From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/rgw/CMakeLists.txt | 579 ++ src/rgw/MAINTAINERS.md | 28 + src/rgw/driver/daos/README.md | 47 + src/rgw/driver/dbstore/CMakeLists.txt | 71 + src/rgw/driver/dbstore/README.md | 53 + src/rgw/driver/dbstore/common/connection_pool.h | 147 + src/rgw/driver/dbstore/common/dbstore.cc | 2252 ++++ src/rgw/driver/dbstore/common/dbstore.h | 2016 ++++ src/rgw/driver/dbstore/common/dbstore_log.h | 15 + src/rgw/driver/dbstore/config/sqlite.cc | 2070 ++++ src/rgw/driver/dbstore/config/sqlite.h | 172 + src/rgw/driver/dbstore/config/sqlite_schema.h | 299 + src/rgw/driver/dbstore/config/store.cc | 38 + src/rgw/driver/dbstore/config/store.h | 27 + src/rgw/driver/dbstore/dbstore_main.cc | 199 + src/rgw/driver/dbstore/dbstore_mgr.cc | 140 + src/rgw/driver/dbstore/dbstore_mgr.h | 56 + src/rgw/driver/dbstore/sqlite/CMakeLists.txt | 16 + src/rgw/driver/dbstore/sqlite/connection.cc | 34 + src/rgw/driver/dbstore/sqlite/connection.h | 64 + src/rgw/driver/dbstore/sqlite/error.cc | 37 + src/rgw/driver/dbstore/sqlite/error.h | 81 + src/rgw/driver/dbstore/sqlite/sqliteDB.cc | 2996 ++++++ src/rgw/driver/dbstore/sqlite/sqliteDB.h | 551 + src/rgw/driver/dbstore/sqlite/statement.cc | 196 + src/rgw/driver/dbstore/sqlite/statement.h | 83 + src/rgw/driver/dbstore/tests/CMakeLists.txt | 17 + src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc | 157 + src/rgw/driver/dbstore/tests/dbstore_tests.cc | 1417 +++ src/rgw/driver/immutable_config/store.cc | 422 + src/rgw/driver/immutable_config/store.h | 180 + src/rgw/driver/json_config/store.cc | 177 + src/rgw/driver/json_config/store.h | 27 + src/rgw/driver/rados/cls_fifo_legacy.cc | 2539 +++++ src/rgw/driver/rados/cls_fifo_legacy.h | 334 + src/rgw/driver/rados/config/impl.cc | 129 + src/rgw/driver/rados/config/impl.h | 139 + src/rgw/driver/rados/config/period.cc | 230 + src/rgw/driver/rados/config/period_config.cc | 55 + src/rgw/driver/rados/config/realm.cc | 364 + src/rgw/driver/rados/config/store.cc | 52 + src/rgw/driver/rados/config/store.h | 182 + src/rgw/driver/rados/config/zone.cc | 312 + src/rgw/driver/rados/config/zonegroup.cc | 315 + src/rgw/driver/rados/rgw_bucket.cc | 3316 ++++++ src/rgw/driver/rados/rgw_bucket.h | 766 ++ src/rgw/driver/rados/rgw_bucket_sync.cc | 1018 ++ src/rgw/driver/rados/rgw_bucket_sync.h | 416 + src/rgw/driver/rados/rgw_cr_rados.cc | 1165 +++ src/rgw/driver/rados/rgw_cr_rados.h | 1647 +++ src/rgw/driver/rados/rgw_cr_tools.cc | 292 + src/rgw/driver/rados/rgw_cr_tools.h | 85 + src/rgw/driver/rados/rgw_d3n_datacache.cc | 369 + src/rgw/driver/rados/rgw_d3n_datacache.h | 259 + src/rgw/driver/rados/rgw_data_sync.cc | 6762 ++++++++++++ src/rgw/driver/rados/rgw_data_sync.h | 868 ++ src/rgw/driver/rados/rgw_datalog.cc | 1090 ++ src/rgw/driver/rados/rgw_datalog.h | 394 + src/rgw/driver/rados/rgw_datalog_notify.cc | 76 + src/rgw/driver/rados/rgw_datalog_notify.h | 31 + src/rgw/driver/rados/rgw_etag_verifier.cc | 191 + src/rgw/driver/rados/rgw_etag_verifier.h | 90 + src/rgw/driver/rados/rgw_gc.cc | 811 ++ src/rgw/driver/rados/rgw_gc.h | 82 + src/rgw/driver/rados/rgw_gc_log.cc | 55 + src/rgw/driver/rados/rgw_lc_tier.cc | 1310 +++ src/rgw/driver/rados/rgw_lc_tier.h | 51 + src/rgw/driver/rados/rgw_log_backing.cc | 708 ++ src/rgw/driver/rados/rgw_log_backing.h | 394 + src/rgw/driver/rados/rgw_metadata.cc | 233 + src/rgw/driver/rados/rgw_metadata.h | 298 + src/rgw/driver/rados/rgw_notify.cc | 1023 ++ src/rgw/driver/rados/rgw_notify.h | 121 + src/rgw/driver/rados/rgw_obj_manifest.cc | 409 + src/rgw/driver/rados/rgw_obj_manifest.h | 622 ++ src/rgw/driver/rados/rgw_object_expirer_core.cc | 442 + src/rgw/driver/rados/rgw_object_expirer_core.h | 146 + src/rgw/driver/rados/rgw_otp.cc | 211 + src/rgw/driver/rados/rgw_otp.h | 110 + src/rgw/driver/rados/rgw_period.cc | 324 + src/rgw/driver/rados/rgw_pubsub_push.cc | 460 + src/rgw/driver/rados/rgw_pubsub_push.h | 47 + src/rgw/driver/rados/rgw_putobj_processor.cc | 761 ++ src/rgw/driver/rados/rgw_putobj_processor.h | 282 + src/rgw/driver/rados/rgw_rados.cc | 10076 ++++++++++++++++++ src/rgw/driver/rados/rgw_rados.h | 1661 +++ src/rgw/driver/rados/rgw_reshard.cc | 1419 +++ src/rgw/driver/rados/rgw_reshard.h | 274 + src/rgw/driver/rados/rgw_rest_bucket.cc | 413 + src/rgw/driver/rados/rgw_rest_bucket.h | 36 + src/rgw/driver/rados/rgw_rest_log.cc | 1268 +++ src/rgw/driver/rados/rgw_rest_log.h | 337 + src/rgw/driver/rados/rgw_rest_pubsub.h | 38 + src/rgw/driver/rados/rgw_rest_realm.cc | 376 + src/rgw/driver/rados/rgw_rest_realm.h | 16 + src/rgw/driver/rados/rgw_rest_user.cc | 1137 +++ src/rgw/driver/rados/rgw_rest_user.h | 36 + src/rgw/driver/rados/rgw_sal_rados.cc | 3846 +++++++ src/rgw/driver/rados/rgw_sal_rados.h | 978 ++ src/rgw/driver/rados/rgw_service.cc | 476 + src/rgw/driver/rados/rgw_service.h | 215 + src/rgw/driver/rados/rgw_sync.cc | 2568 +++++ src/rgw/driver/rados/rgw_sync.h | 547 + src/rgw/driver/rados/rgw_sync_counters.cc | 28 + src/rgw/driver/rados/rgw_sync_counters.h | 25 + src/rgw/driver/rados/rgw_sync_error_repo.cc | 205 + src/rgw/driver/rados/rgw_sync_error_repo.h | 59 + src/rgw/driver/rados/rgw_sync_module.cc | 87 + src/rgw/driver/rados/rgw_sync_module.h | 203 + src/rgw/driver/rados/rgw_sync_module_aws.cc | 1823 ++++ src/rgw/driver/rados/rgw_sync_module_aws.h | 108 + src/rgw/driver/rados/rgw_sync_module_es.cc | 962 ++ src/rgw/driver/rados/rgw_sync_module_es.h | 59 + src/rgw/driver/rados/rgw_sync_module_es_rest.cc | 428 + src/rgw/driver/rados/rgw_sync_module_es_rest.h | 18 + src/rgw/driver/rados/rgw_sync_module_log.cc | 76 + src/rgw/driver/rados/rgw_sync_module_log.h | 15 + src/rgw/driver/rados/rgw_sync_trace.cc | 290 + src/rgw/driver/rados/rgw_sync_trace.h | 141 + src/rgw/driver/rados/rgw_tools.cc | 437 + src/rgw/driver/rados/rgw_tools.h | 276 + src/rgw/driver/rados/rgw_trim_bilog.cc | 1445 +++ src/rgw/driver/rados/rgw_trim_bilog.h | 121 + src/rgw/driver/rados/rgw_trim_datalog.cc | 252 + src/rgw/driver/rados/rgw_trim_datalog.h | 28 + src/rgw/driver/rados/rgw_trim_mdlog.cc | 795 ++ src/rgw/driver/rados/rgw_trim_mdlog.h | 25 + src/rgw/driver/rados/rgw_user.cc | 2776 +++++ src/rgw/driver/rados/rgw_user.h | 885 ++ src/rgw/driver/rados/rgw_zone.cc | 1288 +++ src/rgw/driver/rados/rgw_zone.h | 943 ++ src/rgw/jwt-cpp/base.h | 168 + src/rgw/jwt-cpp/jwt.h | 1615 +++ src/rgw/librgw.cc | 89 + src/rgw/picojson/picojson.h | 1177 +++ src/rgw/rgw-gap-list | 456 + src/rgw/rgw-gap-list-comparator | 119 + src/rgw/rgw-orphan-list | 278 + src/rgw/rgw-restore-bucket-index | 250 + src/rgw/rgw_acl.cc | 442 + src/rgw/rgw_acl.h | 414 + src/rgw/rgw_acl_s3.cc | 643 ++ src/rgw/rgw_acl_s3.h | 115 + src/rgw/rgw_acl_swift.cc | 438 + src/rgw/rgw_acl_swift.h | 58 + src/rgw/rgw_acl_types.h | 213 + src/rgw/rgw_admin.cc | 10799 ++++++++++++++++++++ src/rgw/rgw_aio.cc | 138 + src/rgw/rgw_aio.h | 104 + src/rgw/rgw_aio_throttle.cc | 202 + src/rgw/rgw_aio_throttle.h | 133 + src/rgw/rgw_amqp.cc | 1051 ++ src/rgw/rgw_amqp.h | 82 + src/rgw/rgw_appmain.cc | 605 ++ src/rgw/rgw_arn.cc | 387 + src/rgw/rgw_arn.h | 121 + src/rgw/rgw_asio_client.cc | 192 + src/rgw/rgw_asio_client.h | 62 + src/rgw/rgw_asio_frontend.cc | 1199 +++ src/rgw/rgw_asio_frontend.h | 25 + src/rgw/rgw_asio_frontend_timer.h | 66 + src/rgw/rgw_auth.cc | 934 ++ src/rgw/rgw_auth.h | 791 ++ src/rgw/rgw_auth_filters.h | 302 + src/rgw/rgw_auth_keystone.cc | 767 ++ src/rgw/rgw_auth_keystone.h | 202 + src/rgw/rgw_auth_registry.h | 97 + src/rgw/rgw_auth_s3.cc | 1355 +++ src/rgw/rgw_auth_s3.h | 649 ++ src/rgw/rgw_b64.h | 84 + src/rgw/rgw_basic_types.cc | 180 + src/rgw/rgw_basic_types.h | 291 + src/rgw/rgw_bucket.cc | 186 + src/rgw/rgw_bucket.h | 36 + src/rgw/rgw_bucket_encryption.cc | 49 + src/rgw/rgw_bucket_encryption.h | 142 + src/rgw/rgw_bucket_layout.cc | 380 + src/rgw/rgw_bucket_layout.h | 282 + src/rgw/rgw_bucket_sync_cache.h | 116 + src/rgw/rgw_bucket_types.h | 233 + src/rgw/rgw_cache.cc | 419 + src/rgw/rgw_cache.h | 222 + src/rgw/rgw_client_io.cc | 34 + src/rgw/rgw_client_io.h | 435 + src/rgw/rgw_client_io_filters.h | 454 + src/rgw/rgw_common.cc | 3075 ++++++ src/rgw/rgw_common.h | 1842 ++++ src/rgw/rgw_compression.cc | 236 + src/rgw/rgw_compression.h | 62 + src/rgw/rgw_compression_types.h | 76 + src/rgw/rgw_coroutine.cc | 1130 ++ src/rgw/rgw_coroutine.h | 722 ++ src/rgw/rgw_cors.cc | 193 + src/rgw/rgw_cors.h | 146 + src/rgw/rgw_cors_s3.cc | 246 + src/rgw/rgw_cors_s3.h | 58 + src/rgw/rgw_cors_swift.h | 83 + src/rgw/rgw_cr_rest.cc | 351 + src/rgw/rgw_cr_rest.h | 590 ++ src/rgw/rgw_crypt.cc | 1537 +++ src/rgw/rgw_crypt.h | 174 + src/rgw/rgw_crypt_sanitize.cc | 88 + src/rgw/rgw_crypt_sanitize.h | 68 + src/rgw/rgw_d3n_cacherequest.h | 145 + src/rgw/rgw_dencoder.cc | 41 + src/rgw/rgw_dmclock.h | 52 + src/rgw/rgw_dmclock_async_scheduler.cc | 183 + src/rgw/rgw_dmclock_async_scheduler.h | 217 + src/rgw/rgw_dmclock_scheduler.h | 86 + src/rgw/rgw_dmclock_scheduler_ctx.cc | 178 + src/rgw/rgw_dmclock_scheduler_ctx.h | 119 + src/rgw/rgw_dmclock_sync_scheduler.cc | 117 + src/rgw/rgw_dmclock_sync_scheduler.h | 77 + src/rgw/rgw_env.cc | 158 + src/rgw/rgw_es_main.cc | 76 + src/rgw/rgw_es_query.cc | 696 ++ src/rgw/rgw_es_query.h | 164 + src/rgw/rgw_file.cc | 2787 +++++ src/rgw/rgw_file.h | 2857 ++++++ src/rgw/rgw_flight.cc | 724 ++ src/rgw/rgw_flight.h | 221 + src/rgw/rgw_flight_frontend.cc | 246 + src/rgw/rgw_flight_frontend.h | 86 + src/rgw/rgw_formats.cc | 381 + src/rgw/rgw_formats.h | 134 + src/rgw/rgw_frontend.cc | 105 + src/rgw/rgw_frontend.h | 211 + src/rgw/rgw_gc_log.h | 28 + src/rgw/rgw_http_client.cc | 1223 +++ src/rgw/rgw_http_client.h | 348 + src/rgw/rgw_http_client_curl.cc | 112 + src/rgw/rgw_http_client_curl.h | 29 + src/rgw/rgw_http_client_types.h | 69 + src/rgw/rgw_http_errors.h | 44 + src/rgw/rgw_iam_policy.cc | 1663 +++ src/rgw/rgw_iam_policy.h | 579 ++ src/rgw/rgw_iam_policy_keywords.gperf | 136 + src/rgw/rgw_iam_policy_keywords.h | 139 + src/rgw/rgw_jsonparser.cc | 133 + src/rgw/rgw_kafka.cc | 742 ++ src/rgw/rgw_kafka.h | 66 + src/rgw/rgw_keystone.cc | 684 ++ src/rgw/rgw_keystone.h | 333 + src/rgw/rgw_kmip_client.cc | 82 + src/rgw/rgw_kmip_client.h | 65 + src/rgw/rgw_kmip_client_impl.cc | 728 ++ src/rgw/rgw_kmip_client_impl.h | 27 + src/rgw/rgw_kms.cc | 1279 +++ src/rgw/rgw_kms.h | 64 + src/rgw/rgw_lc.cc | 2869 ++++++ src/rgw/rgw_lc.h | 640 ++ src/rgw/rgw_lc_s3.cc | 353 + src/rgw/rgw_lc_s3.h | 100 + src/rgw/rgw_ldap.cc | 130 + src/rgw/rgw_ldap.h | 138 + src/rgw/rgw_lib.cc | 610 ++ src/rgw/rgw_lib.h | 209 + src/rgw/rgw_lib_frontend.h | 113 + src/rgw/rgw_loadgen.cc | 131 + src/rgw/rgw_loadgen.h | 72 + src/rgw/rgw_loadgen_process.cc | 147 + src/rgw/rgw_log.cc | 722 ++ src/rgw/rgw_log.h | 289 + src/rgw/rgw_lua.cc | 214 + src/rgw/rgw_lua.h | 67 + src/rgw/rgw_lua_background.cc | 181 + src/rgw/rgw_lua_background.h | 230 + src/rgw/rgw_lua_data_filter.cc | 143 + src/rgw/rgw_lua_data_filter.h | 52 + src/rgw/rgw_lua_request.cc | 906 ++ src/rgw/rgw_lua_request.h | 26 + src/rgw/rgw_lua_utils.cc | 77 + src/rgw/rgw_lua_utils.h | 315 + src/rgw/rgw_lua_version.h | 11 + src/rgw/rgw_main.cc | 188 + src/rgw/rgw_main.h | 134 + src/rgw/rgw_mdlog.h | 185 + src/rgw/rgw_mdlog_types.h | 35 + src/rgw/rgw_meta_sync_status.h | 121 + src/rgw/rgw_metadata.cc | 683 ++ src/rgw/rgw_multi.cc | 103 + src/rgw/rgw_multi.h | 62 + src/rgw/rgw_multi_del.cc | 73 + src/rgw/rgw_multi_del.h | 62 + src/rgw/rgw_multiparser.cc | 47 + src/rgw/rgw_multipart_meta_filter.cc | 32 + src/rgw/rgw_notify_event_type.cc | 119 + src/rgw/rgw_notify_event_type.h | 49 + src/rgw/rgw_obj_manifest.cc | 260 + src/rgw/rgw_obj_types.h | 622 ++ src/rgw/rgw_object_expirer.cc | 106 + src/rgw/rgw_object_lock.cc | 100 + src/rgw/rgw_object_lock.h | 222 + src/rgw/rgw_oidc_provider.cc | 182 + src/rgw/rgw_oidc_provider.h | 121 + src/rgw/rgw_op.cc | 8958 ++++++++++++++++ src/rgw/rgw_op.h | 2672 +++++ src/rgw/rgw_op_type.h | 133 + src/rgw/rgw_opa.cc | 97 + src/rgw/rgw_opa.h | 11 + src/rgw/rgw_orphan.cc | 1598 +++ src/rgw/rgw_orphan.h | 304 + src/rgw/rgw_os_lib.cc | 63 + src/rgw/rgw_os_lib.h | 9 + src/rgw/rgw_perf_counters.cc | 78 + src/rgw/rgw_perf_counters.h | 60 + src/rgw/rgw_period.cc | 350 + src/rgw/rgw_period_history.cc | 353 + src/rgw/rgw_period_history.h | 114 + src/rgw/rgw_period_puller.cc | 123 + src/rgw/rgw_period_puller.h | 24 + src/rgw/rgw_period_pusher.cc | 316 + src/rgw/rgw_period_pusher.h | 54 + src/rgw/rgw_placement_types.h | 118 + src/rgw/rgw_policy_s3.cc | 305 + src/rgw/rgw_policy_s3.h | 57 + src/rgw/rgw_polparser.cc | 105 + src/rgw/rgw_pool_types.h | 157 + src/rgw/rgw_process.cc | 472 + src/rgw/rgw_process.h | 159 + src/rgw/rgw_process_env.h | 50 + src/rgw/rgw_public_access.cc | 33 + src/rgw/rgw_public_access.h | 67 + src/rgw/rgw_pubsub.cc | 736 ++ src/rgw/rgw_pubsub.h | 629 ++ src/rgw/rgw_putobj.cc | 99 + src/rgw/rgw_putobj.h | 73 + src/rgw/rgw_quota.cc | 1049 ++ src/rgw/rgw_quota.h | 49 + src/rgw/rgw_quota_types.h | 87 + src/rgw/rgw_ratelimit.h | 292 + src/rgw/rgw_realm.cc | 265 + src/rgw/rgw_realm_reloader.cc | 188 + src/rgw/rgw_realm_reloader.h | 64 + src/rgw/rgw_realm_watcher.cc | 148 + src/rgw/rgw_realm_watcher.h | 66 + src/rgw/rgw_request.h | 40 + src/rgw/rgw_resolve.cc | 45 + src/rgw/rgw_resolve.h | 24 + src/rgw/rgw_rest.cc | 2335 +++++ src/rgw/rgw_rest.h | 819 ++ src/rgw/rgw_rest_admin.h | 12 + src/rgw/rgw_rest_client.cc | 1124 ++ src/rgw/rgw_rest_client.h | 257 + src/rgw/rgw_rest_config.cc | 57 + src/rgw/rgw_rest_config.h | 64 + src/rgw/rgw_rest_conn.cc | 526 + src/rgw/rgw_rest_conn.h | 557 + src/rgw/rgw_rest_iam.cc | 90 + src/rgw/rgw_rest_iam.h | 48 + src/rgw/rgw_rest_info.cc | 49 + src/rgw/rgw_rest_info.h | 33 + src/rgw/rgw_rest_metadata.cc | 321 + src/rgw/rgw_rest_metadata.h | 107 + src/rgw/rgw_rest_oidc_provider.cc | 233 + src/rgw/rgw_rest_oidc_provider.h | 71 + src/rgw/rgw_rest_pubsub.cc | 954 ++ src/rgw/rgw_rest_ratelimit.cc | 349 + src/rgw/rgw_rest_ratelimit.h | 34 + src/rgw/rgw_rest_role.cc | 1022 ++ src/rgw/rgw_rest_role.h | 181 + src/rgw/rgw_rest_s3.cc | 6477 ++++++++++++ src/rgw/rgw_rest_s3.h | 1215 +++ src/rgw/rgw_rest_s3website.h | 100 + src/rgw/rgw_rest_sts.cc | 819 ++ src/rgw/rgw_rest_sts.h | 235 + src/rgw/rgw_rest_swift.cc | 3114 ++++++ src/rgw/rgw_rest_swift.h | 685 ++ src/rgw/rgw_rest_usage.cc | 121 + src/rgw/rgw_rest_usage.h | 34 + src/rgw/rgw_rest_user_policy.cc | 413 + src/rgw/rgw_rest_user_policy.h | 73 + src/rgw/rgw_role.cc | 444 + src/rgw/rgw_role.h | 209 + src/rgw/rgw_s3select.cc | 1001 ++ src/rgw/rgw_s3select.h | 10 + src/rgw/rgw_s3select_private.h | 258 + src/rgw/rgw_sal.cc | 402 + src/rgw/rgw_sal.h | 1644 +++ src/rgw/rgw_sal_config.h | 301 + src/rgw/rgw_sal_daos.cc | 2473 +++++ src/rgw/rgw_sal_daos.h | 1054 ++ src/rgw/rgw_sal_dbstore.cc | 2045 ++++ src/rgw/rgw_sal_dbstore.h | 921 ++ src/rgw/rgw_sal_filter.cc | 1370 +++ src/rgw/rgw_sal_filter.h | 921 ++ src/rgw/rgw_sal_fwd.h | 41 + src/rgw/rgw_sal_motr.cc | 4024 ++++++++ src/rgw/rgw_sal_motr.h | 1204 +++ src/rgw/rgw_sal_store.h | 419 + src/rgw/rgw_signal.cc | 91 + src/rgw/rgw_signal.h | 31 + src/rgw/rgw_string.cc | 45 + src/rgw/rgw_string.h | 235 + src/rgw/rgw_sts.cc | 469 + src/rgw/rgw_sts.h | 251 + src/rgw/rgw_swift_auth.cc | 775 ++ src/rgw/rgw_swift_auth.h | 354 + src/rgw/rgw_sync.cc | 24 + src/rgw/rgw_sync_checkpoint.cc | 273 + src/rgw/rgw_sync_checkpoint.h | 35 + src/rgw/rgw_sync_policy.cc | 787 ++ src/rgw/rgw_sync_policy.h | 682 ++ src/rgw/rgw_tag.cc | 67 + src/rgw/rgw_tag.h | 49 + src/rgw/rgw_tag_s3.cc | 66 + src/rgw/rgw_tag_s3.h | 49 + src/rgw/rgw_tar.h | 153 + src/rgw/rgw_token.cc | 144 + src/rgw/rgw_token.h | 170 + src/rgw/rgw_tools.cc | 124 + src/rgw/rgw_torrent.cc | 261 + src/rgw/rgw_torrent.h | 139 + src/rgw/rgw_tracer.cc | 13 + src/rgw/rgw_tracer.h | 34 + src/rgw/rgw_url.cc | 49 + src/rgw/rgw_url.h | 12 + src/rgw/rgw_usage.cc | 171 + src/rgw/rgw_usage.h | 30 + src/rgw/rgw_user.cc | 127 + src/rgw/rgw_user_types.h | 158 + src/rgw/rgw_web_idp.h | 26 + src/rgw/rgw_website.cc | 341 + src/rgw/rgw_website.h | 243 + src/rgw/rgw_worker.h | 91 + src/rgw/rgw_xml.cc | 502 + src/rgw/rgw_xml.h | 371 + src/rgw/rgw_xml_enc.cc | 25 + src/rgw/rgw_zone.cc | 1371 +++ src/rgw/rgw_zone_features.h | 47 + src/rgw/rgw_zone_types.h | 625 ++ src/rgw/rgwam.py | 240 + src/rgw/services/svc_bi.h | 44 + src/rgw/services/svc_bi_rados.cc | 509 + src/rgw/services/svc_bi_rados.h | 166 + src/rgw/services/svc_bilog_rados.cc | 220 + src/rgw/services/svc_bilog_rados.h | 60 + src/rgw/services/svc_bucket.cc | 25 + src/rgw/services/svc_bucket.h | 111 + src/rgw/services/svc_bucket_sobj.cc | 644 ++ src/rgw/services/svc_bucket_sobj.h | 180 + src/rgw/services/svc_bucket_sync.h | 55 + src/rgw/services/svc_bucket_sync_sobj.cc | 903 ++ src/rgw/services/svc_bucket_sync_sobj.h | 123 + src/rgw/services/svc_bucket_types.h | 38 + src/rgw/services/svc_cls.cc | 478 + src/rgw/services/svc_cls.h | 166 + src/rgw/services/svc_config_key.h | 31 + src/rgw/services/svc_config_key_rados.cc | 50 + src/rgw/services/svc_config_key_rados.h | 54 + src/rgw/services/svc_finisher.cc | 58 + src/rgw/services/svc_finisher.h | 44 + src/rgw/services/svc_mdlog.cc | 549 + src/rgw/services/svc_mdlog.h | 118 + src/rgw/services/svc_meta.cc | 46 + src/rgw/services/svc_meta.h | 48 + src/rgw/services/svc_meta_be.cc | 193 + src/rgw/services/svc_meta_be.h | 294 + src/rgw/services/svc_meta_be_otp.cc | 73 + src/rgw/services/svc_meta_be_otp.h | 89 + src/rgw/services/svc_meta_be_params.h | 25 + src/rgw/services/svc_meta_be_sobj.cc | 246 + src/rgw/services/svc_meta_be_sobj.h | 194 + src/rgw/services/svc_meta_be_types.h | 26 + src/rgw/services/svc_notify.cc | 515 + src/rgw/services/svc_notify.h | 106 + src/rgw/services/svc_otp.cc | 186 + src/rgw/services/svc_otp.h | 95 + src/rgw/services/svc_otp_types.h | 29 + src/rgw/services/svc_quota.cc | 18 + src/rgw/services/svc_quota.h | 22 + src/rgw/services/svc_rados.cc | 445 + src/rgw/services/svc_rados.h | 252 + src/rgw/services/svc_role_rados.cc | 82 + src/rgw/services/svc_role_rados.h | 50 + src/rgw/services/svc_sync_modules.cc | 44 + src/rgw/services/svc_sync_modules.h | 34 + src/rgw/services/svc_sys_obj.cc | 183 + src/rgw/services/svc_sys_obj.h | 270 + src/rgw/services/svc_sys_obj_cache.cc | 670 ++ src/rgw/services/svc_sys_obj_cache.h | 222 + src/rgw/services/svc_sys_obj_core.cc | 666 ++ src/rgw/services/svc_sys_obj_core.h | 145 + src/rgw/services/svc_sys_obj_core_types.h | 34 + src/rgw/services/svc_sys_obj_types.h | 15 + src/rgw/services/svc_tier_rados.cc | 36 + src/rgw/services/svc_tier_rados.h | 154 + src/rgw/services/svc_user.cc | 11 + src/rgw/services/svc_user.h | 127 + src/rgw/services/svc_user_rados.cc | 968 ++ src/rgw/services/svc_user_rados.h | 211 + src/rgw/services/svc_zone.cc | 1100 ++ src/rgw/services/svc_zone.h | 165 + src/rgw/services/svc_zone_utils.cc | 64 + src/rgw/services/svc_zone_utils.h | 38 + 495 files changed, 242939 insertions(+) create mode 100644 src/rgw/CMakeLists.txt create mode 100644 src/rgw/MAINTAINERS.md create mode 100644 src/rgw/driver/daos/README.md create mode 100644 src/rgw/driver/dbstore/CMakeLists.txt create mode 100644 src/rgw/driver/dbstore/README.md create mode 100644 src/rgw/driver/dbstore/common/connection_pool.h create mode 100644 src/rgw/driver/dbstore/common/dbstore.cc create mode 100644 src/rgw/driver/dbstore/common/dbstore.h create mode 100644 src/rgw/driver/dbstore/common/dbstore_log.h create mode 100644 src/rgw/driver/dbstore/config/sqlite.cc create mode 100644 src/rgw/driver/dbstore/config/sqlite.h create mode 100644 src/rgw/driver/dbstore/config/sqlite_schema.h create mode 100644 src/rgw/driver/dbstore/config/store.cc create mode 100644 src/rgw/driver/dbstore/config/store.h create mode 100644 src/rgw/driver/dbstore/dbstore_main.cc create mode 100644 src/rgw/driver/dbstore/dbstore_mgr.cc create mode 100644 src/rgw/driver/dbstore/dbstore_mgr.h create mode 100644 src/rgw/driver/dbstore/sqlite/CMakeLists.txt create mode 100644 src/rgw/driver/dbstore/sqlite/connection.cc create mode 100644 src/rgw/driver/dbstore/sqlite/connection.h create mode 100644 src/rgw/driver/dbstore/sqlite/error.cc create mode 100644 src/rgw/driver/dbstore/sqlite/error.h create mode 100644 src/rgw/driver/dbstore/sqlite/sqliteDB.cc create mode 100644 src/rgw/driver/dbstore/sqlite/sqliteDB.h create mode 100644 src/rgw/driver/dbstore/sqlite/statement.cc create mode 100644 src/rgw/driver/dbstore/sqlite/statement.h create mode 100644 src/rgw/driver/dbstore/tests/CMakeLists.txt create mode 100644 src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc create mode 100644 src/rgw/driver/dbstore/tests/dbstore_tests.cc create mode 100644 src/rgw/driver/immutable_config/store.cc create mode 100644 src/rgw/driver/immutable_config/store.h create mode 100644 src/rgw/driver/json_config/store.cc create mode 100644 src/rgw/driver/json_config/store.h create mode 100644 src/rgw/driver/rados/cls_fifo_legacy.cc create mode 100644 src/rgw/driver/rados/cls_fifo_legacy.h create mode 100644 src/rgw/driver/rados/config/impl.cc create mode 100644 src/rgw/driver/rados/config/impl.h create mode 100644 src/rgw/driver/rados/config/period.cc create mode 100644 src/rgw/driver/rados/config/period_config.cc create mode 100644 src/rgw/driver/rados/config/realm.cc create mode 100644 src/rgw/driver/rados/config/store.cc create mode 100644 src/rgw/driver/rados/config/store.h create mode 100644 src/rgw/driver/rados/config/zone.cc create mode 100644 src/rgw/driver/rados/config/zonegroup.cc create mode 100644 src/rgw/driver/rados/rgw_bucket.cc create mode 100644 src/rgw/driver/rados/rgw_bucket.h create mode 100644 src/rgw/driver/rados/rgw_bucket_sync.cc create mode 100644 src/rgw/driver/rados/rgw_bucket_sync.h create mode 100644 src/rgw/driver/rados/rgw_cr_rados.cc create mode 100644 src/rgw/driver/rados/rgw_cr_rados.h create mode 100644 src/rgw/driver/rados/rgw_cr_tools.cc create mode 100644 src/rgw/driver/rados/rgw_cr_tools.h create mode 100644 src/rgw/driver/rados/rgw_d3n_datacache.cc create mode 100644 src/rgw/driver/rados/rgw_d3n_datacache.h create mode 100644 src/rgw/driver/rados/rgw_data_sync.cc create mode 100644 src/rgw/driver/rados/rgw_data_sync.h create mode 100644 src/rgw/driver/rados/rgw_datalog.cc create mode 100644 src/rgw/driver/rados/rgw_datalog.h create mode 100644 src/rgw/driver/rados/rgw_datalog_notify.cc create mode 100644 src/rgw/driver/rados/rgw_datalog_notify.h create mode 100644 src/rgw/driver/rados/rgw_etag_verifier.cc create mode 100644 src/rgw/driver/rados/rgw_etag_verifier.h create mode 100644 src/rgw/driver/rados/rgw_gc.cc create mode 100644 src/rgw/driver/rados/rgw_gc.h create mode 100644 src/rgw/driver/rados/rgw_gc_log.cc create mode 100644 src/rgw/driver/rados/rgw_lc_tier.cc create mode 100644 src/rgw/driver/rados/rgw_lc_tier.h create mode 100644 src/rgw/driver/rados/rgw_log_backing.cc create mode 100644 src/rgw/driver/rados/rgw_log_backing.h create mode 100644 src/rgw/driver/rados/rgw_metadata.cc create mode 100644 src/rgw/driver/rados/rgw_metadata.h create mode 100644 src/rgw/driver/rados/rgw_notify.cc create mode 100644 src/rgw/driver/rados/rgw_notify.h create mode 100644 src/rgw/driver/rados/rgw_obj_manifest.cc create mode 100644 src/rgw/driver/rados/rgw_obj_manifest.h create mode 100644 src/rgw/driver/rados/rgw_object_expirer_core.cc create mode 100644 src/rgw/driver/rados/rgw_object_expirer_core.h create mode 100644 src/rgw/driver/rados/rgw_otp.cc create mode 100644 src/rgw/driver/rados/rgw_otp.h create mode 100644 src/rgw/driver/rados/rgw_period.cc create mode 100644 src/rgw/driver/rados/rgw_pubsub_push.cc create mode 100644 src/rgw/driver/rados/rgw_pubsub_push.h create mode 100644 src/rgw/driver/rados/rgw_putobj_processor.cc create mode 100644 src/rgw/driver/rados/rgw_putobj_processor.h create mode 100644 src/rgw/driver/rados/rgw_rados.cc create mode 100644 src/rgw/driver/rados/rgw_rados.h create mode 100644 src/rgw/driver/rados/rgw_reshard.cc create mode 100644 src/rgw/driver/rados/rgw_reshard.h create mode 100644 src/rgw/driver/rados/rgw_rest_bucket.cc create mode 100644 src/rgw/driver/rados/rgw_rest_bucket.h create mode 100644 src/rgw/driver/rados/rgw_rest_log.cc create mode 100644 src/rgw/driver/rados/rgw_rest_log.h create mode 100644 src/rgw/driver/rados/rgw_rest_pubsub.h create mode 100644 src/rgw/driver/rados/rgw_rest_realm.cc create mode 100644 src/rgw/driver/rados/rgw_rest_realm.h create mode 100644 src/rgw/driver/rados/rgw_rest_user.cc create mode 100644 src/rgw/driver/rados/rgw_rest_user.h create mode 100644 src/rgw/driver/rados/rgw_sal_rados.cc create mode 100644 src/rgw/driver/rados/rgw_sal_rados.h create mode 100644 src/rgw/driver/rados/rgw_service.cc create mode 100644 src/rgw/driver/rados/rgw_service.h create mode 100644 src/rgw/driver/rados/rgw_sync.cc create mode 100644 src/rgw/driver/rados/rgw_sync.h create mode 100644 src/rgw/driver/rados/rgw_sync_counters.cc create mode 100644 src/rgw/driver/rados/rgw_sync_counters.h create mode 100644 src/rgw/driver/rados/rgw_sync_error_repo.cc create mode 100644 src/rgw/driver/rados/rgw_sync_error_repo.h create mode 100644 src/rgw/driver/rados/rgw_sync_module.cc create mode 100644 src/rgw/driver/rados/rgw_sync_module.h create mode 100644 src/rgw/driver/rados/rgw_sync_module_aws.cc create mode 100644 src/rgw/driver/rados/rgw_sync_module_aws.h create mode 100644 src/rgw/driver/rados/rgw_sync_module_es.cc create mode 100644 src/rgw/driver/rados/rgw_sync_module_es.h create mode 100644 src/rgw/driver/rados/rgw_sync_module_es_rest.cc create mode 100644 src/rgw/driver/rados/rgw_sync_module_es_rest.h create mode 100644 src/rgw/driver/rados/rgw_sync_module_log.cc create mode 100644 src/rgw/driver/rados/rgw_sync_module_log.h create mode 100644 src/rgw/driver/rados/rgw_sync_trace.cc create mode 100644 src/rgw/driver/rados/rgw_sync_trace.h create mode 100644 src/rgw/driver/rados/rgw_tools.cc create mode 100644 src/rgw/driver/rados/rgw_tools.h create mode 100644 src/rgw/driver/rados/rgw_trim_bilog.cc create mode 100644 src/rgw/driver/rados/rgw_trim_bilog.h create mode 100644 src/rgw/driver/rados/rgw_trim_datalog.cc create mode 100644 src/rgw/driver/rados/rgw_trim_datalog.h create mode 100644 src/rgw/driver/rados/rgw_trim_mdlog.cc create mode 100644 src/rgw/driver/rados/rgw_trim_mdlog.h create mode 100644 src/rgw/driver/rados/rgw_user.cc create mode 100644 src/rgw/driver/rados/rgw_user.h create mode 100644 src/rgw/driver/rados/rgw_zone.cc create mode 100644 src/rgw/driver/rados/rgw_zone.h create mode 100644 src/rgw/jwt-cpp/base.h create mode 100644 src/rgw/jwt-cpp/jwt.h create mode 100644 src/rgw/librgw.cc create mode 100644 src/rgw/picojson/picojson.h create mode 100755 src/rgw/rgw-gap-list create mode 100755 src/rgw/rgw-gap-list-comparator create mode 100755 src/rgw/rgw-orphan-list create mode 100755 src/rgw/rgw-restore-bucket-index create mode 100644 src/rgw/rgw_acl.cc create mode 100644 src/rgw/rgw_acl.h create mode 100644 src/rgw/rgw_acl_s3.cc create mode 100644 src/rgw/rgw_acl_s3.h create mode 100644 src/rgw/rgw_acl_swift.cc create mode 100644 src/rgw/rgw_acl_swift.h create mode 100644 src/rgw/rgw_acl_types.h create mode 100644 src/rgw/rgw_admin.cc create mode 100644 src/rgw/rgw_aio.cc create mode 100644 src/rgw/rgw_aio.h create mode 100644 src/rgw/rgw_aio_throttle.cc create mode 100644 src/rgw/rgw_aio_throttle.h create mode 100644 src/rgw/rgw_amqp.cc create mode 100644 src/rgw/rgw_amqp.h create mode 100644 src/rgw/rgw_appmain.cc create mode 100644 src/rgw/rgw_arn.cc create mode 100644 src/rgw/rgw_arn.h create mode 100644 src/rgw/rgw_asio_client.cc create mode 100644 src/rgw/rgw_asio_client.h create mode 100644 src/rgw/rgw_asio_frontend.cc create mode 100644 src/rgw/rgw_asio_frontend.h create mode 100644 src/rgw/rgw_asio_frontend_timer.h create mode 100644 src/rgw/rgw_auth.cc create mode 100644 src/rgw/rgw_auth.h create mode 100644 src/rgw/rgw_auth_filters.h create mode 100644 src/rgw/rgw_auth_keystone.cc create mode 100644 src/rgw/rgw_auth_keystone.h create mode 100644 src/rgw/rgw_auth_registry.h create mode 100644 src/rgw/rgw_auth_s3.cc create mode 100644 src/rgw/rgw_auth_s3.h create mode 100644 src/rgw/rgw_b64.h create mode 100644 src/rgw/rgw_basic_types.cc create mode 100644 src/rgw/rgw_basic_types.h create mode 100644 src/rgw/rgw_bucket.cc create mode 100644 src/rgw/rgw_bucket.h create mode 100644 src/rgw/rgw_bucket_encryption.cc create mode 100644 src/rgw/rgw_bucket_encryption.h create mode 100644 src/rgw/rgw_bucket_layout.cc create mode 100644 src/rgw/rgw_bucket_layout.h create mode 100644 src/rgw/rgw_bucket_sync_cache.h create mode 100644 src/rgw/rgw_bucket_types.h create mode 100644 src/rgw/rgw_cache.cc create mode 100644 src/rgw/rgw_cache.h create mode 100644 src/rgw/rgw_client_io.cc create mode 100644 src/rgw/rgw_client_io.h create mode 100644 src/rgw/rgw_client_io_filters.h create mode 100644 src/rgw/rgw_common.cc create mode 100644 src/rgw/rgw_common.h create mode 100644 src/rgw/rgw_compression.cc create mode 100644 src/rgw/rgw_compression.h create mode 100644 src/rgw/rgw_compression_types.h create mode 100644 src/rgw/rgw_coroutine.cc create mode 100644 src/rgw/rgw_coroutine.h create mode 100644 src/rgw/rgw_cors.cc create mode 100644 src/rgw/rgw_cors.h create mode 100644 src/rgw/rgw_cors_s3.cc create mode 100644 src/rgw/rgw_cors_s3.h create mode 100644 src/rgw/rgw_cors_swift.h create mode 100644 src/rgw/rgw_cr_rest.cc create mode 100644 src/rgw/rgw_cr_rest.h create mode 100644 src/rgw/rgw_crypt.cc create mode 100644 src/rgw/rgw_crypt.h create mode 100644 src/rgw/rgw_crypt_sanitize.cc create mode 100644 src/rgw/rgw_crypt_sanitize.h create mode 100644 src/rgw/rgw_d3n_cacherequest.h create mode 100644 src/rgw/rgw_dencoder.cc create mode 100644 src/rgw/rgw_dmclock.h create mode 100644 src/rgw/rgw_dmclock_async_scheduler.cc create mode 100644 src/rgw/rgw_dmclock_async_scheduler.h create mode 100644 src/rgw/rgw_dmclock_scheduler.h create mode 100644 src/rgw/rgw_dmclock_scheduler_ctx.cc create mode 100644 src/rgw/rgw_dmclock_scheduler_ctx.h create mode 100644 src/rgw/rgw_dmclock_sync_scheduler.cc create mode 100644 src/rgw/rgw_dmclock_sync_scheduler.h create mode 100644 src/rgw/rgw_env.cc create mode 100644 src/rgw/rgw_es_main.cc create mode 100644 src/rgw/rgw_es_query.cc create mode 100644 src/rgw/rgw_es_query.h create mode 100644 src/rgw/rgw_file.cc create mode 100644 src/rgw/rgw_file.h create mode 100644 src/rgw/rgw_flight.cc create mode 100644 src/rgw/rgw_flight.h create mode 100644 src/rgw/rgw_flight_frontend.cc create mode 100644 src/rgw/rgw_flight_frontend.h create mode 100644 src/rgw/rgw_formats.cc create mode 100644 src/rgw/rgw_formats.h create mode 100644 src/rgw/rgw_frontend.cc create mode 100644 src/rgw/rgw_frontend.h create mode 100644 src/rgw/rgw_gc_log.h create mode 100644 src/rgw/rgw_http_client.cc create mode 100644 src/rgw/rgw_http_client.h create mode 100644 src/rgw/rgw_http_client_curl.cc create mode 100644 src/rgw/rgw_http_client_curl.h create mode 100644 src/rgw/rgw_http_client_types.h create mode 100644 src/rgw/rgw_http_errors.h create mode 100644 src/rgw/rgw_iam_policy.cc create mode 100644 src/rgw/rgw_iam_policy.h create mode 100644 src/rgw/rgw_iam_policy_keywords.gperf create mode 100644 src/rgw/rgw_iam_policy_keywords.h create mode 100644 src/rgw/rgw_jsonparser.cc create mode 100644 src/rgw/rgw_kafka.cc create mode 100644 src/rgw/rgw_kafka.h create mode 100644 src/rgw/rgw_keystone.cc create mode 100644 src/rgw/rgw_keystone.h create mode 100644 src/rgw/rgw_kmip_client.cc create mode 100644 src/rgw/rgw_kmip_client.h create mode 100644 src/rgw/rgw_kmip_client_impl.cc create mode 100644 src/rgw/rgw_kmip_client_impl.h create mode 100644 src/rgw/rgw_kms.cc create mode 100644 src/rgw/rgw_kms.h create mode 100644 src/rgw/rgw_lc.cc create mode 100644 src/rgw/rgw_lc.h create mode 100644 src/rgw/rgw_lc_s3.cc create mode 100644 src/rgw/rgw_lc_s3.h create mode 100644 src/rgw/rgw_ldap.cc create mode 100644 src/rgw/rgw_ldap.h create mode 100644 src/rgw/rgw_lib.cc create mode 100644 src/rgw/rgw_lib.h create mode 100644 src/rgw/rgw_lib_frontend.h create mode 100644 src/rgw/rgw_loadgen.cc create mode 100644 src/rgw/rgw_loadgen.h create mode 100644 src/rgw/rgw_loadgen_process.cc create mode 100644 src/rgw/rgw_log.cc create mode 100644 src/rgw/rgw_log.h create mode 100644 src/rgw/rgw_lua.cc create mode 100644 src/rgw/rgw_lua.h create mode 100644 src/rgw/rgw_lua_background.cc create mode 100644 src/rgw/rgw_lua_background.h create mode 100644 src/rgw/rgw_lua_data_filter.cc create mode 100644 src/rgw/rgw_lua_data_filter.h create mode 100644 src/rgw/rgw_lua_request.cc create mode 100644 src/rgw/rgw_lua_request.h create mode 100644 src/rgw/rgw_lua_utils.cc create mode 100644 src/rgw/rgw_lua_utils.h create mode 100644 src/rgw/rgw_lua_version.h create mode 100644 src/rgw/rgw_main.cc create mode 100644 src/rgw/rgw_main.h create mode 100644 src/rgw/rgw_mdlog.h create mode 100644 src/rgw/rgw_mdlog_types.h create mode 100644 src/rgw/rgw_meta_sync_status.h create mode 100644 src/rgw/rgw_metadata.cc create mode 100644 src/rgw/rgw_multi.cc create mode 100644 src/rgw/rgw_multi.h create mode 100644 src/rgw/rgw_multi_del.cc create mode 100644 src/rgw/rgw_multi_del.h create mode 100644 src/rgw/rgw_multiparser.cc create mode 100644 src/rgw/rgw_multipart_meta_filter.cc create mode 100644 src/rgw/rgw_notify_event_type.cc create mode 100644 src/rgw/rgw_notify_event_type.h create mode 100644 src/rgw/rgw_obj_manifest.cc create mode 100644 src/rgw/rgw_obj_types.h create mode 100644 src/rgw/rgw_object_expirer.cc create mode 100644 src/rgw/rgw_object_lock.cc create mode 100644 src/rgw/rgw_object_lock.h create mode 100644 src/rgw/rgw_oidc_provider.cc create mode 100644 src/rgw/rgw_oidc_provider.h create mode 100644 src/rgw/rgw_op.cc create mode 100644 src/rgw/rgw_op.h create mode 100644 src/rgw/rgw_op_type.h create mode 100644 src/rgw/rgw_opa.cc create mode 100644 src/rgw/rgw_opa.h create mode 100644 src/rgw/rgw_orphan.cc create mode 100644 src/rgw/rgw_orphan.h create mode 100644 src/rgw/rgw_os_lib.cc create mode 100644 src/rgw/rgw_os_lib.h create mode 100644 src/rgw/rgw_perf_counters.cc create mode 100644 src/rgw/rgw_perf_counters.h create mode 100644 src/rgw/rgw_period.cc create mode 100644 src/rgw/rgw_period_history.cc create mode 100644 src/rgw/rgw_period_history.h create mode 100644 src/rgw/rgw_period_puller.cc create mode 100644 src/rgw/rgw_period_puller.h create mode 100644 src/rgw/rgw_period_pusher.cc create mode 100644 src/rgw/rgw_period_pusher.h create mode 100644 src/rgw/rgw_placement_types.h create mode 100644 src/rgw/rgw_policy_s3.cc create mode 100644 src/rgw/rgw_policy_s3.h create mode 100644 src/rgw/rgw_polparser.cc create mode 100644 src/rgw/rgw_pool_types.h create mode 100644 src/rgw/rgw_process.cc create mode 100644 src/rgw/rgw_process.h create mode 100644 src/rgw/rgw_process_env.h create mode 100644 src/rgw/rgw_public_access.cc create mode 100644 src/rgw/rgw_public_access.h create mode 100644 src/rgw/rgw_pubsub.cc create mode 100644 src/rgw/rgw_pubsub.h create mode 100644 src/rgw/rgw_putobj.cc create mode 100644 src/rgw/rgw_putobj.h create mode 100644 src/rgw/rgw_quota.cc create mode 100644 src/rgw/rgw_quota.h create mode 100644 src/rgw/rgw_quota_types.h create mode 100644 src/rgw/rgw_ratelimit.h create mode 100644 src/rgw/rgw_realm.cc create mode 100644 src/rgw/rgw_realm_reloader.cc create mode 100644 src/rgw/rgw_realm_reloader.h create mode 100644 src/rgw/rgw_realm_watcher.cc create mode 100644 src/rgw/rgw_realm_watcher.h create mode 100644 src/rgw/rgw_request.h create mode 100644 src/rgw/rgw_resolve.cc create mode 100644 src/rgw/rgw_resolve.h create mode 100644 src/rgw/rgw_rest.cc create mode 100644 src/rgw/rgw_rest.h create mode 100644 src/rgw/rgw_rest_admin.h create mode 100644 src/rgw/rgw_rest_client.cc create mode 100644 src/rgw/rgw_rest_client.h create mode 100644 src/rgw/rgw_rest_config.cc create mode 100644 src/rgw/rgw_rest_config.h create mode 100644 src/rgw/rgw_rest_conn.cc create mode 100644 src/rgw/rgw_rest_conn.h create mode 100644 src/rgw/rgw_rest_iam.cc create mode 100644 src/rgw/rgw_rest_iam.h create mode 100644 src/rgw/rgw_rest_info.cc create mode 100644 src/rgw/rgw_rest_info.h create mode 100644 src/rgw/rgw_rest_metadata.cc create mode 100644 src/rgw/rgw_rest_metadata.h create mode 100644 src/rgw/rgw_rest_oidc_provider.cc create mode 100644 src/rgw/rgw_rest_oidc_provider.h create mode 100644 src/rgw/rgw_rest_pubsub.cc create mode 100644 src/rgw/rgw_rest_ratelimit.cc create mode 100644 src/rgw/rgw_rest_ratelimit.h create mode 100644 src/rgw/rgw_rest_role.cc create mode 100644 src/rgw/rgw_rest_role.h create mode 100644 src/rgw/rgw_rest_s3.cc create mode 100644 src/rgw/rgw_rest_s3.h create mode 100644 src/rgw/rgw_rest_s3website.h create mode 100644 src/rgw/rgw_rest_sts.cc create mode 100644 src/rgw/rgw_rest_sts.h create mode 100644 src/rgw/rgw_rest_swift.cc create mode 100644 src/rgw/rgw_rest_swift.h create mode 100644 src/rgw/rgw_rest_usage.cc create mode 100644 src/rgw/rgw_rest_usage.h create mode 100644 src/rgw/rgw_rest_user_policy.cc create mode 100644 src/rgw/rgw_rest_user_policy.h create mode 100644 src/rgw/rgw_role.cc create mode 100644 src/rgw/rgw_role.h create mode 100644 src/rgw/rgw_s3select.cc create mode 100644 src/rgw/rgw_s3select.h create mode 100644 src/rgw/rgw_s3select_private.h create mode 100644 src/rgw/rgw_sal.cc create mode 100644 src/rgw/rgw_sal.h create mode 100644 src/rgw/rgw_sal_config.h create mode 100644 src/rgw/rgw_sal_daos.cc create mode 100644 src/rgw/rgw_sal_daos.h create mode 100644 src/rgw/rgw_sal_dbstore.cc create mode 100644 src/rgw/rgw_sal_dbstore.h create mode 100644 src/rgw/rgw_sal_filter.cc create mode 100644 src/rgw/rgw_sal_filter.h create mode 100644 src/rgw/rgw_sal_fwd.h create mode 100644 src/rgw/rgw_sal_motr.cc create mode 100644 src/rgw/rgw_sal_motr.h create mode 100644 src/rgw/rgw_sal_store.h create mode 100644 src/rgw/rgw_signal.cc create mode 100644 src/rgw/rgw_signal.h create mode 100644 src/rgw/rgw_string.cc create mode 100644 src/rgw/rgw_string.h create mode 100644 src/rgw/rgw_sts.cc create mode 100644 src/rgw/rgw_sts.h create mode 100644 src/rgw/rgw_swift_auth.cc create mode 100644 src/rgw/rgw_swift_auth.h create mode 100644 src/rgw/rgw_sync.cc create mode 100644 src/rgw/rgw_sync_checkpoint.cc create mode 100644 src/rgw/rgw_sync_checkpoint.h create mode 100644 src/rgw/rgw_sync_policy.cc create mode 100644 src/rgw/rgw_sync_policy.h create mode 100644 src/rgw/rgw_tag.cc create mode 100644 src/rgw/rgw_tag.h create mode 100644 src/rgw/rgw_tag_s3.cc create mode 100644 src/rgw/rgw_tag_s3.h create mode 100644 src/rgw/rgw_tar.h create mode 100644 src/rgw/rgw_token.cc create mode 100644 src/rgw/rgw_token.h create mode 100644 src/rgw/rgw_tools.cc create mode 100644 src/rgw/rgw_torrent.cc create mode 100644 src/rgw/rgw_torrent.h create mode 100644 src/rgw/rgw_tracer.cc create mode 100644 src/rgw/rgw_tracer.h create mode 100644 src/rgw/rgw_url.cc create mode 100644 src/rgw/rgw_url.h create mode 100644 src/rgw/rgw_usage.cc create mode 100644 src/rgw/rgw_usage.h create mode 100644 src/rgw/rgw_user.cc create mode 100644 src/rgw/rgw_user_types.h create mode 100644 src/rgw/rgw_web_idp.h create mode 100644 src/rgw/rgw_website.cc create mode 100644 src/rgw/rgw_website.h create mode 100644 src/rgw/rgw_worker.h create mode 100644 src/rgw/rgw_xml.cc create mode 100644 src/rgw/rgw_xml.h create mode 100644 src/rgw/rgw_xml_enc.cc create mode 100644 src/rgw/rgw_zone.cc create mode 100644 src/rgw/rgw_zone_features.h create mode 100644 src/rgw/rgw_zone_types.h create mode 100755 src/rgw/rgwam.py create mode 100644 src/rgw/services/svc_bi.h create mode 100644 src/rgw/services/svc_bi_rados.cc create mode 100644 src/rgw/services/svc_bi_rados.h create mode 100644 src/rgw/services/svc_bilog_rados.cc create mode 100644 src/rgw/services/svc_bilog_rados.h create mode 100644 src/rgw/services/svc_bucket.cc create mode 100644 src/rgw/services/svc_bucket.h create mode 100644 src/rgw/services/svc_bucket_sobj.cc create mode 100644 src/rgw/services/svc_bucket_sobj.h create mode 100644 src/rgw/services/svc_bucket_sync.h create mode 100644 src/rgw/services/svc_bucket_sync_sobj.cc create mode 100644 src/rgw/services/svc_bucket_sync_sobj.h create mode 100644 src/rgw/services/svc_bucket_types.h create mode 100644 src/rgw/services/svc_cls.cc create mode 100644 src/rgw/services/svc_cls.h create mode 100644 src/rgw/services/svc_config_key.h create mode 100644 src/rgw/services/svc_config_key_rados.cc create mode 100644 src/rgw/services/svc_config_key_rados.h create mode 100644 src/rgw/services/svc_finisher.cc create mode 100644 src/rgw/services/svc_finisher.h create mode 100644 src/rgw/services/svc_mdlog.cc create mode 100644 src/rgw/services/svc_mdlog.h create mode 100644 src/rgw/services/svc_meta.cc create mode 100644 src/rgw/services/svc_meta.h create mode 100644 src/rgw/services/svc_meta_be.cc create mode 100644 src/rgw/services/svc_meta_be.h create mode 100644 src/rgw/services/svc_meta_be_otp.cc create mode 100644 src/rgw/services/svc_meta_be_otp.h create mode 100644 src/rgw/services/svc_meta_be_params.h create mode 100644 src/rgw/services/svc_meta_be_sobj.cc create mode 100644 src/rgw/services/svc_meta_be_sobj.h create mode 100644 src/rgw/services/svc_meta_be_types.h create mode 100644 src/rgw/services/svc_notify.cc create mode 100644 src/rgw/services/svc_notify.h create mode 100644 src/rgw/services/svc_otp.cc create mode 100644 src/rgw/services/svc_otp.h create mode 100644 src/rgw/services/svc_otp_types.h create mode 100644 src/rgw/services/svc_quota.cc create mode 100644 src/rgw/services/svc_quota.h create mode 100644 src/rgw/services/svc_rados.cc create mode 100644 src/rgw/services/svc_rados.h create mode 100644 src/rgw/services/svc_role_rados.cc create mode 100644 src/rgw/services/svc_role_rados.h create mode 100644 src/rgw/services/svc_sync_modules.cc create mode 100644 src/rgw/services/svc_sync_modules.h create mode 100644 src/rgw/services/svc_sys_obj.cc create mode 100644 src/rgw/services/svc_sys_obj.h create mode 100644 src/rgw/services/svc_sys_obj_cache.cc create mode 100644 src/rgw/services/svc_sys_obj_cache.h create mode 100644 src/rgw/services/svc_sys_obj_core.cc create mode 100644 src/rgw/services/svc_sys_obj_core.h create mode 100644 src/rgw/services/svc_sys_obj_core_types.h create mode 100644 src/rgw/services/svc_sys_obj_types.h create mode 100644 src/rgw/services/svc_tier_rados.cc create mode 100644 src/rgw/services/svc_tier_rados.h create mode 100644 src/rgw/services/svc_user.cc create mode 100644 src/rgw/services/svc_user.h create mode 100644 src/rgw/services/svc_user_rados.cc create mode 100644 src/rgw/services/svc_user_rados.h create mode 100644 src/rgw/services/svc_zone.cc create mode 100644 src/rgw/services/svc_zone.h create mode 100644 src/rgw/services/svc_zone_utils.cc create mode 100644 src/rgw/services/svc_zone_utils.h (limited to 'src/rgw') diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt new file mode 100644 index 000000000..b010f303a --- /dev/null +++ b/src/rgw/CMakeLists.txt @@ -0,0 +1,579 @@ +find_program(GPERF gperf) +if(NOT GPERF) + message(FATAL_ERROR "Can't find gperf") +endif() + +if(WITH_RADOSGW_BACKTRACE_LOGGING) + add_definitions(-D_BACKTRACE_LOGGING) +endif(WITH_RADOSGW_BACKTRACE_LOGGING) + +if(WITH_RADOSGW_SELECT_PARQUET) + set(ARROW_LIBRARIES Arrow::Arrow Arrow::Parquet) + add_definitions(-D_ARROW_EXIST) + message("-- arrow is installed, radosgw/s3select-op is able to process parquet objects") +endif(WITH_RADOSGW_SELECT_PARQUET) + +if(WITH_RADOSGW_ARROW_FLIGHT) + set(ARROW_FLIGHT_LIBRARIES Arrow::Arrow Arrow::Parquet Arrow::Flight utf8proc::utf8proc) # order is important + add_definitions(-D_ARROW_EXIST) + message("-- arrow flight is installed") +endif(WITH_RADOSGW_ARROW_FLIGHT) + +function(gperf_generate input output) + add_custom_command( + OUTPUT ${output} + COMMAND ${GPERF} ${input} | sed "s/register //g" > ${output} + DEPENDS ${input} + COMMENT "Generate ${output}" + ) +endfunction() + +find_package(ICU 52.0 COMPONENTS uc REQUIRED) + +set(librgw_common_srcs + services/svc_finisher.cc + services/svc_bi_rados.cc + services/svc_bilog_rados.cc + services/svc_bucket.cc + services/svc_bucket_sobj.cc + services/svc_bucket_sync_sobj.cc + services/svc_cls.cc + services/svc_config_key_rados.cc + services/svc_mdlog.cc + services/svc_meta.cc + services/svc_meta_be.cc + services/svc_meta_be_otp.cc + services/svc_meta_be_sobj.cc + services/svc_notify.cc + services/svc_otp.cc + services/svc_quota.cc + services/svc_sync_modules.cc + services/svc_rados.cc + services/svc_role_rados.cc + services/svc_sys_obj.cc + services/svc_sys_obj_cache.cc + services/svc_sys_obj_core.cc + services/svc_tier_rados.cc + services/svc_user.cc + services/svc_user_rados.cc + services/svc_zone.cc + services/svc_zone_utils.cc + rgw_acl.cc + rgw_acl_s3.cc + rgw_acl_swift.cc + rgw_aio.cc + rgw_aio_throttle.cc + rgw_auth.cc + rgw_auth_s3.cc + rgw_arn.cc + rgw_basic_types.cc + rgw_bucket.cc + rgw_bucket_layout.cc + rgw_cache.cc + rgw_common.cc + rgw_compression.cc + rgw_cors.cc + rgw_cors_s3.cc + rgw_env.cc + rgw_es_query.cc + rgw_formats.cc + rgw_http_client.cc + rgw_keystone.cc + rgw_ldap.cc + rgw_lc.cc + rgw_lc_s3.cc + rgw_metadata.cc + rgw_multi.cc + rgw_multi_del.cc + rgw_multipart_meta_filter.cc + rgw_obj_manifest.cc + rgw_period.cc + rgw_realm.cc + rgw_sync.cc + rgw_sync_policy.cc + rgw_notify_event_type.cc + rgw_period_history.cc + rgw_period_puller.cc + rgw_pubsub.cc + rgw_coroutine.cc + rgw_cr_rest.cc + rgw_op.cc + rgw_policy_s3.cc + rgw_public_access.cc + rgw_putobj.cc + rgw_quota.cc + rgw_resolve.cc + rgw_rest.cc + rgw_rest_client.cc + rgw_rest_config.cc + rgw_rest_conn.cc + rgw_rest_metadata.cc + rgw_rest_ratelimit.cc + rgw_rest_role.cc + rgw_rest_s3.cc + rgw_rest_pubsub.cc + rgw_s3select.cc + rgw_role.cc + rgw_sal.cc + rgw_sal_filter.cc + rgw_string.cc + rgw_tag.cc + rgw_tag_s3.cc + rgw_tools.cc + rgw_user.cc + rgw_website.cc + rgw_xml.cc + rgw_torrent.cc + rgw_crypt.cc + rgw_crypt_sanitize.cc + rgw_iam_policy.cc + rgw_rest_user_policy.cc + rgw_zone.cc + rgw_sts.cc + rgw_rest_sts.cc + rgw_perf_counters.cc + rgw_rest_oidc_provider.cc + rgw_rest_iam.cc + rgw_object_lock.cc + rgw_kms.cc + rgw_kmip_client.cc + rgw_url.cc + rgw_oidc_provider.cc + rgw_log.cc + rgw_lua_request.cc + rgw_lua_utils.cc + rgw_lua.cc + rgw_lua_data_filter.cc + rgw_bucket_encryption.cc + rgw_tracer.cc + rgw_lua_background.cc + driver/rados/cls_fifo_legacy.cc + driver/rados/rgw_bucket.cc + driver/rados/rgw_bucket_sync.cc + driver/rados/rgw_cr_rados.cc + driver/rados/rgw_cr_tools.cc + driver/rados/rgw_d3n_datacache.cc + driver/rados/rgw_datalog.cc + driver/rados/rgw_datalog_notify.cc + driver/rados/rgw_data_sync.cc + driver/rados/rgw_etag_verifier.cc + driver/rados/rgw_gc.cc + driver/rados/rgw_gc_log.cc + driver/rados/rgw_lc_tier.cc + driver/rados/rgw_log_backing.cc + driver/rados/rgw_metadata.cc + driver/rados/rgw_notify.cc + driver/rados/rgw_obj_manifest.cc + driver/rados/rgw_object_expirer_core.cc + driver/rados/rgw_otp.cc + driver/rados/rgw_period.cc + driver/rados/rgw_pubsub_push.cc + driver/rados/rgw_putobj_processor.cc + driver/rados/rgw_rados.cc + driver/rados/rgw_reshard.cc + driver/rados/rgw_rest_bucket.cc + driver/rados/rgw_rest_log.cc + driver/rados/rgw_rest_realm.cc + driver/rados/rgw_rest_user.cc + driver/rados/rgw_sal_rados.cc + driver/rados/rgw_service.cc + driver/rados/rgw_sync.cc + driver/rados/rgw_sync_counters.cc + driver/rados/rgw_sync_error_repo.cc + driver/rados/rgw_sync_module.cc + driver/rados/rgw_sync_module_aws.cc + driver/rados/rgw_sync_module_es.cc + driver/rados/rgw_sync_module_es_rest.cc + driver/rados/rgw_sync_module_log.cc + driver/rados/rgw_sync_trace.cc + driver/rados/rgw_tools.cc + driver/rados/rgw_trim_bilog.cc + driver/rados/rgw_trim_datalog.cc + driver/rados/rgw_trim_mdlog.cc + driver/rados/rgw_user.cc + driver/rados/rgw_zone.cc) + +list(APPEND librgw_common_srcs + driver/immutable_config/store.cc + driver/json_config/store.cc + driver/rados/config/impl.cc + driver/rados/config/period.cc + driver/rados/config/period_config.cc + driver/rados/config/realm.cc + driver/rados/config/store.cc + driver/rados/config/zone.cc + driver/rados/config/zonegroup.cc) + +if(WITH_RADOSGW_AMQP_ENDPOINT) + list(APPEND librgw_common_srcs rgw_amqp.cc) +endif() +if(WITH_RADOSGW_KAFKA_ENDPOINT) + list(APPEND librgw_common_srcs rgw_kafka.cc) +endif() +if(WITH_RADOSGW_DBSTORE) + add_subdirectory(driver/dbstore) + list(APPEND librgw_common_srcs rgw_sal_dbstore.cc) +endif() +if(WITH_RADOSGW_MOTR) + list(APPEND librgw_common_srcs rgw_sal_motr.cc) +endif() +if(WITH_RADOSGW_DAOS) + list(APPEND librgw_common_srcs rgw_sal_daos.cc) +endif() +if(WITH_JAEGER) + list(APPEND librgw_common_srcs rgw_tracer.cc) +endif() +if(WITH_RADOSGW_ARROW_FLIGHT) + # NOTE: eventually don't want this in common but just in radosgw daemon + # list(APPEND radosgw_srcs rgw_flight.cc rgw_flight_frontend.cc) + list(APPEND librgw_common_srcs rgw_flight.cc rgw_flight_frontend.cc) +endif(WITH_RADOSGW_ARROW_FLIGHT) + + +add_library(rgw_common STATIC ${librgw_common_srcs}) + +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-Wimplicit-const-int-float-conversion" + COMPILER_SUPPORTS_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION) +if(COMPILER_SUPPORTS_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION) + target_compile_definitions(common-objs PRIVATE + HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION) +endif() + +target_link_libraries(rgw_common + PRIVATE + global + cls_2pc_queue_client + cls_cmpomap_client + cls_lock_client + cls_log_client + cls_otp_client + cls_refcount_client + cls_rgw_client + cls_rgw_gc_client + cls_timeindex_client + cls_user_client + cls_version_client + librados + rt + ICU::uc + OATH::OATH + dmclock::dmclock + ${CURL_LIBRARIES} + ${EXPAT_LIBRARIES} + ${ARROW_LIBRARIES} + ${ARROW_FLIGHT_LIBRARIES} + ${ALLOC_LIBS} + PUBLIC + ${LUA_LIBRARIES} + RapidJSON::RapidJSON + spawn + fmt::fmt) +target_include_directories(rgw_common + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/services" + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados" + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw" + PUBLIC "${LUA_INCLUDE_DIR}") +if(WITH_RADOSGW_KAFKA_ENDPOINT) + # used by rgw_kafka.cc + target_link_libraries(rgw_common + PRIVATE + RDKafka::RDKafka) +endif() +if(WITH_RADOSGW_AMQP_ENDPOINT) + # used by rgw_amqp.cc + target_link_libraries(rgw_common + PRIVATE + RabbitMQ::RabbitMQ + OpenSSL::SSL) +endif() +if(WITH_OPENLDAP) + target_link_libraries(rgw_common + PRIVATE + OpenLDAP::OpenLDAP) +endif() +if(WITH_RADOSGW_LUA_PACKAGES) + target_link_libraries(rgw_common + PRIVATE Boost::filesystem StdFilesystem::filesystem) +endif() + +if(WITH_LTTNG) + # rgw/rgw_op.cc includes "tracing/rgw_op.h" + # rgw/rgw_rados.cc includes "tracing/rgw_rados.h" + add_dependencies(rgw_common rgw_op-tp rgw_rados-tp) +endif() + +if(WITH_JAEGER) + add_dependencies(rgw_common jaeger_base) + target_link_libraries(rgw_common PUBLIC jaeger_base) +endif() + +if(WITH_RADOSGW_DBSTORE) + target_link_libraries(rgw_common PRIVATE global dbstore) +endif() + +if(WITH_RADOSGW_MOTR) + find_package(motr REQUIRED) + target_link_libraries(rgw_common PRIVATE motr::motr) +endif() + +if(WITH_RADOSGW_DAOS) + find_package(DAOS REQUIRED) + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG ") + target_link_libraries(rgw_common PRIVATE daos dfs ds3 uuid duns) + target_include_directories(rgw_common PRIVATE ${PC_DAOS_INCLUDEDIR} ) + link_directories( ${PC_DAOS_LIBRARY_DIRS} ) +endif() + +set(rgw_a_srcs + rgw_appmain.cc + rgw_asio_client.cc + rgw_asio_frontend.cc + rgw_auth_keystone.cc + rgw_client_io.cc + rgw_file.cc + rgw_frontend.cc + rgw_http_client_curl.cc + rgw_kmip_client_impl.cc + rgw_lib.cc + rgw_loadgen.cc + rgw_loadgen_process.cc + rgw_log.cc + rgw_lua_request.cc + rgw_opa.cc + rgw_os_lib.cc + rgw_period_pusher.cc + rgw_process.cc + rgw_realm_reloader.cc + rgw_realm_watcher.cc + rgw_rest_config.cc + rgw_rest_info.cc + rgw_rest_metadata.cc + rgw_rest_ratelimit.cc + rgw_rest_sts.cc + rgw_rest_swift.cc + rgw_rest_usage.cc + rgw_signal.cc + rgw_swift_auth.cc + rgw_usage.cc + rgw_sts.cc + driver/rados/rgw_rest_bucket.cc + driver/rados/rgw_rest_log.cc + driver/rados/rgw_rest_realm.cc) + +gperf_generate(${CMAKE_SOURCE_DIR}/src/rgw/rgw_iam_policy_keywords.gperf + rgw_iam_policy_keywords.frag.cc) +set_source_files_properties(rgw_iam_policy.cc PROPERTIES + OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/rgw/rgw_iam_policy_keywords.frag.cc + COMPILE_FLAGS -I${CMAKE_BINARY_DIR}/src/rgw) + + +add_library(rgw_a STATIC + ${rgw_a_srcs}) + +target_compile_definitions(rgw_a PUBLIC "-DCLS_CLIENT_HIDE_IOCTX") + +target_include_directories(rgw_a + PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src" + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw" + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados" + PRIVATE "${CMAKE_SOURCE_DIR}/src/libkmip") + +if(WITH_RADOSGW_AMQP_ENDPOINT) + find_package(RabbitMQ REQUIRED) +endif() +if(WITH_RADOSGW_KAFKA_ENDPOINT) + find_package(RDKafka 0.9.2 REQUIRED) +endif() + +target_link_libraries(rgw_a + PRIVATE + common_utf8 global + ${CRYPTO_LIBS} + ${ARROW_LIBRARIES} + ${ARROW_FLIGHT_LIBRARIES} + OATH::OATH + PUBLIC + rgw_common + spawn) + +if(WITH_CURL_OPENSSL) + # used by rgw_http_client_curl.cc + target_link_libraries(rgw_a PRIVATE OpenSSL::Crypto) +endif() + +set(rgw_libs rgw_a) + +set(rgw_schedulers_srcs + rgw_dmclock_scheduler_ctx.cc + rgw_dmclock_sync_scheduler.cc + rgw_dmclock_async_scheduler.cc) + +add_library(rgw_schedulers STATIC ${rgw_schedulers_srcs}) +target_link_libraries(rgw_schedulers + PUBLIC dmclock::dmclock spawn) + +set(radosgw_srcs + rgw_main.cc) + +add_executable(radosgw ${radosgw_srcs}) + +if(WITH_RADOSGW_ARROW_FLIGHT) + # target_compile_definitions(radosgw PUBLIC WITH_ARROW_FLIGHT) + target_compile_definitions(rgw_common PUBLIC WITH_ARROW_FLIGHT) + target_include_directories(rgw_common + PUBLIC "${CMAKE_SOURCE_DIR}/src/arrow/cpp/src") + # target_include_directories(radosgw PUBLIC Arrow::Arrow) +endif(WITH_RADOSGW_ARROW_FLIGHT) + +target_compile_definitions(radosgw PUBLIC "-DCLS_CLIENT_HIDE_IOCTX") +target_include_directories(radosgw + PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src" + PRIVATE "${CMAKE_SOURCE_DIR}/src/libkmip" + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw" + PRIVATE "${LUA_INCLUDE_DIR}") + +target_include_directories(radosgw SYSTEM PUBLIC "../rapidjson/include") + +target_link_libraries(radosgw PRIVATE ${rgw_libs} rgw_schedulers kmip) +if(WITH_RADOSGW_BEAST_OPENSSL) + # used by rgw_asio_frontend.cc + target_link_libraries(radosgw PRIVATE OpenSSL::SSL) +endif() +install(TARGETS radosgw DESTINATION bin) + +set(radosgw_admin_srcs + rgw_admin.cc + rgw_sync_checkpoint.cc + rgw_orphan.cc) + +# this is unsatisfying and hopefully temporary; ARROW should not be +# part of radosgw_admin +if(WITH_RADOSGW_ARROW_FLIGHT) + list(APPEND radosgw_admin_srcs rgw_flight.cc) +endif(WITH_RADOSGW_ARROW_FLIGHT) + +add_executable(radosgw-admin ${radosgw_admin_srcs}) +target_link_libraries(radosgw-admin ${rgw_libs} librados + cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client + cls_log_client cls_timeindex_client + cls_version_client cls_user_client + global ${LIB_RESOLV} + OATH::OATH + ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${BLKID_LIBRARIES}) + +# this is unsatisfying and hopefully temporary; ARROW should not be +# part of radosgw_admin +if(WITH_RADOSGW_ARROW_FLIGHT) + target_link_libraries(radosgw-admin ${ARROW_LIBRARIES} ${ARROW_FLIGHT_LIBRARIES}) +endif(WITH_RADOSGW_ARROW_FLIGHT) + +install(TARGETS radosgw-admin DESTINATION bin) + +set(radosgw_es_srcs + rgw_es_main.cc) +add_executable(radosgw-es ${radosgw_es_srcs}) +target_link_libraries(radosgw-es ${rgw_libs} librados + cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client + cls_log_client cls_timeindex_client + cls_version_client cls_user_client + global ${LIB_RESOLV} + ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${BLKID_LIBRARIES}) +install(TARGETS radosgw-es DESTINATION bin) + +set(radosgw_token_srcs + rgw_token.cc) +add_executable(radosgw-token ${radosgw_token_srcs}) +target_link_libraries(radosgw-token librados + global ${ALLOC_LIBS}) +install(TARGETS radosgw-token DESTINATION bin) + +set(radosgw_object_expirer_srcs + rgw_object_expirer.cc) +add_executable(radosgw-object-expirer ${radosgw_object_expirer_srcs}) +target_link_libraries(radosgw-object-expirer ${rgw_libs} librados + cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client + cls_log_client cls_timeindex_client + cls_version_client cls_user_client + global ${LIB_RESOLV} + ${CURL_LIBRARIES} ${EXPAT_LIBRARIES}) +install(TARGETS radosgw-object-expirer DESTINATION bin) + +set(radosgw_polparser_srcs + rgw_polparser.cc) +add_executable(rgw-policy-check ${radosgw_polparser_srcs}) +target_link_libraries(rgw-policy-check ${rgw_libs}) +install(TARGETS rgw-policy-check DESTINATION bin) + +set(librgw_srcs + librgw.cc) +add_library(rgw SHARED ${librgw_srcs}) + +target_compile_definitions(rgw PUBLIC "-DCLS_CLIENT_HIDE_IOCTX") +target_include_directories(rgw + PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src" + PRIVATE "${CMAKE_SOURCE_DIR}/src/libkmip" + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw" + PRIVATE "${LUA_INCLUDE_DIR}") + +target_include_directories(rgw SYSTEM PUBLIC "../rapidjson/include") + +target_link_libraries(rgw + PRIVATE + ${rgw_libs} + rgw_schedulers + kmip + librados + cls_rgw_client + cls_otp_client + cls_lock_client + cls_refcount_client + cls_log_client + cls_timeindex_client + cls_version_client + cls_user_client + ${LIB_RESOLV} + ${CURL_LIBRARIES} + ${EXPAT_LIBRARIES} + PUBLIC + RapidJSON::RapidJSON + dmclock::dmclock) + +if(WITH_RADOSGW_AMQP_ENDPOINT) + target_link_libraries(rgw PRIVATE RabbitMQ::RabbitMQ) + target_link_libraries(rgw PRIVATE OpenSSL::SSL) +endif() + +if(WITH_RADOSGW_KAFKA_ENDPOINT) + target_link_libraries(rgw PRIVATE RDKafka::RDKafka) +endif() + +set_target_properties(rgw PROPERTIES OUTPUT_NAME rgw VERSION 2.0.0 + SOVERSION 2) +install(TARGETS rgw DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +if(WITH_TESTS) + add_executable(ceph_rgw_jsonparser + rgw_jsonparser.cc) + target_link_libraries(ceph_rgw_jsonparser + ${rgw_libs} + global) + + add_executable(ceph_rgw_multiparser + rgw_multiparser.cc) + target_link_libraries(ceph_rgw_multiparser + ${rgw_libs} + global) + + install(TARGETS + ceph_rgw_jsonparser + ceph_rgw_multiparser + DESTINATION bin) +endif(WITH_TESTS) + +install(PROGRAMS + rgw-gap-list + rgw-gap-list-comparator + rgw-orphan-list + rgw-restore-bucket-index + DESTINATION bin) diff --git a/src/rgw/MAINTAINERS.md b/src/rgw/MAINTAINERS.md new file mode 100644 index 000000000..4636a636e --- /dev/null +++ b/src/rgw/MAINTAINERS.md @@ -0,0 +1,28 @@ +# RGW Maintainers + +Maintainers are the default assignee for related tracker issues and pull requests. + +| Component | Name | +|---------------------------------|---------------------------------| +| auth, STS | Pritha Srivastava | +| bucket index, resharding | J. Eric Ivancich | +| bucket notifications | Yuval Lifshitz | +| data caching | Mark Kogan | +| garbage collection | Pritha Srivastava | +| http frontends | Casey Bodley | +| lifecycle | Matt Benjamin | +| lua scripting | Yuval Lifshitz | +| multisite | Casey Bodley | +| object i/o | Casey Bodley | +| rgw orchestration, admin APIs | Ali Maredia | +| radosgw-admin | Daniel Gryniewicz | +| rest ops | Daniel Gryniewicz | +| rgw-nfs | Matt Benjamin | +| performance | Mark Kogan | +| s3 select | Gal Salomon | +| storage abstraction layer | Daniel Gryniewicz | + +# Looking for maintainer + +* security (crypto, SSE, CVEs) +* swift api diff --git a/src/rgw/driver/daos/README.md b/src/rgw/driver/daos/README.md new file mode 100644 index 000000000..de6d215a0 --- /dev/null +++ b/src/rgw/driver/daos/README.md @@ -0,0 +1,47 @@ +# DAOS + +Standalone RADOS Gateway (RGW) on [DAOS](http://daos.io/) (Experimental) + +## CMake Option + +Add below cmake option + +```bash + -DWITH_RADOSGW_DAOS=ON +``` + +## Build + +```bash + cd build + ninja [vstart] +``` + +## Running Test cluster + +Edit ceph.conf to add below option + +```conf + [client] + rgw backend store = daos +``` + +Restart vstart cluster or just RGW server + +```bash + [..] RGW=1 ../src/vstart.sh -d +``` + +The above configuration brings up an RGW server on DAOS. + +## Creating a test user + + To create a `testid` user to be used for s3 operations, use the following command: + + ```bash +local akey='0555b35654ad1656d804' +local skey='h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==' + radosgw-admin user create --uid testid \ + --access-key $akey --secret $skey \ + --display-name 'M. Tester' --email tester@ceph.com --no-mon-config + ``` diff --git a/src/rgw/driver/dbstore/CMakeLists.txt b/src/rgw/driver/dbstore/CMakeLists.txt new file mode 100644 index 000000000..a3aca7a64 --- /dev/null +++ b/src/rgw/driver/dbstore/CMakeLists.txt @@ -0,0 +1,71 @@ +#need to update cmake version here +cmake_minimum_required(VERSION 3.14.0) +project(dbstore) + +option(USE_SQLITE "Enable SQLITE DB" ON) + +set (CMAKE_INCLUDE_DIR ${CMAKE_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/common") + +set(dbstore_srcs + common/dbstore_log.h + common/dbstore.h + common/dbstore.cc + config/store.cc) +IF(USE_SQLITE) + list(APPEND dbstore_srcs + config/sqlite.cc + sqlite/connection.cc + sqlite/error.cc + sqlite/statement.cc) +endif() + +set(dbstore_mgr_srcs + dbstore_mgr.h + dbstore_mgr.cc + ) + +add_library(dbstore_lib ${dbstore_srcs}) +target_include_directories(dbstore_lib + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw" + PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/store/rados" + PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}") +set(link_targets spawn) +if(WITH_JAEGER) + list(APPEND link_targets jaeger_base) +endif() +list(APPEND link_targets rgw_common) +target_link_libraries(dbstore_lib PUBLIC ${link_targets}) + +set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} dbstore_lib) + +IF(USE_SQLITE) + add_subdirectory(sqlite) + set(CMAKE_INCLUDE_DIR ${CMAKE_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/sqlite") + add_compile_definitions(SQLITE_ENABLED=1) + set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} rgw_common) + set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} sqlite_db) + add_dependencies(sqlite_db dbstore_lib) +ENDIF() + +# add pthread library +set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} pthread) + +find_package(gtest QUIET) +if(WITH_TESTS) + add_subdirectory(tests) +else() + message(WARNING "Gtest not enabled") +endif() + +include_directories(${CMAKE_INCLUDE_DIR}) +add_library(dbstore STATIC ${dbstore_mgr_srcs}) +target_link_libraries(dbstore ${CMAKE_LINK_LIBRARIES}) + +# testing purpose +set(dbstore_main_srcs + dbstore_main.cc) + +set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} dbstore) +add_executable(dbstore-bin ${dbstore_main_srcs}) +add_dependencies(dbstore-bin dbstore) +target_link_libraries(dbstore-bin ${CMAKE_LINK_LIBRARIES}) diff --git a/src/rgw/driver/dbstore/README.md b/src/rgw/driver/dbstore/README.md new file mode 100644 index 000000000..0867bc2cc --- /dev/null +++ b/src/rgw/driver/dbstore/README.md @@ -0,0 +1,53 @@ +# DBStore +Standalone Rados Gateway (RGW) on DBStore (Experimental) + + +## CMake Option +Add below cmake option (enabled by default) + + -DWITH_RADOSGW_DBSTORE=ON + + +## Build + + cd build + ninja [vstart] + + +## Running Test cluster +Edit ceph.conf to add below option + + [client] + rgw backend store = dbstore + +Start vstart cluster + + [..] RGW=1 ../src/vstart.sh -o rgw_backend_store=dbstore -n -d + +The above vstart command brings up RGW server on dbstore and creates few default users (eg., testid) to be used for s3 operations. + +`radosgw-admin` can be used to create and remove other users. + + +By default, dbstore creates .db file *'/var/lib/ceph/radosgw/dbstore-default_ns.db'* to store the data. This can be configured using below options in ceph.conf + + [client] + dbstore db dir = + dbstore db name prefix = + + +## DBStore Unit Tests +To execute DBStore unit test cases (using Gtest framework), from build directory + + ninja unittest_dbstore_tests + ./bin/unittest_dbstore_tests [logfile] [loglevel] + (default logfile: rgw_dbstore_tests.log, loglevel: 20) + ninja unittest_dbstore_mgr_tests + ./bin/unittest_dbstore_mgr_tests + +To execute Sample test file + + ninja src/rgw/driver/dbstore/install + ./bin/dbstore-bin [logfile] [loglevel] + (default logfile: rgw_dbstore_bin.log, loglevel: 20) + diff --git a/src/rgw/driver/dbstore/common/connection_pool.h b/src/rgw/driver/dbstore/common/connection_pool.h new file mode 100644 index 000000000..07f3c81c3 --- /dev/null +++ b/src/rgw/driver/dbstore/common/connection_pool.h @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include +#include +#include +#include "common/dout.h" + +namespace rgw::dbstore { + +template +class ConnectionHandle; + +/// A thread-safe base class that manages a fixed-size pool of generic database +/// connections and supports the reclamation of ConnectionHandles. This class +/// is the subset of ConnectionPool which doesn't depend on the Factory type. +template +class ConnectionPoolBase { + public: + ConnectionPoolBase(std::size_t max_connections) + : connections(max_connections) + {} + private: + friend class ConnectionHandle; + + // TODO: the caller may detect a connection error that prevents the connection + // from being reused. allow them to indicate these errors here + void put(std::unique_ptr connection) + { + auto lock = std::scoped_lock{mutex}; + connections.push_back(std::move(connection)); + + if (connections.size() == 1) { // was empty + cond.notify_one(); + } + } + protected: + std::mutex mutex; + std::condition_variable cond; + boost::circular_buffer> connections; +}; + +/// Handle to a database connection borrowed from the pool. Automatically +/// returns the connection to its pool on the handle's destruction. +template +class ConnectionHandle { + ConnectionPoolBase* pool = nullptr; + std::unique_ptr conn; + public: + ConnectionHandle() noexcept = default; + ConnectionHandle(ConnectionPoolBase* pool, + std::unique_ptr conn) noexcept + : pool(pool), conn(std::move(conn)) {} + + ~ConnectionHandle() { + if (conn) { + pool->put(std::move(conn)); + } + } + + ConnectionHandle(ConnectionHandle&&) = default; + ConnectionHandle& operator=(ConnectionHandle&& o) noexcept { + if (conn) { + pool->put(std::move(conn)); + } + conn = std::move(o.conn); + pool = o.pool; + return *this; + } + + explicit operator bool() const noexcept { return static_cast(conn); } + Connection& operator*() const noexcept { return *conn; } + Connection* operator->() const noexcept { return conn.get(); } + Connection* get() const noexcept { return conn.get(); } +}; + + +// factory_of concept requires the function signature: +// F(const DoutPrefixProvider*) -> std::unique_ptr +template +concept factory_of = requires (F factory, const DoutPrefixProvider* dpp) { + { factory(dpp) } -> std::same_as>; + requires std::move_constructible; +}; + + +/// Generic database connection pool that enforces a limit on open connections. +template Factory> +class ConnectionPool : public ConnectionPoolBase { + public: + ConnectionPool(Factory factory, std::size_t max_connections) + : ConnectionPoolBase(max_connections), + factory(std::move(factory)) + {} + + /// Borrow a connection from the pool. If all existing connections are in use, + /// use the connection factory to create another one. If we've reached the + /// limit on open connections, wait on a condition variable for the next one + /// returned to the pool. + auto get(const DoutPrefixProvider* dpp) + -> ConnectionHandle + { + auto lock = std::unique_lock{this->mutex}; + std::unique_ptr conn; + + if (!this->connections.empty()) { + // take an existing connection + conn = std::move(this->connections.front()); + this->connections.pop_front(); + } else if (total < this->connections.capacity()) { + // add another connection to the pool + conn = factory(dpp); + ++total; + } else { + // wait for the next put() + // TODO: support optional_yield + ldpp_dout(dpp, 4) << "ConnectionPool waiting on a connection" << dendl; + this->cond.wait(lock, [&] { return !this->connections.empty(); }); + ldpp_dout(dpp, 4) << "ConnectionPool done waiting" << dendl; + conn = std::move(this->connections.front()); + this->connections.pop_front(); + } + + return {this, std::move(conn)}; + } + private: + Factory factory; + std::size_t total = 0; +}; + +} // namespace rgw::dbstore diff --git a/src/rgw/driver/dbstore/common/dbstore.cc b/src/rgw/driver/dbstore/common/dbstore.cc new file mode 100644 index 000000000..dc5a90c31 --- /dev/null +++ b/src/rgw/driver/dbstore/common/dbstore.cc @@ -0,0 +1,2252 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "dbstore.h" + +using namespace std; + +namespace rgw { namespace store { + +map DB::objectmap = {}; + +map DB::getObjectMap() { + return DB::objectmap; +} + +int DB::Initialize(string logfile, int loglevel) +{ + int ret = -1; + const DoutPrefixProvider *dpp = get_def_dpp(); + + if (!cct) { + cout << "Failed to Initialize. No ceph Context \n"; + return -1; + } + + if (loglevel > 0) { + cct->_conf->subsys.set_log_level(ceph_subsys_rgw, loglevel); + } + if (!logfile.empty()) { + cct->_log->set_log_file(logfile); + cct->_log->reopen_log_file(); + } + + + db = openDB(dpp); + + if (!db) { + ldpp_dout(dpp, 0) <<"Failed to open database " << dendl; + return ret; + } + + ret = InitializeDBOps(dpp); + + if (ret) { + ldpp_dout(dpp, 0) <<"InitializeDBOps failed " << dendl; + closeDB(dpp); + db = NULL; + return ret; + } + + ldpp_dout(dpp, 0) << "DB successfully initialized - name:" \ + << db_name << "" << dendl; + + return ret; +} + +int DB::createGC(const DoutPrefixProvider *dpp) { + int ret = 0; + /* create gc thread */ + + gc_worker = std::make_unique(dpp, this); + gc_worker->create("db_gc"); + + return ret; +} + +int DB::stopGC() { + if (gc_worker) { + gc_worker->signal_stop(); + gc_worker->join(); + } + return 0; +} + +int DB::Destroy(const DoutPrefixProvider *dpp) +{ + if (!db) + return 0; + + stopGC(); + + closeDB(dpp); + + + ldpp_dout(dpp, 20)<<"DB successfully destroyed - name:" \ + < DB::getDBOp(const DoutPrefixProvider *dpp, std::string_view Op, + const DBOpParams *params) +{ + if (!Op.compare("InsertUser")) + return dbops.InsertUser; + if (!Op.compare("RemoveUser")) + return dbops.RemoveUser; + if (!Op.compare("GetUser")) + return dbops.GetUser; + if (!Op.compare("InsertBucket")) + return dbops.InsertBucket; + if (!Op.compare("UpdateBucket")) + return dbops.UpdateBucket; + if (!Op.compare("RemoveBucket")) + return dbops.RemoveBucket; + if (!Op.compare("GetBucket")) + return dbops.GetBucket; + if (!Op.compare("ListUserBuckets")) + return dbops.ListUserBuckets; + if (!Op.compare("InsertLCEntry")) + return dbops.InsertLCEntry; + if (!Op.compare("RemoveLCEntry")) + return dbops.RemoveLCEntry; + if (!Op.compare("GetLCEntry")) + return dbops.GetLCEntry; + if (!Op.compare("ListLCEntries")) + return dbops.ListLCEntries; + if (!Op.compare("InsertLCHead")) + return dbops.InsertLCHead; + if (!Op.compare("RemoveLCHead")) + return dbops.RemoveLCHead; + if (!Op.compare("GetLCHead")) + return dbops.GetLCHead; + + /* Object Operations */ + map::iterator iter; + class ObjectOp* Ob; + + { + const std::lock_guard lk(mtx); + iter = DB::objectmap.find(params->op.bucket.info.bucket.name); + } + + if (iter == DB::objectmap.end()) { + ldpp_dout(dpp, 30)<<"No objectmap found for bucket: " \ + <op.bucket.info.bucket.name << dendl; + /* not found */ + return nullptr; + } + + Ob = iter->second; + + if (!Op.compare("PutObject")) + return Ob->PutObject; + if (!Op.compare("DeleteObject")) + return Ob->DeleteObject; + if (!Op.compare("GetObject")) + return Ob->GetObject; + if (!Op.compare("UpdateObject")) + return Ob->UpdateObject; + if (!Op.compare("ListBucketObjects")) + return Ob->ListBucketObjects; + if (!Op.compare("ListVersionedObjects")) + return Ob->ListVersionedObjects; + if (!Op.compare("PutObjectData")) + return Ob->PutObjectData; + if (!Op.compare("UpdateObjectData")) + return Ob->UpdateObjectData; + if (!Op.compare("GetObjectData")) + return Ob->GetObjectData; + if (!Op.compare("DeleteObjectData")) + return Ob->DeleteObjectData; + if (!Op.compare("DeleteStaleObjectData")) + return Ob->DeleteStaleObjectData; + + return nullptr; +} + +int DB::objectmapInsert(const DoutPrefixProvider *dpp, string bucket, class ObjectOp* ptr) +{ + map::iterator iter; + class ObjectOp *Ob; + + const std::lock_guard lk(mtx); + iter = DB::objectmap.find(bucket); + + if (iter != DB::objectmap.end()) { + // entry already exists + // return success or replace it or + // return error ? + // + // return success for now & delete the newly allocated ptr + ldpp_dout(dpp, 30)<<"Objectmap entry already exists for bucket("\ + <InitializeObjectOps(getDBname(), dpp); + + DB::objectmap.insert(pair(bucket, Ob)); + + return 0; +} + +int DB::objectmapDelete(const DoutPrefixProvider *dpp, string bucket) +{ + map::iterator iter; + + const std::lock_guard lk(mtx); + iter = DB::objectmap.find(bucket); + + if (iter == DB::objectmap.end()) { + // entry doesn't exist + // return success or return error ? + // return success for now + ldpp_dout(dpp, 20)<<"Objectmap entry for bucket("<cct = cct; + + //reset params here + params->user_table = user_table; + params->bucket_table = bucket_table; + params->quota_table = quota_table; + params->lc_entry_table = lc_entry_table; + params->lc_head_table = lc_head_table; + + ret = 0; +out: + return ret; +} + +int DB::ProcessOp(const DoutPrefixProvider *dpp, std::string_view Op, DBOpParams *params) { + int ret = -1; + shared_ptr db_op; + + db_op = getDBOp(dpp, Op, params); + + if (!db_op) { + ldpp_dout(dpp, 0)<<"No db_op found for Op("<Execute(dpp, params); + + if (ret) { + ldpp_dout(dpp, 0)<<"In Process op Execute failed for fop(" << Op << ")" << dendl; + } else { + ldpp_dout(dpp, 20)<<"Successfully processed fop(" << Op << ")" << dendl; + } + + return ret; +} + +int DB::get_user(const DoutPrefixProvider *dpp, + const std::string& query_str, const std::string& query_str_val, + RGWUserInfo& uinfo, map *pattrs, + RGWObjVersionTracker *pobjv_tracker) { + int ret = 0; + + if (query_str.empty() || query_str_val.empty()) { + ldpp_dout(dpp, 0)<<"In GetUser - Invalid query(" << query_str <<"), query_str_val(" << query_str_val <<")" << dendl; + return -1; + } + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + + params.op.query_str = query_str; + + // validate query_str with UserTable entries names + if (query_str == "username") { + params.op.user.uinfo.display_name = query_str_val; + } else if (query_str == "email") { + params.op.user.uinfo.user_email = query_str_val; + } else if (query_str == "access_key") { + RGWAccessKey k(query_str_val, ""); + map keys; + keys[query_str_val] = k; + params.op.user.uinfo.access_keys = keys; + } else if (query_str == "user_id") { + params.op.user.uinfo.user_id = uinfo.user_id; + } else { + ldpp_dout(dpp, 0)<<"In GetUser Invalid query string :" <read_version = params.op.user.user_version; + } + +out: + return ret; +} + +int DB::store_user(const DoutPrefixProvider *dpp, + RGWUserInfo& uinfo, bool exclusive, map *pattrs, + RGWObjVersionTracker *pobjv, RGWUserInfo* pold_info) +{ + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + int ret = 0; + + /* Check if the user already exists and return the old info, caller will have a use for it */ + RGWUserInfo orig_info; + RGWObjVersionTracker objv_tracker = {}; + obj_version& obj_ver = objv_tracker.read_version; + + orig_info.user_id = uinfo.user_id; + ret = get_user(dpp, string("user_id"), uinfo.user_id.id, orig_info, nullptr, &objv_tracker); + + if (!ret && obj_ver.ver) { + /* already exists. */ + + if (pold_info) { + *pold_info = orig_info; + } + + if (pobjv && (pobjv->read_version.ver != obj_ver.ver)) { + /* Object version mismatch.. return ECANCELED */ + ret = -ECANCELED; + ldpp_dout(dpp, 0)<<"User Read version mismatch err:(" <read_version = obj_ver; + pobjv->write_version = obj_ver; + } + +out: + return ret; +} + +int DB::remove_user(const DoutPrefixProvider *dpp, + RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv) +{ + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + int ret = 0; + + RGWUserInfo orig_info; + RGWObjVersionTracker objv_tracker = {}; + + orig_info.user_id = uinfo.user_id; + ret = get_user(dpp, string("user_id"), uinfo.user_id.id, orig_info, nullptr, &objv_tracker); + + if (ret) { + return ret; + } + + if (!ret && objv_tracker.read_version.ver) { + /* already exists. */ + + if (pobjv && (pobjv->read_version.ver != objv_tracker.read_version.ver)) { + /* Object version mismatch.. return ECANCELED */ + ret = -ECANCELED; + ldpp_dout(dpp, 0)<<"User Read version mismatch err:(" <& attrs, + RGWBucketInfo& info, + obj_version *pobjv, + obj_version *pep_objv, + real_time creation_time, + rgw_bucket *pmaster_bucket, + uint32_t *pmaster_num_shards, + optional_yield y, + bool exclusive) +{ + /* + * XXX: Simple creation for now. + * + * Referring to RGWRados::create_bucket(), + * Check if bucket already exists, select_bucket_placement, + * is explicit put/remove instance info needed? - should not be ideally + */ + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + int ret = 0; + + /* Check if the bucket already exists and return the old info, caller will have a use for it */ + RGWBucketInfo orig_info; + orig_info.bucket.name = bucket.name; + ret = get_bucket_info(dpp, string("name"), "", orig_info, nullptr, nullptr, nullptr); + + if (!ret && !orig_info.owner.id.empty() && exclusive) { + /* already exists. Return the old info */ + + info = std::move(orig_info); + return ret; + } + + RGWObjVersionTracker& objv_tracker = info.objv_tracker; + + objv_tracker.read_version.clear(); + + if (pobjv) { + objv_tracker.write_version = *pobjv; + } else { + objv_tracker.generate_new_write_ver(cct); + } + params.op.bucket.bucket_version = objv_tracker.write_version; + objv_tracker.read_version = params.op.bucket.bucket_version; + + uint64_t bid = next_bucket_id(); + string s = getDBname() + "." + std::to_string(bid); + bucket.marker = bucket.bucket_id = s; + + info.bucket = bucket; + info.owner = owner.user_id; + info.zonegroup = zonegroup_id; + info.placement_rule = placement_rule; + info.swift_ver_location = swift_ver_location; + info.swift_versioning = (!swift_ver_location.empty()); + + info.requester_pays = false; + if (real_clock::is_zero(creation_time)) { + info.creation_time = ceph::real_clock::now(); + } else { + info.creation_time = creation_time; + } + if (pquota_info) { + info.quota = *pquota_info; + } + + params.op.bucket.info = info; + params.op.bucket.bucket_attrs = attrs; + params.op.bucket.mtime = ceph::real_time(); + params.op.user.uinfo.user_id.id = owner.user_id.id; + + ret = ProcessOp(dpp, "InsertBucket", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"create_bucket failed with err:(" <add(std::move(entry)); + } + + if (query_str == "all") { + // userID/OwnerID may have changed. Update it. + user.id = params.op.bucket.info.owner.id; + } + +out: + return ret; +} + +int DB::update_bucket(const DoutPrefixProvider *dpp, const std::string& query_str, + RGWBucketInfo& info, + bool exclusive, + const rgw_user* powner_id, + map* pattrs, + ceph::real_time* pmtime, + RGWObjVersionTracker* pobjv) +{ + int ret = 0; + DBOpParams params = {}; + obj_version bucket_version; + RGWBucketInfo orig_info; + + /* Check if the bucket already exists and return the old info, caller will have a use for it */ + orig_info.bucket.name = info.bucket.name; + params.op.bucket.info.bucket.name = info.bucket.name; + ret = get_bucket_info(dpp, string("name"), "", orig_info, nullptr, nullptr, + &bucket_version); + + if (ret) { + ldpp_dout(dpp, 0)<<"Failed to read bucket info err:(" <read_version.ver != bucket_version.ver) { + ldpp_dout(dpp, 0)<<"Read version mismatch err:(" <id; + } else { + params.op.user.uinfo.user_id.id = orig_info.owner.id; + } + + /* Update version & mtime */ + params.op.bucket.bucket_version.ver = ++(bucket_version.ver); + + if (pmtime) { + params.op.bucket.mtime = *pmtime;; + } else { + params.op.bucket.mtime = ceph::real_time(); + } + + if (query_str == "attrs") { + params.op.query_str = "attrs"; + params.op.bucket.bucket_attrs = *pattrs; + } else if (query_str == "owner") { + /* Update only owner i.e, chown. + * Update creation_time too */ + params.op.query_str = "owner"; + params.op.bucket.info.creation_time = params.op.bucket.mtime; + } else if (query_str == "info") { + params.op.query_str = "info"; + params.op.bucket.info = info; + } else { + ret = -1; + ldpp_dout(dpp, 0)<<"In UpdateBucket Invalid query_str : " << query_str << dendl; + goto out; + } + + ret = ProcessOp(dpp, "UpdateBucket", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In UpdateBucket failed err:(" <read_version = params.op.bucket.bucket_version; + pobjv->write_version = params.op.bucket.bucket_version; + } + +out: + return ret; +} + +/** + * Get ordered listing of the objects in a bucket. + * + * max_p: maximum number of results to return + * bucket: bucket to list contents of + * prefix: only return results that match this prefix + * delim: do not include results that match this string. + * Any skipped results will have the matching portion of their name + * inserted in common_prefixes with a "true" mark. + * marker: if filled in, begin the listing with this object. + * end_marker: if filled in, end the listing with this object. + * result: the objects are put in here. + * common_prefixes: if delim is filled in, any matching prefixes are + * placed here. + * is_truncated: if number of objects in the bucket is bigger than + * max, then truncated. + */ +int DB::Bucket::List::list_objects(const DoutPrefixProvider *dpp, int64_t max, + vector *result, + map *common_prefixes, bool *is_truncated) +{ + int ret = 0; + DB *store = target->get_store(); + int64_t count = 0; + std::string prev_obj; + + DBOpParams db_params = {}; + store->InitializeParams(dpp, &db_params); + + db_params.op.bucket.info = target->get_bucket_info(); + /* XXX: Handle whole marker? key -> name, instance, ns? */ + db_params.op.obj.min_marker = params.marker.name; + db_params.op.obj.max_marker = params.end_marker.name; + db_params.op.obj.prefix = params.prefix + "%"; + db_params.op.list_max_count = max + 1; /* +1 for next_marker */ + + ret = store->ProcessOp(dpp, "ListBucketObjects", &db_params); + + if (ret) { + ldpp_dout(dpp, 0)<<"In ListBucketObjects failed err:(" <= max) { + *is_truncated = true; + next_marker.name = entry.key.name; + next_marker.instance = entry.key.instance; + break; + } + + if (!params.delim.empty()) { + const std::string& objname = entry.key.name; + const int delim_pos = objname.find(params.delim, params.prefix.size()); + if (delim_pos >= 0) { + /* extract key -with trailing delimiter- for CommonPrefix */ + const std::string& prefix_key = + objname.substr(0, delim_pos + params.delim.length()); + + if (common_prefixes && + common_prefixes->find(prefix_key) == common_prefixes->end()) { + next_marker = prefix_key; + (*common_prefixes)[prefix_key] = true; + count++; + } + continue; + } + } + + if (!params.end_marker.name.empty() && + params.end_marker.name.compare(entry.key.name) <= 0) { + // should not include end_marker + *is_truncated = false; + break; + } + count++; + result->push_back(std::move(entry)); + } +out: + return ret; +} + +int DB::raw_obj::InitializeParamsfromRawObj(const DoutPrefixProvider *dpp, + DBOpParams* params) { + int ret = 0; + + if (!params) + return -1; + + params->op.bucket.info.bucket.name = bucket_name; + params->op.obj.state.obj.key.name = obj_name; + params->op.obj.state.obj.key.instance = obj_instance; + params->op.obj.state.obj.key.ns = obj_ns; + params->op.obj.obj_id = obj_id; + + if (multipart_part_str != "0.0") { + params->op.obj.is_multipart = true; + } else { + params->op.obj.is_multipart = false; + } + + params->op.obj_data.multipart_part_str = multipart_part_str; + params->op.obj_data.part_num = part_num; + + return ret; +} + +int DB::Object::InitializeParamsfromObject(const DoutPrefixProvider *dpp, + DBOpParams* params) { + int ret = 0; + string bucket = bucket_info.bucket.name; + + if (!params) + return -1; + + params->op.bucket.info.bucket.name = bucket; + params->op.obj.state.obj = obj; + params->op.obj.obj_id = obj_id; + + return ret; +} + +int DB::Object::get_object_impl(const DoutPrefixProvider *dpp, DBOpParams& params) { + int ret = 0; + + if (params.op.obj.state.obj.key.name.empty()) { + /* Initialize */ + store->InitializeParams(dpp, ¶ms); + InitializeParamsfromObject(dpp, ¶ms); + } + + ret = store->ProcessOp(dpp, "GetObject", ¶ms); + + /* pick one field check if object exists */ + if (!ret && !params.op.obj.state.exists) { + ldpp_dout(dpp, 0)<<"Object(bucket:" << bucket_info.bucket.name << ", Object:"<< obj.key.name << ") doesn't exist" << dendl; + ret = -ENOENT; + } + + return ret; +} + +int DB::Object::obj_omap_set_val_by_key(const DoutPrefixProvider *dpp, + const std::string& key, bufferlist& val, + bool must_exist) { + int ret = 0; + + DBOpParams params = {}; + + ret = get_object_impl(dpp, params); + + if (ret) { + ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <ProcessOp(dpp, "UpdateObject", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <& keys, + std::map* vals) +{ + int ret = 0; + DBOpParams params = {}; + std::map omap; + + if (!vals) + return -1; + + ret = get_object_impl(dpp, params); + + if (ret) { + ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <ProcessOp(dpp, "UpdateObject", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <& info) +{ + int ret = 0; + DBOpParams params = {}; + std::map omap; + + ret = get_object_impl(dpp, params); + + if (ret) { + ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <set_instance(buf); +} + +int DB::Object::obj_omap_get_all(const DoutPrefixProvider *dpp, + std::map *m) +{ + int ret = 0; + DBOpParams params = {}; + std::map omap; + + if (!m) + return -1; + + ret = get_object_impl(dpp, params); + + if (ret) { + ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" < *m, bool* pmore) +{ + int ret = 0; + DBOpParams params = {}; + std::map omap; + map::iterator iter; + uint64_t count = 0; + + if (!m) + return -1; + + ret = get_object_impl(dpp, params); + + if (ret) { + ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <first < marker) + continue; + + if ((++count) > max_count) { + *pmore = true; + break; + } + + (*m)[iter->first] = iter->second; + } + +out: + return ret; +} + +int DB::Object::set_attrs(const DoutPrefixProvider *dpp, + map& setattrs, + map* rmattrs) +{ + int ret = 0; + + DBOpParams params = {}; + rgw::sal::Attrs *attrs; + map::iterator iter; + RGWObjState* state; + + store->InitializeParams(dpp, ¶ms); + InitializeParamsfromObject(dpp, ¶ms); + ret = get_state(dpp, &state, true); + + if (ret && !state->exists) { + ldpp_dout(dpp, 0) <<"get_state failed err:(" <begin(); iter != rmattrs->end(); ++iter) { + (*attrs).erase(iter->first); + } + } + for (iter = setattrs.begin(); iter != setattrs.end(); ++iter) { + (*attrs)[iter->first] = iter->second; + } + + params.op.query_str = "attrs"; + /* As per https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html, + * the only way for users to modify object metadata is to make a copy of the object and + * set the metadata. + * Hence do not update mtime for any other attr changes */ + + ret = store->ProcessOp(dpp, "UpdateObject", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" < *attrset; + + store->InitializeParams(dpp, ¶ms); + InitializeParamsfromObject(dpp, ¶ms); + + ret = store->ProcessOp(dpp, "GetObject", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0) <<"In GetObject failed err:(" <ProcessOp(dpp, "UpdateObject", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <InitializeParams(dpp, ¶ms); + InitializeParamsfromRawObj(dpp, ¶ms); + + ret = db->ProcessOp(dpp, "GetObjectData", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In GetObjectData failed err:(" <InitializeParams(dpp, ¶ms); + InitializeParamsfromRawObj(dpp, ¶ms); + + /* XXX: Check for chunk_size ?? */ + params.op.obj_data.offset = ofs; + unsigned write_len = std::min((uint64_t)bl.length() - write_ofs, len); + bl.begin(write_ofs).copy(write_len, params.op.obj_data.data); + params.op.obj_data.size = params.op.obj_data.data.length(); + params.op.obj.state.mtime = real_clock::now(); + + ret = db->ProcessOp(dpp, "PutObjectData", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In PutObjectData failed err:(" <& list_entries) { + int ret = 0; + store = get_store(); + DBOpParams db_params = {}; + + store->InitializeParams(dpp, &db_params); + InitializeParamsfromObject(dpp, &db_params); + + db_params.op.list_max_count = MAX_VERSIONED_OBJECTS; + + ret = store->ProcessOp(dpp, "ListVersionedObjects", &db_params); + + if (ret) { + ldpp_dout(dpp, 0)<<"In ListVersionedObjects failed err:(" <InitializeParams(dpp, ¶ms); + InitializeParamsfromObject(dpp, ¶ms); + params.op.obj.state.obj.key = ent.key; + + ret = get_object_impl(dpp, params); + + if (ret) { + ldpp_dout(dpp, 0) <<"get_object_impl of versioned object failed err:(" <shadow_obj to store ObjectID string */ + s->shadow_obj = params.op.obj.obj_id; + + *state = &obj_state; + **state = *s; + +out: + return ret; + +} + +int DB::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState** pstate, bool follow_olh) +{ + return get_obj_state(dpp, bucket_info, obj, follow_olh, pstate); +} + +int DB::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest) +{ + RGWObjState* state; + int r = source->get_state(dpp, &state, true); + if (r < 0) + return r; + if (!state->exists) + return -ENOENT; + if (!state->get_attr(name, dest)) + return -ENODATA; + + return 0; +} + +int DB::Object::Read::prepare(const DoutPrefixProvider *dpp) +{ + DB *store = source->get_store(); + CephContext *cct = store->ctx(); + + bufferlist etag; + + map::iterator iter; + + RGWObjState* astate; + + int r = source->get_state(dpp, &astate, true); + if (r < 0) + return r; + + if (!astate->exists) { + return -ENOENT; + } + + state.obj = astate->obj; + source->obj_id = astate->shadow_obj; + + if (params.target_obj) { + *params.target_obj = state.obj; + } + if (params.attrs) { + *params.attrs = astate->attrset; + if (cct->_conf->subsys.should_gather()) { + for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) { + ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl; + } + } + } + + if (conds.if_match || conds.if_nomatch) { + r = get_attr(dpp, RGW_ATTR_ETAG, etag); + if (r < 0) + return r; + + if (conds.if_match) { + string if_match_str = rgw_string_unquote(conds.if_match); + ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl; + if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) { + return -ERR_PRECONDITION_FAILED; + } + } + + if (conds.if_nomatch) { + string if_nomatch_str = rgw_string_unquote(conds.if_nomatch); + ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl; + if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) { + return -ERR_NOT_MODIFIED; + } + } + } + + if (params.obj_size) + *params.obj_size = astate->size; + if (params.lastmod) + *params.lastmod = astate->mtime; + + return 0; +} + +int DB::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end) +{ + if (ofs < 0) { + ofs += obj_size; + if (ofs < 0) + ofs = 0; + end = obj_size - 1; + } else if (end < 0) { + end = obj_size - 1; + } + + if (obj_size > 0) { + if (ofs >= (off_t)obj_size) { + return -ERANGE; + } + if (end >= (off_t)obj_size) { + end = obj_size - 1; + } + } + return 0; +} + +int DB::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, const DoutPrefixProvider *dpp) +{ + DB *store = source->get_store(); + + uint64_t read_ofs = ofs; + uint64_t len, read_len; + + bufferlist read_bl; + uint64_t max_chunk_size = store->get_max_chunk_size(); + + RGWObjState* astate; + int r = source->get_state(dpp, &astate, true); + if (r < 0) + return r; + + if (!astate->exists) { + return -ENOENT; + } + + if (astate->size == 0) { + end = 0; + } else if (end >= (int64_t)astate->size) { + end = astate->size - 1; + } + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + + if (len > max_chunk_size) { + len = max_chunk_size; + } + + int head_data_size = astate->data.length(); + bool reading_from_head = (ofs < head_data_size); + + if (reading_from_head) { + if (astate) { // && astate->prefetch_data)? + if (!ofs && astate->data.length() >= len) { + bl = astate->data; + return bl.length(); + } + + if (ofs < astate->data.length()) { + unsigned copy_len = std::min((uint64_t)head_data_size - ofs, len); + astate->data.begin(ofs).copy(copy_len, bl); + return bl.length(); + } + } + } + + /* tail object */ + int part_num = (ofs / max_chunk_size); + /* XXX: Handle multipart_str */ + raw_obj read_obj(store, source->get_bucket_info().bucket.name, astate->obj.key.name, + astate->obj.key.instance, astate->obj.key.ns, source->obj_id, "0.0", part_num); + + read_len = len; + + ldpp_dout(dpp, 20) << "dbstore->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl; + + // read from non head object + r = read_obj.read(dpp, read_ofs, read_len, bl); + + if (r < 0) { + return r; + } + + return bl.length(); +} + +static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const DB::raw_obj& read_obj, off_t obj_ofs, + off_t len, bool is_head_obj, + RGWObjState* astate, void *arg) +{ + struct db_get_obj_data* d = static_cast(arg); + return d->store->get_obj_iterate_cb(dpp, read_obj, obj_ofs, len, + is_head_obj, astate, arg); +} + +int DB::get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const raw_obj& read_obj, off_t obj_ofs, + off_t len, bool is_head_obj, + RGWObjState* astate, void *arg) +{ + struct db_get_obj_data* d = static_cast(arg); + bufferlist bl; + int r = 0; + + if (is_head_obj) { + bl = astate->data; + } else { + // read from non head object + raw_obj robj = read_obj; + /* read entire data. So pass offset as '0' & len as '-1' */ + r = robj.read(dpp, 0, -1, bl); + + if (r <= 0) { + return r; + } + } + + unsigned read_ofs = 0, read_len = 0; + while (read_ofs < bl.length()) { + unsigned chunk_len = std::min((uint64_t)bl.length() - read_ofs, (uint64_t)len); + r = d->client_cb->handle_data(bl, read_ofs, chunk_len); + if (r < 0) + return r; + read_ofs += chunk_len; + read_len += chunk_len; + ldpp_dout(dpp, 20) << "dbstore->get_obj_iterate_cb obj-ofs=" << obj_ofs << " len=" << len << " chunk_len = " << chunk_len << " read_len = " << read_len << dendl; + } + + + d->offset += read_len; + + return read_len; +} + +int DB::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb) +{ + DB *store = source->get_store(); + const uint64_t chunk_size = store->get_max_chunk_size(); + + db_get_obj_data data(store, cb, ofs); + + int r = source->iterate_obj(dpp, source->get_bucket_info(), state.obj, + ofs, end, chunk_size, _get_obj_iterate_cb, &data); + if (r < 0) { + ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl; + return r; + } + + return 0; +} + +int DB::Object::iterate_obj(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, const rgw_obj& obj, + off_t ofs, off_t end, uint64_t max_chunk_size, + iterate_obj_cb cb, void *arg) +{ + DB *store = get_store(); + uint64_t len; + RGWObjState* astate; + + int r = get_state(dpp, &astate, true); + if (r < 0) { + return r; + } + + if (!astate->exists) { + return -ENOENT; + } + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + /* XXX: Will it really help to store all parts info in astate like manifest in Rados? */ + int part_num = 0; + int head_data_size = astate->data.length(); + + while (ofs <= end && (uint64_t)ofs < astate->size) { + part_num = (ofs / max_chunk_size); + uint64_t read_len = std::min(len, max_chunk_size); + + /* XXX: Handle multipart_str */ + raw_obj read_obj(store, get_bucket_info().bucket.name, astate->obj.key.name, + astate->obj.key.instance, astate->obj.key.ns, obj_id, "0.0", part_num); + bool reading_from_head = (ofs < head_data_size); + + r = cb(dpp, read_obj, ofs, read_len, reading_from_head, astate, arg); + if (r <= 0) { + return r; + } + /* r refers to chunk_len (no. of bytes) handled in cb */ + len -= r; + ofs += r; + } + + return 0; +} + +int DB::Object::Write::prepare(const DoutPrefixProvider* dpp) +{ + DB *store = target->get_store(); + + int ret = -1; + + /* XXX: handle assume_noent */ + + obj_state.obj = target->obj; + + if (target->obj_id.empty()) { + if (!target->obj.key.instance.empty() && (target->obj.key.instance != "null")) { + /* versioned object. Set obj_id same as versionID/instance */ + target->obj_id = target->obj.key.instance; + } else { + // generate obj_id + char buf[33]; + gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); + target->obj_id = buf; + } + } + + ret = 0; + return ret; +} + +/* writes tail objects */ +int DB::Object::Write::write_data(const DoutPrefixProvider* dpp, + bufferlist& data, uint64_t ofs) { + DB *store = target->get_store(); + /* tail objects */ + /* XXX: Split into parts each of max_chunk_size. But later make tail + * object chunk size limit to sqlite blob limit */ + int part_num = 0; + + uint64_t max_chunk_size = store->get_max_chunk_size(); + + /* tail_obj ofs should be greater than max_head_size */ + if (mp_part_str == "0.0") { // ensure not multipart meta object + if (ofs < store->get_max_head_size()) { + return -1; + } + } + + uint64_t end = data.length(); + uint64_t write_ofs = 0; + /* as we are writing max_chunk_size at a time in sal_dbstore DBAtomicWriter::process(), + * maybe this while loop is not needed + */ + while (write_ofs < end) { + part_num = (ofs / max_chunk_size); + uint64_t len = std::min(end, max_chunk_size); + + /* XXX: Handle multipart_str */ + raw_obj write_obj(store, target->get_bucket_info().bucket.name, obj_state.obj.key.name, + obj_state.obj.key.instance, obj_state.obj.key.ns, target->obj_id, mp_part_str, part_num); + + + ldpp_dout(dpp, 20) << "dbstore->write obj-ofs=" << ofs << " write_len=" << len << dendl; + + // write into non head object + int r = write_obj.write(dpp, ofs, write_ofs, len, data); + if (r < 0) { + return r; + } + /* r refers to chunk_len (no. of bytes) handled in raw_obj::write */ + len -= r; + ofs += r; + write_ofs += r; + } + + return 0; +} + +/* Write metadata & head object data */ +int DB::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, + uint64_t size, uint64_t accounted_size, + map& attrs, + bool assume_noent, bool modify_tail) +{ + DB *store = target->get_store(); + + RGWObjState* state = &obj_state; + map *attrset; + DBOpParams params = {}; + int ret = 0; + string etag; + string content_type; + bufferlist acl_bl; + string storage_class; + + map::iterator iter; + + store->InitializeParams(dpp, ¶ms); + target->InitializeParamsfromObject(dpp, ¶ms); + + obj_state = params.op.obj.state; + + if (real_clock::is_zero(meta.set_mtime)) { + meta.set_mtime = real_clock::now(); + } + + attrset = &state->attrset; + if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule()) { + // && meta.flags == PUT_OBJ_CREATE) { + auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (iter == attrs.end()) { + real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime); + string mode = target->bucket_info.obj_lock.get_mode(); + RGWObjectRetention obj_retention(mode, lock_until_date); + bufferlist bl; + obj_retention.encode(bl); + (*attrset)[RGW_ATTR_OBJECT_RETENTION] = bl; + } + } + + state->mtime = meta.set_mtime; + + if (meta.data) { + /* if we want to overwrite the data, we also want to overwrite the + xattrs, so just remove the object */ + params.op.obj.head_data = *meta.data; + } + + if (meta.rmattrs) { + for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) { + const string& name = iter->first; + (*attrset).erase(name.c_str()); + } + } + + if (meta.manifest) { + storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class; + + /* remove existing manifest attr */ + iter = attrs.find(RGW_ATTR_MANIFEST); + if (iter != attrs.end()) + attrs.erase(iter); + + bufferlist bl; + encode(*meta.manifest, bl); + (*attrset)[RGW_ATTR_MANIFEST] = bl; + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + (*attrset)[name.c_str()] = bl; + + if (name.compare(RGW_ATTR_ETAG) == 0) { + etag = rgw_bl_str(bl); + params.op.obj.etag = etag; + } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) { + content_type = rgw_bl_str(bl); + } else if (name.compare(RGW_ATTR_ACL) == 0) { + acl_bl = bl; + } + } + + if (!storage_class.empty()) { + bufferlist bl; + bl.append(storage_class); + (*attrset)[RGW_ATTR_STORAGE_CLASS] = bl; + } + + params.op.obj.state = *state ; + params.op.obj.state.exists = true; + params.op.obj.state.size = size; + params.op.obj.state.accounted_size = accounted_size; + params.op.obj.owner = target->get_bucket_info().owner.id; + params.op.obj.category = meta.category; + + if (meta.mtime) { + *meta.mtime = meta.set_mtime; + } + + params.op.query_str = "meta"; + params.op.obj.obj_id = target->obj_id; + + /* Check if versioned */ + bool is_versioned = !target->obj.key.instance.empty() && (target->obj.key.instance != "null"); + params.op.obj.is_versioned = is_versioned; + + if (is_versioned && (params.op.obj.category == RGWObjCategory::Main)) { + /* versioned object */ + params.op.obj.flags |= rgw_bucket_dir_entry::FLAG_VER; + } + ret = store->ProcessOp(dpp, "PutObject", ¶ms); + if (ret) { + ldpp_dout(dpp, 0)<<"In PutObject failed err:(" <list_versioned_objects(dpp, del_params.op.obj.list_entries); + if (ret) { + ldpp_dout(dpp, 0)<<"ListVersionedObjects failed err:(" <get_store(); + + ret = store->ProcessOp(dpp, "DeleteObject", &del_params); + if (ret) { + ldpp_dout(dpp, 0) << "In DeleteObject failed err:(" <ProcessOp(dpp, "UpdateObjectData", &update_params); + + if (ret) { + ldpp_dout(dpp, 0) << "Updating tail objects mtime failed err:(" <get_store(); + bool versioning_suspended = ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == BUCKET_VERSIONS_SUSPENDED); + int ret = -1; + DBOpParams olh_params = {}; + std::string version_id; + DBOpParams next_params = del_params; + + version_id = del_params.op.obj.state.obj.key.instance; + + DBOpParams dm_params = del_params; + + // create delete marker + + store->InitializeParams(dpp, &dm_params); + target->InitializeParamsfromObject(dpp, &dm_params); + dm_params.op.obj.category = RGWObjCategory::None; + + if (versioning_suspended) { + dm_params.op.obj.state.obj.key.instance = "null"; + } else { + store->gen_rand_obj_instance_name(&dm_params.op.obj.state.obj.key); + dm_params.op.obj.obj_id = dm_params.op.obj.state.obj.key.instance; + } + + dm_params.op.obj.flags |= (rgw_bucket_dir_entry::FLAG_DELETE_MARKER); + + ret = store->ProcessOp(dpp, "PutObject", &dm_params); + + if (ret) { + ldpp_dout(dpp, 0) << "delete_olh: failed to create delete marker - err:(" <* entry) +{ + int ret = 0; + const DoutPrefixProvider *dpp = get_def_dpp(); + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + + params.op.lc_entry.index = oid; + params.op.lc_entry.entry.set_bucket(marker); + + params.op.query_str = "get_entry"; + ret = ProcessOp(dpp, "GetLCEntry", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In GetLCEntry failed err:(" <reset(e); + } + +out: + return ret; +} + +int DB::get_next_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) +{ + int ret = 0; + const DoutPrefixProvider *dpp = get_def_dpp(); + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + + params.op.lc_entry.index = oid; + params.op.lc_entry.entry.set_bucket(marker); + + params.op.query_str = "get_next_entry"; + ret = ProcessOp(dpp, "GetLCEntry", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In GetLCEntry failed err:(" <reset(e); + } + +out: + return ret; +} + +int DB::set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry) +{ + int ret = 0; + const DoutPrefixProvider *dpp = get_def_dpp(); + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + + params.op.lc_entry.index = oid; + params.op.lc_entry.entry = entry; + + ret = ProcessOp(dpp, "InsertLCEntry", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In InsertLCEntry failed err:(" <>& entries) +{ + int ret = 0; + const DoutPrefixProvider *dpp = get_def_dpp(); + + entries.clear(); + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + + params.op.lc_entry.index = oid; + params.op.lc_entry.min_marker = marker; + params.op.list_max_count = max_entries; + + ret = ProcessOp(dpp, "ListLCEntries", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In ListLCEntries failed err:(" <(std::move(entry))); + } + +out: + return ret; +} + +int DB::rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry) +{ + int ret = 0; + const DoutPrefixProvider *dpp = get_def_dpp(); + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + + params.op.lc_entry.index = oid; + params.op.lc_entry.entry = entry; + + ret = ProcessOp(dpp, "RemoveLCEntry", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In RemoveLCEntry failed err:(" <* head) +{ + int ret = 0; + const DoutPrefixProvider *dpp = get_def_dpp(); + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + + params.op.lc_head.index = oid; + + ret = ProcessOp(dpp, "GetLCHead", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In GetLCHead failed err:(" <(params.op.lc_head.head); + +out: + return ret; +} + +int DB::put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head) +{ + int ret = 0; + const DoutPrefixProvider *dpp = get_def_dpp(); + + DBOpParams params = {}; + InitializeParams(dpp, ¶ms); + + params.op.lc_head.index = oid; + params.op.lc_head.head = head; + + ret = ProcessOp(dpp, "InsertLCHead", ¶ms); + + if (ret) { + ldpp_dout(dpp, 0)<<"In InsertLCHead failed err:(" < lk(mtx); + + ldpp_dout(dpp, 2) << " DB GC started " << dendl; + int max = 100; + RGWUserBuckets buckets; + bool is_truncated = false; + + do { + std::string& marker = bucket_marker; + rgw_user user; + user.id = user_marker; + buckets.clear(); + is_truncated = false; + + int r = db->list_buckets(dpp, "all", user, marker, string(), + max, false, &buckets, &is_truncated); + + if (r < 0) { //do nothing? retry later ? + break; + } + + for (const auto& ent : buckets.get_buckets()) { + const std::string &bname = ent.first; + + r = db->delete_stale_objs(dpp, bname, gc_obj_min_wait); + + if (r < 0) { //do nothing? skip to next entry? + ldpp_dout(dpp, 2) << " delete_stale_objs failed for bucket( " << bname <<")" << dendl; + } + bucket_marker = bname; + user_marker = user.id; + + /* XXX: If using locks, unlock here and reacquire in the next iteration */ + cv.wait_for(lk, std::chrono::milliseconds(100)); + if (stop_signalled) { + goto done; + } + } + } while(is_truncated); + + bucket_marker.clear(); + cv.wait_for(lk, std::chrono::milliseconds(gc_interval*10)); + } while(! stop_signalled); + +done: + return nullptr; +} + +} } // namespace rgw::store + diff --git a/src/rgw/driver/dbstore/common/dbstore.h b/src/rgw/driver/dbstore/common/dbstore.h new file mode 100644 index 000000000..b26cc116e --- /dev/null +++ b/src/rgw/driver/dbstore/common/dbstore.h @@ -0,0 +1,2016 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "fmt/format.h" +#include +#include "rgw_sal_store.h" +#include "rgw_common.h" +#include "driver/rados/rgw_bucket.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "common/ceph_context.h" +#include "rgw_obj_manifest.h" +#include "rgw_multi.h" + +namespace rgw { namespace store { + +class DB; + +struct DBOpUserInfo { + RGWUserInfo uinfo = {}; + obj_version user_version; + rgw::sal::Attrs user_attrs; +}; + +struct DBOpBucketInfo { + RGWBucketEnt ent; // maybe not needed. not used in create/get_bucket + RGWBucketInfo info; + RGWUser* owner = nullptr; + rgw::sal::Attrs bucket_attrs; + obj_version bucket_version; + ceph::real_time mtime; + // used for list query + std::string min_marker; + std::string max_marker; + std::list list_entries; +}; + +struct DBOpObjectInfo { + RGWAccessControlPolicy acls; + RGWObjState state = {}; + + /* Below are taken from rgw_bucket_dir_entry */ + RGWObjCategory category; + std::string etag; + std::string owner; + std::string owner_display_name; + std::string content_type; + std::string storage_class; + bool appendable; + uint64_t index_ver; + std::string tag; + uint16_t flags; + uint64_t versioned_epoch; + + /* from state.manifest (RGWObjManifest) */ + std::map objs; + uint64_t head_size{0}; + rgw_placement_rule head_placement_rule; + uint64_t max_head_size{0}; + std::string obj_id; + rgw_bucket_placement tail_placement; /* might be different than the original bucket, + as object might have been copied across pools */ + std::map rules; + std::string tail_instance; /* tail object's instance */ + + + /* Obj's omap store */ + std::map omap; + + /* Extra fields */ + bool is_multipart; + std::list mp_parts; + + bufferlist head_data; + std::string min_marker; + std::string max_marker; + std::string prefix; + std::list list_entries; + /* XXX: Maybe use std::vector instead of std::list */ + + /* for versioned objects */ + bool is_versioned; + uint64_t version_num = 0; +}; + +struct DBOpObjectDataInfo { + RGWObjState state; + uint64_t part_num; + std::string multipart_part_str; + uint64_t offset; + uint64_t size; + bufferlist data{}; +}; + +struct DBOpLCHeadInfo { + std::string index; + rgw::sal::StoreLifecycle::StoreLCHead head; +}; + +struct DBOpLCEntryInfo { + std::string index; + rgw::sal::StoreLifecycle::StoreLCEntry entry; + // used for list query + std::string min_marker; + std::list list_entries; +}; + +struct DBOpInfo { + std::string name; // Op name + /* Support only single access_key for now. So store + * it separately as primary access_key_id & secret to + * be able to query easily. + * + * XXX: Swift keys and subuser not supported for now */ + DBOpUserInfo user; + std::string query_str; + DBOpBucketInfo bucket; + DBOpObjectInfo obj; + DBOpObjectDataInfo obj_data; + DBOpLCHeadInfo lc_head; + DBOpLCEntryInfo lc_entry; + uint64_t list_max_count; +}; + +struct DBOpParams { + CephContext *cct; + + /* Tables */ + std::string user_table; + std::string bucket_table; + std::string object_table; + + /* Ops*/ + DBOpInfo op; + + std::string objectdata_table; + std::string object_trigger; + std::string object_view; + std::string quota_table; + std::string lc_head_table; + std::string lc_entry_table; + std::string obj; +}; + +/* Used for prepared schemas. + * Difference with above structure is that all + * the fields are strings here to accommodate any + * style identifiers used by backend db. By default + * initialized with sqlitedb style, can be overriden + * using InitPrepareParams() + * + * These identifiers are used in prepare and bind statements + * to get the right index of each param. + */ +struct DBOpUserPrepareInfo { + static constexpr const char* user_id = ":user_id"; + static constexpr const char* tenant = ":tenant"; + static constexpr const char* ns = ":ns"; + static constexpr const char* display_name = ":display_name"; + static constexpr const char* user_email = ":user_email"; + /* Support only single access_key for now. So store + * it separately as primary access_key_id & secret to + * be able to query easily. + * + * In future, when need to support & query from multiple + * access keys, better to maintain them in a separate table. + */ + static constexpr const char* access_keys_id = ":access_keys_id"; + static constexpr const char* access_keys_secret = ":access_keys_secret"; + static constexpr const char* access_keys = ":access_keys"; + static constexpr const char* swift_keys = ":swift_keys"; + static constexpr const char* subusers = ":subusers"; + static constexpr const char* suspended = ":suspended"; + static constexpr const char* max_buckets = ":max_buckets"; + static constexpr const char* op_mask = ":op_mask"; + static constexpr const char* user_caps = ":user_caps"; + static constexpr const char* admin = ":admin"; + static constexpr const char* system = ":system"; + static constexpr const char* placement_name = ":placement_name"; + static constexpr const char* placement_storage_class = ":placement_storage_class"; + static constexpr const char* placement_tags = ":placement_tags"; + static constexpr const char* bucket_quota = ":bucket_quota"; + static constexpr const char* temp_url_keys = ":temp_url_keys"; + static constexpr const char* user_quota = ":user_quota"; + static constexpr const char* type = ":type"; + static constexpr const char* mfa_ids = ":mfa_ids"; + static constexpr const char* user_attrs = ":user_attrs"; + static constexpr const char* user_ver = ":user_vers"; + static constexpr const char* user_ver_tag = ":user_ver_tag"; +}; + +struct DBOpBucketPrepareInfo { + static constexpr const char* bucket_name = ":bucket_name"; + static constexpr const char* tenant = ":tenant"; + static constexpr const char* marker = ":marker"; + static constexpr const char* bucket_id = ":bucket_id"; + static constexpr const char* size = ":size"; + static constexpr const char* size_rounded = ":size_rounded"; + static constexpr const char* creation_time = ":creation_time"; + static constexpr const char* count = ":count"; + static constexpr const char* placement_name = ":placement_name"; + static constexpr const char* placement_storage_class = ":placement_storage_class"; + /* ownerid - maps to DBOpUserPrepareInfo */ + static constexpr const char* flags = ":flags"; + static constexpr const char* zonegroup = ":zonegroup"; + static constexpr const char* has_instance_obj = ":has_instance_obj"; + static constexpr const char* quota = ":quota"; + static constexpr const char* requester_pays = ":requester_pays"; + static constexpr const char* has_website = ":has_website"; + static constexpr const char* website_conf = ":website_conf"; + static constexpr const char* swift_versioning = ":swift_versioning"; + static constexpr const char* swift_ver_location = ":swift_ver_location"; + static constexpr const char* mdsearch_config = ":mdsearch_config"; + static constexpr const char* new_bucket_instance_id = ":new_bucket_instance_id"; + static constexpr const char* obj_lock = ":obj_lock"; + static constexpr const char* sync_policy_info_groups = ":sync_policy_info_groups"; + static constexpr const char* bucket_attrs = ":bucket_attrs"; + static constexpr const char* bucket_ver = ":bucket_vers"; + static constexpr const char* bucket_ver_tag = ":bucket_ver_tag"; + static constexpr const char* mtime = ":mtime"; + static constexpr const char* min_marker = ":min_marker"; + static constexpr const char* max_marker = ":max_marker"; +}; + +struct DBOpObjectPrepareInfo { + static constexpr const char* obj_name = ":obj_name"; + static constexpr const char* obj_instance = ":obj_instance"; + static constexpr const char* obj_ns = ":obj_ns"; + static constexpr const char* acls = ":acls"; + static constexpr const char* index_ver = ":index_ver"; + static constexpr const char* tag = ":tag"; + static constexpr const char* flags = ":flags"; + static constexpr const char* versioned_epoch = ":versioned_epoch"; + static constexpr const char* obj_category = ":obj_category"; + static constexpr const char* etag = ":etag"; + static constexpr const char* owner = ":owner"; + static constexpr const char* owner_display_name = ":owner_display_name"; + static constexpr const char* storage_class = ":storage_class"; + static constexpr const char* appendable = ":appendable"; + static constexpr const char* content_type = ":content_type"; + static constexpr const char* index_hash_source = ":index_hash_source"; + static constexpr const char* obj_size = ":obj_size"; + static constexpr const char* accounted_size = ":accounted_size"; + static constexpr const char* mtime = ":mtime"; + static constexpr const char* epoch = ":epoch"; + static constexpr const char* obj_tag = ":obj_tag"; + static constexpr const char* tail_tag = ":tail_tag"; + static constexpr const char* write_tag = ":write_tag"; + static constexpr const char* fake_tag = ":fake_tag"; + static constexpr const char* shadow_obj = ":shadow_obj"; + static constexpr const char* has_data = ":has_data"; + static constexpr const char* is_versioned = ":is_versioned"; + static constexpr const char* version_num = ":version_num"; + static constexpr const char* pg_ver = ":pg_ver"; + static constexpr const char* zone_short_id = ":zone_short_id"; + static constexpr const char* obj_version = ":obj_version"; + static constexpr const char* obj_version_tag = ":obj_version_tag"; + static constexpr const char* obj_attrs = ":obj_attrs"; + static constexpr const char* head_size = ":head_size"; + static constexpr const char* max_head_size = ":max_head_size"; + static constexpr const char* obj_id = ":obj_id"; + static constexpr const char* tail_instance = ":tail_instance"; + static constexpr const char* head_placement_rule_name = ":head_placement_rule_name"; + static constexpr const char* head_placement_storage_class = ":head_placement_storage_class"; + static constexpr const char* tail_placement_rule_name = ":tail_placement_rule_name"; + static constexpr const char* tail_placement_storage_class = ":tail_placement_storage_class"; + static constexpr const char* manifest_part_objs = ":manifest_part_objs"; + static constexpr const char* manifest_part_rules = ":manifest_part_rules"; + static constexpr const char* omap = ":omap"; + static constexpr const char* is_multipart = ":is_multipart"; + static constexpr const char* mp_parts = ":mp_parts"; + static constexpr const char* head_data = ":head_data"; + static constexpr const char* min_marker = ":min_marker"; + static constexpr const char* max_marker = ":max_marker"; + static constexpr const char* prefix = ":prefix"; + /* Below used to update mp_parts obj name + * from meta object to src object on completion */ + static constexpr const char* new_obj_name = ":new_obj_name"; + static constexpr const char* new_obj_instance = ":new_obj_instance"; + static constexpr const char* new_obj_ns = ":new_obj_ns"; +}; + +struct DBOpObjectDataPrepareInfo { + static constexpr const char* part_num = ":part_num"; + static constexpr const char* offset = ":offset"; + static constexpr const char* data = ":data"; + static constexpr const char* size = ":size"; + static constexpr const char* multipart_part_str = ":multipart_part_str"; +}; + +struct DBOpLCEntryPrepareInfo { + static constexpr const char* index = ":index"; + static constexpr const char* bucket_name = ":bucket_name"; + static constexpr const char* start_time = ":start_time"; + static constexpr const char* status = ":status"; + static constexpr const char* min_marker = ":min_marker"; +}; + +struct DBOpLCHeadPrepareInfo { + static constexpr const char* index = ":index"; + static constexpr const char* start_date = ":start_date"; + static constexpr const char* marker = ":marker"; +}; + +struct DBOpPrepareInfo { + DBOpUserPrepareInfo user; + std::string_view query_str; // view into DBOpInfo::query_str + DBOpBucketPrepareInfo bucket; + DBOpObjectPrepareInfo obj; + DBOpObjectDataPrepareInfo obj_data; + DBOpLCHeadPrepareInfo lc_head; + DBOpLCEntryPrepareInfo lc_entry; + static constexpr const char* list_max_count = ":list_max_count"; +}; + +struct DBOpPrepareParams { + /* Tables */ + std::string user_table; + std::string bucket_table; + std::string object_table; + + /* Ops */ + DBOpPrepareInfo op; + + + std::string objectdata_table; + std::string object_trigger; + std::string object_view; + std::string quota_table; + std::string lc_head_table; + std::string lc_entry_table; +}; + +struct DBOps { + std::shared_ptr InsertUser; + std::shared_ptr RemoveUser; + std::shared_ptr GetUser; + std::shared_ptr InsertBucket; + std::shared_ptr UpdateBucket; + std::shared_ptr RemoveBucket; + std::shared_ptr GetBucket; + std::shared_ptr ListUserBuckets; + std::shared_ptr InsertLCEntry; + std::shared_ptr RemoveLCEntry; + std::shared_ptr GetLCEntry; + std::shared_ptr ListLCEntries; + std::shared_ptr InsertLCHead; + std::shared_ptr RemoveLCHead; + std::shared_ptr GetLCHead; +}; + +class ObjectOp { + public: + ObjectOp() {}; + + virtual ~ObjectOp() {} + + std::shared_ptr PutObject; + std::shared_ptr DeleteObject; + std::shared_ptr GetObject; + std::shared_ptr UpdateObject; + std::shared_ptr ListBucketObjects; + std::shared_ptr ListVersionedObjects; + std::shared_ptr PutObjectData; + std::shared_ptr UpdateObjectData; + std::shared_ptr GetObjectData; + std::shared_ptr DeleteObjectData; + std::shared_ptr DeleteStaleObjectData; + + virtual int InitializeObjectOps(std::string db_name, const DoutPrefixProvider *dpp) { return 0; } +}; + +class DBOp { + private: + static constexpr std::string_view CreateUserTableQ = + /* Corresponds to rgw::sal::User + * + * For now only UserID is made Primary key. + * If multiple tenants are stored in single .db handle, should + * make both (UserID, Tenant) as Primary Key. + * + * XXX: + * - AccessKeys, SwiftKeys, Subusers (map<>) are stored as blob. + * To enable easy query, first accesskey is stored in separate fields + * AccessKeysID, AccessKeysSecret. + * In future, may be have separate table to store these keys and + * query on that table. + * - Quota stored as blob .. should be linked to quota table. + */ + "CREATE TABLE IF NOT EXISTS '{}' ( \ + UserID TEXT NOT NULL UNIQUE, \ + Tenant TEXT , \ + NS TEXT , \ + DisplayName TEXT , \ + UserEmail TEXT , \ + AccessKeysID TEXT , \ + AccessKeysSecret TEXT , \ + AccessKeys BLOB , \ + SwiftKeys BLOB , \ + SubUsers BLOB , \ + Suspended INTEGER , \ + MaxBuckets INTEGER , \ + OpMask INTEGER , \ + UserCaps BLOB , \ + Admin INTEGER , \ + System INTEGER , \ + PlacementName TEXT , \ + PlacementStorageClass TEXT , \ + PlacementTags BLOB , \ + BucketQuota BLOB , \ + TempURLKeys BLOB , \ + UserQuota BLOB , \ + TYPE INTEGER , \ + MfaIDs BLOB , \ + AssumedRoleARN TEXT , \ + UserAttrs BLOB, \ + UserVersion INTEGER, \ + UserVersionTag TEXT, \ + PRIMARY KEY (UserID) \n);"; + + static constexpr std::string_view CreateBucketTableQ = + /* Corresponds to rgw::sal::Bucket + * + * For now only BucketName is made Primary key. Since buckets should + * be unique across users in rgw, OwnerID is not made part of primary key. + * However it is still referenced as foreign key + * + * If multiple tenants are stored in single .db handle, should + * make both (BucketName, Tenant) as Primary Key. Also should + * reference (UserID, Tenant) as Foreign key. + * + * leaving below RADOS specific fields + * - rgw_data_placement_target explicit_placement (struct rgw_bucket) + * - rgw::BucketLayout layout (struct RGWBucketInfo) + * - const static uint32_t NUM_SHARDS_BLIND_BUCKET (struct RGWBucketInfo), + * should be '0' indicating no sharding. + * - cls_rgw_reshard_status reshard_status (struct RGWBucketInfo) + * + * XXX: + * - Quota stored as blob .. should be linked to quota table. + * - WebsiteConf stored as BLOB..if required, should be split + * - Storing bucket_version (struct RGWBucket), objv_tracker + * (struct RGWBucketInfo) separately. Are they same? + * + */ + "CREATE TABLE IF NOT EXISTS '{}' ( \ + BucketName TEXT NOT NULL UNIQUE , \ + Tenant TEXT, \ + Marker TEXT, \ + BucketID TEXT, \ + Size INTEGER, \ + SizeRounded INTEGER,\ + CreationTime BLOB, \ + Count INTEGER, \ + PlacementName TEXT , \ + PlacementStorageClass TEXT , \ + OwnerID TEXT NOT NULL, \ + Flags INTEGER, \ + Zonegroup TEXT, \ + HasInstanceObj BOOLEAN, \ + Quota BLOB, \ + RequesterPays BOOLEAN, \ + HasWebsite BOOLEAN, \ + WebsiteConf BLOB, \ + SwiftVersioning BOOLEAN, \ + SwiftVerLocation TEXT, \ + MdsearchConfig BLOB, \ + NewBucketInstanceID TEXT,\ + ObjectLock BLOB, \ + SyncPolicyInfoGroups BLOB, \ + BucketAttrs BLOB, \ + BucketVersion INTEGER, \ + BucketVersionTag TEXT, \ + Mtime BLOB, \ + PRIMARY KEY (BucketName) \ + FOREIGN KEY (OwnerID) \ + REFERENCES '{}' (UserID) ON DELETE CASCADE ON UPDATE CASCADE \n);"; + + static constexpr std::string_view CreateObjectTableTriggerQ = + "CREATE TRIGGER IF NOT EXISTS '{}' \ + AFTER INSERT ON '{}' \ + BEGIN \ + UPDATE '{}' \ + SET VersionNum = (SELECT COALESCE(max(VersionNum), 0) from '{}' where ObjName = new.ObjName) + 1 \ + where ObjName = new.ObjName and ObjInstance = new.ObjInstance; \ + END;"; + + static constexpr std::string_view CreateObjectTableQ = + /* Corresponds to rgw::sal::Object + * + * For now only BucketName, ObjName is made Primary key. + * If multiple tenants are stored in single .db handle, should + * include Tenant too in the Primary Key. Also should + * reference (BucketID, Tenant) as Foreign key. + * + * referring to + * - rgw_bucket_dir_entry - following are added for now + * flags, + * versioned_epoch + * tag + * index_ver + * meta.category + * meta.etag + * meta.storageclass + * meta.appendable + * meta.content_type + * meta.owner + * meta.owner_display_name + * + * - RGWObjState. Below are omitted from that struct + * as they seem in-memory variables + * * is_atomic, has_atts, exists, prefetch_data, keep_tail, + * - RGWObjManifest + * + * Extra field added "IsMultipart" to flag multipart uploads, + * HeadData to store first chunk data. + */ + "CREATE TABLE IF NOT EXISTS '{}' ( \ + ObjName TEXT NOT NULL , \ + ObjInstance TEXT, \ + ObjNS TEXT, \ + BucketName TEXT NOT NULL , \ + ACLs BLOB, \ + IndexVer INTEGER, \ + Tag TEXT, \ + Flags INTEGER, \ + VersionedEpoch INTEGER, \ + ObjCategory INTEGER, \ + Etag TEXT, \ + Owner TEXT, \ + OwnerDisplayName TEXT, \ + StorageClass TEXT, \ + Appendable BOOL, \ + ContentType TEXT, \ + IndexHashSource TEXT, \ + ObjSize INTEGER, \ + AccountedSize INTEGER, \ + Mtime BLOB, \ + Epoch INTEGER, \ + ObjTag BLOB, \ + TailTag BLOB, \ + WriteTag TEXT, \ + FakeTag BOOL, \ + ShadowObj TEXT, \ + HasData BOOL, \ + IsVersioned BOOL, \ + VersionNum INTEGER, \ + PGVer INTEGER, \ + ZoneShortID INTEGER, \ + ObjVersion INTEGER, \ + ObjVersionTag TEXT, \ + ObjAttrs BLOB, \ + HeadSize INTEGER, \ + MaxHeadSize INTEGER, \ + ObjID TEXT NOT NULL, \ + TailInstance TEXT, \ + HeadPlacementRuleName TEXT, \ + HeadPlacementRuleStorageClass TEXT, \ + TailPlacementRuleName TEXT, \ + TailPlacementStorageClass TEXT, \ + ManifestPartObjs BLOB, \ + ManifestPartRules BLOB, \ + Omap BLOB, \ + IsMultipart BOOL, \ + MPPartsList BLOB, \ + HeadData BLOB, \ + PRIMARY KEY (ObjName, ObjInstance, BucketName), \ + FOREIGN KEY (BucketName) \ + REFERENCES '{}' (BucketName) ON DELETE CASCADE ON UPDATE CASCADE \n);"; + + static constexpr std::string_view CreateObjectDataTableQ = + /* Extra field 'MultipartPartStr' added which signifies multipart + * . For regular object, it is '0.0' + * + * - part: a collection of stripes that make a contiguous part of an + object. A regular object will only have one part (although might have + many stripes), a multipart object might have many parts. Each part + has a fixed stripe size (ObjChunkSize), although the last stripe of a + part might be smaller than that. + */ + "CREATE TABLE IF NOT EXISTS '{}' ( \ + ObjName TEXT NOT NULL , \ + ObjInstance TEXT, \ + ObjNS TEXT, \ + BucketName TEXT NOT NULL , \ + ObjID TEXT NOT NULL , \ + MultipartPartStr TEXT, \ + PartNum INTEGER NOT NULL, \ + Offset INTEGER, \ + Size INTEGER, \ + Mtime BLOB, \ + Data BLOB, \ + PRIMARY KEY (ObjName, BucketName, ObjInstance, ObjID, MultipartPartStr, PartNum), \ + FOREIGN KEY (BucketName) \ + REFERENCES '{}' (BucketName) ON DELETE CASCADE ON UPDATE CASCADE \n);"; + + static constexpr std::string_view CreateObjectViewQ = + /* This query creats temporary view with entries from ObjectData table which have + * corresponding head object (i.e, with same ObjName, ObjInstance, ObjNS, ObjID) + * in the Object table. + * + * GC thread can use this view to delete stale entries from the ObjectData table which + * do not exist in this view. + * + * XXX: This view is throwing ForeignKey mismatch error, mostly may be because all the keys + * of objectdata table are not referenced here. So this view is not used atm. + */ + "CREATE TEMP VIEW IF NOT EXISTS '{}' AS \ + SELECT s.ObjName, s.ObjInstance, s.ObjID from '{}' as s INNER JOIN '{}' USING \ + (ObjName, BucketName, ObjInstance, ObjID);"; + + + static constexpr std::string_view CreateQuotaTableQ = + "CREATE TABLE IF NOT EXISTS '{}' ( \ + QuotaID INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE , \ + MaxSizeSoftThreshold INTEGER , \ + MaxObjsSoftThreshold INTEGER , \ + MaxSize INTEGER , \ + MaxObjects INTEGER , \ + Enabled Boolean , \ + CheckOnRaw Boolean \n);"; + + static constexpr std::string_view CreateLCEntryTableQ = + "CREATE TABLE IF NOT EXISTS '{}' ( \ + LCIndex TEXT NOT NULL , \ + BucketName TEXT NOT NULL , \ + StartTime INTEGER , \ + Status INTEGER , \ + PRIMARY KEY (LCIndex, BucketName) \n);"; + + static constexpr std::string_view CreateLCHeadTableQ = + "CREATE TABLE IF NOT EXISTS '{}' ( \ + LCIndex TEXT NOT NULL , \ + Marker TEXT , \ + StartDate INTEGER , \ + PRIMARY KEY (LCIndex) \n);"; + + static constexpr std::string_view DropQ = "DROP TABLE IF EXISTS '{}'"; + static constexpr std::string_view ListAllQ = "SELECT * from '{}'"; + + public: + DBOp() {} + virtual ~DBOp() {} + std::mutex mtx; // to protect prepared stmt + + static std::string CreateTableSchema(std::string_view type, + const DBOpParams *params) { + if (!type.compare("User")) + return fmt::format(CreateUserTableQ, + params->user_table); + if (!type.compare("Bucket")) + return fmt::format(CreateBucketTableQ, + params->bucket_table, + params->user_table); + if (!type.compare("Object")) + return fmt::format(CreateObjectTableQ, + params->object_table, + params->bucket_table); + if (!type.compare("ObjectTrigger")) + return fmt::format(CreateObjectTableTriggerQ, + params->object_trigger, + params->object_table, + params->object_table, + params->object_table); + if (!type.compare("ObjectData")) + return fmt::format(CreateObjectDataTableQ, + params->objectdata_table, + params->bucket_table); + if (!type.compare("ObjectView")) + return fmt::format(CreateObjectTableQ, + params->object_view, + params->objectdata_table, + params->object_table); + if (!type.compare("Quota")) + return fmt::format(CreateQuotaTableQ, + params->quota_table); + if (!type.compare("LCHead")) + return fmt::format(CreateLCHeadTableQ, + params->lc_head_table); + if (!type.compare("LCEntry")) + return fmt::format(CreateLCEntryTableQ, + params->lc_entry_table, + params->bucket_table); + + ceph_abort_msgf("incorrect table type %.*s", type.size(), type.data()); + } + + static std::string DeleteTableSchema(std::string_view table) { + return fmt::format(DropQ, table); + } + static std::string ListTableSchema(std::string_view table) { + return fmt::format(ListAllQ, table); + } + + virtual int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; } + virtual int Bind(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; } + virtual int Execute(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; } +}; + +class InsertUserOp : virtual public DBOp { + private: + /* For existing entires, - + * (1) INSERT or REPLACE - it will delete previous entry and then + * inserts new one. Since it deletes previos enties, it will + * trigger all foriegn key cascade deletes or other triggers. + * (2) INSERT or UPDATE - this will set NULL values to unassigned + * fields. + * more info: https://code-examples.net/en/q/377728 + * + * For now using INSERT or REPLACE. If required of updating existing + * record, will use another query. + */ + static constexpr std::string_view Query = "INSERT OR REPLACE INTO '{}' \ + (UserID, Tenant, NS, DisplayName, UserEmail, \ + AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\ + SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \ + System, PlacementName, PlacementStorageClass, PlacementTags, \ + BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, \ + UserAttrs, UserVersion, UserVersionTag) \ + VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \ + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});"; + + public: + virtual ~InsertUserOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.user_table, + params.op.user.user_id, params.op.user.tenant, params.op.user.ns, + params.op.user.display_name, params.op.user.user_email, + params.op.user.access_keys_id, params.op.user.access_keys_secret, + params.op.user.access_keys, params.op.user.swift_keys, + params.op.user.subusers, params.op.user.suspended, + params.op.user.max_buckets, params.op.user.op_mask, + params.op.user.user_caps, params.op.user.admin, params.op.user.system, + params.op.user.placement_name, params.op.user.placement_storage_class, + params.op.user.placement_tags, params.op.user.bucket_quota, + params.op.user.temp_url_keys, params.op.user.user_quota, + params.op.user.type, params.op.user.mfa_ids, + params.op.user.user_attrs, params.op.user.user_ver, + params.op.user.user_ver_tag); + } + +}; + +class RemoveUserOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "DELETE from '{}' where UserID = {}"; + + public: + virtual ~RemoveUserOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.user_table, + params.op.user.user_id); + } +}; + +class GetUserOp: virtual public DBOp { + private: + /* If below query columns are updated, make sure to update the indexes + * in list_user() cbk in sqliteDB.cc */ + static constexpr std::string_view Query = "SELECT \ + UserID, Tenant, NS, DisplayName, UserEmail, \ + AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\ + SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \ + System, PlacementName, PlacementStorageClass, PlacementTags, \ + BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \ + UserAttrs, UserVersion, UserVersionTag from '{}' where UserID = {}"; + + static constexpr std::string_view QueryByEmail = "SELECT \ + UserID, Tenant, NS, DisplayName, UserEmail, \ + AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\ + SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \ + System, PlacementName, PlacementStorageClass, PlacementTags, \ + BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \ + UserAttrs, UserVersion, UserVersionTag from '{}' where UserEmail = {}"; + + static constexpr std::string_view QueryByAccessKeys = "SELECT \ + UserID, Tenant, NS, DisplayName, UserEmail, \ + AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\ + SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \ + System, PlacementName, PlacementStorageClass, PlacementTags, \ + BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \ + UserAttrs, UserVersion, UserVersionTag from '{}' where AccessKeysID = {}"; + + static constexpr std::string_view QueryByUserID = "SELECT \ + UserID, Tenant, NS, DisplayName, UserEmail, \ + AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\ + SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \ + System, PlacementName, PlacementStorageClass, PlacementTags, \ + BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \ + UserAttrs, UserVersion, UserVersionTag \ + from '{}' where UserID = {}"; + + public: + virtual ~GetUserOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + if (params.op.query_str == "email") { + return fmt::format(QueryByEmail, params.user_table, + params.op.user.user_email); + } else if (params.op.query_str == "access_key") { + return fmt::format(QueryByAccessKeys, + params.user_table, + params.op.user.access_keys_id); + } else if (params.op.query_str == "user_id") { + return fmt::format(QueryByUserID, + params.user_table, + params.op.user.user_id); + } else { + return fmt::format(Query, params.user_table, + params.op.user.user_id); + } + } +}; + +class InsertBucketOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "INSERT OR REPLACE INTO '{}' \ + (BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \ + Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \ + HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \ + SwiftVersioning, SwiftVerLocation, \ + MdsearchConfig, NewBucketInstanceID, ObjectLock, \ + SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime) \ + VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, \ + {}, {}, {}, {}, {}, {}, {}, {}, {}, \ + {}, {}, {}, {}, {}, {}, {}, {}, {}, {})"; + + public: + virtual ~InsertBucketOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.bucket_table, + params.op.bucket.bucket_name, params.op.bucket.tenant, + params.op.bucket.marker, params.op.bucket.bucket_id, + params.op.bucket.size, params.op.bucket.size_rounded, + params.op.bucket.creation_time, params.op.bucket.count, + params.op.bucket.placement_name, params.op.bucket.placement_storage_class, + params.op.user.user_id, + params.op.bucket.flags, params.op.bucket.zonegroup, params.op.bucket.has_instance_obj, + params.op.bucket.quota, params.op.bucket.requester_pays, params.op.bucket.has_website, + params.op.bucket.website_conf, params.op.bucket.swift_versioning, + params.op.bucket.swift_ver_location, params.op.bucket.mdsearch_config, + params.op.bucket.new_bucket_instance_id, params.op.bucket.obj_lock, + params.op.bucket.sync_policy_info_groups, params.op.bucket.bucket_attrs, + params.op.bucket.bucket_ver, params.op.bucket.bucket_ver_tag, + params.op.bucket.mtime); + } +}; + +class UpdateBucketOp: virtual public DBOp { + private: + // Updates Info, Mtime, Version + static constexpr std::string_view InfoQuery = + "UPDATE '{}' SET Tenant = {}, Marker = {}, BucketID = {}, CreationTime = {}, \ + Count = {}, PlacementName = {}, PlacementStorageClass = {}, OwnerID = {}, Flags = {}, \ + Zonegroup = {}, HasInstanceObj = {}, Quota = {}, RequesterPays = {}, HasWebsite = {}, \ + WebsiteConf = {}, SwiftVersioning = {}, SwiftVerLocation = {}, MdsearchConfig = {}, \ + NewBucketInstanceID = {}, ObjectLock = {}, SyncPolicyInfoGroups = {}, \ + BucketVersion = {}, Mtime = {} WHERE BucketName = {}"; + // Updates Attrs, OwnerID, Mtime, Version + static constexpr std::string_view AttrsQuery = + "UPDATE '{}' SET OwnerID = {}, BucketAttrs = {}, Mtime = {}, BucketVersion = {} \ + WHERE BucketName = {}"; + // Updates OwnerID, CreationTime, Mtime, Version + static constexpr std::string_view OwnerQuery = + "UPDATE '{}' SET OwnerID = {}, CreationTime = {}, Mtime = {}, BucketVersion = {} WHERE BucketName = {}"; + + public: + virtual ~UpdateBucketOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + if (params.op.query_str == "info") { + return fmt::format(InfoQuery, params.bucket_table, + params.op.bucket.tenant, params.op.bucket.marker, params.op.bucket.bucket_id, + params.op.bucket.creation_time, params.op.bucket.count, + params.op.bucket.placement_name, params.op.bucket.placement_storage_class, + params.op.user.user_id, + params.op.bucket.flags, params.op.bucket.zonegroup, params.op.bucket.has_instance_obj, + params.op.bucket.quota, params.op.bucket.requester_pays, params.op.bucket.has_website, + params.op.bucket.website_conf, params.op.bucket.swift_versioning, + params.op.bucket.swift_ver_location, params.op.bucket.mdsearch_config, + params.op.bucket.new_bucket_instance_id, params.op.bucket.obj_lock, + params.op.bucket.sync_policy_info_groups, + params.op.bucket.bucket_ver, params.op.bucket.mtime, + params.op.bucket.bucket_name); + } + if (params.op.query_str == "attrs") { + return fmt::format(AttrsQuery, params.bucket_table, + params.op.user.user_id, params.op.bucket.bucket_attrs, + params.op.bucket.mtime, + params.op.bucket.bucket_ver, params.op.bucket.bucket_name); + } + if (params.op.query_str == "owner") { + return fmt::format(OwnerQuery, params.bucket_table, + params.op.user.user_id, params.op.bucket.creation_time, + params.op.bucket.mtime, + params.op.bucket.bucket_ver, params.op.bucket.bucket_name); + } + return ""; + } +}; + +class RemoveBucketOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "DELETE from '{}' where BucketName = {}"; + + public: + virtual ~RemoveBucketOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.bucket_table, + params.op.bucket.bucket_name); + } +}; + +class GetBucketOp: virtual public DBOp { + private: + static constexpr std::string_view Query = "SELECT \ + BucketName, BucketTable.Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \ + Count, BucketTable.PlacementName, BucketTable.PlacementStorageClass, OwnerID, Flags, Zonegroup, \ + HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \ + SwiftVersioning, SwiftVerLocation, \ + MdsearchConfig, NewBucketInstanceID, ObjectLock, \ + SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime, NS \ + from '{}' as BucketTable INNER JOIN '{}' ON OwnerID = UserID where BucketName = {}"; + + public: + virtual ~GetBucketOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + //return fmt::format(Query, params.op.bucket.bucket_name, + // params.bucket_table, params.user_table); + return fmt::format(Query, + params.bucket_table, params.user_table, + params.op.bucket.bucket_name); + } +}; + +class ListUserBucketsOp: virtual public DBOp { + private: + // once we have stats also stored, may have to update this query to join + // these two tables. + static constexpr std::string_view Query = "SELECT \ + BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \ + Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \ + HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \ + SwiftVersioning, SwiftVerLocation, \ + MdsearchConfig, NewBucketInstanceID, ObjectLock, \ + SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime \ + FROM '{}' WHERE OwnerID = {} AND BucketName > {} ORDER BY BucketName ASC LIMIT {}"; + + /* BucketNames are unique across users. Hence userid/OwnerID is not used as + * marker or for ordering here in the below query + */ + static constexpr std::string_view AllQuery = "SELECT \ + BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \ + Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \ + HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \ + SwiftVersioning, SwiftVerLocation, \ + MdsearchConfig, NewBucketInstanceID, ObjectLock, \ + SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime \ + FROM '{}' WHERE BucketName > {} ORDER BY BucketName ASC LIMIT {}"; + + public: + virtual ~ListUserBucketsOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + if (params.op.query_str == "all") { + return fmt::format(AllQuery, params.bucket_table, + params.op.bucket.min_marker, + params.op.list_max_count); + } else { + return fmt::format(Query, params.bucket_table, + params.op.user.user_id, params.op.bucket.min_marker, + params.op.list_max_count); + } + } +}; + +class PutObjectOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "INSERT OR REPLACE INTO '{}' \ + (ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \ + Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \ + StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \ + AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \ + ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \ + ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \ + ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \ + TailPlacementRuleName, TailPlacementStorageClass, \ + ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \ + HeadData) \ + VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \ + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \ + {}, {}, {}, \ + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})"; + + public: + virtual ~PutObjectOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, + params.object_table, params.op.obj.obj_name, + params.op.obj.obj_instance, params.op.obj.obj_ns, + params.op.bucket.bucket_name, params.op.obj.acls, params.op.obj.index_ver, + params.op.obj.tag, params.op.obj.flags, params.op.obj.versioned_epoch, + params.op.obj.obj_category, params.op.obj.etag, params.op.obj.owner, + params.op.obj.owner_display_name, params.op.obj.storage_class, + params.op.obj.appendable, params.op.obj.content_type, + params.op.obj.index_hash_source, params.op.obj.obj_size, + params.op.obj.accounted_size, params.op.obj.mtime, + params.op.obj.epoch, params.op.obj.obj_tag, params.op.obj.tail_tag, + params.op.obj.write_tag, params.op.obj.fake_tag, params.op.obj.shadow_obj, + params.op.obj.has_data, params.op.obj.is_versioned, + params.op.obj.version_num, + params.op.obj.pg_ver, params.op.obj.zone_short_id, + params.op.obj.obj_version, params.op.obj.obj_version_tag, + params.op.obj.obj_attrs, params.op.obj.head_size, + params.op.obj.max_head_size, params.op.obj.obj_id, + params.op.obj.tail_instance, + params.op.obj.head_placement_rule_name, + params.op.obj.head_placement_storage_class, + params.op.obj.tail_placement_rule_name, + params.op.obj.tail_placement_storage_class, + params.op.obj.manifest_part_objs, + params.op.obj.manifest_part_rules, params.op.obj.omap, + params.op.obj.is_multipart, params.op.obj.mp_parts, + params.op.obj.head_data); + } +}; + +class DeleteObjectOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "DELETE from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {}"; + + public: + virtual ~DeleteObjectOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.object_table, + params.op.bucket.bucket_name, + params.op.obj.obj_name, + params.op.obj.obj_instance); + } +}; + +class GetObjectOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "SELECT \ + ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \ + Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \ + StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \ + AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \ + ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \ + ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \ + ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \ + TailPlacementRuleName, TailPlacementStorageClass, \ + ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \ + HeadData from '{}' \ + where BucketName = {} and ObjName = {} and ObjInstance = {}"; + + public: + virtual ~GetObjectOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, + params.object_table, + params.op.bucket.bucket_name, + params.op.obj.obj_name, + params.op.obj.obj_instance); + } +}; + +class ListBucketObjectsOp: virtual public DBOp { + private: + // once we have stats also stored, may have to update this query to join + // these two tables. + static constexpr std::string_view Query = + "SELECT \ + ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \ + Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \ + StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \ + AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \ + ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \ + ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \ + ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \ + TailPlacementRuleName, TailPlacementStorageClass, \ + ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, HeadData from '{}' \ + where BucketName = {} and ObjName >= {} and ObjName LIKE {} ORDER BY ObjName ASC, VersionNum DESC LIMIT {}"; + public: + virtual ~ListBucketObjectsOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + /* XXX: Include obj_id, delim */ + return fmt::format(Query, + params.object_table, + params.op.bucket.bucket_name, + params.op.obj.min_marker, + params.op.obj.prefix, + params.op.list_max_count); + } +}; + +#define MAX_VERSIONED_OBJECTS 20 +class ListVersionedObjectsOp: virtual public DBOp { + private: + // once we have stats also stored, may have to update this query to join + // these two tables. + static constexpr std::string_view Query = + "SELECT \ + ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \ + Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \ + StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \ + AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \ + ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \ + ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \ + ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \ + TailPlacementRuleName, TailPlacementStorageClass, \ + ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \ + HeadData from '{}' \ + where BucketName = {} and ObjName = {} ORDER BY VersionNum DESC LIMIT {}"; + public: + virtual ~ListVersionedObjectsOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + /* XXX: Include obj_id, delim */ + return fmt::format(Query, + params.object_table, + params.op.bucket.bucket_name, + params.op.obj.obj_name, + params.op.list_max_count); + } +}; + +class UpdateObjectOp: virtual public DBOp { + private: + // Updates Omap + static constexpr std::string_view OmapQuery = + "UPDATE '{}' SET Omap = {}, Mtime = {} \ + where BucketName = {} and ObjName = {} and ObjInstance = {}"; + static constexpr std::string_view AttrsQuery = + "UPDATE '{}' SET ObjAttrs = {}, Mtime = {} \ + where BucketName = {} and ObjName = {} and ObjInstance = {}"; + static constexpr std::string_view MPQuery = + "UPDATE '{}' SET MPPartsList = {}, Mtime = {} \ + where BucketName = {} and ObjName = {} and ObjInstance = {}"; + static constexpr std::string_view MetaQuery = + "UPDATE '{}' SET \ + ObjNS = {}, ACLs = {}, IndexVer = {}, Tag = {}, Flags = {}, VersionedEpoch = {}, \ + ObjCategory = {}, Etag = {}, Owner = {}, OwnerDisplayName = {}, \ + StorageClass = {}, Appendable = {}, ContentType = {}, \ + IndexHashSource = {}, ObjSize = {}, AccountedSize = {}, Mtime = {}, \ + Epoch = {}, ObjTag = {}, TailTag = {}, WriteTag = {}, FakeTag = {}, \ + ShadowObj = {}, HasData = {}, IsVersioned = {}, VersionNum = {}, PGVer = {}, \ + ZoneShortID = {}, ObjVersion = {}, ObjVersionTag = {}, ObjAttrs = {}, \ + HeadSize = {}, MaxHeadSize = {}, ObjID = {}, TailInstance = {}, \ + HeadPlacementRuleName = {}, HeadPlacementRuleStorageClass = {}, \ + TailPlacementRuleName = {}, TailPlacementStorageClass = {}, \ + ManifestPartObjs = {}, ManifestPartRules = {}, Omap = {}, \ + IsMultipart = {}, MPPartsList = {}, HeadData = {} \ + WHERE ObjName = {} and ObjInstance = {} and BucketName = {}"; + + public: + virtual ~UpdateObjectOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + if (params.op.query_str == "omap") { + return fmt::format(OmapQuery, + params.object_table, params.op.obj.omap, + params.op.obj.mtime, + params.op.bucket.bucket_name, + params.op.obj.obj_name, + params.op.obj.obj_instance); + } + if (params.op.query_str == "attrs") { + return fmt::format(AttrsQuery, + params.object_table, params.op.obj.obj_attrs, + params.op.obj.mtime, + params.op.bucket.bucket_name, + params.op.obj.obj_name, + params.op.obj.obj_instance); + } + if (params.op.query_str == "mp") { + return fmt::format(MPQuery, + params.object_table, params.op.obj.mp_parts, + params.op.obj.mtime, + params.op.bucket.bucket_name, + params.op.obj.obj_name, + params.op.obj.obj_instance); + } + if (params.op.query_str == "meta") { + return fmt::format(MetaQuery, + params.object_table, + params.op.obj.obj_ns, params.op.obj.acls, params.op.obj.index_ver, + params.op.obj.tag, params.op.obj.flags, params.op.obj.versioned_epoch, + params.op.obj.obj_category, params.op.obj.etag, params.op.obj.owner, + params.op.obj.owner_display_name, params.op.obj.storage_class, + params.op.obj.appendable, params.op.obj.content_type, + params.op.obj.index_hash_source, params.op.obj.obj_size, + params.op.obj.accounted_size, params.op.obj.mtime, + params.op.obj.epoch, params.op.obj.obj_tag, params.op.obj.tail_tag, + params.op.obj.write_tag, params.op.obj.fake_tag, params.op.obj.shadow_obj, + params.op.obj.has_data, params.op.obj.is_versioned, params.op.obj.version_num, + params.op.obj.pg_ver, params.op.obj.zone_short_id, + params.op.obj.obj_version, params.op.obj.obj_version_tag, + params.op.obj.obj_attrs, params.op.obj.head_size, + params.op.obj.max_head_size, params.op.obj.obj_id, + params.op.obj.tail_instance, + params.op.obj.head_placement_rule_name, + params.op.obj.head_placement_storage_class, + params.op.obj.tail_placement_rule_name, + params.op.obj.tail_placement_storage_class, + params.op.obj.manifest_part_objs, + params.op.obj.manifest_part_rules, params.op.obj.omap, + params.op.obj.is_multipart, params.op.obj.mp_parts, + params.op.obj.head_data, + params.op.obj.obj_name, params.op.obj.obj_instance, + params.op.bucket.bucket_name); + } + return ""; + } +}; + +class PutObjectDataOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "INSERT OR REPLACE INTO '{}' \ + (ObjName, ObjInstance, ObjNS, BucketName, ObjID, MultipartPartStr, PartNum, Offset, Size, Mtime, Data) \ + VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})"; + + public: + virtual ~PutObjectDataOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, + params.objectdata_table, + params.op.obj.obj_name, params.op.obj.obj_instance, + params.op.obj.obj_ns, + params.op.bucket.bucket_name, + params.op.obj.obj_id, + params.op.obj_data.multipart_part_str, + params.op.obj_data.part_num, + params.op.obj_data.offset, + params.op.obj_data.size, + params.op.obj.mtime, + params.op.obj_data.data); + } +}; + +/* XXX: Recheck if this is really needed */ +class UpdateObjectDataOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "UPDATE '{}' \ + SET Mtime = {} WHERE ObjName = {} and ObjInstance = {} and \ + BucketName = {} and ObjID = {}"; + + public: + virtual ~UpdateObjectDataOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, + params.objectdata_table, + params.op.obj.mtime, + params.op.obj.obj_name, params.op.obj.obj_instance, + params.op.bucket.bucket_name, + params.op.obj.obj_id); + } +}; + +class GetObjectDataOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "SELECT \ + ObjName, ObjInstance, ObjNS, BucketName, ObjID, MultipartPartStr, PartNum, Offset, Size, Mtime, Data \ + from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {} and ObjID = {} ORDER BY MultipartPartStr, PartNum"; + + public: + virtual ~GetObjectDataOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, + params.objectdata_table, + params.op.bucket.bucket_name, + params.op.obj.obj_name, + params.op.obj.obj_instance, + params.op.obj.obj_id); + } +}; + +class DeleteObjectDataOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "DELETE from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {} and ObjID = {}"; + + public: + virtual ~DeleteObjectDataOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, + params.objectdata_table, + params.op.bucket.bucket_name, + params.op.obj.obj_name, + params.op.obj.obj_instance, + params.op.obj.obj_id); + } +}; + +class DeleteStaleObjectDataOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "DELETE from '{}' WHERE (ObjName, ObjInstance, ObjID) NOT IN (SELECT s.ObjName, s.ObjInstance, s.ObjID from '{}' as s INNER JOIN '{}' USING (ObjName, BucketName, ObjInstance, ObjID)) and Mtime < {}"; + + public: + virtual ~DeleteStaleObjectDataOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, + params.objectdata_table, + params.objectdata_table, + params.object_table, + params.op.obj.mtime); + } +}; + +class InsertLCEntryOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "INSERT OR REPLACE INTO '{}' \ + (LCIndex, BucketName, StartTime, Status) \ + VALUES ({}, {}, {}, {})"; + + public: + virtual ~InsertLCEntryOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.lc_entry_table, + params.op.lc_entry.index, params.op.lc_entry.bucket_name, + params.op.lc_entry.start_time, params.op.lc_entry.status); + } +}; + +class RemoveLCEntryOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "DELETE from '{}' where LCIndex = {} and BucketName = {}"; + + public: + virtual ~RemoveLCEntryOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.lc_entry_table, + params.op.lc_entry.index, params.op.lc_entry.bucket_name); + } +}; + +class GetLCEntryOp: virtual public DBOp { + private: + static constexpr std::string_view Query = "SELECT \ + LCIndex, BucketName, StartTime, Status \ + from '{}' where LCIndex = {} and BucketName = {}"; + static constexpr std::string_view NextQuery = "SELECT \ + LCIndex, BucketName, StartTime, Status \ + from '{}' where LCIndex = {} and BucketName > {} ORDER BY BucketName ASC"; + + public: + virtual ~GetLCEntryOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + if (params.op.query_str == "get_next_entry") { + return fmt::format(NextQuery, params.lc_entry_table, + params.op.lc_entry.index, params.op.lc_entry.bucket_name); + } + // default + return fmt::format(Query, params.lc_entry_table, + params.op.lc_entry.index, params.op.lc_entry.bucket_name); + } +}; + +class ListLCEntriesOp: virtual public DBOp { + private: + static constexpr std::string_view Query = "SELECT \ + LCIndex, BucketName, StartTime, Status \ + FROM '{}' WHERE LCIndex = {} AND BucketName > {} ORDER BY BucketName ASC LIMIT {}"; + + public: + virtual ~ListLCEntriesOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.lc_entry_table, + params.op.lc_entry.index, params.op.lc_entry.min_marker, + params.op.list_max_count); + } +}; + +class InsertLCHeadOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "INSERT OR REPLACE INTO '{}' \ + (LCIndex, Marker, StartDate) \ + VALUES ({}, {}, {})"; + + public: + virtual ~InsertLCHeadOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.lc_head_table, + params.op.lc_head.index, params.op.lc_head.marker, + params.op.lc_head.start_date); + } +}; + +class RemoveLCHeadOp: virtual public DBOp { + private: + static constexpr std::string_view Query = + "DELETE from '{}' where LCIndex = {}"; + + public: + virtual ~RemoveLCHeadOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.lc_head_table, + params.op.lc_head.index); + } +}; + +class GetLCHeadOp: virtual public DBOp { + private: + static constexpr std::string_view Query = "SELECT \ + LCIndex, Marker, StartDate \ + from '{}' where LCIndex = {}"; + + public: + virtual ~GetLCHeadOp() {} + + static std::string Schema(DBOpPrepareParams ¶ms) { + return fmt::format(Query, params.lc_head_table, + params.op.lc_head.index); + } +}; + +/* taken from rgw_rados.h::RGWOLHInfo */ +struct DBOLHInfo { + rgw_obj target; + bool removed; + DBOLHInfo() : removed(false) {} + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(target, bl); + encode(removed, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(target, bl); + decode(removed, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(DBOLHInfo) + +class DB { + private: + const std::string db_name; + rgw::sal::Driver* driver; + const std::string user_table; + const std::string bucket_table; + const std::string quota_table; + const std::string lc_head_table; + const std::string lc_entry_table; + static std::map objectmap; + + protected: + void *db; + CephContext *cct; + const DoutPrefix dp; + uint64_t max_bucket_id = 0; + // XXX: default ObjStripeSize or ObjChunk size - 4M, make them configurable? + uint64_t ObjHeadSize = 1024; /* 1K - default head data size */ + uint64_t ObjChunkSize = (get_blob_limit() - 1000); /* 1000 to accommodate other fields */ + // Below mutex is to protect objectmap and other shared + // objects if any. + std::mutex mtx; + + public: + DB(std::string db_name, CephContext *_cct) : db_name(db_name), + user_table(db_name+"_user_table"), + bucket_table(db_name+"_bucket_table"), + quota_table(db_name+"_quota_table"), + lc_head_table(db_name+"_lc_head_table"), + lc_entry_table(db_name+"_lc_entry_table"), + cct(_cct), + dp(_cct, ceph_subsys_rgw, "rgw DBStore backend: ") + {} + /* DB() {}*/ + + DB(CephContext *_cct) : db_name("default_db"), + user_table(db_name+"_user_table"), + bucket_table(db_name+"_bucket_table"), + quota_table(db_name+"_quota_table"), + lc_head_table(db_name+"_lc_head_table"), + lc_entry_table(db_name+"_lc_entry_table"), + cct(_cct), + dp(_cct, ceph_subsys_rgw, "rgw DBStore backend: ") + {} + virtual ~DB() {} + + const std::string getDBname() { return db_name; } + const std::string getDBfile() { return db_name + ".db"; } + const std::string getUserTable() { return user_table; } + const std::string getBucketTable() { return bucket_table; } + const std::string getQuotaTable() { return quota_table; } + const std::string getLCHeadTable() { return lc_head_table; } + const std::string getLCEntryTable() { return lc_entry_table; } + const std::string getObjectTable(std::string bucket) { + return db_name+"_"+bucket+"_object_table"; } + const std::string getObjectDataTable(std::string bucket) { + return db_name+"_"+bucket+"_objectdata_table"; } + const std::string getObjectView(std::string bucket) { + return db_name+"_"+bucket+"_object_view"; } + const std::string getObjectTrigger(std::string bucket) { + return db_name+"_"+bucket+"_object_trigger"; } + + std::map getObjectMap(); + + struct DBOps dbops; // DB operations, make it private? + + void set_driver(rgw::sal::Driver* _driver) { + driver = _driver; + } + + void set_context(CephContext *_cct) { + cct = _cct; + } + + CephContext *ctx() { return cct; } + const DoutPrefixProvider *get_def_dpp() { return &dp; } + + int Initialize(std::string logfile, int loglevel); + int Destroy(const DoutPrefixProvider *dpp); + int LockInit(const DoutPrefixProvider *dpp); + int LockDestroy(const DoutPrefixProvider *dpp); + int Lock(const DoutPrefixProvider *dpp); + int Unlock(const DoutPrefixProvider *dpp); + + int InitializeParams(const DoutPrefixProvider *dpp, DBOpParams *params); + int ProcessOp(const DoutPrefixProvider *dpp, std::string_view Op, DBOpParams *params); + std::shared_ptr getDBOp(const DoutPrefixProvider *dpp, std::string_view Op, const DBOpParams *params); + int objectmapInsert(const DoutPrefixProvider *dpp, std::string bucket, class ObjectOp* ptr); + int objectmapDelete(const DoutPrefixProvider *dpp, std::string bucket); + + virtual uint64_t get_blob_limit() { return 0; }; + virtual void *openDB(const DoutPrefixProvider *dpp) { return NULL; } + virtual int closeDB(const DoutPrefixProvider *dpp) { return 0; } + virtual int createTables(const DoutPrefixProvider *dpp) { return 0; } + virtual int InitializeDBOps(const DoutPrefixProvider *dpp) { return 0; } + virtual int InitPrepareParams(const DoutPrefixProvider *dpp, + DBOpPrepareParams &p_params, + DBOpParams* params) = 0; + virtual int createLCTables(const DoutPrefixProvider *dpp) = 0; + + virtual int ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) = 0; + virtual int ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) = 0; + virtual int ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) = 0; + + int get_user(const DoutPrefixProvider *dpp, + const std::string& query_str, const std::string& query_str_val, + RGWUserInfo& uinfo, std::map *pattrs, + RGWObjVersionTracker *pobjv_tracker); + int store_user(const DoutPrefixProvider *dpp, + RGWUserInfo& uinfo, bool exclusive, std::map *pattrs, + RGWObjVersionTracker *pobjv_tracker, RGWUserInfo* pold_info); + int remove_user(const DoutPrefixProvider *dpp, + RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv_tracker); + int get_bucket_info(const DoutPrefixProvider *dpp, const std::string& query_str, + const std::string& query_str_val, + RGWBucketInfo& info, rgw::sal::Attrs* pattrs, ceph::real_time* pmtime, + obj_version* pbucket_version); + int create_bucket(const DoutPrefixProvider *dpp, + const RGWUserInfo& owner, rgw_bucket& bucket, + const std::string& zonegroup_id, + const rgw_placement_rule& placement_rule, + const std::string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + std::map& attrs, + RGWBucketInfo& info, + obj_version *pobjv, + obj_version *pep_objv, + real_time creation_time, + rgw_bucket *pmaster_bucket, + uint32_t *pmaster_num_shards, + optional_yield y, + bool exclusive); + + int next_bucket_id() { return ++max_bucket_id; }; + + int remove_bucket(const DoutPrefixProvider *dpp, const RGWBucketInfo info); + int list_buckets(const DoutPrefixProvider *dpp, const std::string& query_str, + rgw_user& user, + const std::string& marker, + const std::string& end_marker, + uint64_t max, + bool need_stats, + RGWUserBuckets *buckets, + bool *is_truncated); + int update_bucket(const DoutPrefixProvider *dpp, const std::string& query_str, + RGWBucketInfo& info, bool exclusive, + const rgw_user* powner_id, std::map* pattrs, + ceph::real_time* pmtime, RGWObjVersionTracker* pobjv); + + uint64_t get_max_head_size() { return ObjHeadSize; } + uint64_t get_max_chunk_size() { return ObjChunkSize; } + void gen_rand_obj_instance_name(rgw_obj_key *target_key); + + // db raw obj string is of format - + // "____" + static constexpr std::string_view raw_obj_oid = "{0}_{1}_{2}_{3}_{4}"; + + std::string to_oid(std::string_view bucket, std::string_view obj_name, + std::string_view obj_instance, std::string_view obj_id, + std::string_view mp_str, uint64_t partnum) { + return fmt::format(raw_obj_oid, bucket, obj_name, obj_instance, obj_id, mp_str, partnum); + } + int from_oid(const std::string& oid, std::string& bucket, std::string& obj_name, std::string& obj_id, + std::string& obj_instance, + std::string& mp_str, uint64_t& partnum) { + // TODO: use ceph::split() from common/split.h + // XXX: doesn't this break if obj_name has underscores in it? + std::vector result; + boost::split(result, oid, boost::is_any_of("_")); + bucket = result[0]; + obj_name = result[1]; + obj_instance = result[2]; + obj_id = result[3]; + mp_str = result[4]; + partnum = stoi(result[5]); + + return 0; + } + + struct raw_obj { + DB* db; + + std::string bucket_name; + std::string obj_name; + std::string obj_instance; + std::string obj_ns; + std::string obj_id; + std::string multipart_part_str; + uint64_t part_num; + + std::string obj_table; + std::string obj_data_table; + + raw_obj(DB* _db) { + db = _db; + } + + raw_obj(DB* _db, std::string& _bname, std::string& _obj_name, std::string& _obj_instance, + std::string& _obj_ns, std::string& _obj_id, std::string _mp_part_str, int _part_num) { + db = _db; + bucket_name = _bname; + obj_name = _obj_name; + obj_instance = _obj_instance; + obj_ns = _obj_ns; + obj_id = _obj_id; + multipart_part_str = _mp_part_str; + part_num = _part_num; + + obj_table = bucket_name+".object.table"; + obj_data_table = bucket_name+".objectdata.table"; + } + + raw_obj(DB* _db, std::string& oid) { + int r; + + db = _db; + r = db->from_oid(oid, bucket_name, obj_name, obj_instance, obj_id, multipart_part_str, + part_num); + if (r < 0) { + multipart_part_str = "0.0"; + part_num = 0; + } + + obj_table = db->getObjectTable(bucket_name); + obj_data_table = db->getObjectDataTable(bucket_name); + } + + int InitializeParamsfromRawObj (const DoutPrefixProvider *dpp, DBOpParams* params); + + int read(const DoutPrefixProvider *dpp, int64_t ofs, uint64_t end, bufferlist& bl); + int write(const DoutPrefixProvider *dpp, int64_t ofs, int64_t write_ofs, uint64_t len, bufferlist& bl); + }; + + class GC : public Thread { + const DoutPrefixProvider *dpp; + DB *db; + /* Default time interval for GC + * XXX: Make below options configurable + * + * gc_interval: The time between successive gc thread runs + * gc_obj_min_wait: Min. time to wait before deleting any data post its creation. + * + */ + std::mutex mtx; + std::condition_variable cv; + bool stop_signalled = false; + uint32_t gc_interval = 24*60*60; //sec ; default: 24*60*60 + uint32_t gc_obj_min_wait = 60*60; //60*60sec default + std::string bucket_marker; + std::string user_marker; + + public: + GC(const DoutPrefixProvider *_dpp, DB* _db) : + dpp(_dpp), db(_db) {} + + void *entry() override; + + void signal_stop() { + std::lock_guard lk_guard(mtx); + stop_signalled = true; + cv.notify_one(); + } + + friend class DB; + }; + std::unique_ptr gc_worker; + + class Bucket { + friend class DB; + DB* store; + + RGWBucketInfo bucket_info; + + public: + Bucket(DB *_store, const RGWBucketInfo& _binfo) : store(_store), bucket_info(_binfo) {} + DB *get_store() { return store; } + rgw_bucket& get_bucket() { return bucket_info.bucket; } + RGWBucketInfo& get_bucket_info() { return bucket_info; } + + class List { + protected: + // absolute maximum number of objects that + // list_objects_(un)ordered can return + static constexpr int64_t bucket_list_objects_absolute_max = 25000; + + DB::Bucket *target; + rgw_obj_key next_marker; + + public: + + struct Params { + std::string prefix; + std::string delim; + rgw_obj_key marker; + rgw_obj_key end_marker; + std::string ns; + bool enforce_ns; + RGWAccessListFilter* access_list_filter; + RGWBucketListNameFilter force_check_filter; + bool list_versions; + bool allow_unordered; + + Params() : + enforce_ns(true), + access_list_filter(nullptr), + list_versions(false), + allow_unordered(false) + {} + } params; + + explicit List(DB::Bucket *_target) : target(_target) {} + + /* XXX: Handle ordered and unordered separately. + * For now returning only ordered entries */ + int list_objects(const DoutPrefixProvider *dpp, int64_t max, + std::vector *result, + std::map *common_prefixes, bool *is_truncated); + rgw_obj_key& get_next_marker() { + return next_marker; + } + }; + }; + + class Object { + friend class DB; + DB* store; + + RGWBucketInfo bucket_info; + rgw_obj obj; + + RGWObjState obj_state; + std::string obj_id; + + bool versioning_disabled; + + bool bs_initialized; + + public: + Object(DB *_store, const RGWBucketInfo& _bucket_info, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info), + obj(_obj), + versioning_disabled(false), + bs_initialized(false) {} + + Object(DB *_store, const RGWBucketInfo& _bucket_info, const rgw_obj& _obj, const std::string& _obj_id) : store(_store), bucket_info(_bucket_info), obj(_obj), obj_id(_obj_id) {} + + struct Read { + DB::Object *source; + + struct GetObjState { + rgw_obj obj; + } state; + + struct ConditionParams { + const ceph::real_time *mod_ptr; + const ceph::real_time *unmod_ptr; + bool high_precision_time; + uint32_t mod_zone_id; + uint64_t mod_pg_ver; + const char *if_match; + const char *if_nomatch; + + ConditionParams() : + mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0), + if_match(NULL), if_nomatch(NULL) {} + } conds; + + struct Params { + ceph::real_time *lastmod; + uint64_t *obj_size; + std::map *attrs; + rgw_obj *target_obj; + + Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr), + target_obj(nullptr) {} + } params; + + explicit Read(DB::Object *_source) : source(_source) {} + + int prepare(const DoutPrefixProvider *dpp); + static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end); + int read(int64_t ofs, int64_t end, bufferlist& bl, const DoutPrefixProvider *dpp); + int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb); + int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest); + }; + + struct Write { + DB::Object *target; + RGWObjState obj_state; + std::string mp_part_str = "0.0"; // multipart num + + struct MetaParams { + ceph::real_time *mtime; + std::map* rmattrs; + const bufferlist *data; + RGWObjManifest *manifest; + const std::string *ptag; + std::list *remove_objs; + ceph::real_time set_mtime; + rgw_user owner; + RGWObjCategory category; + int flags; + const char *if_match; + const char *if_nomatch; + std::optional olh_epoch; + ceph::real_time delete_at; + bool canceled; + const std::string *user_data; + rgw_zone_set *zones_trace; + bool modify_tail; + bool completeMultipart; + bool appendable; + + MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL), + remove_objs(NULL), category(RGWObjCategory::Main), flags(0), + if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr), + modify_tail(false), completeMultipart(false), appendable(false) {} + } meta; + + explicit Write(DB::Object *_target) : target(_target) {} + + void set_mp_part_str(std::string _mp_part_str) { mp_part_str = _mp_part_str;} + int prepare(const DoutPrefixProvider* dpp); + int write_data(const DoutPrefixProvider* dpp, + bufferlist& data, uint64_t ofs); + int _do_write_meta(const DoutPrefixProvider *dpp, + uint64_t size, uint64_t accounted_size, + std::map& attrs, + bool assume_noent, bool modify_tail); + int write_meta(const DoutPrefixProvider *dpp, uint64_t size, + uint64_t accounted_size, std::map& attrs); + }; + + struct Delete { + DB::Object *target; + + struct DeleteParams { + rgw_user bucket_owner; + int versioning_status; + ACLOwner obj_owner; /* needed for creation of deletion marker */ + uint64_t olh_epoch; + std::string marker_version_id; + uint32_t bilog_flags; + std::list *remove_objs; + ceph::real_time expiration_time; + ceph::real_time unmod_since; + ceph::real_time mtime; /* for setting delete marker mtime */ + bool high_precision_time; + rgw_zone_set *zones_trace; + bool abortmp; + uint64_t parts_accounted_size; + + DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {} + } params; + + struct DeleteResult { + bool delete_marker; + std::string version_id; + + DeleteResult() : delete_marker(false) {} + } result; + + explicit Delete(DB::Object *_target) : target(_target) {} + + int delete_obj(const DoutPrefixProvider *dpp); + int delete_obj_impl(const DoutPrefixProvider *dpp, DBOpParams& del_params); + int create_dm(const DoutPrefixProvider *dpp, DBOpParams& del_params); + }; + + /* XXX: the parameters may be subject to change. All we need is bucket name + * & obj name,instance - keys */ + int get_object_impl(const DoutPrefixProvider *dpp, DBOpParams& params); + int get_obj_state(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + bool follow_olh, RGWObjState **state); + int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, bool follow_olh); + int list_versioned_objects(const DoutPrefixProvider *dpp, + std::list& list_entries); + + DB *get_store() { return store; } + rgw_obj& get_obj() { return obj; } + RGWBucketInfo& get_bucket_info() { return bucket_info; } + + int InitializeParamsfromObject(const DoutPrefixProvider *dpp, DBOpParams* params); + int set_attrs(const DoutPrefixProvider *dpp, std::map& setattrs, + std::map* rmattrs); + int transition(const DoutPrefixProvider *dpp, + const rgw_placement_rule& rule, const real_time& mtime, + uint64_t olh_epoch); + int obj_omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, bool must_exist); + int obj_omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid, + const std::set& keys, + std::map* vals); + int obj_omap_get_all(const DoutPrefixProvider *dpp, std::map *m); + int obj_omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map *m, bool* pmore); + using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const raw_obj&, off_t, off_t, + bool, RGWObjState*, void*); + int add_mp_part(const DoutPrefixProvider *dpp, RGWUploadPartInfo info); + int get_mp_parts_list(const DoutPrefixProvider *dpp, std::list& info); + + int iterate_obj(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, const rgw_obj& obj, + off_t ofs, off_t end, uint64_t max_chunk_size, + iterate_obj_cb cb, void *arg); + }; + int get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const raw_obj& read_obj, off_t obj_ofs, + off_t len, bool is_head_obj, + RGWObjState *astate, void *arg); + + int get_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry); + int get_next_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry); + int set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry); + int list_entries(const std::string& oid, const std::string& marker, + uint32_t max_entries, std::vector>& entries); + int rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry); + int get_head(const std::string& oid, std::unique_ptr* head); + int put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head); + int delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket, + uint32_t min_wait); + int createGC(const DoutPrefixProvider *_dpp); + int stopGC(); +}; + +struct db_get_obj_data { + DB* store; + RGWGetDataCB* client_cb = nullptr; + uint64_t offset; // next offset to write to client + + db_get_obj_data(DB* db, RGWGetDataCB* cb, uint64_t offset) : + store(db), client_cb(cb), offset(offset) {} + ~db_get_obj_data() {} +}; + +} } // namespace rgw::store diff --git a/src/rgw/driver/dbstore/common/dbstore_log.h b/src/rgw/driver/dbstore/common/dbstore_log.h new file mode 100644 index 000000000..416508369 --- /dev/null +++ b/src/rgw/driver/dbstore/common/dbstore_log.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "common/dout.h" + +#undef dout_prefix +#define dout_prefix *_dout << "rgw dbstore: " diff --git a/src/rgw/driver/dbstore/config/sqlite.cc b/src/rgw/driver/dbstore/config/sqlite.cc new file mode 100644 index 000000000..a1b217735 --- /dev/null +++ b/src/rgw/driver/dbstore/config/sqlite.cc @@ -0,0 +1,2070 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include + +#include + +#include + +#include "include/buffer.h" +#include "include/encoding.h" +#include "common/dout.h" +#include "common/random_string.h" +#include "rgw_zone.h" + +#include "common/connection_pool.h" +#include "sqlite/connection.h" +#include "sqlite/error.h" +#include "sqlite/statement.h" +#include "sqlite_schema.h" +#include "sqlite.h" + +#define dout_subsys ceph_subsys_rgw_dbstore + +namespace rgw::dbstore::config { + +struct Prefix : DoutPrefixPipe { + std::string_view prefix; + Prefix(const DoutPrefixProvider& dpp, std::string_view prefix) + : DoutPrefixPipe(dpp), prefix(prefix) {} + unsigned get_subsys() const override { return dout_subsys; } + void add_prefix(std::ostream& out) const override { + out << prefix; + } +}; + +namespace { + +// parameter names for prepared statement bindings +static constexpr const char* P1 = ":1"; +static constexpr const char* P2 = ":2"; +static constexpr const char* P3 = ":3"; +static constexpr const char* P4 = ":4"; +static constexpr const char* P5 = ":5"; +static constexpr const char* P6 = ":6"; + + +void read_text_rows(const DoutPrefixProvider* dpp, + const sqlite::stmt_execution& stmt, + std::span entries, + sal::ListResult& result) +{ + result.entries = sqlite::read_text_rows(dpp, stmt, entries); + if (result.entries.size() < entries.size()) { // end of listing + result.next.clear(); + } else { + result.next = result.entries.back(); + } +} + +struct RealmRow { + RGWRealm info; + int ver; + std::string tag; +}; + +void read_realm_row(const sqlite::stmt_execution& stmt, RealmRow& row) +{ + row.info.id = sqlite::column_text(stmt, 0); + row.info.name = sqlite::column_text(stmt, 1); + row.info.current_period = sqlite::column_text(stmt, 2); + row.info.epoch = sqlite::column_int(stmt, 3); + row.ver = sqlite::column_int(stmt, 4); + row.tag = sqlite::column_text(stmt, 5); +} + +void read_period_row(const sqlite::stmt_execution& stmt, RGWPeriod& row) +{ + // just read the Data column and decode everything else from that + std::string data = sqlite::column_text(stmt, 3); + + bufferlist bl = bufferlist::static_from_string(data); + auto p = bl.cbegin(); + decode(row, p); +} + +struct ZoneGroupRow { + RGWZoneGroup info; + int ver; + std::string tag; +}; + +void read_zonegroup_row(const sqlite::stmt_execution& stmt, ZoneGroupRow& row) +{ + std::string data = sqlite::column_text(stmt, 3); + row.ver = sqlite::column_int(stmt, 4); + row.tag = sqlite::column_text(stmt, 5); + + bufferlist bl = bufferlist::static_from_string(data); + auto p = bl.cbegin(); + decode(row.info, p); +} + +struct ZoneRow { + RGWZoneParams info; + int ver; + std::string tag; +}; + +void read_zone_row(const sqlite::stmt_execution& stmt, ZoneRow& row) +{ + std::string data = sqlite::column_text(stmt, 3); + row.ver = sqlite::column_int(stmt, 4); + row.tag = sqlite::column_text(stmt, 5); + + bufferlist bl = bufferlist::static_from_string(data); + auto p = bl.cbegin(); + decode(row.info, p); +} + +std::string generate_version_tag(CephContext* cct) +{ + static constexpr auto TAG_LEN = 24; + return gen_rand_alphanumeric(cct, TAG_LEN); +} + +using SQLiteConnectionHandle = ConnectionHandle; + +using SQLiteConnectionPool = ConnectionPool< + sqlite::Connection, sqlite::ConnectionFactory>; + +} // anonymous namespace + +class SQLiteImpl : public SQLiteConnectionPool { + public: + using SQLiteConnectionPool::SQLiteConnectionPool; +}; + + +SQLiteConfigStore::SQLiteConfigStore(std::unique_ptr impl) + : impl(std::move(impl)) +{ +} + +SQLiteConfigStore::~SQLiteConfigStore() = default; + + +// Realm + +class SQLiteRealmWriter : public sal::RealmWriter { + SQLiteImpl* impl; + int ver; + std::string tag; + std::string realm_id; + std::string realm_name; + public: + SQLiteRealmWriter(SQLiteImpl* impl, int ver, std::string tag, + std::string_view realm_id, std::string_view realm_name) + : impl(impl), ver(ver), tag(std::move(tag)), + realm_id(realm_id), realm_name(realm_name) + {} + + int write(const DoutPrefixProvider* dpp, optional_yield y, + const RGWRealm& info) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:realm_write "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after a conflict or delete + } + if (realm_id != info.id || realm_name != info.name) { + return -EINVAL; // can't modify realm id or name directly + } + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["realm_upd"]; + if (!stmt) { + const std::string sql = fmt::format(schema::realm_update5, + P1, P2, P3, P4, P5); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_text(dpp, binding, P2, info.current_period); + sqlite::bind_int(dpp, binding, P3, info.epoch); + sqlite::bind_int(dpp, binding, P4, ver); + sqlite::bind_text(dpp, binding, P5, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch + // our version is no longer consistent, so later writes would fail too + impl = nullptr; + return -ECANCELED; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm update failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::foreign_key_constraint) { + return -EINVAL; // refers to nonexistent CurrentPeriod + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + ++ver; + return 0; + } + + int rename(const DoutPrefixProvider* dpp, optional_yield y, + RGWRealm& info, std::string_view new_name) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:realm_rename "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after conflict or delete + } + if (realm_id != info.id || realm_name != info.name) { + return -EINVAL; // can't modify realm id or name directly + } + if (new_name.empty()) { + ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl; + return -EINVAL; + } + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["realm_rename"]; + if (!stmt) { + const std::string sql = fmt::format(schema::realm_rename4, + P1, P2, P3, P4); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + sqlite::bind_text(dpp, binding, P2, new_name); + sqlite::bind_int(dpp, binding, P3, ver); + sqlite::bind_text(dpp, binding, P4, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch + impl = nullptr; + return -ECANCELED; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm rename failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::unique_constraint) { + return -EEXIST; // Name already taken + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + info.name = std::string{new_name}; + ++ver; + return 0; + } + + int remove(const DoutPrefixProvider* dpp, optional_yield y) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:realm_remove "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after conflict or delete + } + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["realm_del"]; + if (!stmt) { + const std::string sql = fmt::format(schema::realm_delete3, P1, P2, P3); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + sqlite::bind_int(dpp, binding, P2, ver); + sqlite::bind_text(dpp, binding, P3, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + impl = nullptr; // prevent any further writes after delete + if (!::sqlite3_changes(conn->db.get())) { + return -ECANCELED; // VersionNumber/Tag mismatch + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm delete failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; + } +}; // SQLiteRealmWriter + + +int SQLiteConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:write_default_realm_id "}; dpp = &prefix; + + if (realm_id.empty()) { + ldpp_dout(dpp, 0) << "requires a realm id" << dendl; + return -EINVAL; + } + + try { + auto conn = impl->get(dpp); + sqlite::stmt_ptr* stmt = nullptr; + if (exclusive) { + stmt = &conn->statements["def_realm_ins"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::default_realm_insert1, P1); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } else { + stmt = &conn->statements["def_realm_ups"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::default_realm_upsert1, P1); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } + auto binding = sqlite::stmt_binding{stmt->get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + + auto reset = sqlite::stmt_execution{stmt->get()}; + sqlite::eval0(dpp, reset); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default realm insert failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::primary_key_constraint) { + return -EEXIST; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string& realm_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_default_realm_id "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["def_realm_sel"]; + if (!stmt) { + static constexpr std::string_view sql = schema::default_realm_select0; + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + realm_id = sqlite::column_text(reset, 0); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default realm select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y) + +{ + Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_realm_id "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["def_realm_del"]; + if (!stmt) { + static constexpr std::string_view sql = schema::default_realm_delete0; + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { + return -ENOENT; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default realm delete failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + + +int SQLiteConfigStore::create_realm(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWRealm& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:create_realm "}; dpp = &prefix; + + if (info.id.empty()) { + ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl; + return -EINVAL; + } + if (info.name.empty()) { + ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl; + return -EINVAL; + } + + int ver = 1; + auto tag = generate_version_tag(dpp->get_cct()); + + try { + auto conn = impl->get(dpp); + sqlite::stmt_ptr* stmt = nullptr; + if (exclusive) { + stmt = &conn->statements["realm_ins"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::realm_insert4, + P1, P2, P3, P4); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } else { + stmt = &conn->statements["realm_ups"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::realm_upsert4, + P1, P2, P3, P4); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } + auto binding = sqlite::stmt_binding{stmt->get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_text(dpp, binding, P2, info.name); + sqlite::bind_int(dpp, binding, P3, ver); + sqlite::bind_text(dpp, binding, P4, tag); + + auto reset = sqlite::stmt_execution{stmt->get()}; + sqlite::eval0(dpp, reset); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm insert failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::primary_key_constraint) { + return -EEXIST; // ID already taken + } else if (e.code() == sqlite::errc::unique_constraint) { + return -EEXIST; // Name already taken + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), ver, std::move(tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWRealm& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_by_id "}; dpp = &prefix; + + if (realm_id.empty()) { + ldpp_dout(dpp, 0) << "requires a realm id" << dendl; + return -EINVAL; + } + + RealmRow row; + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["realm_sel_id"]; + if (!stmt) { + const std::string sql = fmt::format(schema::realm_select_id1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_realm_row(reset, row); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +static void realm_select_by_name(const DoutPrefixProvider* dpp, + sqlite::Connection& conn, + std::string_view realm_name, + RealmRow& row) +{ + auto& stmt = conn.statements["realm_sel_name"]; + if (!stmt) { + const std::string sql = fmt::format(schema::realm_select_name1, P1); + stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_name); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_realm_row(reset, row); +} + +int SQLiteConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_by_name "}; dpp = &prefix; + + if (realm_name.empty()) { + ldpp_dout(dpp, 0) << "requires a realm name" << dendl; + return -EINVAL; + } + + RealmRow row; + try { + auto conn = impl->get(dpp); + realm_select_by_name(dpp, *conn, realm_name, row); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_default_realm(const DoutPrefixProvider* dpp, + optional_yield y, + RGWRealm& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_default_realm "}; dpp = &prefix; + + RealmRow row; + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["realm_sel_def"]; + if (!stmt) { + static constexpr std::string_view sql = schema::realm_select_default0; + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_realm_row(reset, row); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + std::string& realm_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_id "}; dpp = &prefix; + + if (realm_name.empty()) { + ldpp_dout(dpp, 0) << "requires a realm name" << dendl; + return -EINVAL; + } + + try { + auto conn = impl->get(dpp); + + RealmRow row; + realm_select_by_name(dpp, *conn, realm_name, row); + + realm_id = std::move(row.info.id); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + return 0; +} + +int SQLiteConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWPeriod& period) +{ + return -ENOTSUP; +} + +int SQLiteConfigStore::list_realm_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:list_realm_names "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["realm_sel_names"]; + if (!stmt) { + const std::string sql = fmt::format(schema::realm_select_names2, P1, P2); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, marker); + sqlite::bind_int(dpp, binding, P2, entries.size()); + + auto reset = sqlite::stmt_execution{stmt.get()}; + read_text_rows(dpp, reset, entries, result); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + + +// Period + +int SQLiteConfigStore::create_period(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWPeriod& info) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:create_period "}; dpp = &prefix; + + if (info.id.empty()) { + ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl; + return -EINVAL; + } + + bufferlist bl; + encode(info, bl); + const auto data = std::string_view{bl.c_str(), bl.length()}; + + try { + auto conn = impl->get(dpp); + sqlite::stmt_ptr* stmt = nullptr; + if (exclusive) { + stmt = &conn->statements["period_ins"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::period_insert4, + P1, P2, P3, P4); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } else { + stmt = &conn->statements["period_ups"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::period_upsert4, + P1, P2, P3, P4); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } + auto binding = sqlite::stmt_binding{stmt->get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_int(dpp, binding, P2, info.epoch); + sqlite::bind_text(dpp, binding, P3, info.realm_id); + sqlite::bind_text(dpp, binding, P4, data); + + auto reset = sqlite::stmt_execution{stmt->get()}; + sqlite::eval0(dpp, reset); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "period insert failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::foreign_key_constraint) { + return -EINVAL; // refers to nonexistent RealmID + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +static void period_select_epoch(const DoutPrefixProvider* dpp, + sqlite::Connection& conn, + std::string_view id, uint32_t epoch, + RGWPeriod& row) +{ + auto& stmt = conn.statements["period_sel_epoch"]; + if (!stmt) { + const std::string sql = fmt::format(schema::period_select_epoch2, P1, P2); + stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, id); + sqlite::bind_int(dpp, binding, P2, epoch); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_period_row(reset, row); +} + +static void period_select_latest(const DoutPrefixProvider* dpp, + sqlite::Connection& conn, + std::string_view id, RGWPeriod& row) +{ + auto& stmt = conn.statements["period_sel_latest"]; + if (!stmt) { + const std::string sql = fmt::format(schema::period_select_latest1, P1); + stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_period_row(reset, row); +} + +int SQLiteConfigStore::read_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id, + std::optional epoch, + RGWPeriod& info) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_period "}; dpp = &prefix; + + if (period_id.empty()) { + ldpp_dout(dpp, 0) << "requires a period id" << dendl; + return -EINVAL; + } + + try { + auto conn = impl->get(dpp); + if (epoch) { + period_select_epoch(dpp, *conn, period_id, *epoch, info); + } else { + period_select_latest(dpp, *conn, period_id, info); + } + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "period decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "period select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::delete_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:delete_period "}; dpp = &prefix; + + if (period_id.empty()) { + ldpp_dout(dpp, 0) << "requires a period id" << dendl; + return -EINVAL; + } + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["period_del"]; + if (!stmt) { + const std::string sql = fmt::format(schema::period_delete1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, period_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { + return -ENOENT; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "period delete failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::list_period_ids(const DoutPrefixProvider* dpp, + optional_yield y, + const std::string& marker, + std::span entries, + sal::ListResult& result) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:list_period_ids "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["period_sel_ids"]; + if (!stmt) { + const std::string sql = fmt::format(schema::period_select_ids2, P1, P2); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, marker); + sqlite::bind_int(dpp, binding, P2, entries.size()); + + auto reset = sqlite::stmt_execution{stmt.get()}; + read_text_rows(dpp, reset, entries, result); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "period select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + + +// ZoneGroup + +class SQLiteZoneGroupWriter : public sal::ZoneGroupWriter { + SQLiteImpl* impl; + int ver; + std::string tag; + std::string zonegroup_id; + std::string zonegroup_name; + public: + SQLiteZoneGroupWriter(SQLiteImpl* impl, int ver, std::string tag, + std::string_view zonegroup_id, + std::string_view zonegroup_name) + : impl(impl), ver(ver), tag(std::move(tag)), + zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name) + {} + + int write(const DoutPrefixProvider* dpp, optional_yield y, + const RGWZoneGroup& info) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_write "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after conflict or delete + } + if (zonegroup_id != info.id || zonegroup_name != info.name) { + return -EINVAL; // can't modify zonegroup id or name directly + } + + bufferlist bl; + encode(info, bl); + const auto data = std::string_view{bl.c_str(), bl.length()}; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zonegroup_upd"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zonegroup_update5, + P1, P2, P3, P4, P5); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_text(dpp, binding, P2, info.realm_id); + sqlite::bind_text(dpp, binding, P3, data); + sqlite::bind_int(dpp, binding, P4, ver); + sqlite::bind_text(dpp, binding, P5, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch + impl = nullptr; + return -ECANCELED; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zonegroup update failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::foreign_key_constraint) { + return -EINVAL; // refers to nonexistent RealmID + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; + } + + int rename(const DoutPrefixProvider* dpp, optional_yield y, + RGWZoneGroup& info, std::string_view new_name) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_rename "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after conflict or delete + } + if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) { + return -EINVAL; // can't modify zonegroup id or name directly + } + if (new_name.empty()) { + ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl; + return -EINVAL; + } + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zonegroup_rename"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zonegroup_rename4, + P1, P2, P3, P4); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_text(dpp, binding, P2, new_name); + sqlite::bind_int(dpp, binding, P3, ver); + sqlite::bind_text(dpp, binding, P4, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch + impl = nullptr; + return -ECANCELED; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zonegroup rename failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::unique_constraint) { + return -EEXIST; // Name already taken + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + info.name = std::string{new_name}; + return 0; + } + + int remove(const DoutPrefixProvider* dpp, optional_yield y) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_remove "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after conflict or delete + } + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zonegroup_del"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zonegroup_delete3, + P1, P2, P3); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, zonegroup_id); + sqlite::bind_int(dpp, binding, P2, ver); + sqlite::bind_text(dpp, binding, P3, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + impl = nullptr; + if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch + return -ECANCELED; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zonegroup delete failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; + } +}; // SQLiteZoneGroupWriter + + +int SQLiteConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zonegroup_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:write_default_zonegroup_id "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + sqlite::stmt_ptr* stmt = nullptr; + if (exclusive) { + stmt = &conn->statements["def_zonegroup_ins"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::default_zonegroup_insert2, + P1, P2); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } else { + stmt = &conn->statements["def_zonegroup_ups"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::default_zonegroup_upsert2, + P1, P2); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } + auto binding = sqlite::stmt_binding{stmt->get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + sqlite::bind_text(dpp, binding, P2, zonegroup_id); + + auto reset = sqlite::stmt_execution{stmt->get()}; + sqlite::eval0(dpp, reset); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default zonegroup insert failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zonegroup_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zonegroup_id "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["def_zonegroup_sel"]; + if (!stmt) { + const std::string sql = fmt::format(schema::default_zonegroup_select1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + zonegroup_id = sqlite::column_text(reset, 0); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default zonegroup select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_zonegroup_id "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["def_zonegroup_del"]; + if (!stmt) { + const std::string sql = fmt::format(schema::default_zonegroup_delete1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { + return -ENOENT; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default zonegroup delete failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + + +int SQLiteConfigStore::create_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneGroup& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:create_zonegroup "}; dpp = &prefix; + + if (info.id.empty()) { + ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl; + return -EINVAL; + } + if (info.name.empty()) { + ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl; + return -EINVAL; + } + + int ver = 1; + auto tag = generate_version_tag(dpp->get_cct()); + + bufferlist bl; + encode(info, bl); + const auto data = std::string_view{bl.c_str(), bl.length()}; + + try { + auto conn = impl->get(dpp); + sqlite::stmt_ptr* stmt = nullptr; + if (exclusive) { + stmt = &conn->statements["zonegroup_ins"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::zonegroup_insert6, + P1, P2, P3, P4, P5, P6); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } else { + stmt = &conn->statements["zonegroup_ups"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::zonegroup_upsert6, + P1, P2, P3, P4, P5, P6); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } + auto binding = sqlite::stmt_binding{stmt->get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_text(dpp, binding, P2, info.name); + sqlite::bind_text(dpp, binding, P3, info.realm_id); + sqlite::bind_text(dpp, binding, P4, data); + sqlite::bind_int(dpp, binding, P5, ver); + sqlite::bind_text(dpp, binding, P6, tag); + + auto reset = sqlite::stmt_execution{stmt->get()}; + sqlite::eval0(dpp, reset); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zonegroup insert failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::foreign_key_constraint) { + return -EINVAL; // refers to nonexistent RealmID + } else if (e.code() == sqlite::errc::primary_key_constraint) { + return -EEXIST; // ID already taken + } else if (e.code() == sqlite::errc::unique_constraint) { + return -EEXIST; // Name already taken + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), ver, std::move(tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_id, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_zonegroup_by_id "}; dpp = &prefix; + + if (zonegroup_id.empty()) { + ldpp_dout(dpp, 0) << "requires a zonegroup id" << dendl; + return -EINVAL; + } + + ZoneGroupRow row; + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zonegroup_sel_id"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zonegroup_select_id1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, zonegroup_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_zonegroup_row(reset, row); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_zonegroup_by_name "}; dpp = &prefix; + + if (zonegroup_name.empty()) { + ldpp_dout(dpp, 0) << "requires a zonegroup name" << dendl; + return -EINVAL; + } + + ZoneGroupRow row; + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zonegroup_sel_name"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zonegroup_select_name1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, zonegroup_name); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_zonegroup_row(reset, row); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zonegroup "}; dpp = &prefix; + + ZoneGroupRow row; + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zonegroup_sel_def"]; + if (!stmt) { + static constexpr std::string_view sql = schema::zonegroup_select_default0; + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_zonegroup_row(reset, row); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp, + optional_yield y, + const std::string& marker, + std::span entries, + sal::ListResult& result) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:list_zonegroup_names "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zonegroup_sel_names"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zonegroup_select_names2, P1, P2); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + auto reset = sqlite::stmt_execution{stmt.get()}; + + sqlite::bind_text(dpp, binding, P1, marker); + sqlite::bind_int(dpp, binding, P2, entries.size()); + + read_text_rows(dpp, reset, entries, result); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + + +// Zone + +class SQLiteZoneWriter : public sal::ZoneWriter { + SQLiteImpl* impl; + int ver; + std::string tag; + std::string zone_id; + std::string zone_name; + public: + SQLiteZoneWriter(SQLiteImpl* impl, int ver, std::string tag, + std::string_view zone_id, std::string_view zone_name) + : impl(impl), ver(ver), tag(std::move(tag)), + zone_id(zone_id), zone_name(zone_name) + {} + + int write(const DoutPrefixProvider* dpp, optional_yield y, + const RGWZoneParams& info) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:zone_write "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after conflict or delete + } + if (zone_id != info.id || zone_name != info.name) { + return -EINVAL; // can't modify zone id or name directly + } + + bufferlist bl; + encode(info, bl); + const auto data = std::string_view{bl.c_str(), bl.length()}; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zone_upd"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zone_update5, + P1, P2, P3, P4, P5); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_text(dpp, binding, P2, info.realm_id); + sqlite::bind_text(dpp, binding, P3, data); + sqlite::bind_int(dpp, binding, P4, ver); + sqlite::bind_text(dpp, binding, P5, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch + impl = nullptr; + return -ECANCELED; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zone update failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::foreign_key_constraint) { + return -EINVAL; // refers to nonexistent RealmID + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + ++ver; + return 0; + } + + int rename(const DoutPrefixProvider* dpp, optional_yield y, + RGWZoneParams& info, std::string_view new_name) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:zone_rename "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after conflict or delete + } + if (zone_id != info.id || zone_name != info.name) { + return -EINVAL; // can't modify zone id or name directly + } + if (new_name.empty()) { + ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl; + return -EINVAL; + } + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zone_rename"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zone_rename4, P1, P2, P2, P3); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_text(dpp, binding, P2, new_name); + sqlite::bind_int(dpp, binding, P3, ver); + sqlite::bind_text(dpp, binding, P4, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch + impl = nullptr; + return -ECANCELED; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zone rename failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::unique_constraint) { + return -EEXIST; // Name already taken + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + info.name = std::string{new_name}; + ++ver; + return 0; + } + + int remove(const DoutPrefixProvider* dpp, optional_yield y) override + { + Prefix prefix{*dpp, "dbconfig:sqlite:zone_remove "}; dpp = &prefix; + + if (!impl) { + return -EINVAL; // can't write after conflict or delete + } + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zone_del"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zone_delete3, P1, P2, P3); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, zone_id); + sqlite::bind_int(dpp, binding, P2, ver); + sqlite::bind_text(dpp, binding, P3, tag); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + impl = nullptr; + if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch + return -ECANCELED; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zone delete failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; + } +}; // SQLiteZoneWriter + + +int SQLiteConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zone_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:write_default_zone_id "}; dpp = &prefix; + + if (zone_id.empty()) { + ldpp_dout(dpp, 0) << "requires a zone id" << dendl; + return -EINVAL; + } + + try { + auto conn = impl->get(dpp); + sqlite::stmt_ptr* stmt = nullptr; + if (exclusive) { + stmt = &conn->statements["def_zone_ins"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::default_zone_insert2, P1, P2); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } else { + stmt = &conn->statements["def_zone_ups"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::default_zone_upsert2, P1, P2); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } + auto binding = sqlite::stmt_binding{stmt->get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + sqlite::bind_text(dpp, binding, P2, zone_id); + + auto reset = sqlite::stmt_execution{stmt->get()}; + sqlite::eval0(dpp, reset); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default zone insert failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zone_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zone_id "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["def_zone_sel"]; + if (!stmt) { + const std::string sql = fmt::format(schema::default_zone_select1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + zone_id = sqlite::column_text(reset, 0); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default zone select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_zone_id "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["def_zone_del"]; + if (!stmt) { + const std::string sql = fmt::format(schema::default_zone_delete1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval0(dpp, reset); + + if (!::sqlite3_changes(conn->db.get())) { + return -ENOENT; + } + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "default zone delete failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + + +int SQLiteConfigStore::create_zone(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneParams& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:create_zone "}; dpp = &prefix; + + if (info.id.empty()) { + ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl; + return -EINVAL; + } + if (info.name.empty()) { + ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl; + return -EINVAL; + } + + int ver = 1; + auto tag = generate_version_tag(dpp->get_cct()); + + bufferlist bl; + encode(info, bl); + const auto data = std::string_view{bl.c_str(), bl.length()}; + + try { + auto conn = impl->get(dpp); + sqlite::stmt_ptr* stmt = nullptr; + if (exclusive) { + stmt = &conn->statements["zone_ins"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::zone_insert6, + P1, P2, P3, P4, P5, P6); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } else { + stmt = &conn->statements["zone_ups"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::zone_upsert6, + P1, P2, P3, P4, P5, P6); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } + auto binding = sqlite::stmt_binding{stmt->get()}; + sqlite::bind_text(dpp, binding, P1, info.id); + sqlite::bind_text(dpp, binding, P2, info.name); + sqlite::bind_text(dpp, binding, P3, info.realm_id); + sqlite::bind_text(dpp, binding, P4, data); + sqlite::bind_int(dpp, binding, P5, ver); + sqlite::bind_text(dpp, binding, P6, tag); + + auto reset = sqlite::stmt_execution{stmt->get()}; + sqlite::eval0(dpp, reset); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zone insert failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::foreign_key_constraint) { + return -EINVAL; // refers to nonexistent RealmID + } else if (e.code() == sqlite::errc::primary_key_constraint) { + return -EEXIST; // ID already taken + } else if (e.code() == sqlite::errc::unique_constraint) { + return -EEXIST; // Name already taken + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), ver, std::move(tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_id, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_zone_by_id "}; dpp = &prefix; + + if (zone_id.empty()) { + ldpp_dout(dpp, 0) << "requires a zone id" << dendl; + return -EINVAL; + } + + ZoneRow row; + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zone_sel_id"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zone_select_id1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, zone_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_zone_row(reset, row); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_zone_by_name "}; dpp = &prefix; + + if (zone_name.empty()) { + ldpp_dout(dpp, 0) << "requires a zone name" << dendl; + return -EINVAL; + } + + ZoneRow row; + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zone_sel_name"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zone_select_name1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, zone_name); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_zone_row(reset, row); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::read_default_zone(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zone "}; dpp = &prefix; + + ZoneRow row; + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zone_sel_def"]; + if (!stmt) { + static constexpr std::string_view sql = schema::zone_select_default0; + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + read_zone_row(reset, row); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + + info = std::move(row.info); + if (writer) { + *writer = std::make_unique( + impl.get(), row.ver, std::move(row.tag), info.id, info.name); + } + return 0; +} + +int SQLiteConfigStore::list_zone_names(const DoutPrefixProvider* dpp, + optional_yield y, + const std::string& marker, + std::span entries, + sal::ListResult& result) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:list_zone_names "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["zone_sel_names"]; + if (!stmt) { + const std::string sql = fmt::format(schema::zone_select_names2, P1, P2); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, marker); + sqlite::bind_int(dpp, binding, P2, entries.size()); + + auto reset = sqlite::stmt_execution{stmt.get()}; + read_text_rows(dpp, reset, entries, result); + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + + +// PeriodConfig + +int SQLiteConfigStore::read_period_config(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWPeriodConfig& info) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:read_period_config "}; dpp = &prefix; + + try { + auto conn = impl->get(dpp); + auto& stmt = conn->statements["period_conf_sel"]; + if (!stmt) { + const std::string sql = fmt::format(schema::period_config_select1, P1); + stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + auto binding = sqlite::stmt_binding{stmt.get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + + auto reset = sqlite::stmt_execution{stmt.get()}; + sqlite::eval1(dpp, reset); + + std::string data = sqlite::column_text(reset, 0); + bufferlist bl = bufferlist::static_from_string(data); + auto p = bl.cbegin(); + decode(info, p); + + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "period config decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "period config select failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::done) { + return -ENOENT; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +int SQLiteConfigStore::write_period_config(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + const RGWPeriodConfig& info) +{ + Prefix prefix{*dpp, "dbconfig:sqlite:write_period_config "}; dpp = &prefix; + + bufferlist bl; + encode(info, bl); + const auto data = std::string_view{bl.c_str(), bl.length()}; + + try { + auto conn = impl->get(dpp); + sqlite::stmt_ptr* stmt = nullptr; + if (exclusive) { + stmt = &conn->statements["period_conf_ins"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::period_config_insert2, P1, P2); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } else { + stmt = &conn->statements["period_conf_ups"]; + if (!*stmt) { + const std::string sql = fmt::format(schema::period_config_upsert2, P1, P2); + *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql); + } + } + auto binding = sqlite::stmt_binding{stmt->get()}; + sqlite::bind_text(dpp, binding, P1, realm_id); + sqlite::bind_text(dpp, binding, P2, data); + + auto reset = sqlite::stmt_execution{stmt->get()}; + sqlite::eval0(dpp, reset); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 20) << "period config decode failed: " << e.what() << dendl; + return -EIO; + } catch (const sqlite::error& e) { + ldpp_dout(dpp, 20) << "period config insert failed: " << e.what() << dendl; + if (e.code() == sqlite::errc::primary_key_constraint) { + return -EEXIST; + } else if (e.code() == sqlite::errc::busy) { + return -EBUSY; + } + return -EIO; + } + return 0; +} + +namespace { + +int version_cb(void* user, int count, char** values, char** names) +{ + if (count != 1) { + return EINVAL; + } + std::string_view name = names[0]; + if (name != "user_version") { + return EINVAL; + } + std::string_view value = values[0]; + auto result = std::from_chars(value.begin(), value.end(), + *reinterpret_cast(user)); + if (result.ec != std::errc{}) { + return static_cast(result.ec); + } + return 0; +} + +void apply_schema_migrations(const DoutPrefixProvider* dpp, sqlite3* db) +{ + sqlite::execute(dpp, db, "PRAGMA foreign_keys = ON", nullptr, nullptr); + + // initiate a transaction and read the current schema version + uint32_t version = 0; + sqlite::execute(dpp, db, "BEGIN; PRAGMA user_version", version_cb, &version); + + const uint32_t initial_version = version; + ldpp_dout(dpp, 4) << "current schema version " << version << dendl; + + // use the version as an index into schema::migrations + auto m = std::next(schema::migrations.begin(), version); + + for (; m != schema::migrations.end(); ++m, ++version) { + try { + sqlite::execute(dpp, db, m->up, nullptr, nullptr); + } catch (const sqlite::error&) { + ldpp_dout(dpp, -1) << "ERROR: schema migration failed on v" << version + << ": " << m->description << dendl; + throw; + } + } + + if (version > initial_version) { + // update the user_version and commit the transaction + const auto commit = fmt::format("PRAGMA user_version = {}; COMMIT", version); + sqlite::execute(dpp, db, commit.c_str(), nullptr, nullptr); + + ldpp_dout(dpp, 4) << "upgraded database schema to version " << version << dendl; + } else { + // nothing to commit + sqlite::execute(dpp, db, "ROLLBACK", nullptr, nullptr); + } +} + +} // anonymous namespace + + +auto create_sqlite_store(const DoutPrefixProvider* dpp, const std::string& uri) + -> std::unique_ptr +{ + Prefix prefix{*dpp, "dbconfig:sqlite:create_sqlite_store "}; dpp = &prefix; + + // build the connection pool + int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_URI | SQLITE_OPEN_READWRITE | + SQLITE_OPEN_NOMUTEX; + auto factory = sqlite::ConnectionFactory{uri, flags}; + + // sqlite does not support concurrent writers. we enforce this limitation by + // using a connection pool of size=1 + static constexpr size_t max_connections = 1; + auto impl = std::make_unique(std::move(factory), max_connections); + + // open a connection to apply schema migrations + auto conn = impl->get(dpp); + apply_schema_migrations(dpp, conn->db.get()); + + return std::make_unique(std::move(impl)); +} + +} // namespace rgw::dbstore::config diff --git a/src/rgw/driver/dbstore/config/sqlite.h b/src/rgw/driver/dbstore/config/sqlite.h new file mode 100644 index 000000000..d79e04072 --- /dev/null +++ b/src/rgw/driver/dbstore/config/sqlite.h @@ -0,0 +1,172 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_sal_config.h" + +class DoutPrefixProvider; + +namespace rgw::dbstore::config { + +struct SQLiteImpl; + +class SQLiteConfigStore : public sal::ConfigStore { + public: + explicit SQLiteConfigStore(std::unique_ptr impl); + ~SQLiteConfigStore() override; + + int write_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id) override; + int read_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string& realm_id) override; + int delete_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y) override; + + int create_realm(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWRealm& info, + std::unique_ptr* writer) override; + int read_realm_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWRealm& info, + std::unique_ptr* writer) override; + int read_realm_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer) override; + int read_default_realm(const DoutPrefixProvider* dpp, + optional_yield y, + RGWRealm& info, + std::unique_ptr* writer) override; + int read_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view realm_name, + std::string& realm_id) override; + int realm_notify_new_period(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWPeriod& period) override; + int list_realm_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) override; + + int create_period(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWPeriod& info) override; + int read_period(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view period_id, + std::optional epoch, RGWPeriod& info) override; + int delete_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id) override; + int list_period_ids(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) override; + + int write_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zonegroup_id) override; + int read_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zonegroup_id) override; + int delete_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) override; + + int create_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneGroup& info, + std::unique_ptr* writer) override; + int read_zonegroup_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_id, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + int read_zonegroup_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + int read_default_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + int list_zonegroup_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) override; + + int write_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zone_id) override; + int read_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zone_id) override; + int delete_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) override; + + int create_zone(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneParams& info, + std::unique_ptr* writer) override; + int read_zone_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_id, + RGWZoneParams& info, + std::unique_ptr* writer) override; + int read_zone_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer) override; + int read_default_zone(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneParams& info, + std::unique_ptr* writer) override; + int list_zone_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) override; + + int read_period_config(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWPeriodConfig& info) override; + int write_period_config(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + const RGWPeriodConfig& info) override; + + private: + std::unique_ptr impl; +}; // SQLiteConfigStore + + +auto create_sqlite_store(const DoutPrefixProvider* dpp, const std::string& uri) + -> std::unique_ptr; + +} // namespace rgw::dbstore::config diff --git a/src/rgw/driver/dbstore/config/sqlite_schema.h b/src/rgw/driver/dbstore/config/sqlite_schema.h new file mode 100644 index 000000000..c8a8fce3e --- /dev/null +++ b/src/rgw/driver/dbstore/config/sqlite_schema.h @@ -0,0 +1,299 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include + +namespace rgw::dbstore::config::schema { + +struct Migration { + // human-readable description to help with debugging migration errors + const char* description = nullptr; + // series of sql statements to apply the schema migration + const char* up = nullptr; + // series of sql statements to undo the schema migration + const char* down = nullptr; +}; + +static constexpr std::initializer_list migrations {{ + .description = "create the initial ConfigStore tables", + .up = R"( +CREATE TABLE IF NOT EXISTS Realms ( + ID TEXT PRIMARY KEY NOT NULL, + Name TEXT UNIQUE NOT NULL, + CurrentPeriod TEXT, + Epoch INTEGER DEFAULT 0, + VersionNumber INTEGER, + VersionTag TEXT +); +CREATE TABLE IF NOT EXISTS Periods ( + ID TEXT NOT NULL, + Epoch INTEGER DEFAULT 0, + RealmID TEXT NOT NULL REFERENCES Realms (ID), + Data TEXT NOT NULL, + PRIMARY KEY (ID, Epoch) +); +CREATE TABLE IF NOT EXISTS PeriodConfigs ( + RealmID TEXT PRIMARY KEY NOT NULL REFERENCES Realms (ID), + Data TEXT NOT NULL +); +CREATE TABLE IF NOT EXISTS ZoneGroups ( + ID TEXT PRIMARY KEY NOT NULL, + Name TEXT UNIQUE NOT NULL, + RealmID TEXT NOT NULL REFERENCES Realms (ID), + Data TEXT NOT NULL, + VersionNumber INTEGER, + VersionTag TEXT +); +CREATE TABLE IF NOT EXISTS Zones ( + ID TEXT PRIMARY KEY NOT NULL, + Name TEXT UNIQUE NOT NULL, + RealmID TEXT NOT NULL REFERENCES Realms (ID), + Data TEXT NOT NULL, + VersionNumber INTEGER, + VersionTag TEXT +); +CREATE TABLE IF NOT EXISTS DefaultRealms ( + ID TEXT, + Empty TEXT PRIMARY KEY +); +CREATE TABLE IF NOT EXISTS DefaultZoneGroups ( + ID TEXT, + RealmID TEXT PRIMARY KEY REFERENCES Realms (ID) +); +CREATE TABLE IF NOT EXISTS DefaultZones ( + ID TEXT, + RealmID TEXT PRIMARY KEY REFERENCES Realms (ID) +); +)", + .down = R"( +DROP TABLE IF EXISTS Realms; +DROP TABLE IF EXISTS Periods; +DROP TABLE IF EXISTS PeriodConfigs; +DROP TABLE IF EXISTS ZoneGroups; +DROP TABLE IF EXISTS Zones; +DROP TABLE IF EXISTS DefaultRealms; +DROP TABLE IF EXISTS DefaultZoneGroups; +DROP TABLE IF EXISTS DefaultZones; +)" + } +}; + + +// DefaultRealms + +static constexpr const char* default_realm_insert1 = +"INSERT INTO DefaultRealms (ID, Empty) VALUES ({}, '')"; + +static constexpr const char* default_realm_upsert1 = +R"(INSERT INTO DefaultRealms (ID, Empty) VALUES ({0}, '') +ON CONFLICT(Empty) DO UPDATE SET ID = {0})"; + +static constexpr const char* default_realm_select0 = +"SELECT ID FROM DefaultRealms LIMIT 1"; + +static constexpr const char* default_realm_delete0 = +"DELETE FROM DefaultRealms"; + + +// Realms + +static constexpr const char* realm_update5 = +"UPDATE Realms SET CurrentPeriod = {1}, Epoch = {2}, VersionNumber = {3} + 1 \ +WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}"; + +static constexpr const char* realm_rename4 = +"UPDATE Realms SET Name = {1}, VersionNumber = {2} + 1 \ +WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}"; + +static constexpr const char* realm_delete3 = +"DELETE FROM Realms WHERE ID = {} AND VersionNumber = {} AND VersionTag = {}"; + +static constexpr const char* realm_insert4 = +"INSERT INTO Realms (ID, Name, VersionNumber, VersionTag) \ +VALUES ({}, {}, {}, {})"; + +static constexpr const char* realm_upsert4 = +"INSERT INTO Realms (ID, Name, VersionNumber, VersionTag) \ +VALUES ({0}, {1}, {2}, {3}) \ +ON CONFLICT(ID) DO UPDATE SET Name = {1}, \ +VersionNumber = {2}, VersionTag = {3}"; + +static constexpr const char* realm_select_id1 = +"SELECT * FROM Realms WHERE ID = {} LIMIT 1"; + +static constexpr const char* realm_select_name1 = +"SELECT * FROM Realms WHERE Name = {} LIMIT 1"; + +static constexpr const char* realm_select_default0 = +"SELECT r.* FROM Realms r \ +INNER JOIN DefaultRealms d \ +ON d.ID = r.ID LIMIT 1"; + +static constexpr const char* realm_select_names2 = +"SELECT Name FROM Realms WHERE Name > {} \ +ORDER BY Name ASC LIMIT {}"; + + +// Periods + +static constexpr const char* period_insert4 = +"INSERT INTO Periods (ID, Epoch, RealmID, Data) \ +VALUES ({}, {}, {}, {})"; + +static constexpr const char* period_upsert4 = +"INSERT INTO Periods (ID, Epoch, RealmID, Data) \ +VALUES ({0}, {1}, {2}, {3}) \ +ON CONFLICT DO UPDATE SET RealmID = {2}, Data = {3}"; + +static constexpr const char* period_select_epoch2 = +"SELECT * FROM Periods WHERE ID = {} AND Epoch = {} LIMIT 1"; + +static constexpr const char* period_select_latest1 = +"SELECT * FROM Periods WHERE ID = {} ORDER BY Epoch DESC LIMIT 1"; + +static constexpr const char* period_delete1 = +"DELETE FROM Periods WHERE ID = {}"; + +static constexpr const char* period_select_ids2 = +"SELECT ID FROM Periods WHERE ID > {} ORDER BY ID ASC LIMIT {}"; + + +// DefaultZoneGroups + +static constexpr const char* default_zonegroup_insert2 = +"INSERT INTO DefaultZoneGroups (RealmID, ID) VALUES ({}, {})"; + +static constexpr const char* default_zonegroup_upsert2 = +"INSERT INTO DefaultZoneGroups (RealmID, ID) \ +VALUES ({0}, {1}) \ +ON CONFLICT(RealmID) DO UPDATE SET ID = {1}"; + +static constexpr const char* default_zonegroup_select1 = +"SELECT ID FROM DefaultZoneGroups WHERE RealmID = {}"; + +static constexpr const char* default_zonegroup_delete1 = +"DELETE FROM DefaultZoneGroups WHERE RealmID = {}"; + + +// ZoneGroups + +static constexpr const char* zonegroup_update5 = +"UPDATE ZoneGroups SET RealmID = {1}, Data = {2}, VersionNumber = {3} + 1 \ +WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}"; + +static constexpr const char* zonegroup_rename4 = +"UPDATE ZoneGroups SET Name = {1}, VersionNumber = {2} + 1 \ +WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}"; + +static constexpr const char* zonegroup_delete3 = +"DELETE FROM ZoneGroups WHERE ID = {} \ +AND VersionNumber = {} AND VersionTag = {}"; + +static constexpr const char* zonegroup_insert6 = +"INSERT INTO ZoneGroups (ID, Name, RealmID, Data, VersionNumber, VersionTag) \ +VALUES ({}, {}, {}, {}, {}, {})"; + +static constexpr const char* zonegroup_upsert6 = +"INSERT INTO ZoneGroups (ID, Name, RealmID, Data, VersionNumber, VersionTag) \ +VALUES ({0}, {1}, {2}, {3}, {4}, {5}) \ +ON CONFLICT (ID) DO UPDATE SET Name = {1}, RealmID = {2}, \ +Data = {3}, VersionNumber = {4}, VersionTag = {5}"; + +static constexpr const char* zonegroup_select_id1 = +"SELECT * FROM ZoneGroups WHERE ID = {} LIMIT 1"; + +static constexpr const char* zonegroup_select_name1 = +"SELECT * FROM ZoneGroups WHERE Name = {} LIMIT 1"; + +static constexpr const char* zonegroup_select_default0 = +"SELECT z.* FROM ZoneGroups z \ +INNER JOIN DefaultZoneGroups d \ +ON d.ID = z.ID LIMIT 1"; + +static constexpr const char* zonegroup_select_names2 = +"SELECT Name FROM ZoneGroups WHERE Name > {} \ +ORDER BY Name ASC LIMIT {}"; + + +// DefaultZones + +static constexpr const char* default_zone_insert2 = +"INSERT INTO DefaultZones (RealmID, ID) VALUES ({}, {})"; + +static constexpr const char* default_zone_upsert2 = +"INSERT INTO DefaultZones (RealmID, ID) VALUES ({0}, {1}) \ +ON CONFLICT(RealmID) DO UPDATE SET ID = {1}"; + +static constexpr const char* default_zone_select1 = +"SELECT ID FROM DefaultZones WHERE RealmID = {}"; + +static constexpr const char* default_zone_delete1 = +"DELETE FROM DefaultZones WHERE RealmID = {}"; + + +// Zones + +static constexpr const char* zone_update5 = +"UPDATE Zones SET RealmID = {1}, Data = {2}, VersionNumber = {3} + 1 \ +WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}"; + +static constexpr const char* zone_rename4 = +"UPDATE Zones SET Name = {1}, VersionNumber = {2} + 1 \ +WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}"; + +static constexpr const char* zone_delete3 = +"DELETE FROM Zones WHERE ID = {} AND VersionNumber = {} AND VersionTag = {}"; + +static constexpr const char* zone_insert6 = +"INSERT INTO Zones (ID, Name, RealmID, Data, VersionNumber, VersionTag) \ +VALUES ({}, {}, {}, {}, {}, {})"; + +static constexpr const char* zone_upsert6 = +"INSERT INTO Zones (ID, Name, RealmID, Data, VersionNumber, VersionTag) \ +VALUES ({0}, {1}, {2}, {3}, {4}, {5}) \ +ON CONFLICT (ID) DO UPDATE SET Name = {1}, RealmID = {2}, \ +Data = {3}, VersionNumber = {4}, VersionTag = {5}"; + +static constexpr const char* zone_select_id1 = +"SELECT * FROM Zones WHERE ID = {} LIMIT 1"; + +static constexpr const char* zone_select_name1 = +"SELECT * FROM Zones WHERE Name = {} LIMIT 1"; + +static constexpr const char* zone_select_default0 = +"SELECT z.* FROM Zones z \ +INNER JOIN DefaultZones d \ +ON d.ID = z.ID LIMIT 1"; + +static constexpr const char* zone_select_names2 = +"SELECT Name FROM Zones WHERE Name > {} \ +ORDER BY Name ASC LIMIT {}"; + + +// PeriodConfigs + +static constexpr const char* period_config_insert2 = +"INSERT INTO PeriodConfigs (RealmID, Data) VALUES ({}, {})"; + +static constexpr const char* period_config_upsert2 = +"INSERT INTO PeriodConfigs (RealmID, Data) VALUES ({0}, {1}) \ +ON CONFLICT (RealmID) DO UPDATE SET Data = {1}"; + +static constexpr const char* period_config_select1 = +"SELECT Data FROM PeriodConfigs WHERE RealmID = {} LIMIT 1"; + +} // namespace rgw::dbstore::config::schema diff --git a/src/rgw/driver/dbstore/config/store.cc b/src/rgw/driver/dbstore/config/store.cc new file mode 100644 index 000000000..569a093b7 --- /dev/null +++ b/src/rgw/driver/dbstore/config/store.cc @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include + +#include + +#include "store.h" +#ifdef SQLITE_ENABLED +#include "sqlite.h" +#endif + +namespace rgw::dbstore { + +auto create_config_store(const DoutPrefixProvider* dpp, const std::string& uri) + -> std::unique_ptr +{ +#ifdef SQLITE_ENABLED + if (uri.starts_with("file:")) { + return config::create_sqlite_store(dpp, uri); + } +#endif + throw std::runtime_error(fmt::format("unrecognized URI {}", uri)); +} + +} // namespace rgw::dbstore diff --git a/src/rgw/driver/dbstore/config/store.h b/src/rgw/driver/dbstore/config/store.h new file mode 100644 index 000000000..553d9f709 --- /dev/null +++ b/src/rgw/driver/dbstore/config/store.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include "rgw_sal_config.h" + +namespace rgw::dbstore { + +// ConfigStore factory +auto create_config_store(const DoutPrefixProvider* dpp, const std::string& uri) + -> std::unique_ptr; + +} // namespace rgw::dbstore diff --git a/src/rgw/driver/dbstore/dbstore_main.cc b/src/rgw/driver/dbstore/dbstore_main.cc new file mode 100644 index 000000000..4fff38ced --- /dev/null +++ b/src/rgw/driver/dbstore/dbstore_main.cc @@ -0,0 +1,199 @@ +#include +#include +#include +#include +#include + +#include "dbstore_mgr.h" +#include +#include + +using namespace std; +using namespace rgw::store; +using DB = rgw::store::DB; + +struct thr_args { + DB *dbs; + int thr_id; +}; + +void* process(void *arg) +{ + struct thr_args *t_args = (struct thr_args*)arg; + + DB *db = t_args->dbs; + int thr_id = t_args->thr_id; + int ret = -1; + + cout<<"Entered thread:"<get_def_dpp(); + + db->InitializeParams(dpp, ¶ms); + + params.op.user.uinfo.display_name = user1; + params.op.user.uinfo.user_id.tenant = "tenant"; + params.op.user.uinfo.user_id.id = user1; + params.op.user.uinfo.suspended = 123; + params.op.user.uinfo.max_buckets = 456; + params.op.user.uinfo.placement_tags.push_back("tags1"); + params.op.user.uinfo.placement_tags.push_back("tags2"); + + RGWAccessKey k1("id1", "key1"); + RGWAccessKey k2("id2", "key2"); + params.op.user.uinfo.access_keys.insert(make_pair("key1", k1)); + params.op.user.uinfo.access_keys.insert(make_pair("key2", k2)); + + ret = db->ProcessOp(dpp, "InsertUser", ¶ms); + cout << "InsertUser return value: " << ret << "\n"; + + DBOpParams params2 = {}; + params.op.user.uinfo.user_id.tenant = "tenant2"; + + db->InitializeParams(dpp, ¶ms2); + params2.op.user.uinfo.display_name = user1; + ret = db->ProcessOp(dpp, "GetUser", ¶ms2); + + cout << "GetUser return value: " << ret << "\n"; + + cout << "tenant: " << params2.op.user.uinfo.user_id.tenant << "\n"; + cout << "suspended: " << (int)params2.op.user.uinfo.suspended << "\n"; + + list::iterator it = params2.op.user.uinfo.placement_tags.begin(); + + while (it != params2.op.user.uinfo.placement_tags.end()) { + cout << "list = " << *it << "\n"; + it++; + } + + map::iterator it2 = params2.op.user.uinfo.access_keys.begin(); + + while (it2 != params2.op.user.uinfo.access_keys.end()) { + cout << "keys = " << it2->first << "\n"; + RGWAccessKey k = it2->second; + cout << "id = " << k.id << ", keys = " << k.key << "\n"; + it2++; + } + + params.op.bucket.info.bucket.name = bucketa; + db->ProcessOp(dpp, "InsertBucket", ¶ms); + + params.op.user.uinfo.display_name = user2; + params.op.user.uinfo.user_id.id = user2; + db->ProcessOp(dpp, "InsertUser", ¶ms); + + params.op.bucket.info.bucket.name = bucketb; + db->ProcessOp(dpp, "InsertBucket", ¶ms); + + db->ProcessOp(dpp, "GetUser", ¶ms); + db->ProcessOp(dpp, "GetBucket", ¶ms); + + db->ListAllUsers(dpp, ¶ms); + db->ListAllBuckets(dpp, ¶ms); + + params.op.bucket.info.bucket.name = bucketb; + + db->ProcessOp(dpp, "RemoveBucket", ¶ms); + + params.op.user.uinfo.user_id.id = user2; + db->ProcessOp(dpp, "RemoveUser", ¶ms); + + db->ListAllUsers(dpp, ¶ms); + db->ListAllBuckets(dpp, ¶ms); + cout<<"Exiting thread:"< args; + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, CINIT_FLAG_NO_MON_CONFIG, 1); + dbsm = new DBStoreManager(cct.get(), logfile, loglevel); + dbs = dbsm->getDB(tenant, true); + + cout<<"No. of threads being created = "<destroyAllHandles(); + + return 0; +} diff --git a/src/rgw/driver/dbstore/dbstore_mgr.cc b/src/rgw/driver/dbstore/dbstore_mgr.cc new file mode 100644 index 000000000..6835f526b --- /dev/null +++ b/src/rgw/driver/dbstore/dbstore_mgr.cc @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "dbstore_mgr.h" +#include "common/dbstore_log.h" + +#include + +static constexpr auto dout_subsys = ceph_subsys_rgw; + +using namespace std; + + +/* Given a tenant, find and return the DBStore handle. + * If not found and 'create' set to true, create one + * and return + */ +DB *DBStoreManager::getDB (string tenant, bool create) +{ + map::iterator iter; + DB *dbs = nullptr; + pair::iterator,bool> ret; + + if (tenant.empty()) + return default_db; + + if (DBStoreHandles.empty()) + goto not_found; + + iter = DBStoreHandles.find(tenant); + + if (iter != DBStoreHandles.end()) + return iter->second; + +not_found: + if (!create) + return nullptr; + + dbs = createDB(tenant); + + return dbs; +} + +/* Create DBStore instance */ +DB *DBStoreManager::createDB(std::string tenant) { + DB *dbs = nullptr; + pair::iterator,bool> ret; + const auto& db_path = g_conf().get_val("dbstore_db_dir"); + const auto& db_name = g_conf().get_val("dbstore_db_name_prefix") + "-" + tenant; + + auto db_full_path = std::filesystem::path(db_path) / db_name; + ldout(cct, 0) << "DB initialization full db_path("<Initialize("", -1) < 0) { + ldout(cct, 0) << "DB initialization failed for tenant("<(tenant, dbs)); + + /* + * Its safe to check for already existing entry (just + * incase other thread raced and created the entry) + */ + if (ret.second == false) { + /* Entry already created by another thread */ + delete dbs; + + dbs = ret.first->second; + } + + return dbs; +} + +void DBStoreManager::deleteDB(string tenant) { + map::iterator iter; + DB *dbs = nullptr; + + if (tenant.empty() || DBStoreHandles.empty()) + return; + + /* XXX: Check if we need to perform this operation under a lock */ + iter = DBStoreHandles.find(tenant); + + if (iter == DBStoreHandles.end()) + return; + + dbs = iter->second; + + DBStoreHandles.erase(iter); + dbs->Destroy(dbs->get_def_dpp()); + delete dbs; + + return; +} + +void DBStoreManager::deleteDB(DB *dbs) { + if (!dbs) + return; + + (void)deleteDB(dbs->getDBname()); +} + + +void DBStoreManager::destroyAllHandles(){ + map::iterator iter; + DB *dbs = nullptr; + + if (DBStoreHandles.empty()) + return; + + for (iter = DBStoreHandles.begin(); iter != DBStoreHandles.end(); + ++iter) { + dbs = iter->second; + dbs->Destroy(dbs->get_def_dpp()); + delete dbs; + } + + DBStoreHandles.clear(); + + return; +} + + diff --git a/src/rgw/driver/dbstore/dbstore_mgr.h b/src/rgw/driver/dbstore/dbstore_mgr.h new file mode 100644 index 000000000..77fc3aaf7 --- /dev/null +++ b/src/rgw/driver/dbstore/dbstore_mgr.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "common/ceph_context.h" +#include "common/dbstore.h" +#include "sqlite/sqliteDB.h" + +using namespace rgw::store; +using DB = rgw::store::DB; + +/* XXX: Should be a dbstore config option */ +const static std::string default_tenant = "default_ns"; + +class DBStoreManager { +private: + std::map DBStoreHandles; + DB *default_db = nullptr; + CephContext *cct; + +public: + DBStoreManager(CephContext *_cct): DBStoreHandles() { + cct = _cct; + default_db = createDB(default_tenant); + }; + DBStoreManager(CephContext *_cct, std::string logfile, int loglevel): DBStoreHandles() { + /* No ceph context. Create one with log args provided */ + cct = _cct; + cct->_log->set_log_file(logfile); + cct->_log->reopen_log_file(); + cct->_conf->subsys.set_log_level(ceph_subsys_rgw, loglevel); + default_db = createDB(default_tenant); + }; + ~DBStoreManager() { destroyAllHandles(); }; + + /* XXX: TBD based on testing + * 1) Lock to protect DBStoreHandles map. + * 2) Refcount of each DBStore to protect from + * being deleted while using it. + */ + DB* getDB () { return default_db; }; + DB* getDB (std::string tenant, bool create); + DB* createDB (std::string tenant); + void deleteDB (std::string tenant); + void deleteDB (DB* db); + void destroyAllHandles(); +}; diff --git a/src/rgw/driver/dbstore/sqlite/CMakeLists.txt b/src/rgw/driver/dbstore/sqlite/CMakeLists.txt new file mode 100644 index 000000000..909765e30 --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.14.0) +project(sqlite_db) + +find_package(SQLite3 REQUIRED) + +set(sqlite_db_srcs + sqliteDB.h + sqliteDB.cc) + +include_directories(${CMAKE_INCLUDE_DIR}) + +set(SQLITE_COMPILE_FLAGS "-DSQLITE_THREADSAFE=1") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SQLITE_COMPILE_FLAGS}") + +add_library(sqlite_db STATIC ${sqlite_db_srcs}) +target_link_libraries(sqlite_db sqlite3 dbstore_lib rgw_common) diff --git a/src/rgw/driver/dbstore/sqlite/connection.cc b/src/rgw/driver/dbstore/sqlite/connection.cc new file mode 100644 index 000000000..143a3a0d5 --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/connection.cc @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/dout.h" +#include "connection.h" +#include "error.h" + +namespace rgw::dbstore::sqlite { + +db_ptr open_database(const char* filename, int flags) +{ + sqlite3* db = nullptr; + const int result = ::sqlite3_open_v2(filename, &db, flags, nullptr); + if (result != SQLITE_OK) { + throw std::system_error(result, sqlite::error_category()); + } + // request extended result codes + (void) ::sqlite3_extended_result_codes(db, 1); + return db_ptr{db}; +} + +} // namespace rgw::dbstore::sqlite diff --git a/src/rgw/driver/dbstore/sqlite/connection.h b/src/rgw/driver/dbstore/sqlite/connection.h new file mode 100644 index 000000000..6088763fd --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/connection.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include + +#include + +#include "sqlite/statement.h" + +class DoutPrefixProvider; + +namespace rgw::dbstore::sqlite { + +// owning sqlite3 pointer +struct db_deleter { + void operator()(sqlite3* p) const { ::sqlite3_close(p); } +}; +using db_ptr = std::unique_ptr; + + +// open the database file or throw on error +db_ptr open_database(const char* filename, int flags); + + +struct Connection { + db_ptr db; + // map of statements, prepared on first use + std::map statements; + + explicit Connection(db_ptr db) : db(std::move(db)) {} +}; + +// sqlite connection factory for ConnectionPool +class ConnectionFactory { + std::string uri; + int flags; + public: + ConnectionFactory(std::string uri, int flags) + : uri(std::move(uri)), flags(flags) {} + + auto operator()(const DoutPrefixProvider* dpp) + -> std::unique_ptr + { + auto db = open_database(uri.c_str(), flags); + return std::make_unique(std::move(db)); + } +}; + +} // namespace rgw::dbstore::sqlite diff --git a/src/rgw/driver/dbstore/sqlite/error.cc b/src/rgw/driver/dbstore/sqlite/error.cc new file mode 100644 index 000000000..5fe9eb0ae --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/error.cc @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "error.h" + +namespace rgw::dbstore::sqlite { + +const std::error_category& error_category() +{ + struct category : std::error_category { + const char* name() const noexcept override { + return "dbstore:sqlite"; + } + std::string message(int ev) const override { + return ::sqlite3_errstr(ev); + } + std::error_condition default_error_condition(int code) const noexcept override { + return {code & 0xFF, category()}; + } + }; + static category instance; + return instance; +} + +} // namespace rgw::dbstore::sqlite diff --git a/src/rgw/driver/dbstore/sqlite/error.h b/src/rgw/driver/dbstore/sqlite/error.h new file mode 100644 index 000000000..15396d8ca --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/error.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include + +namespace rgw::dbstore::sqlite { + +// error category for sqlite extended result codes: +// https://www.sqlite.org/rescode.html +const std::error_category& error_category(); + + +// sqlite exception type that carries the extended error code and message +class error : public std::runtime_error { + std::error_code ec; + public: + error(const char* errmsg, std::error_code ec) + : runtime_error(errmsg), ec(ec) {} + error(sqlite3* db, std::error_code ec) : error(::sqlite3_errmsg(db), ec) {} + error(sqlite3* db, int result) : error(db, {result, error_category()}) {} + error(sqlite3* db) : error(db, ::sqlite3_extended_errcode(db)) {} + std::error_code code() const { return ec; } +}; + + +// sqlite error conditions for primary and extended result codes +// +// 'primary' error_conditions will match 'primary' error_codes as well as any +// 'extended' error_codes whose lowest 8 bits match that primary code. for +// example, the error_condition for SQLITE_CONSTRAINT will match the error_codes +// SQLITE_CONSTRAINT and SQLITE_CONSTRAINT_* +enum class errc { + // primary result codes + ok = SQLITE_OK, + busy = SQLITE_BUSY, + constraint = SQLITE_CONSTRAINT, + row = SQLITE_ROW, + done = SQLITE_DONE, + + // extended result codes + primary_key_constraint = SQLITE_CONSTRAINT_PRIMARYKEY, + foreign_key_constraint = SQLITE_CONSTRAINT_FOREIGNKEY, + unique_constraint = SQLITE_CONSTRAINT_UNIQUE, + + // ..add conditions as needed +}; + +inline std::error_code make_error_code(errc e) +{ + return {static_cast(e), error_category()}; +} + +inline std::error_condition make_error_condition(errc e) +{ + return {static_cast(e), error_category()}; +} + +} // namespace rgw::dbstore::sqlite + +namespace std { + +// enable implicit conversions from sqlite::errc to std::error_condition +template<> struct is_error_condition_enum< + rgw::dbstore::sqlite::errc> : public true_type {}; + +} // namespace std diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.cc b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc new file mode 100644 index 000000000..dc244c07b --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc @@ -0,0 +1,2996 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "sqliteDB.h" + +using namespace std; + +#define SQL_PREPARE(dpp, params, sdb, stmt, ret, Op) \ + do { \ + string schema; \ + schema = Schema(params); \ + sqlite3_prepare_v2 (*sdb, schema.c_str(), \ + -1, &stmt , NULL); \ + if (!stmt) { \ + ldpp_dout(dpp, 0) <<"failed to prepare statement " \ + <<"for Op("<(blob), blob_len); \ + \ + decode(param, b); \ + }while(0); + +#define SQL_EXECUTE(dpp, params, stmt, cbk, args...) \ + do{ \ + const std::lock_guard lk(((DBOp*)(this))->mtx); \ + if (!stmt) { \ + ret = Prepare(dpp, params); \ + } \ + \ + if (!stmt) { \ + ldpp_dout(dpp, 0) <<"No prepared statement "<< dendl; \ + goto out; \ + } \ + \ + ret = Bind(dpp, params); \ + if (ret) { \ + ldpp_dout(dpp, 0) <<"Bind parameters failed for stmt(" <op, stmt, cbk); \ + \ + Reset(dpp, stmt); \ + \ + if (ret) { \ + ldpp_dout(dpp, 0) <<"Execution failed for stmt(" <user_table.empty()) { + params->user_table = getUserTable(); + } + if (params->user_table.empty()) { + params->user_table = getUserTable(); + } + if (params->bucket_table.empty()) { + params->bucket_table = getBucketTable(); + } + if (params->quota_table.empty()) { + params->quota_table = getQuotaTable(); + } + if (params->lc_entry_table.empty()) { + params->lc_entry_table = getLCEntryTable(); + } + if (params->lc_head_table.empty()) { + params->lc_head_table = getLCHeadTable(); + } + + p_params.user_table = params->user_table; + p_params.bucket_table = params->bucket_table; + p_params.quota_table = params->quota_table; + p_params.lc_entry_table = params->lc_entry_table; + p_params.lc_head_table = params->lc_head_table; + + p_params.op.query_str = params->op.query_str; + + bucket = params->op.bucket.info.bucket.name; + + if (!bucket.empty()) { + if (params->object_table.empty()) { + params->object_table = getObjectTable(bucket); + } + if (params->objectdata_table.empty()) { + params->objectdata_table = getObjectDataTable(bucket); + } + if (params->object_view.empty()) { + params->object_view = getObjectView(bucket); + } + if (params->object_trigger.empty()) { + params->object_trigger = getObjectTrigger(bucket); + } + p_params.object_table = params->object_table; + p_params.objectdata_table = params->objectdata_table; + p_params.object_view = params->object_view; + } + + return 0; +} + +static int list_callback(void *None, int argc, char **argv, char **aname) +{ + int i; + for(i=0; i < argc; i++) { + string arg = argv[i] ? argv[i] : "NULL"; + cout<(&this->db, this->getDBname(), cct); + dbops.RemoveUser = make_shared(&this->db, this->getDBname(), cct); + dbops.GetUser = make_shared(&this->db, this->getDBname(), cct); + dbops.InsertBucket = make_shared(&this->db, this->getDBname(), cct); + dbops.UpdateBucket = make_shared(&this->db, this->getDBname(), cct); + dbops.RemoveBucket = make_shared(&this->db, this->getDBname(), cct); + dbops.GetBucket = make_shared(&this->db, this->getDBname(), cct); + dbops.ListUserBuckets = make_shared(&this->db, this->getDBname(), cct); + dbops.InsertLCEntry = make_shared(&this->db, this->getDBname(), cct); + dbops.RemoveLCEntry = make_shared(&this->db, this->getDBname(), cct); + dbops.GetLCEntry = make_shared(&this->db, this->getDBname(), cct); + dbops.ListLCEntries = make_shared(&this->db, this->getDBname(), cct); + dbops.InsertLCHead = make_shared(&this->db, this->getDBname(), cct); + dbops.RemoveLCHead = make_shared(&this->db, this->getDBname(), cct); + dbops.GetLCHead = make_shared(&this->db, this->getDBname(), cct); + + return 0; +} + +void *SQLiteDB::openDB(const DoutPrefixProvider *dpp) +{ + string dbname; + int rc = 0; + + dbname = getDBfile(); + if (dbname.empty()) { + ldpp_dout(dpp, 0)<<"dbname is NULL" << dendl; + goto out; + } + + rc = sqlite3_open_v2(dbname.c_str(), (sqlite3**)&db, + SQLITE_OPEN_READWRITE | + SQLITE_OPEN_CREATE | + SQLITE_OPEN_FULLMUTEX, + NULL); + + if (rc) { + ldpp_dout(dpp, 0) <<"Cant open "<user_table); + + ret = exec(dpp, schema.c_str(), NULL); + if (ret) + ldpp_dout(dpp, 0)<<"DeleteUserTable failed " << dendl; + + ldpp_dout(dpp, 20)<<"DeleteUserTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::DeleteBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + + schema = DeleteTableSchema(params->bucket_table); + + ret = exec(dpp, schema.c_str(), NULL); + if (ret) + ldpp_dout(dpp, 0)<<"DeletebucketTable failed " << dendl; + + ldpp_dout(dpp, 20)<<"DeletebucketTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::DeleteObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + + schema = DeleteTableSchema(params->object_table); + + ret = exec(dpp, schema.c_str(), NULL); + if (ret) + ldpp_dout(dpp, 0)<<"DeleteObjectTable failed " << dendl; + + ldpp_dout(dpp, 20)<<"DeleteObjectTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::DeleteObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + + schema = DeleteTableSchema(params->objectdata_table); + + ret = exec(dpp, schema.c_str(), NULL); + if (ret) + ldpp_dout(dpp, 0)<<"DeleteObjectDataTable failed " << dendl; + + ldpp_dout(dpp, 20)<<"DeleteObjectDataTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::DeleteQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + + schema = DeleteTableSchema(params->quota_table); + + ret = exec(dpp, schema.c_str(), NULL); + if (ret) + ldpp_dout(dpp, 0)<<"DeleteQuotaTable failed " << dendl; + + ldpp_dout(dpp, 20)<<"DeleteQuotaTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::DeleteLCEntryTable(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + + schema = DeleteTableSchema(params->lc_entry_table); + ret = exec(dpp, schema.c_str(), NULL); + if (ret) + ldpp_dout(dpp, 0)<<"DeleteLCEntryTable failed " << dendl; + ldpp_dout(dpp, 20)<<"DeleteLCEntryTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::DeleteLCHeadTable(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + + schema = DeleteTableSchema(params->lc_head_table); + ret = exec(dpp, schema.c_str(), NULL); + if (ret) + ldpp_dout(dpp, 0)<<"DeleteLCHeadTable failed " << dendl; + ldpp_dout(dpp, 20)<<"DeleteLCHeadTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + + schema = ListTableSchema(params->user_table); + ret = exec(dpp, schema.c_str(), &list_callback); + if (ret) + ldpp_dout(dpp, 0)<<"GetUsertable failed " << dendl; + + ldpp_dout(dpp, 20)<<"GetUserTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + + schema = ListTableSchema(params->bucket_table); + + ret = exec(dpp, schema.c_str(), &list_callback); + if (ret) + ldpp_dout(dpp, 0)<<"Listbuckettable failed " << dendl; + + ldpp_dout(dpp, 20)<<"ListbucketTable suceeded " << dendl; + + return ret; +} + +int SQLiteDB::ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) +{ + int ret = -1; + string schema; + map::iterator iter; + map objectmap; + string bucket; + + objectmap = getObjectMap(); + + if (objectmap.empty()) + ldpp_dout(dpp, 20)<<"objectmap empty " << dendl; + + for (iter = objectmap.begin(); iter != objectmap.end(); ++iter) { + bucket = iter->first; + params->object_table = getObjectTable(bucket); + schema = ListTableSchema(params->object_table); + + ret = exec(dpp, schema.c_str(), &list_callback); + if (ret) + ldpp_dout(dpp, 0)<<"ListObjecttable failed " << dendl; + + ldpp_dout(dpp, 20)<<"ListObjectTable suceeded " << dendl; + } + + return ret; +} + +int SQLObjectOp::InitializeObjectOps(string db_name, const DoutPrefixProvider *dpp) +{ + PutObject = make_shared(sdb, db_name, cct); + DeleteObject = make_shared(sdb, db_name, cct); + GetObject = make_shared(sdb, db_name, cct); + UpdateObject = make_shared(sdb, db_name, cct); + ListBucketObjects = make_shared(sdb, db_name, cct); + ListVersionedObjects = make_shared(sdb, db_name, cct); + PutObjectData = make_shared(sdb, db_name, cct); + UpdateObjectData = make_shared(sdb, db_name, cct); + GetObjectData = make_shared(sdb, db_name, cct); + DeleteObjectData = make_shared(sdb, db_name, cct); + DeleteStaleObjectData = make_shared(sdb, db_name, cct); + + return 0; +} + +int SQLInsertUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLInsertUser - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertUser"); +out: + return ret; +} + +int SQLInsertUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.tenant, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.tenant.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.ns, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.ns.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.display_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.display_name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_email, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_email.c_str(), sdb); + + if (!params->op.user.uinfo.access_keys.empty()) { + string access_key; + string key; + map::const_iterator it = + params->op.user.uinfo.access_keys.begin(); + const RGWAccessKey& k = it->second; + access_key = k.id; + key = k.key; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, access_key.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys_secret, sdb); + SQL_BIND_TEXT(dpp, stmt, index, key.c_str(), sdb); + + } + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.access_keys, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.swift_keys, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.swift_keys, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.subusers, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.subusers, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.suspended, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.suspended, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.max_buckets, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.max_buckets, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.op_mask, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.op_mask, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_caps, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.caps, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.admin, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.admin, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.system, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.system, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.default_placement.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_storage_class, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.default_placement.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_tags, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.placement_tags, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.bucket_quota, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.quota.bucket_quota, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.temp_url_keys, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.temp_url_keys, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_quota, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.quota.user_quota, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.type, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.type, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.mfa_ids, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.mfa_ids, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_attrs, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.user_attrs, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_ver, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.user.user_version.ver, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_ver_tag, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.user_version.tag.c_str(), sdb); + +out: + return rc; +} + +int SQLInsertUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLRemoveUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLRemoveUser - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveUser"); +out: + return ret; +} + +int SQLRemoveUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb); + +out: + return rc; +} + +int SQLRemoveUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLGetUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLGetUser - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + if (params->op.query_str == "email") { + SQL_PREPARE(dpp, p_params, sdb, email_stmt, ret, "PrepareGetUser"); + } else if (params->op.query_str == "access_key") { + SQL_PREPARE(dpp, p_params, sdb, ak_stmt, ret, "PrepareGetUser"); + } else if (params->op.query_str == "user_id") { + SQL_PREPARE(dpp, p_params, sdb, userid_stmt, ret, "PrepareGetUser"); + } else { // by default by userid + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetUser"); + } +out: + return ret; +} + +int SQLGetUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.query_str == "email") { + SQL_BIND_INDEX(dpp, email_stmt, index, p_params.op.user.user_email, sdb); + SQL_BIND_TEXT(dpp, email_stmt, index, params->op.user.uinfo.user_email.c_str(), sdb); + } else if (params->op.query_str == "access_key") { + if (!params->op.user.uinfo.access_keys.empty()) { + string access_key; + map::const_iterator it = + params->op.user.uinfo.access_keys.begin(); + const RGWAccessKey& k = it->second; + access_key = k.id; + + SQL_BIND_INDEX(dpp, ak_stmt, index, p_params.op.user.access_keys_id, sdb); + SQL_BIND_TEXT(dpp, ak_stmt, index, access_key.c_str(), sdb); + } + } else if (params->op.query_str == "user_id") { + SQL_BIND_INDEX(dpp, userid_stmt, index, p_params.op.user.user_id, sdb); + SQL_BIND_TEXT(dpp, userid_stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb); + } else { // by default by userid + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb); + } + +out: + return rc; +} + +int SQLGetUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + if (params->op.query_str == "email") { + SQL_EXECUTE(dpp, params, email_stmt, list_user); + } else if (params->op.query_str == "access_key") { + SQL_EXECUTE(dpp, params, ak_stmt, list_user); + } else if (params->op.query_str == "user_id") { + SQL_EXECUTE(dpp, params, userid_stmt, list_user); + } else { // by default by userid + SQL_EXECUTE(dpp, params, stmt, list_user); + } + +out: + return ret; +} + +int SQLInsertBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLInsertBucket - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertBucket"); + +out: + return ret; +} + +int SQLInsertBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + // user_id here is copied as OwnerID in the bucket table. + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.tenant, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.tenant.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.marker, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.marker.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.bucket_id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.size, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.size, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.size_rounded, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.size_rounded, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.creation_time, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.creation_time, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.count, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.count, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.placement_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.placement_rule.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.placement_storage_class, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.placement_rule.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.flags, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.flags, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.zonegroup, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.zonegroup.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.has_instance_obj, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.has_instance_obj, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.quota, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.quota, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.requester_pays, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.requester_pays, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.has_website, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.has_website, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.website_conf, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.website_conf, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.swift_versioning, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.swift_versioning, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.swift_ver_location, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.swift_ver_location.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.mdsearch_config, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.mdsearch_config, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.new_bucket_instance_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.new_bucket_instance_id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.obj_lock, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.obj_lock, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.sync_policy_info_groups, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.sync_policy, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_attrs, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.bucket_attrs, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_ver, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.bucket.bucket_version.ver, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_ver_tag, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.bucket_version.tag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.mtime, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.mtime, sdb); + +out: + return rc; +} + +int SQLInsertBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + class SQLObjectOp *ObPtr = NULL; + string bucket_name = params->op.bucket.info.bucket.name; + struct DBOpPrepareParams p_params = PrepareParams; + + ObPtr = new SQLObjectOp(sdb, ctx()); + + objectmapInsert(dpp, bucket_name, ObPtr); + + SQL_EXECUTE(dpp, params, stmt, NULL); + + /* Once Bucket is inserted created corresponding object(&data) tables + */ + InitPrepareParams(dpp, p_params, params); + + (void)createObjectTable(dpp, params); + (void)createObjectDataTable(dpp, params); + (void)createObjectTableTrigger(dpp, params); +out: + return ret; +} + +int SQLUpdateBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLUpdateBucket - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + if (params->op.query_str == "attrs") { + SQL_PREPARE(dpp, p_params, sdb, attrs_stmt, ret, "PrepareUpdateBucket"); + } else if (params->op.query_str == "owner") { + SQL_PREPARE(dpp, p_params, sdb, owner_stmt, ret, "PrepareUpdateBucket"); + } else if (params->op.query_str == "info") { + SQL_PREPARE(dpp, p_params, sdb, info_stmt, ret, "PrepareUpdateBucket"); + } else { + ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" << + params->op.query_str << "" << dendl; + goto out; + } + +out: + return ret; +} + +int SQLUpdateBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + sqlite3_stmt** stmt = NULL; // Prepared statement + + /* All below fields for attrs */ + if (params->op.query_str == "attrs") { + stmt = &attrs_stmt; + } else if (params->op.query_str == "owner") { + stmt = &owner_stmt; + } else if (params->op.query_str == "info") { + stmt = &info_stmt; + } else { + ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" << + params->op.query_str << "" << dendl; + goto out; + } + + if (params->op.query_str == "attrs") { + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_attrs, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.bucket_attrs, sdb); + } else if (params->op.query_str == "owner") { + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.creation_time, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.creation_time, sdb); + } else if (params->op.query_str == "info") { + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.tenant, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.tenant.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.marker, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.marker.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_id, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.bucket_id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.creation_time, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.creation_time, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.count, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.ent.count, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.placement_name, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.placement_rule.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.placement_storage_class, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.placement_rule.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.flags, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.flags, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.zonegroup, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.zonegroup.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.has_instance_obj, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.has_instance_obj, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.quota, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.quota, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.requester_pays, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.requester_pays, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.has_website, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.has_website, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.website_conf, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.website_conf, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.swift_versioning, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.swift_versioning, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.swift_ver_location, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.swift_ver_location.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.mdsearch_config, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.mdsearch_config, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.new_bucket_instance_id, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.new_bucket_instance_id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.obj_lock, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.obj_lock, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.sync_policy_info_groups, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.sync_policy, sdb); + } + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.user.user_id, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_ver, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.bucket_version.ver, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.mtime, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.mtime, sdb); + +out: + return rc; +} + +int SQLUpdateBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + sqlite3_stmt** stmt = NULL; // Prepared statement + + if (params->op.query_str == "attrs") { + stmt = &attrs_stmt; + } else if (params->op.query_str == "owner") { + stmt = &owner_stmt; + } else if (params->op.query_str == "info") { + stmt = &info_stmt; + } else { + ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" << + params->op.query_str << "" << dendl; + goto out; + } + + SQL_EXECUTE(dpp, params, *stmt, NULL); +out: + return ret; +} + +int SQLRemoveBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLRemoveBucket - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveBucket"); + +out: + return ret; +} + +int SQLRemoveBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + +out: + return rc; +} + +int SQLRemoveBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + objectmapDelete(dpp, params->op.bucket.info.bucket.name); + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLGetBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLGetBucket - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetBucket"); + +out: + return ret; +} + +int SQLGetBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + +out: + return rc; +} + +int SQLGetBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + class SQLObjectOp *ObPtr = NULL; + + params->op.name = "GetBucket"; + + ObPtr = new SQLObjectOp(sdb, ctx()); + + /* For the case when the server restarts, need to reinsert objectmap*/ + objectmapInsert(dpp, params->op.bucket.info.bucket.name, ObPtr); + SQL_EXECUTE(dpp, params, stmt, list_bucket); +out: + return ret; +} + +int SQLListUserBuckets::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLListUserBuckets - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + if (params->op.query_str == "all") { + SQL_PREPARE(dpp, p_params, sdb, all_stmt, ret, "PrepareListUserBuckets"); + }else { + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListUserBuckets"); + } + +out: + return ret; +} + +int SQLListUserBuckets::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + sqlite3_stmt** pstmt = NULL; // Prepared statement + + if (params->op.query_str == "all") { + pstmt = &all_stmt; + } else { + pstmt = &stmt; + } + + if (params->op.query_str != "all") { + SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.user.user_id, sdb); + SQL_BIND_TEXT(dpp, *pstmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb); + } + + SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.bucket.min_marker, sdb); + SQL_BIND_TEXT(dpp, *pstmt, index, params->op.bucket.min_marker.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.list_max_count, sdb); + SQL_BIND_INT(dpp, *pstmt, index, params->op.list_max_count, sdb); + +out: + return rc; +} + +int SQLListUserBuckets::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + if (params->op.query_str == "all") { + SQL_EXECUTE(dpp, params, all_stmt, list_bucket); + } else { + SQL_EXECUTE(dpp, params, stmt, list_bucket); + } +out: + return ret; +} + +int SQLPutObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLPutObject - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PreparePutObject"); + +out: + return ret; +} + +int SQLPutObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + int VersionNum = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_ns, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.acls, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.acls, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.index_ver, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.index_ver, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tag, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.flags, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.flags, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.versioned_epoch, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.versioned_epoch, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_category, sdb); + SQL_BIND_INT(dpp, stmt, index, (uint8_t)(params->op.obj.category), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.etag, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.etag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.owner, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.owner.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.owner_display_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.owner_display_name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.storage_class, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.appendable, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.appendable, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.content_type, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.content_type.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.index_hash_source, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.index_hash_source.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_size, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.size, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.accounted_size, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.accounted_size, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.epoch, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.epoch, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_tag, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.obj_tag, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_tag, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.tail_tag, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.write_tag, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.write_tag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.fake_tag, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.fake_tag, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.shadow_obj, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.shadow_obj.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.has_data, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.has_data, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.is_versioned, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.is_versioned, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.version_num, sdb); + SQL_BIND_INT(dpp, stmt, index, VersionNum, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.pg_ver, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.pg_ver, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.zone_short_id, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.zone_short_id, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_version, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.objv_tracker.read_version.ver, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_version_tag, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.objv_tracker.read_version.tag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_attrs, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.attrset, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_size, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.head_size, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.max_head_size, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.max_head_size, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_instance, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_instance.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_placement_rule_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.head_placement_rule.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_placement_storage_class, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.head_placement_rule.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_placement_rule_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_placement.placement_rule.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_placement_storage_class, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_placement.placement_rule.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.manifest_part_objs, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.objs, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.manifest_part_rules, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.rules, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.omap, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.omap, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.is_multipart, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.obj.is_multipart, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mp_parts, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.mp_parts, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_data, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.head_data, sdb); + +out: + return rc; +} + +int SQLPutObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLDeleteObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLDeleteObject - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteObject"); + +out: + return ret; +} + +int SQLDeleteObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb); +out: + return rc; +} + +int SQLDeleteObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLGetObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLGetObject - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetObject"); + +out: + return ret; +} + +int SQLGetObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb); + +out: + return rc; +} + +int SQLGetObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, list_object); +out: + return ret; +} + +int SQLUpdateObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + struct DBOpParams copy = *params; + string bucket_name; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLUpdateObject - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + if (params->op.query_str == "omap") { + SQL_PREPARE(dpp, p_params, sdb, omap_stmt, ret, "PrepareUpdateObject"); + } else if (params->op.query_str == "attrs") { + SQL_PREPARE(dpp, p_params, sdb, attrs_stmt, ret, "PrepareUpdateObject"); + } else if (params->op.query_str == "meta") { + SQL_PREPARE(dpp, p_params, sdb, meta_stmt, ret, "PrepareUpdateObject"); + } else if (params->op.query_str == "mp") { + SQL_PREPARE(dpp, p_params, sdb, mp_stmt, ret, "PrepareUpdateObject"); + } else { + ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" << + params->op.query_str << dendl; + goto out; + } + +out: + return ret; +} + +int SQLUpdateObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + sqlite3_stmt** stmt = NULL; // Prepared statement + + /* All below fields for attrs */ + if (params->op.query_str == "omap") { + stmt = &omap_stmt; + } else if (params->op.query_str == "attrs") { + stmt = &attrs_stmt; + } else if (params->op.query_str == "meta") { + stmt = &meta_stmt; + } else if (params->op.query_str == "mp") { + stmt = &mp_stmt; + } else { + ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" << + params->op.query_str << dendl; + goto out; + } + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_name, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_instance, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mtime, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.mtime, sdb); + + if (params->op.query_str == "omap") { + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.omap, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.omap, sdb); + } + if (params->op.query_str == "attrs") { + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_attrs, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.attrset, sdb); + } + if (params->op.query_str == "mp") { + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mp_parts, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.mp_parts, sdb); + } + if (params->op.query_str == "meta") { + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_ns, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.acls, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.acls, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.index_ver, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.index_ver, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tag, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.flags, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.flags, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.versioned_epoch, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.versioned_epoch, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_category, sdb); + SQL_BIND_INT(dpp, *stmt, index, (uint8_t)(params->op.obj.category), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.etag, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.etag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.owner, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.owner.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.owner_display_name, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.owner_display_name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.storage_class, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.appendable, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.appendable, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.content_type, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.content_type.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.index_hash_source, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.index_hash_source.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_size, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.size, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.accounted_size, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.accounted_size, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.epoch, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.epoch, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_tag, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.obj_tag, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_tag, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.tail_tag, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.write_tag, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.write_tag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.fake_tag, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.fake_tag, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.shadow_obj, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.shadow_obj.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.has_data, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.has_data, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.is_versioned, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.is_versioned, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.version_num, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.version_num, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.pg_ver, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.pg_ver, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.zone_short_id, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.zone_short_id, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_version, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.objv_tracker.read_version.ver, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_version_tag, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.objv_tracker.read_version.tag.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_attrs, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.attrset, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_size, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.head_size, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.max_head_size, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.max_head_size, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_id, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.obj_id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_instance, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_instance.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_placement_rule_name, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.head_placement_rule.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_placement_storage_class, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.head_placement_rule.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_placement_rule_name, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_placement.placement_rule.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_placement_storage_class, sdb); + SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_placement.placement_rule.storage_class.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.manifest_part_objs, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.objs, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.manifest_part_rules, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.rules, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.omap, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.omap, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.is_multipart, sdb); + SQL_BIND_INT(dpp, *stmt, index, params->op.obj.is_multipart, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mp_parts, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.mp_parts, sdb); + + SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_data, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.head_data, sdb); + } + +out: + return rc; +} + +int SQLUpdateObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + sqlite3_stmt** stmt = NULL; // Prepared statement + + if (params->op.query_str == "omap") { + stmt = &omap_stmt; + } else if (params->op.query_str == "attrs") { + stmt = &attrs_stmt; + } else if (params->op.query_str == "meta") { + stmt = &meta_stmt; + } else if (params->op.query_str == "mp") { + stmt = &mp_stmt; + } else { + ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" << + params->op.query_str << dendl; + goto out; + } + + SQL_EXECUTE(dpp, params, *stmt, NULL); +out: + return ret; +} + +int SQLListBucketObjects::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLListBucketObjects - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListBucketObjects"); + +out: + return ret; +} + +int SQLListBucketObjects::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.min_marker, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.min_marker.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.prefix, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.prefix.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb); + +out: + return rc; +} + +int SQLListBucketObjects::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, list_object); +out: + return ret; +} + +int SQLListVersionedObjects::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLListVersionedObjects - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListVersionedObjects"); + +out: + return ret; +} + +int SQLListVersionedObjects::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb); + +out: + return rc; +} + +int SQLListVersionedObjects::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, list_object); +out: + return ret; +} + +int SQLPutObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLPutObjectData - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PreparePutObjectData"); + +out: + return ret; +} + +int SQLPutObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb); + + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb); + + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_ns, sdb); + + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.part_num, sdb); + + SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.part_num, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.offset, sdb); + + SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.offset, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.data, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj_data.data, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.size, sdb); + + SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.size, sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.multipart_part_str, sdb); + + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj_data.multipart_part_str.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb); + +out: + return rc; +} + +int SQLPutObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLUpdateObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLUpdateObjectData - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareUpdateObjectData"); + +out: + return ret; +} + +int SQLUpdateObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb); + +out: + return rc; +} + +int SQLUpdateObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLGetObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLGetObjectData - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetObjectData"); + +out: + return ret; +} + +int SQLGetObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb); + +out: + return rc; +} + +int SQLGetObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, get_objectdata); +out: + return ret; +} + +int SQLDeleteObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLDeleteObjectData - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteObjectData"); + +out: + return ret; +} + +int SQLDeleteObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + if (params->op.obj.state.obj.key.instance.empty()) { + params->op.obj.state.obj.key.instance = "null"; + } + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb); + +out: + return rc; +} + +int SQLDeleteObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLDeleteStaleObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLDeleteStaleObjectData - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteStaleObjectData"); + +out: + return ret; +} + +int SQLDeleteStaleObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb); + +out: + return rc; +} + +int SQLDeleteStaleObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLInsertLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLInsertLCEntry - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertLCEntry"); + +out: + return ret; +} + +int SQLInsertLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.status, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_status(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.start_time, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_start_time(), sdb); + +out: + return rc; +} + +int SQLInsertLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLRemoveLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLRemoveLCEntry - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveLCEntry"); + +out: + return ret; +} + +int SQLRemoveLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb); + +out: + return rc; +} + +int SQLRemoveLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLGetLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + sqlite3_stmt** pstmt = NULL; // Prepared statement + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLGetLCEntry - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + if (params->op.query_str == "get_next_entry") { + pstmt = &next_stmt; + } else { + pstmt = &stmt; + } + SQL_PREPARE(dpp, p_params, sdb, *pstmt, ret, "PrepareGetLCEntry"); + +out: + return ret; +} + +int SQLGetLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + sqlite3_stmt** pstmt = NULL; // Prepared statement + + if (params->op.query_str == "get_next_entry") { + pstmt = &next_stmt; + } else { + pstmt = &stmt; + } + SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.index, sdb); + SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.index.c_str(), sdb); + + SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.bucket_name, sdb); + SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb); + +out: + return rc; +} + +int SQLGetLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + sqlite3_stmt** pstmt = NULL; // Prepared statement + + if (params->op.query_str == "get_next_entry") { + pstmt = &next_stmt; + } else { + pstmt = &stmt; + } + + SQL_EXECUTE(dpp, params, *pstmt, list_lc_entry); +out: + return ret; +} + +int SQLListLCEntries::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLListLCEntries - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListLCEntries"); + +out: + return ret; +} + +int SQLListLCEntries::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.min_marker, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.min_marker.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb); + SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb); + +out: + return rc; +} + +int SQLListLCEntries::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, list_lc_entry); +out: + return ret; +} + +int SQLInsertLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLInsertLCHead - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertLCHead"); + +out: + return ret; +} + +int SQLInsertLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.marker, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.head.get_marker().c_str(), sdb); + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.start_date, sdb); + SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, static_cast(params->op.lc_head.head.start_date), sdb); + +out: + return rc; +} + +int SQLInsertLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLRemoveLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLRemoveLCHead - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveLCHead"); + +out: + return ret; +} + +int SQLRemoveLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb); + +out: + return rc; +} + +int SQLRemoveLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + SQL_EXECUTE(dpp, params, stmt, NULL); +out: + return ret; +} + +int SQLGetLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + struct DBOpPrepareParams p_params = PrepareParams; + + if (!*sdb) { + ldpp_dout(dpp, 0)<<"In SQLGetLCHead - no db" << dendl; + goto out; + } + + InitPrepareParams(dpp, p_params, params); + + SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetLCHead"); + +out: + return ret; +} + +int SQLGetLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int index = -1; + int rc = 0; + struct DBOpPrepareParams p_params = PrepareParams; + + SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb); + SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb); + +out: + return rc; +} + +int SQLGetLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params) +{ + int ret = -1; + + // clear the params before fetching the entry + params->op.lc_head.head = {}; + SQL_EXECUTE(dpp, params, stmt, list_lc_head); +out: + return ret; +} diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.h b/src/rgw/driver/dbstore/sqlite/sqliteDB.h new file mode 100644 index 000000000..ec0ef2bb2 --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/sqliteDB.h @@ -0,0 +1,551 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include "rgw/driver/dbstore/common/dbstore.h" + +using namespace rgw::store; + +class SQLiteDB : public DB, virtual public DBOp { + private: + sqlite3_mutex *mutex = NULL; + + protected: + CephContext *cct; + + public: + sqlite3_stmt *stmt = NULL; + DBOpPrepareParams PrepareParams; + + SQLiteDB(sqlite3 *dbi, std::string db_name, CephContext *_cct) : DB(db_name, _cct), cct(_cct) { + db = (void*)dbi; + } + SQLiteDB(std::string db_name, CephContext *_cct) : DB(db_name, _cct), cct(_cct) { + } + ~SQLiteDB() {} + + uint64_t get_blob_limit() override { return SQLITE_LIMIT_LENGTH; } + void *openDB(const DoutPrefixProvider *dpp) override; + int closeDB(const DoutPrefixProvider *dpp) override; + int InitializeDBOps(const DoutPrefixProvider *dpp) override; + + int InitPrepareParams(const DoutPrefixProvider *dpp, DBOpPrepareParams &p_params, + DBOpParams* params) override; + + int exec(const DoutPrefixProvider *dpp, const char *schema, + int (*callback)(void*,int,char**,char**)); + int Step(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt, + int (*cbk)(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt)); + int Reset(const DoutPrefixProvider *dpp, sqlite3_stmt *stmt); + /* default value matches with sqliteDB style */ + + int createTables(const DoutPrefixProvider *dpp) override; + int createBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int createUserTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int createObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int createObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int createObjectView(const DoutPrefixProvider *dpp, DBOpParams *params); + int createObjectTableTrigger(const DoutPrefixProvider *dpp, DBOpParams *params); + int createQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params); + void populate_object_params(const DoutPrefixProvider *dpp, + struct DBOpPrepareParams& p_params, + struct DBOpParams* params, bool data); + + int createLCTables(const DoutPrefixProvider *dpp) override; + + int DeleteBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int DeleteUserTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int DeleteObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int DeleteObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int DeleteQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int DeleteLCEntryTable(const DoutPrefixProvider *dpp, DBOpParams *params); + int DeleteLCHeadTable(const DoutPrefixProvider *dpp, DBOpParams *params); + + int ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) override; + int ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) override; + int ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) override; +}; + +class SQLObjectOp : public ObjectOp { + private: + sqlite3 **sdb = NULL; + CephContext *cct; + + public: + SQLObjectOp(sqlite3 **sdbi, CephContext *_cct) : sdb(sdbi), cct(_cct) {}; + ~SQLObjectOp() {} + + int InitializeObjectOps(std::string db_name, const DoutPrefixProvider *dpp); +}; + +class SQLInsertUser : public SQLiteDB, public InsertUserOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLInsertUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLInsertUser() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLRemoveUser : public SQLiteDB, public RemoveUserOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLRemoveUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLRemoveUser() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLGetUser : public SQLiteDB, public GetUserOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + sqlite3_stmt *email_stmt = NULL; // Prepared statement to query by useremail + sqlite3_stmt *ak_stmt = NULL; // Prepared statement to query by access_key_id + sqlite3_stmt *userid_stmt = NULL; // Prepared statement to query by user_id + + public: + SQLGetUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLGetUser() { + if (stmt) + sqlite3_finalize(stmt); + if (email_stmt) + sqlite3_finalize(email_stmt); + if (ak_stmt) + sqlite3_finalize(ak_stmt); + if (userid_stmt) + sqlite3_finalize(userid_stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLInsertBucket : public SQLiteDB, public InsertBucketOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLInsertBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLInsertBucket() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLUpdateBucket : public SQLiteDB, public UpdateBucketOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *info_stmt = NULL; // Prepared statement + sqlite3_stmt *attrs_stmt = NULL; // Prepared statement + sqlite3_stmt *owner_stmt = NULL; // Prepared statement + + public: + SQLUpdateBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLUpdateBucket() { + if (info_stmt) + sqlite3_finalize(info_stmt); + if (attrs_stmt) + sqlite3_finalize(attrs_stmt); + if (owner_stmt) + sqlite3_finalize(owner_stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLRemoveBucket : public SQLiteDB, public RemoveBucketOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLRemoveBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLRemoveBucket() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLGetBucket : public SQLiteDB, public GetBucketOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLGetBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLGetBucket() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLListUserBuckets : public SQLiteDB, public ListUserBucketsOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + sqlite3_stmt *all_stmt = NULL; // Prepared statement + + public: + SQLListUserBuckets(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLListUserBuckets() { + if (stmt) + sqlite3_finalize(stmt); + if (all_stmt) + sqlite3_finalize(all_stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLPutObject : public SQLiteDB, public PutObjectOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLPutObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLPutObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLPutObject() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLDeleteObject : public SQLiteDB, public DeleteObjectOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLDeleteObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLDeleteObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLDeleteObject() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLGetObject : public SQLiteDB, public GetObjectOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLGetObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLGetObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLGetObject() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLUpdateObject : public SQLiteDB, public UpdateObjectOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *omap_stmt = NULL; // Prepared statement + sqlite3_stmt *attrs_stmt = NULL; // Prepared statement + sqlite3_stmt *meta_stmt = NULL; // Prepared statement + sqlite3_stmt *mp_stmt = NULL; // Prepared statement + + public: + SQLUpdateObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLUpdateObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLUpdateObject() { + if (omap_stmt) + sqlite3_finalize(omap_stmt); + if (attrs_stmt) + sqlite3_finalize(attrs_stmt); + if (meta_stmt) + sqlite3_finalize(meta_stmt); + } + + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLListBucketObjects : public SQLiteDB, public ListBucketObjectsOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLListBucketObjects(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLListBucketObjects(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLListBucketObjects() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLListVersionedObjects : public SQLiteDB, public ListVersionedObjectsOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLListVersionedObjects(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLListVersionedObjects(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLListVersionedObjects() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLPutObjectData : public SQLiteDB, public PutObjectDataOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLPutObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLPutObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLPutObjectData() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLUpdateObjectData : public SQLiteDB, public UpdateObjectDataOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLUpdateObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLUpdateObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLUpdateObjectData() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLGetObjectData : public SQLiteDB, public GetObjectDataOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLGetObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLGetObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLGetObjectData() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLDeleteObjectData : public SQLiteDB, public DeleteObjectDataOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLDeleteObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLDeleteObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLDeleteObjectData() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLDeleteStaleObjectData : public SQLiteDB, public DeleteStaleObjectDataOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLDeleteStaleObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + SQLDeleteStaleObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {} + + ~SQLDeleteStaleObjectData() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLInsertLCEntry : public SQLiteDB, public InsertLCEntryOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLInsertLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLInsertLCEntry() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLRemoveLCEntry : public SQLiteDB, public RemoveLCEntryOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLRemoveLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLRemoveLCEntry() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLGetLCEntry : public SQLiteDB, public GetLCEntryOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + sqlite3_stmt *next_stmt = NULL; // Prepared statement + + public: + SQLGetLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLGetLCEntry() { + if (stmt) + sqlite3_finalize(stmt); + if (next_stmt) + sqlite3_finalize(next_stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLListLCEntries : public SQLiteDB, public ListLCEntriesOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLListLCEntries(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLListLCEntries() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLInsertLCHead : public SQLiteDB, public InsertLCHeadOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLInsertLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLInsertLCHead() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLRemoveLCHead : public SQLiteDB, public RemoveLCHeadOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLRemoveLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLRemoveLCHead() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; + +class SQLGetLCHead : public SQLiteDB, public GetLCHeadOp { + private: + sqlite3 **sdb = NULL; + sqlite3_stmt *stmt = NULL; // Prepared statement + + public: + SQLGetLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {} + ~SQLGetLCHead() { + if (stmt) + sqlite3_finalize(stmt); + } + int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params); + int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); + int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); +}; diff --git a/src/rgw/driver/dbstore/sqlite/statement.cc b/src/rgw/driver/dbstore/sqlite/statement.cc new file mode 100644 index 000000000..dcf7dba9c --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/statement.cc @@ -0,0 +1,196 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/dout.h" +#include "error.h" +#include "statement.h" + +#define dout_subsys ceph_subsys_rgw_dbstore + +namespace rgw::dbstore::sqlite { + +// owning pointer to arbitrary memory allocated and returned by sqlite3 +struct sqlite_deleter { + template + void operator()(T* p) { ::sqlite3_free(p); } +}; +template +using sqlite_ptr = std::unique_ptr; + + +stmt_ptr prepare_statement(const DoutPrefixProvider* dpp, + sqlite3* db, std::string_view sql) +{ + sqlite3_stmt* stmt = nullptr; + int result = ::sqlite3_prepare_v2(db, sql.data(), sql.size(), &stmt, nullptr); + auto ec = std::error_code{result, sqlite::error_category()}; + if (ec != sqlite::errc::ok) { + const char* errmsg = ::sqlite3_errmsg(db); + ldpp_dout(dpp, 1) << "preparation failed: " << errmsg + << " (" << ec << ")\nstatement: " << sql << dendl; + throw sqlite::error(errmsg, ec); + } + return stmt_ptr{stmt}; +} + +static int bind_index(const DoutPrefixProvider* dpp, + const stmt_binding& stmt, const char* name) +{ + const int index = ::sqlite3_bind_parameter_index(stmt.get(), name); + if (index <= 0) { + ldpp_dout(dpp, 1) << "binding failed on parameter name=" + << name << dendl; + sqlite3* db = ::sqlite3_db_handle(stmt.get()); + throw sqlite::error(db); + } + return index; +} + +void bind_text(const DoutPrefixProvider* dpp, const stmt_binding& stmt, + const char* name, std::string_view value) +{ + const int index = bind_index(dpp, stmt, name); + + int result = ::sqlite3_bind_text(stmt.get(), index, value.data(), + value.size(), SQLITE_STATIC); + auto ec = std::error_code{result, sqlite::error_category()}; + if (ec != sqlite::errc::ok) { + ldpp_dout(dpp, 1) << "binding failed on parameter name=" + << name << " value=" << value << dendl; + sqlite3* db = ::sqlite3_db_handle(stmt.get()); + throw sqlite::error(db, ec); + } +} + +void bind_int(const DoutPrefixProvider* dpp, const stmt_binding& stmt, + const char* name, int value) +{ + const int index = bind_index(dpp, stmt, name); + + int result = ::sqlite3_bind_int(stmt.get(), index, value); + auto ec = std::error_code{result, sqlite::error_category()}; + if (ec != sqlite::errc::ok) { + ldpp_dout(dpp, 1) << "binding failed on parameter name=" + << name << " value=" << value << dendl; + sqlite3* db = ::sqlite3_db_handle(stmt.get()); + throw sqlite::error(db, ec); + } +} + +void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt) +{ + sqlite_ptr sql; + if (dpp->get_cct()->_conf->subsys.should_gather()) { + sql.reset(::sqlite3_expanded_sql(stmt.get())); + } + + const int result = ::sqlite3_step(stmt.get()); + auto ec = std::error_code{result, sqlite::error_category()}; + sqlite3* db = ::sqlite3_db_handle(stmt.get()); + + if (ec != sqlite::errc::done) { + const char* errmsg = ::sqlite3_errmsg(db); + ldpp_dout(dpp, 20) << "evaluation failed: " << errmsg + << " (" << ec << ")\nstatement: " << sql.get() << dendl; + throw sqlite::error(errmsg, ec); + } + ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl; +} + +void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt) +{ + sqlite_ptr sql; + if (dpp->get_cct()->_conf->subsys.should_gather()) { + sql.reset(::sqlite3_expanded_sql(stmt.get())); + } + + const int result = ::sqlite3_step(stmt.get()); + auto ec = std::error_code{result, sqlite::error_category()}; + if (ec != sqlite::errc::row) { + sqlite3* db = ::sqlite3_db_handle(stmt.get()); + const char* errmsg = ::sqlite3_errmsg(db); + ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec + << ")\nstatement: " << sql.get() << dendl; + throw sqlite::error(errmsg, ec); + } + ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl; +} + +int column_int(const stmt_execution& stmt, int column) +{ + return ::sqlite3_column_int(stmt.get(), column); +} + +std::string column_text(const stmt_execution& stmt, int column) +{ + const unsigned char* text = ::sqlite3_column_text(stmt.get(), column); + // may be NULL + if (text) { + const std::size_t size = ::sqlite3_column_bytes(stmt.get(), column); + return {reinterpret_cast(text), size}; + } else { + return {}; + } +} + +auto read_text_rows(const DoutPrefixProvider* dpp, + const stmt_execution& stmt, + std::span entries) + -> std::span +{ + sqlite_ptr sql; + if (dpp->get_cct()->_conf->subsys.should_gather()) { + sql.reset(::sqlite3_expanded_sql(stmt.get())); + } + + std::size_t count = 0; + while (count < entries.size()) { + const int result = ::sqlite3_step(stmt.get()); + auto ec = std::error_code{result, sqlite::error_category()}; + if (ec == sqlite::errc::done) { + break; + } + if (ec != sqlite::errc::row) { + sqlite3* db = ::sqlite3_db_handle(stmt.get()); + const char* errmsg = ::sqlite3_errmsg(db); + ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec + << ")\nstatement: " << sql.get() << dendl; + throw sqlite::error(errmsg, ec); + } + entries[count] = column_text(stmt, 0); + ++count; + } + ldpp_dout(dpp, 20) << "statement evaluation produced " << count + << " results: " << sql.get() << dendl; + + return entries.first(count); +} + +void execute(const DoutPrefixProvider* dpp, sqlite3* db, const char* query, + sqlite3_callback callback, void* arg) +{ + char* errmsg = nullptr; + const int result = ::sqlite3_exec(db, query, callback, arg, &errmsg); + auto ec = std::error_code{result, sqlite::error_category()}; + auto ptr = sqlite_ptr{errmsg}; // free on destruction + if (ec != sqlite::errc::ok) { + ldpp_dout(dpp, 1) << "query execution failed: " << errmsg << " (" << ec + << ")\nquery: " << query << dendl; + throw sqlite::error(errmsg, ec); + } + ldpp_dout(dpp, 20) << "query execution succeeded: " << query << dendl; +} + +} // namespace rgw::dbstore::sqlite diff --git a/src/rgw/driver/dbstore/sqlite/statement.h b/src/rgw/driver/dbstore/sqlite/statement.h new file mode 100644 index 000000000..98b4acfea --- /dev/null +++ b/src/rgw/driver/dbstore/sqlite/statement.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include + +#include + +class DoutPrefixProvider; + +namespace rgw::dbstore::sqlite { + +// owning sqlite3_stmt pointer +struct stmt_deleter { + void operator()(sqlite3_stmt* p) const { ::sqlite3_finalize(p); } +}; +using stmt_ptr = std::unique_ptr; + +// non-owning sqlite3_stmt pointer that clears binding state on destruction +struct stmt_binding_deleter { + void operator()(sqlite3_stmt* p) const { ::sqlite3_clear_bindings(p); } +}; +using stmt_binding = std::unique_ptr; + +// non-owning sqlite3_stmt pointer that clears execution state on destruction +struct stmt_execution_deleter { + void operator()(sqlite3_stmt* p) const { ::sqlite3_reset(p); } +}; +using stmt_execution = std::unique_ptr; + + +// prepare the sql statement or throw on error +stmt_ptr prepare_statement(const DoutPrefixProvider* dpp, + sqlite3* db, std::string_view sql); + +// bind an input string for the given parameter name +void bind_text(const DoutPrefixProvider* dpp, const stmt_binding& stmt, + const char* name, std::string_view value); + +// bind an input integer for the given parameter name +void bind_int(const DoutPrefixProvider* dpp, const stmt_binding& stmt, + const char* name, int value); + +// evaluate a prepared statement, expecting no result rows +void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt); + +// evaluate a prepared statement, expecting a single result row +void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt); + +// return the given column as an integer +int column_int(const stmt_execution& stmt, int column); + +// return the given column as text, or an empty string on NULL +std::string column_text(const stmt_execution& stmt, int column); + +// read the text column from each result row into the given entries, and return +// the sub-span of entries that contain results +auto read_text_rows(const DoutPrefixProvider* dpp, + const stmt_execution& stmt, + std::span entries) + -> std::span; + +// execute a raw query without preparing a statement. the optional callback +// can be used to read results +void execute(const DoutPrefixProvider* dpp, sqlite3* db, const char* query, + sqlite3_callback callback, void* arg); + +} // namespace rgw::dbstore::sqlite diff --git a/src/rgw/driver/dbstore/tests/CMakeLists.txt b/src/rgw/driver/dbstore/tests/CMakeLists.txt new file mode 100644 index 000000000..4e60dcf5e --- /dev/null +++ b/src/rgw/driver/dbstore/tests/CMakeLists.txt @@ -0,0 +1,17 @@ +cmake_minimum_required(VERSION 3.14.0) +project(dbstore-tests) + +set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} gtest) + +set(dbstore_tests_srcs + dbstore_tests.cc) + +include_directories(${CMAKE_INCLUDE_DIR}) + +add_executable(unittest_dbstore_tests ${dbstore_tests_srcs}) +target_link_libraries(unittest_dbstore_tests ${CMAKE_LINK_LIBRARIES}) +add_ceph_unittest(unittest_dbstore_tests) + +add_executable(unittest_dbstore_mgr_tests dbstore_mgr_tests.cc) +target_link_libraries(unittest_dbstore_mgr_tests dbstore gtest_main) +add_ceph_unittest(unittest_dbstore_mgr_tests) diff --git a/src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc b/src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc new file mode 100644 index 000000000..02ecd9f15 --- /dev/null +++ b/src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc @@ -0,0 +1,157 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_context.h" +#include "rgw/driver/dbstore/dbstore_mgr.h" + +#include +#include +#include + +using namespace rgw; +namespace fs = std::filesystem; +const static std::string TEST_DIR = "rgw_dbstore_tests"; + +bool endsWith(const std::string &mainStr, const std::string &toMatch) +{ + if(mainStr.size() >= toMatch.size() && + mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0) + return true; + else + return false; +} + +class TestDBStoreManager : public ::testing::Test { +protected: + void SetUp() override { + ctx_ = std::make_shared(CEPH_ENTITY_TYPE_CLIENT); + g_ceph_context = ctx_.get(); + fs::current_path(fs::temp_directory_path()); + fs::create_directory(TEST_DIR); + } + + void TearDown() override { + fs::current_path(fs::temp_directory_path()); + fs::remove_all(TEST_DIR); + } + + std::string getTestDir() const { + auto test_dir = fs::temp_directory_path() / TEST_DIR; + return test_dir.string(); + } + + fs::path getDBFullPath(const std::string & base_dir, + const std::string & tenant) const { + auto db_path = ctx_->_conf.get_val("dbstore_db_dir"); + const auto& db_name = ctx_->_conf.get_val("dbstore_db_name_prefix") + "-" + tenant + ".db"; + + auto db_full_path = std::filesystem::path(db_path) / db_name; + auto db_full_path_test = fs::path(base_dir) / db_full_path; + return db_full_path_test; + } + + std::string getDBTenant(const std::string & base_dir, + const std::string & tenant) const { + auto db_name = ctx_->_conf.get_val("dbstore_db_name_prefix"); + db_name += "-" + tenant; + auto db_full_path = fs::path(base_dir) / db_name; + return db_full_path.string(); + } + + std::string getDBTenant(const std::string & tenant = default_tenant) const { + return getDBTenant(getTestDir(), tenant); + } + + fs::path getDBFullPath(const std::string & tenant) const { + return getDBFullPath(getTestDir(), tenant); + } + + fs::path getLogFilePath(const std::string & log_file) { + return fs::temp_directory_path() / log_file; + } + + std::shared_ptr getContext() const { + return ctx_; + } + + private: + std::shared_ptr ctx_; +}; + +TEST_F(TestDBStoreManager, BasicInstantiateUsingDBDir) { + getContext()->_conf.set_val("dbstore_db_dir", getTestDir()); + + EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant))); + auto dbstore_mgr = std::make_shared(getContext().get()); + EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant))); +} + +TEST_F(TestDBStoreManager, DBNamePrefix) { + getContext()->_conf.set_val("dbstore_db_dir", getTestDir()); + std::string prefix = "testprefix"; + getContext()->_conf.set_val("dbstore_db_name_prefix", prefix); + + EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant))); + auto dbstore_mgr = std::make_shared(getContext().get()); + EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant))); + + // check that the database name contains the given prefix + std::string expected_db_name = prefix + "-" + default_tenant + ".db"; + EXPECT_TRUE(endsWith(getDBFullPath(default_tenant), expected_db_name)); +} + +TEST_F(TestDBStoreManager, BasicInstantiateSecondConstructor) { + getContext()->_conf.set_val("dbstore_db_dir", getTestDir()); + + EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant))); + auto dbstore_mgr = std::make_shared(getContext().get(), getLogFilePath("test.log").string(), 10); + EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant))); +} + +TEST_F(TestDBStoreManager, TestDBName) { + getContext()->_conf.set_val("dbstore_db_dir", getTestDir()); + + auto dbstore_mgr = std::make_shared(getContext().get()); + auto db = dbstore_mgr->getDB(default_tenant, false); + ASSERT_NE(nullptr, db); + EXPECT_EQ(getDBTenant(), db->getDBname()); +} + + +TEST_F(TestDBStoreManager, TestDBNameDefaultDB) { + getContext()->_conf.set_val("dbstore_db_dir", getTestDir()); + + auto dbstore_mgr = std::make_shared(getContext().get()); + // passing an empty tenant should return the default_db + auto db = dbstore_mgr->getDB("", false); + ASSERT_NE(nullptr, db); + EXPECT_EQ(getDBTenant(), db->getDBname()); +} + +TEST_F(TestDBStoreManager, TestDBBadTenant) { + getContext()->_conf.set_val("dbstore_db_dir", getTestDir()); + + auto dbstore_mgr = std::make_shared(getContext().get()); + auto db = dbstore_mgr->getDB("does-not-exist", false); + ASSERT_EQ(nullptr, db); +} + +TEST_F(TestDBStoreManager, TestGetNewDB) { + getContext()->_conf.set_val("dbstore_db_dir", getTestDir()); + + auto dbstore_mgr = std::make_shared(getContext().get()); + + auto new_tenant_path = "new_tenant"; + auto db = dbstore_mgr->getDB(new_tenant_path, true); + ASSERT_NE(nullptr, db); + EXPECT_EQ(getDBTenant(new_tenant_path), db->getDBname()); +} + +TEST_F(TestDBStoreManager, TestDelete) { + getContext()->_conf.set_val("dbstore_db_dir", getTestDir()); + + auto dbstore_mgr = std::make_shared(getContext().get()); + dbstore_mgr->deleteDB(default_tenant); + auto db = dbstore_mgr->getDB(default_tenant, false); + ASSERT_EQ(nullptr, db); +} diff --git a/src/rgw/driver/dbstore/tests/dbstore_tests.cc b/src/rgw/driver/dbstore/tests/dbstore_tests.cc new file mode 100644 index 000000000..27edb7b85 --- /dev/null +++ b/src/rgw/driver/dbstore/tests/dbstore_tests.cc @@ -0,0 +1,1417 @@ +#include "gtest/gtest.h" +#include +#include +#include +#include +#include +#include +#include +#include "rgw_common.h" + +using namespace std; +using DB = rgw::store::DB; + +vector args; + +namespace gtest { + class Environment* env; + + class Environment : public ::testing::Environment { + public: + Environment(): tenant("default_ns"), db(nullptr), + db_type("SQLite"), ret(-1) {} + + Environment(string tenantname, string db_typename): + tenant(tenantname), db(nullptr), + db_type(db_typename), ret(-1) {} + + virtual ~Environment() {} + + void SetUp() override { + cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE | CINIT_FLAG_NO_MON_CONFIG | CINIT_FLAG_NO_DAEMON_ACTIONS); + if (!db_type.compare("SQLite")) { + db = new SQLiteDB(tenant, cct.get()); + ASSERT_TRUE(db != nullptr); + ret = db->Initialize(logfile, loglevel); + ASSERT_GE(ret, 0); + } + } + + void TearDown() override { + if (!db) + return; + db->Destroy(db->get_def_dpp()); + delete db; + } + + string tenant; + DB *db; + string db_type; + int ret; + string logfile = "rgw_dbstore_tests.log"; + int loglevel = 30; + boost::intrusive_ptr cct; + }; +} + +ceph::real_time bucket_mtime = real_clock::now(); +string marker1; + +class DBGetDataCB : public RGWGetDataCB { + public: + bufferlist data_bl; + off_t data_ofs, data_len; + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + data_bl = bl; + data_ofs = bl_ofs; + data_len = bl_len; + return 0; + } +}; + +namespace { + + class DBStoreTest : public ::testing::Test { + protected: + int ret; + DB *db = nullptr; + string user1 = "user1"; + string user_id1 = "user_id1"; + string bucket1 = "bucket1"; + string object1 = "object1"; + string data = "Hello World"; + DBOpParams GlobalParams = {}; + const DoutPrefixProvider *dpp; + + DBStoreTest() {} + void SetUp() { + db = gtest::env->db; + ASSERT_TRUE(db != nullptr); + dpp = db->get_def_dpp(); + ASSERT_TRUE(dpp != nullptr); + + GlobalParams.op.user.uinfo.display_name = user1; + GlobalParams.op.user.uinfo.user_id.id = user_id1; + GlobalParams.op.bucket.info.bucket.name = bucket1; + GlobalParams.op.obj.state.obj.bucket = GlobalParams.op.bucket.info.bucket; + GlobalParams.op.obj.state.obj.key.name = object1; + GlobalParams.op.obj.state.obj.key.instance = "inst1"; + GlobalParams.op.obj.obj_id = "obj_id1"; + GlobalParams.op.obj_data.part_num = 0; + + /* As of now InitializeParams doesnt do anything + * special based on fop. Hence its okay to do + * global initialization once. + */ + ret = db->InitializeParams(dpp, &GlobalParams); + ASSERT_EQ(ret, 0); + } + + void TearDown() { + } + + int write_object(const DoutPrefixProvider *dpp, DBOpParams params) { + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + DB::Object::Write write_op(&op_target); + map setattrs; + ret = write_op.prepare(dpp); + if (ret) + return ret; + + write_op.meta.mtime = &bucket_mtime; + write_op.meta.category = RGWObjCategory::Main; + write_op.meta.owner = params.op.user.uinfo.user_id; + + bufferlist b1 = params.op.obj.head_data; + write_op.meta.data = &b1; + + bufferlist b2; + encode("ACL", b2); + setattrs[RGW_ATTR_ACL] = b2; + + ret = write_op.write_meta(0, params.op.obj.state.size, b1.length()+1, setattrs); + return ret; + } + }; +} + +TEST_F(DBStoreTest, InsertUser) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.user.uinfo.user_id.tenant = "tenant"; + params.op.user.uinfo.user_email = "user1@dbstore.com"; + params.op.user.uinfo.suspended = 123; + params.op.user.uinfo.max_buckets = 456; + params.op.user.uinfo.placement_tags.push_back("tags"); + RGWAccessKey k1("id1", "key1"); + RGWAccessKey k2("id2", "key2"); + params.op.user.uinfo.access_keys["id1"] = k1; + params.op.user.uinfo.access_keys["id2"] = k2; + params.op.user.user_version.ver = 1; + params.op.user.user_version.tag = "UserTAG"; + + ret = db->ProcessOp(dpp, "InsertUser", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, GetUser) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ProcessOp(dpp, "GetUser", ¶ms); + ASSERT_EQ(ret, 0); + ASSERT_EQ(params.op.user.uinfo.user_id.tenant, "tenant"); + ASSERT_EQ(params.op.user.uinfo.user_email, "user1@dbstore.com"); + ASSERT_EQ(params.op.user.uinfo.user_id.id, "user_id1"); + ASSERT_EQ(params.op.user.uinfo.suspended, 123); + ASSERT_EQ(params.op.user.uinfo.max_buckets, 456); + ASSERT_EQ(params.op.user.uinfo.placement_tags.back(), "tags"); + RGWAccessKey k; + map::iterator it2 = params.op.user.uinfo.access_keys.begin(); + k = it2->second; + ASSERT_EQ(k.id, "id1"); + ASSERT_EQ(k.key, "key1"); + it2++; + k = it2->second; + ASSERT_EQ(k.id, "id2"); + ASSERT_EQ(k.key, "key2"); + +} + +TEST_F(DBStoreTest, GetUserQuery) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.query_str = "email"; + params.op.user.uinfo.user_email = "user1@dbstore.com"; + + ret = db->ProcessOp(dpp, "GetUser", ¶ms); + ASSERT_EQ(ret, 0); + ASSERT_EQ(params.op.user.uinfo.user_id.tenant, "tenant"); + ASSERT_EQ(params.op.user.uinfo.user_email, "user1@dbstore.com"); + ASSERT_EQ(params.op.user.uinfo.user_id.id, "user_id1"); + ASSERT_EQ(params.op.user.uinfo.suspended, 123); + ASSERT_EQ(params.op.user.uinfo.max_buckets, 456); + ASSERT_EQ(params.op.user.uinfo.placement_tags.back(), "tags"); + RGWAccessKey k; + map::iterator it2 = params.op.user.uinfo.access_keys.begin(); + k = it2->second; + ASSERT_EQ(k.id, "id1"); + ASSERT_EQ(k.key, "key1"); + it2++; + k = it2->second; + ASSERT_EQ(k.id, "id2"); + ASSERT_EQ(k.key, "key2"); + +} + +TEST_F(DBStoreTest, GetUserQueryByEmail) { + int ret = -1; + RGWUserInfo uinfo; + string email = "user1@dbstore.com"; + map attrs; + RGWObjVersionTracker objv; + + ret = db->get_user(dpp, "email", email, uinfo, &attrs, &objv); + ASSERT_EQ(ret, 0); + ASSERT_EQ(uinfo.user_id.tenant, "tenant"); + ASSERT_EQ(uinfo.user_email, "user1@dbstore.com"); + ASSERT_EQ(uinfo.user_id.id, "user_id1"); + ASSERT_EQ(uinfo.suspended, 123); + ASSERT_EQ(uinfo.max_buckets, 456); + ASSERT_EQ(uinfo.placement_tags.back(), "tags"); + RGWAccessKey k; + map::iterator it2 = uinfo.access_keys.begin(); + k = it2->second; + ASSERT_EQ(k.id, "id1"); + ASSERT_EQ(k.key, "key1"); + it2++; + k = it2->second; + ASSERT_EQ(k.id, "id2"); + ASSERT_EQ(k.key, "key2"); + ASSERT_EQ(objv.read_version.ver, 1); +} + +TEST_F(DBStoreTest, GetUserQueryByAccessKey) { + int ret = -1; + RGWUserInfo uinfo; + string key = "id1"; + + ret = db->get_user(dpp, "access_key", key, uinfo, nullptr, nullptr); + ASSERT_EQ(ret, 0); + ASSERT_EQ(uinfo.user_id.tenant, "tenant"); + ASSERT_EQ(uinfo.user_email, "user1@dbstore.com"); + ASSERT_EQ(uinfo.user_id.id, "user_id1"); + ASSERT_EQ(uinfo.suspended, 123); + ASSERT_EQ(uinfo.max_buckets, 456); + ASSERT_EQ(uinfo.placement_tags.back(), "tags"); + RGWAccessKey k; + map::iterator it2 = uinfo.access_keys.begin(); + k = it2->second; + ASSERT_EQ(k.id, "id1"); + ASSERT_EQ(k.key, "key1"); + it2++; + k = it2->second; + ASSERT_EQ(k.id, "id2"); + ASSERT_EQ(k.key, "key2"); +} + +TEST_F(DBStoreTest, StoreUser) { + struct DBOpParams params = GlobalParams; + int ret = -1; + RGWUserInfo uinfo, old_uinfo; + map attrs; + RGWObjVersionTracker objv_tracker; + + bufferlist attr1, attr2; + encode("attrs1", attr1); + attrs["attr1"] = attr1; + encode("attrs2", attr2); + attrs["attr2"] = attr2; + + uinfo.user_id.id = "user_id2"; + uinfo.user_id.tenant = "tenant"; + uinfo.user_email = "user2@dbstore.com"; + uinfo.suspended = 123; + uinfo.max_buckets = 456; + uinfo.placement_tags.push_back("tags"); + RGWAccessKey k1("id1", "key1"); + RGWAccessKey k2("id2", "key2"); + uinfo.access_keys["id1"] = k1; + uinfo.access_keys["id2"] = k2; + + /* non exclusive create..should create new one */ + ret = db->store_user(dpp, uinfo, false, &attrs, &objv_tracker, &old_uinfo); + ASSERT_EQ(ret, 0); + ASSERT_EQ(old_uinfo.user_email, ""); + ASSERT_EQ(objv_tracker.read_version.ver, 1); + ASSERT_EQ(objv_tracker.read_version.tag, "UserTAG"); + + /* invalid version number */ + objv_tracker.read_version.ver = 4; + ret = db->store_user(dpp, uinfo, true, &attrs, &objv_tracker, &old_uinfo); + ASSERT_EQ(ret, -125); /* returns ECANCELED */ + ASSERT_EQ(old_uinfo.user_id.id, uinfo.user_id.id); + ASSERT_EQ(old_uinfo.user_email, uinfo.user_email); + + /* exclusive create..should not create new one */ + uinfo.user_email = "user2_new@dbstore.com"; + objv_tracker.read_version.ver = 1; + ret = db->store_user(dpp, uinfo, true, &attrs, &objv_tracker, &old_uinfo); + ASSERT_EQ(ret, 0); + ASSERT_EQ(old_uinfo.user_email, "user2@dbstore.com"); + ASSERT_EQ(objv_tracker.read_version.ver, 1); + + ret = db->store_user(dpp, uinfo, false, &attrs, &objv_tracker, &old_uinfo); + ASSERT_EQ(ret, 0); + ASSERT_EQ(old_uinfo.user_email, "user2@dbstore.com"); + ASSERT_EQ(objv_tracker.read_version.ver, 2); + ASSERT_EQ(objv_tracker.read_version.tag, "UserTAG"); +} + +TEST_F(DBStoreTest, GetUserQueryByUserID) { + int ret = -1; + RGWUserInfo uinfo; + map attrs; + RGWObjVersionTracker objv; + + uinfo.user_id.tenant = "tenant"; + uinfo.user_id.id = "user_id2"; + + ret = db->get_user(dpp, "user_id", "user_id2", uinfo, &attrs, &objv); + ASSERT_EQ(ret, 0); + ASSERT_EQ(uinfo.user_id.tenant, "tenant"); + ASSERT_EQ(uinfo.user_email, "user2_new@dbstore.com"); + ASSERT_EQ(uinfo.user_id.id, "user_id2"); + ASSERT_EQ(uinfo.suspended, 123); + ASSERT_EQ(uinfo.max_buckets, 456); + ASSERT_EQ(uinfo.placement_tags.back(), "tags"); + RGWAccessKey k; + map::iterator it = uinfo.access_keys.begin(); + k = it->second; + ASSERT_EQ(k.id, "id1"); + ASSERT_EQ(k.key, "key1"); + it++; + k = it->second; + ASSERT_EQ(k.id, "id2"); + ASSERT_EQ(k.key, "key2"); + + ASSERT_EQ(objv.read_version.ver, 2); + + bufferlist k1, k2; + string attr; + map::iterator it2 = attrs.begin(); + k1 = it2->second; + decode(attr, k1); + ASSERT_EQ(attr, "attrs1"); + it2++; + k2 = it2->second; + decode(attr, k2); + ASSERT_EQ(attr, "attrs2"); +} + +TEST_F(DBStoreTest, ListAllUsers) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ListAllUsers(dpp, ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, InsertBucket) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.bucket.info.bucket.name = "bucket1"; + params.op.bucket.info.bucket.tenant = "tenant"; + params.op.bucket.info.bucket.marker = "marker1"; + + params.op.bucket.ent.size = 1024; + + params.op.bucket.info.has_instance_obj = false; + params.op.bucket.bucket_version.ver = 1; + params.op.bucket.bucket_version.tag = "read_tag"; + + params.op.bucket.mtime = bucket_mtime; + + ret = db->ProcessOp(dpp, "InsertBucket", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, UpdateBucketAttrs) { + int ret = -1; + RGWBucketInfo info; + map attrs; + RGWObjVersionTracker objv; + + bufferlist aclbl, aclbl2; + encode("attrs1", aclbl); + attrs["attr1"] = aclbl; + encode("attrs2", aclbl2); + attrs["attr2"] = aclbl2; + + info.bucket.name = "bucket1"; + + /* invalid version number */ + objv.read_version.ver = 4; + ret = db->update_bucket(dpp, "attrs", info, false, nullptr, &attrs, &bucket_mtime, &objv); + ASSERT_EQ(ret, -125); /* returns ECANCELED */ + + /* right version number */ + objv.read_version.ver = 1; + ret = db->update_bucket(dpp, "attrs", info, false, nullptr, &attrs, &bucket_mtime, &objv); + ASSERT_EQ(ret, 0); + ASSERT_EQ(objv.read_version.ver, 2); +} + +TEST_F(DBStoreTest, UpdateBucketInfo) { + struct DBOpParams params = GlobalParams; + int ret = -1; + RGWBucketInfo info; + + params.op.bucket.info.bucket.name = "bucket1"; + + ret = db->ProcessOp(dpp, "GetBucket", ¶ms); + ASSERT_EQ(ret, 0); + + info = params.op.bucket.info; + + info.bucket.marker = "marker2"; + ret = db->update_bucket(dpp, "info", info, false, nullptr, nullptr, &bucket_mtime, nullptr); + ASSERT_EQ(ret, 0); + ASSERT_EQ(info.objv_tracker.read_version.ver, 3); +} + +TEST_F(DBStoreTest, GetBucket) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.bucket.info.bucket.name = "bucket1"; + ret = db->ProcessOp(dpp, "GetBucket", ¶ms); + ASSERT_EQ(ret, 0); + ASSERT_EQ(params.op.bucket.info.bucket.name, "bucket1"); + ASSERT_EQ(params.op.bucket.info.bucket.tenant, "tenant"); + ASSERT_EQ(params.op.bucket.info.bucket.marker, "marker2"); + ASSERT_EQ(params.op.bucket.ent.size, 1024); + ASSERT_EQ(params.op.bucket.ent.bucket.name, "bucket1"); + ASSERT_EQ(params.op.bucket.ent.bucket.tenant, "tenant"); + ASSERT_EQ(params.op.bucket.info.has_instance_obj, false); + ASSERT_EQ(params.op.bucket.info.objv_tracker.read_version.ver, 3); + ASSERT_EQ(params.op.bucket.info.objv_tracker.read_version.tag, "read_tag"); + ASSERT_EQ(params.op.bucket.mtime, bucket_mtime); + ASSERT_EQ(params.op.bucket.info.owner.id, "user_id1"); + bufferlist k, k2; + string acl; + map::iterator it2 = params.op.bucket.bucket_attrs.begin(); + k = it2->second; + decode(acl, k); + ASSERT_EQ(acl, "attrs1"); + it2++; + k2 = it2->second; + decode(acl, k2); + ASSERT_EQ(acl, "attrs2"); +} + +TEST_F(DBStoreTest, CreateBucket) { + struct DBOpParams params = GlobalParams; + int ret = -1; + RGWBucketInfo info; + RGWUserInfo owner; + rgw_bucket bucket; + obj_version objv; + rgw_placement_rule rule; + map attrs; + + owner.user_id.id = "user_id1"; + bucket.name = "bucket1"; + bucket.tenant = "tenant"; + + objv.ver = 2; + objv.tag = "write_tag"; + + rule.name = "rule1"; + rule.storage_class = "sc1"; + + ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL, + attrs, info, &objv, NULL, bucket_mtime, NULL, NULL, + null_yield, false); + ASSERT_EQ(ret, 0); + bucket.name = "bucket2"; + ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL, + attrs, info, &objv, NULL, bucket_mtime, NULL, NULL, + null_yield, false); + ASSERT_EQ(ret, 0); + bucket.name = "bucket3"; + ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL, + attrs, info, &objv, NULL, bucket_mtime, NULL, NULL, + null_yield, false); + ASSERT_EQ(ret, 0); + bucket.name = "bucket4"; + ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL, + attrs, info, &objv, NULL, bucket_mtime, NULL, NULL, + null_yield, false); + ASSERT_EQ(ret, 0); + bucket.name = "bucket5"; + ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL, + attrs, info, &objv, NULL, bucket_mtime, NULL, NULL, + null_yield, false); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, GetBucketQueryByName) { + int ret = -1; + RGWBucketInfo binfo; + binfo.bucket.name = "bucket2"; + rgw::sal::Attrs attrs; + ceph::real_time mtime; + obj_version objv; + + ret = db->get_bucket_info(dpp, "name", "", binfo, &attrs, &mtime, &objv); + ASSERT_EQ(ret, 0); + ASSERT_EQ(binfo.bucket.name, "bucket2"); + ASSERT_EQ(binfo.bucket.tenant, "tenant"); + ASSERT_EQ(binfo.owner.id, "user_id1"); + ASSERT_EQ(binfo.objv_tracker.read_version.ver, 2); + ASSERT_EQ(binfo.objv_tracker.read_version.tag, "write_tag"); + ASSERT_EQ(binfo.zonegroup, "zid"); + ASSERT_EQ(binfo.creation_time, bucket_mtime); + ASSERT_EQ(binfo.placement_rule.name, "rule1"); + ASSERT_EQ(binfo.placement_rule.storage_class, "sc1"); + ASSERT_EQ(objv.ver, 2); + ASSERT_EQ(objv.tag, "write_tag"); + + marker1 = binfo.bucket.marker; +} + +TEST_F(DBStoreTest, ListUserBuckets) { + struct DBOpParams params = GlobalParams; + int ret = -1; + rgw_user owner; + int max = 2; + bool need_stats = true; + bool is_truncated = false; + RGWUserBuckets ulist; + + owner.id = "user_id1"; + + marker1 = ""; + do { + is_truncated = false; + ret = db->list_buckets(dpp, "", owner, marker1, "", max, need_stats, &ulist, + &is_truncated); + ASSERT_EQ(ret, 0); + + cout << "marker1 :" << marker1 << "\n"; + + cout << "is_truncated :" << is_truncated << "\n"; + + for (const auto& ent: ulist.get_buckets()) { + RGWBucketEnt e = ent.second; + cout << "###################### \n"; + cout << "ent.bucket.id : " << e.bucket.name << "\n"; + cout << "ent.bucket.marker : " << e.bucket.marker << "\n"; + cout << "ent.bucket.bucket_id : " << e.bucket.bucket_id << "\n"; + cout << "ent.size : " << e.size << "\n"; + cout << "ent.rule.name : " << e.placement_rule.name << "\n"; + + marker1 = e.bucket.name; + } + ulist.clear(); + } while(is_truncated); +} + +TEST_F(DBStoreTest, BucketChown) { + int ret = -1; + RGWBucketInfo info; + rgw_user user; + user.id = "user_id2"; + + info.bucket.name = "bucket5"; + + ret = db->update_bucket(dpp, "owner", info, false, &user, nullptr, &bucket_mtime, nullptr); + ASSERT_EQ(ret, 0); + ASSERT_EQ(info.objv_tracker.read_version.ver, 3); +} + +TEST_F(DBStoreTest, ListAllBuckets) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ListAllBuckets(dpp, ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, ListAllBuckets2) { + struct DBOpParams params = GlobalParams; + int ret = -1; + rgw_user owner; + int max = 2; + bool need_stats = true; + bool is_truncated = false; + RGWUserBuckets ulist; + + marker1 = ""; + do { + is_truncated = false; + ret = db->list_buckets(dpp, "all", owner, marker1, "", max, need_stats, &ulist, + &is_truncated); + ASSERT_EQ(ret, 0); + + cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n"; + cout << "ownerID : " << owner.id << "\n"; + cout << "marker1 :" << marker1 << "\n"; + + cout << "is_truncated :" << is_truncated << "\n"; + + for (const auto& ent: ulist.get_buckets()) { + RGWBucketEnt e = ent.second; + cout << "###################### \n"; + cout << "ent.bucket.id : " << e.bucket.name << "\n"; + cout << "ent.bucket.marker : " << e.bucket.marker << "\n"; + cout << "ent.bucket.bucket_id : " << e.bucket.bucket_id << "\n"; + cout << "ent.size : " << e.size << "\n"; + cout << "ent.rule.name : " << e.placement_rule.name << "\n"; + + marker1 = e.bucket.name; + } + ulist.clear(); + } while(is_truncated); +} + +TEST_F(DBStoreTest, RemoveBucketAPI) { + int ret = -1; + RGWBucketInfo info; + + info.bucket.name = "bucket5"; + + ret = db->remove_bucket(dpp, info); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, RemoveUserAPI) { + int ret = -1; + RGWUserInfo uinfo; + RGWObjVersionTracker objv; + + uinfo.user_id.tenant = "tenant"; + uinfo.user_id.id = "user_id2"; + + /* invalid version number...should fail */ + objv.read_version.ver = 4; + ret = db->remove_user(dpp, uinfo, &objv); + ASSERT_EQ(ret, -125); + + objv.read_version.ver = 2; + ret = db->remove_user(dpp, uinfo, &objv); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, PutObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.obj.category = RGWObjCategory::Main; + params.op.obj.storage_class = "STANDARD"; + bufferlist b1; + encode("HELLO WORLD", b1); + cout<<"XXXXXXXXX Insert b1.length " << b1.length() << "\n"; + params.op.obj.head_data = b1; + params.op.obj.state.size = 12; + params.op.obj.state.is_olh = false; + ret = db->ProcessOp(dpp, "PutObject", ¶ms); + ASSERT_EQ(ret, 0); + + /* Insert another objects */ + params.op.obj.state.obj.key.name = "object2"; + params.op.obj.state.obj.key.instance = "inst2"; + ret = db->ProcessOp(dpp, "PutObject", ¶ms); + ASSERT_EQ(ret, 0); + + params.op.obj.state.obj.key.name = "object3"; + params.op.obj.state.obj.key.instance = "inst3"; + ret = db->ProcessOp(dpp, "PutObject", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, ListAllObjects) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ListAllObjects(dpp, ¶ms); + ASSERT_GE(ret, 0); +} + +TEST_F(DBStoreTest, GetObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ProcessOp(dpp, "GetObject", ¶ms); + ASSERT_EQ(ret, 0); + ASSERT_EQ(params.op.obj.category, RGWObjCategory::Main); + ASSERT_EQ(params.op.obj.storage_class, "STANDARD"); + string data; + decode(data, params.op.obj.head_data); + ASSERT_EQ(data, "HELLO WORLD"); + ASSERT_EQ(params.op.obj.state.size, 12); + cout << "versionNum :" << params.op.obj.version_num << "\n"; +} + +TEST_F(DBStoreTest, GetObjectState) { + struct DBOpParams params = GlobalParams; + int ret = -1; + RGWObjState* s; + + params.op.obj.state.obj.key.name = "object2"; + params.op.obj.state.obj.key.instance = "inst2"; + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + + ret = op_target.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj, + false, &s); + ASSERT_EQ(ret, 0); + ASSERT_EQ(s->size, 12); + ASSERT_EQ(s->is_olh, false); + cout << "versionNum :" << params.op.obj.version_num << "\n"; + + /* Recheck with get_state API */ + ret = op_target.get_state(dpp, &s, false); + ASSERT_EQ(ret, 0); + ASSERT_EQ(s->size, 12); + ASSERT_EQ(s->is_olh, false); + cout << "versionNum :" << params.op.obj.version_num << "\n"; +} + +TEST_F(DBStoreTest, ObjAttrs) { + struct DBOpParams params = GlobalParams; + int ret = -1; + map setattrs; + map rmattrs; + map readattrs; + + bufferlist b1, b2, b3; + encode("ACL", b1); + setattrs[RGW_ATTR_ACL] = b1; + encode("LC", b2); + setattrs[RGW_ATTR_LC] = b2; + encode("ETAG", b3); + setattrs[RGW_ATTR_ETAG] = b3; + + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + + /* Set some attrs */ + ret = op_target.set_attrs(dpp, setattrs, nullptr); + ASSERT_EQ(ret, 0); + + /* read those attrs */ + DB::Object::Read read_op(&op_target); + read_op.params.attrs = &readattrs; + ret = read_op.prepare(dpp); + ASSERT_EQ(ret, 0); + + string val; + decode(val, readattrs[RGW_ATTR_ACL]); + ASSERT_EQ(val, "ACL"); + decode(val, readattrs[RGW_ATTR_LC]); + ASSERT_EQ(val, "LC"); + decode(val, readattrs[RGW_ATTR_ETAG]); + ASSERT_EQ(val, "ETAG"); + + /* Remove some attrs */ + rmattrs[RGW_ATTR_ACL] = b1; + map empty; + ret = op_target.set_attrs(dpp, empty, &rmattrs); + ASSERT_EQ(ret, 0); + + /* read those attrs */ + ret = read_op.prepare(dpp); + ASSERT_EQ(ret, 0); + + ASSERT_EQ(readattrs.count(RGW_ATTR_ACL), 0); + decode(val, readattrs[RGW_ATTR_LC]); + ASSERT_EQ(val, "LC"); + decode(val, readattrs[RGW_ATTR_ETAG]); + ASSERT_EQ(val, "ETAG"); +} + +TEST_F(DBStoreTest, WriteObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + params.op.obj.state.obj.key.name = "object3"; + params.op.obj.state.obj.key.instance = "inst3"; + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + + bufferlist b1; + encode("HELLO WORLD - Object3", b1); + params.op.obj.head_data = b1; + params.op.obj.state.size = 22; + + ret = write_object(dpp, params); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, ReadObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + map readattrs; + params.op.obj.state.obj.key.name = "object3"; + params.op.obj.state.obj.key.instance = "inst3"; + uint64_t obj_size; + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + DB::Object::Read read_op(&op_target); + read_op.params.attrs = &readattrs; + read_op.params.obj_size = &obj_size; + ret = read_op.prepare(dpp); + ASSERT_EQ(ret, 0); + + bufferlist bl; + ret = read_op.read(0, 25, bl, dpp); + cout<<"XXXXXXXXX Insert bl.length " << bl.length() << "\n"; + ASSERT_EQ(ret, 25); + + string data; + decode(data, bl); + ASSERT_EQ(data, "HELLO WORLD - Object3"); + ASSERT_EQ(obj_size, 22); +} + +TEST_F(DBStoreTest, IterateObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + map readattrs; + uint64_t obj_size; + DBGetDataCB cb; + + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + DB::Object::Read read_op(&op_target); + read_op.params.attrs = &readattrs; + read_op.params.obj_size = &obj_size; + ret = read_op.prepare(dpp); + ASSERT_EQ(ret, 0); + + bufferlist bl; + ret = read_op.iterate(dpp, 0, 15, &cb); + ASSERT_EQ(ret, 0); + string data; + decode(data, cb.data_bl); + cout << "XXXXXXXXXX iterate data is " << data << ", bl_ofs = " << cb.data_ofs << ", bl_len = " << cb.data_len << "\n"; + ASSERT_EQ(data, "HELLO WORLD"); + ASSERT_EQ(cb.data_ofs, 0); + ASSERT_EQ(cb.data_len, 15); +} + +TEST_F(DBStoreTest, ListBucketObjects) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + int max = 2; + bool is_truncated = false; + rgw_obj_key marker1; + DB::Bucket target(db, params.op.bucket.info); + DB::Bucket::List list_op(&target); + + vector dir_list; + + marker1.name = ""; + do { + is_truncated = false; + list_op.params.marker = marker1; + ret = list_op.list_objects(dpp, max, &dir_list, nullptr, &is_truncated); + ASSERT_EQ(ret, 0); + + cout << "marker1 :" << marker1.name << "\n"; + + cout << "is_truncated :" << is_truncated << "\n"; + + for (const auto& ent: dir_list) { + cls_rgw_obj_key key = ent.key; + cout << "###################### \n"; + cout << "key.name : " << key.name << "\n"; + cout << "key.instance : " << key.instance << "\n"; + + marker1 = list_op.get_next_marker(); + } + dir_list.clear(); + } while(is_truncated); +} + +TEST_F(DBStoreTest, DeleteObj) { + struct DBOpParams params = GlobalParams; + int ret = -1; + RGWObjState *s; + + /* delete object2 */ + params.op.obj.state.obj.key.name = "object2"; + params.op.obj.state.obj.key.instance = "inst2"; + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + + DB::Object::Delete delete_op(&op_target); + ret = delete_op.delete_obj(dpp); + ASSERT_EQ(ret, 0); + + /* Should return ENOENT */ + ret = op_target.get_state(dpp, &s, false); + ASSERT_EQ(ret, -2); +} + +TEST_F(DBStoreTest, WriteVersionedObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + std::string instances[] = {"inst1", "inst2", "inst3"}; + bufferlist b1; + + params.op.obj.flags |= rgw_bucket_dir_entry::FLAG_CURRENT; + params.op.obj.state.obj.key.name = "object1"; + + /* Write versioned objects */ + DB::Object op_target(db, params.op.bucket.info, params.op.obj.state.obj); + DB::Object::Write write_op(&op_target); + + /* Version1 */ + params.op.obj.state.obj.key.instance = instances[0]; + encode("HELLO WORLD", b1); + params.op.obj.head_data = b1; + params.op.obj.state.size = 12; + ret = write_object(dpp, params); + ASSERT_EQ(ret, 0); + + /* Version2 */ + params.op.obj.state.obj.key.instance = instances[1]; + b1.clear(); + encode("HELLO WORLD ABC", b1); + params.op.obj.head_data = b1; + params.op.obj.state.size = 16; + ret = write_object(dpp, params); + ASSERT_EQ(ret, 0); + + /* Version3 */ + params.op.obj.state.obj.key.instance = instances[2]; + b1.clear(); + encode("HELLO WORLD A", b1); + params.op.obj.head_data = b1; + params.op.obj.state.size = 14; + ret = write_object(dpp, params); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, ListVersionedObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + std::string instances[] = {"inst1", "inst2", "inst3"}; + int i = 0; + + /* list versioned objects */ + params.op.obj.state.obj.key.instance.clear(); + params.op.list_max_count = MAX_VERSIONED_OBJECTS; + ret = db->ProcessOp(dpp, "ListVersionedObjects", ¶ms); + ASSERT_EQ(ret, 0); + + i = 2; + for (auto ent: params.op.obj.list_entries) { + + + ASSERT_EQ(ent.key.instance, instances[i]); + i--; + } +} + +TEST_F(DBStoreTest, ReadVersionedObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + std::string instances[] = {"inst1", "inst2", "inst3"}; + std::string data; + + /* read object.. should fetch latest version */ + RGWObjState* s; + params = GlobalParams; + params.op.obj.state.obj.key.instance.clear(); + DB::Object op_target2(db, params.op.bucket.info, params.op.obj.state.obj); + ret = op_target2.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj, + true, &s); + ASSERT_EQ(ret, 0); + ASSERT_EQ(s->obj.key.instance, instances[2]); + decode(data, s->data); + ASSERT_EQ(data, "HELLO WORLD A"); + ASSERT_EQ(s->size, 14); + + /* read a particular non-current version */ + params.op.obj.state.obj.key.instance = instances[1]; + DB::Object op_target3(db, params.op.bucket.info, params.op.obj.state.obj); + ret = op_target3.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj, + true, &s); + ASSERT_EQ(ret, 0); + decode(data, s->data); + ASSERT_EQ(data, "HELLO WORLD ABC"); + ASSERT_EQ(s->size, 16); +} + +TEST_F(DBStoreTest, DeleteVersionedObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + std::string instances[] = {"inst1", "inst2", "inst3"}; + std::string data; + std::string dm_instance; + int i = 0; + + /* Delete object..should create delete marker */ + params.op.obj.state.obj.key.instance.clear(); + DB::Object op_target(db, params.op.bucket.info, params.op.obj.state.obj); + DB::Object::Delete delete_op(&op_target); + delete_op.params.versioning_status |= BUCKET_VERSIONED; + + ret = delete_op.delete_obj(dpp); + ASSERT_EQ(ret, 0); + + /* list versioned objects */ + params = GlobalParams; + params.op.obj.state.obj.key.instance.clear(); + params.op.list_max_count = MAX_VERSIONED_OBJECTS; + ret = db->ProcessOp(dpp, "ListVersionedObjects", ¶ms); + + i = 3; + for (auto ent: params.op.obj.list_entries) { + string is_delete_marker = (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER)? "true" : "false"; + cout << "ent.name: " << ent.key.name << ". ent.instance: " << ent.key.instance << " is_delete_marker = " << is_delete_marker << "\n"; + + if (i == 3) { + ASSERT_EQ(is_delete_marker, "true"); + dm_instance = ent.key.instance; + } else { + ASSERT_EQ(is_delete_marker, "false"); + ASSERT_EQ(ent.key.instance, instances[i]); + } + + i--; + } + + /* read object.. should return -ENOENT */ + RGWObjState* s; + params = GlobalParams; + params.op.obj.state.obj.key.instance.clear(); + DB::Object op_target2(db, params.op.bucket.info, params.op.obj.state.obj); + ret = op_target2.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj, + true, &s); + ASSERT_EQ(ret, -ENOENT); + + /* Delete delete marker..should be able to read object now */ + params.op.obj.state.obj.key.instance = dm_instance; + DB::Object op_target3(db, params.op.bucket.info, params.op.obj.state.obj); + DB::Object::Delete delete_op2(&op_target3); + delete_op2.params.versioning_status |= BUCKET_VERSIONED; + + ret = delete_op2.delete_obj(dpp); + ASSERT_EQ(ret, 0); + + /* read object.. should fetch latest version */ + params = GlobalParams; + params.op.obj.state.obj.key.instance.clear(); + DB::Object op_target4(db, params.op.bucket.info, params.op.obj.state.obj); + ret = op_target4.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj, + true, &s); + ASSERT_EQ(s->obj.key.instance, instances[2]); + decode(data, s->data); + ASSERT_EQ(data, "HELLO WORLD A"); + ASSERT_EQ(s->size, 14); + + /* delete latest version using version-id. Next version should get promoted */ + params.op.obj.state.obj.key.instance = instances[2]; + DB::Object op_target5(db, params.op.bucket.info, params.op.obj.state.obj); + DB::Object::Delete delete_op3(&op_target5); + delete_op3.params.versioning_status |= BUCKET_VERSIONED; + + ret = delete_op3.delete_obj(dpp); + ASSERT_EQ(ret, 0); + + /* list versioned objects..only two versions should be present + * with second version marked as CURRENT */ + params = GlobalParams; + params.op.obj.state.obj.key.instance.clear(); + params.op.list_max_count = MAX_VERSIONED_OBJECTS; + ret = db->ProcessOp(dpp, "ListVersionedObjects", ¶ms); + + i = 1; + for (auto ent: params.op.obj.list_entries) { + + if (i == 1) { + dm_instance = ent.key.instance; + } else { + ASSERT_EQ(ent.key.instance, instances[i]); + } + + i--; + } + +} + +TEST_F(DBStoreTest, ObjectOmapSetVal) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + + string val = "part1_val"; + bufferlist bl; + encode(val, bl); + ret = op_target.obj_omap_set_val_by_key(dpp, "part1", bl, false); + ASSERT_EQ(ret, 0); + + val = "part2_val"; + bl.clear(); + encode(val, bl); + ret = op_target.obj_omap_set_val_by_key(dpp, "part2", bl, false); + ASSERT_EQ(ret, 0); + + val = "part3_val"; + bl.clear(); + encode(val, bl); + ret = op_target.obj_omap_set_val_by_key(dpp, "part3", bl, false); + ASSERT_EQ(ret, 0); + + val = "part4_val"; + bl.clear(); + encode(val, bl); + ret = op_target.obj_omap_set_val_by_key(dpp, "part4", bl, false); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, ObjectOmapGetValsByKeys) { + struct DBOpParams params = GlobalParams; + int ret = -1; + std::set keys; + std::map vals; + + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + + keys.insert("part2"); + keys.insert("part4"); + + ret = op_target.obj_omap_get_vals_by_keys(dpp, "", keys, &vals); + ASSERT_EQ(ret, 0); + ASSERT_EQ(vals.size(), 2); + + string val; + decode(val, vals["part2"]); + ASSERT_EQ(val, "part2_val"); + decode(val, vals["part4"]); + ASSERT_EQ(val, "part4_val"); +} + +TEST_F(DBStoreTest, ObjectOmapGetAll) { + struct DBOpParams params = GlobalParams; + int ret = -1; + std::map vals; + + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + + ret = op_target.obj_omap_get_all(dpp, &vals); + ASSERT_EQ(ret, 0); + ASSERT_EQ(vals.size(), 4); + + string val; + decode(val, vals["part1"]); + ASSERT_EQ(val, "part1_val"); + decode(val, vals["part2"]); + ASSERT_EQ(val, "part2_val"); + decode(val, vals["part3"]); + ASSERT_EQ(val, "part3_val"); + decode(val, vals["part4"]); + ASSERT_EQ(val, "part4_val"); +} + +TEST_F(DBStoreTest, ObjectOmapGetVals) { + struct DBOpParams params = GlobalParams; + int ret = -1; + std::set keys; + std::map vals; + bool pmore; + + DB::Object op_target(db, params.op.bucket.info, + params.op.obj.state.obj); + + ret = op_target.obj_omap_get_vals(dpp, "part3", 10, &vals, &pmore); + ASSERT_EQ(ret, 0); + ASSERT_EQ(vals.size(), 2); + + string val; + decode(val, vals["part3"]); + ASSERT_EQ(val, "part3_val"); + decode(val, vals["part4"]); + ASSERT_EQ(val, "part4_val"); +} + +TEST_F(DBStoreTest, PutObjectData) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.obj_data.part_num = 1; + params.op.obj_data.offset = 10; + params.op.obj_data.multipart_part_str = "2"; + bufferlist b1; + encode("HELLO WORLD", b1); + params.op.obj_data.data = b1; + params.op.obj_data.size = 12; + params.op.obj.state.mtime = real_clock::now(); + ret = db->ProcessOp(dpp, "PutObjectData", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, UpdateObjectData) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.obj.state.mtime = bucket_mtime; + ret = db->ProcessOp(dpp, "UpdateObjectData", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, GetObjectData) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.obj.state.obj.key.instance = "inst1"; + params.op.obj.state.obj.key.name = "object1"; + ret = db->ProcessOp(dpp, "GetObjectData", ¶ms); + ASSERT_EQ(ret, 0); + ASSERT_EQ(params.op.obj_data.part_num, 1); + ASSERT_EQ(params.op.obj_data.offset, 10); + ASSERT_EQ(params.op.obj_data.multipart_part_str, "2"); + ASSERT_EQ(params.op.obj.state.obj.key.instance, "inst1"); + ASSERT_EQ(params.op.obj.state.obj.key.name, "object1"); + ASSERT_EQ(params.op.obj.state.mtime, bucket_mtime); + string data; + decode(data, params.op.obj_data.data); + ASSERT_EQ(data, "HELLO WORLD"); +} + +TEST_F(DBStoreTest, DeleteObjectData) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ProcessOp(dpp, "DeleteObjectData", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, DeleteObject) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ProcessOp(dpp, "DeleteObject", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, LCTables) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->createLCTables(dpp); + ASSERT_GE(ret, 0); +} + +TEST_F(DBStoreTest, LCHead) { + struct DBOpParams params = GlobalParams; + int ret = -1; + std::string index1 = "bucket1"; + std::string index2 = "bucket2"; + time_t lc_time = ceph_clock_now(); + std::unique_ptr head; + std::string ents[] = {"entry1", "entry2", "entry3"}; + rgw::sal::StoreLifecycle::StoreLCHead head1(lc_time, 0, ents[0]); + rgw::sal::StoreLifecycle::StoreLCHead head2(lc_time, 0, ents[1]); + rgw::sal::StoreLifecycle::StoreLCHead head3(lc_time, 0, ents[2]); + + ret = db->put_head(index1, head1); + ASSERT_EQ(ret, 0); + ret = db->put_head(index2, head2); + ASSERT_EQ(ret, 0); + + ret = db->get_head(index1, &head); + ASSERT_EQ(ret, 0); + ASSERT_EQ(head->get_marker(), "entry1"); + + ret = db->get_head(index2, &head); + ASSERT_EQ(ret, 0); + ASSERT_EQ(head->get_marker(), "entry2"); + + // update index1 + ret = db->put_head(index1, head3); + ASSERT_EQ(ret, 0); + ret = db->get_head(index1, &head); + ASSERT_EQ(ret, 0); + ASSERT_EQ(head->get_marker(), "entry3"); + +} +TEST_F(DBStoreTest, LCEntry) { + struct DBOpParams params = GlobalParams; + int ret = -1; + uint64_t lc_time = ceph_clock_now(); + std::string index1 = "lcindex1"; + std::string index2 = "lcindex2"; + typedef enum {lc_uninitial = 1, lc_complete} status; + std::string ents[] = {"bucket1", "bucket2", "bucket3", "bucket4"}; + std::unique_ptr entry; + rgw::sal::StoreLifecycle::StoreLCEntry entry1(ents[0], lc_time, lc_uninitial); + rgw::sal::StoreLifecycle::StoreLCEntry entry2(ents[1], lc_time, lc_uninitial); + rgw::sal::StoreLifecycle::StoreLCEntry entry3(ents[2], lc_time, lc_uninitial); + rgw::sal::StoreLifecycle::StoreLCEntry entry4(ents[3], lc_time, lc_uninitial); + + vector> lc_entries; + + ret = db->set_entry(index1, entry1); + ASSERT_EQ(ret, 0); + ret = db->set_entry(index1, entry2); + ASSERT_EQ(ret, 0); + ret = db->set_entry(index1, entry3); + ASSERT_EQ(ret, 0); + ret = db->set_entry(index2, entry4); + ASSERT_EQ(ret, 0); + + // get entry index1, entry1 + ret = db->get_entry(index1, ents[0], &entry); + ASSERT_EQ(ret, 0); + ASSERT_EQ(entry->get_status(), lc_uninitial); + ASSERT_EQ(entry->get_start_time(), lc_time); + + // get next entry index1, entry2 + ret = db->get_next_entry(index1, ents[1], &entry); + ASSERT_EQ(ret, 0); + ASSERT_EQ(entry->get_bucket(), ents[2]); + ASSERT_EQ(entry->get_status(), lc_uninitial); + ASSERT_EQ(entry->get_start_time(), lc_time); + + // update entry4 to entry5 + entry4.status = lc_complete; + ret = db->set_entry(index2, entry4); + ASSERT_EQ(ret, 0); + ret = db->get_entry(index2, ents[3], &entry); + ASSERT_EQ(ret, 0); + ASSERT_EQ(entry->get_status(), lc_complete); + + // list entries + ret = db->list_entries(index1, "", 5, lc_entries); + ASSERT_EQ(ret, 0); + for (const auto& ent: lc_entries) { + cout << "###################### \n"; + cout << "lc entry.bucket : " << ent->get_bucket() << "\n"; + cout << "lc entry.status : " << ent->get_status() << "\n"; + } + + // remove index1, entry3 + ret = db->rm_entry(index1, entry3); + ASSERT_EQ(ret, 0); + + // get next entry index1, entry2.. should be null + entry.release(); + ret = db->get_next_entry(index1, ents[1], &entry); + ASSERT_EQ(ret, 0); + ASSERT_EQ(entry.get(), nullptr); +} + +TEST_F(DBStoreTest, RemoveBucket) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ProcessOp(dpp, "RemoveBucket", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, RemoveUser) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + ret = db->ProcessOp(dpp, "RemoveUser", ¶ms); + ASSERT_EQ(ret, 0); +} + +TEST_F(DBStoreTest, InsertTestIDUser) { + struct DBOpParams params = GlobalParams; + int ret = -1; + + params.op.user.uinfo.user_id.id = "testid"; + params.op.user.uinfo.display_name = "M. Tester"; + params.op.user.uinfo.user_id.tenant = "tenant"; + params.op.user.uinfo.user_email = "tester@ceph.com"; + RGWAccessKey k1("0555b35654ad1656d804", "h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q=="); + params.op.user.uinfo.access_keys["0555b35654ad1656d804"] = k1; + params.op.user.user_version.ver = 1; + params.op.user.user_version.tag = "UserTAG"; + + ret = db->ProcessOp(dpp, "InsertUser", ¶ms); + ASSERT_EQ(ret, 0); +} + +int main(int argc, char **argv) +{ + int ret = -1; + string c_logfile = "rgw_dbstore_tests.log"; + int c_loglevel = 20; + + // format: ./dbstore-tests logfile loglevel + if (argc == 3) { + c_logfile = argv[1]; + c_loglevel = (atoi)(argv[2]); + cout << "logfile:" << c_logfile << ", loglevel set to " << c_loglevel << "\n"; + } + + ::testing::InitGoogleTest(&argc, argv); + + gtest::env = new gtest::Environment(); + gtest::env->logfile = c_logfile; + gtest::env->loglevel = c_loglevel; + ::testing::AddGlobalTestEnvironment(gtest::env); + + ret = RUN_ALL_TESTS(); + + return ret; +} diff --git a/src/rgw/driver/immutable_config/store.cc b/src/rgw/driver/immutable_config/store.cc new file mode 100644 index 000000000..8d3e0765f --- /dev/null +++ b/src/rgw/driver/immutable_config/store.cc @@ -0,0 +1,422 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_zone.h" +#include "store.h" + +namespace rgw::sal { + +ImmutableConfigStore::ImmutableConfigStore(const RGWZoneGroup& zonegroup, + const RGWZoneParams& zone, + const RGWPeriodConfig& period_config) + : zonegroup(zonegroup), zone(zone), period_config(period_config) +{ +} + +// Realm +int ImmutableConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id) +{ + return -EROFS; +} + +int ImmutableConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string& realm_id) +{ + return -ENOENT; +} + +int ImmutableConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y) +{ + return -EROFS; +} + + +int ImmutableConfigStore::create_realm(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWRealm& info, + std::unique_ptr* writer) +{ + return -EROFS; +} + +int ImmutableConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWRealm& info, + std::unique_ptr* writer) +{ + return -ENOENT; +} + +int ImmutableConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer) +{ + return -ENOENT; +} + +int ImmutableConfigStore::read_default_realm(const DoutPrefixProvider* dpp, + optional_yield y, + RGWRealm& info, + std::unique_ptr* writer) +{ + return -ENOENT; +} + +int ImmutableConfigStore::read_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view realm_name, + std::string& realm_id) +{ + return -ENOENT; +} + +int ImmutableConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWPeriod& period) +{ + return -ENOTSUP; +} + +int ImmutableConfigStore::list_realm_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) +{ + result.next.clear(); + result.entries = entries.first(0); + return 0; +} + + +// Period +int ImmutableConfigStore::create_period(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWPeriod& info) +{ + return -EROFS; +} + +int ImmutableConfigStore::read_period(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view period_id, + std::optional epoch, RGWPeriod& info) +{ + return -ENOENT; +} + +int ImmutableConfigStore::delete_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id) +{ + return -EROFS; +} + +int ImmutableConfigStore::list_period_ids(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) +{ + result.next.clear(); + result.entries = entries.first(0); + return 0; +} + + +// ZoneGroup + +class ImmutableZoneGroupWriter : public ZoneGroupWriter { + public: + int write(const DoutPrefixProvider* dpp, optional_yield y, + const RGWZoneGroup& info) override + { + return -EROFS; + } + int rename(const DoutPrefixProvider* dpp, optional_yield y, + RGWZoneGroup& info, std::string_view new_name) override + { + return -EROFS; + } + int remove(const DoutPrefixProvider* dpp, optional_yield y) override + { + return -EROFS; + } +}; + +int ImmutableConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zonegroup_id) +{ + return -EROFS; +} + +int ImmutableConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zonegroup_id) +{ + if (!realm_id.empty()) { + return -ENOENT; + } + zonegroup_id = zonegroup.id; + return 0; +} + +int ImmutableConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) +{ + return -EROFS; +} + + +int ImmutableConfigStore::create_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneGroup& info, + std::unique_ptr* writer) +{ + return -EROFS; +} + +int ImmutableConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_id, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + if (zonegroup_id != zonegroup.id) { + return -ENOENT; + } + + info = zonegroup; + + if (writer) { + *writer = std::make_unique(); + } + return 0; +} +int ImmutableConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + if (zonegroup_name != zonegroup.name) { + return -ENOENT; + } + + info = zonegroup; + + if (writer) { + *writer = std::make_unique(); + } + return 0; +} + +int ImmutableConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + info = zonegroup; + + if (writer) { + *writer = std::make_unique(); + } + return 0; +} + +int ImmutableConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) +{ + if (marker < zonegroup.name) { + entries[0] = zonegroup.name; + result.next = zonegroup.name; + result.entries = entries.first(1); + } else { + result.next.clear(); + result.entries = entries.first(0); + } + return 0; +} + +// Zone + +class ImmutableZoneWriter : public ZoneWriter { + public: + int write(const DoutPrefixProvider* dpp, optional_yield y, + const RGWZoneParams& info) override + { + return -EROFS; + } + int rename(const DoutPrefixProvider* dpp, optional_yield y, + RGWZoneParams& info, std::string_view new_name) override + { + return -EROFS; + } + int remove(const DoutPrefixProvider* dpp, optional_yield y) override + { + return -EROFS; + } +}; + +int ImmutableConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zone_id) +{ + return -EROFS; +} + +int ImmutableConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zone_id) +{ + if (realm_id.empty()) { + return -ENOENT; + } + zone_id = zone.id; + return 0; +} + +int ImmutableConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) +{ + return -EROFS; +} + + +int ImmutableConfigStore::create_zone(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneParams& info, + std::unique_ptr* writer) +{ + return -EROFS; +} + +int ImmutableConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_id, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + if (zone_id != zone.id) { + return -ENOENT; + } + + info = zone; + + if (writer) { + *writer = std::make_unique(); + } + return 0; +} + +int ImmutableConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + if (zone_name != zone.name) { + return -ENOENT; + } + + info = zone; + + if (writer) { + *writer = std::make_unique(); + } + return 0; +} + +int ImmutableConfigStore::read_default_zone(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + if (!realm_id.empty()) { + return -ENOENT; + } + + info = zone; + + if (writer) { + *writer = std::make_unique(); + } + return 0; +} + +int ImmutableConfigStore::list_zone_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) +{ + if (marker < zone.name) { + entries[0] = zone.name; + result.next = zone.name; + result.entries = entries.first(1); + } else { + result.next.clear(); + result.entries = entries.first(0); + } + return 0; +} + + +// PeriodConfig +int ImmutableConfigStore::read_period_config(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWPeriodConfig& info) +{ + if (!realm_id.empty()) { + return -ENOENT; + } + + info = period_config; + return 0; +} + +int ImmutableConfigStore::write_period_config(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + const RGWPeriodConfig& info) +{ + return -EROFS; +} + + +/// ImmutableConfigStore factory function +auto create_immutable_config_store(const DoutPrefixProvider* dpp, + const RGWZoneGroup& zonegroup, + const RGWZoneParams& zone, + const RGWPeriodConfig& period_config) + -> std::unique_ptr +{ + return std::make_unique(zonegroup, zone, period_config); +} + +} // namespace rgw::sal diff --git a/src/rgw/driver/immutable_config/store.h b/src/rgw/driver/immutable_config/store.h new file mode 100644 index 000000000..9a1ac5f14 --- /dev/null +++ b/src/rgw/driver/immutable_config/store.h @@ -0,0 +1,180 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_sal_config.h" + +namespace rgw::sal { + +/// A read-only ConfigStore that serves the given default zonegroup and zone. +class ImmutableConfigStore : public ConfigStore { + public: + explicit ImmutableConfigStore(const RGWZoneGroup& zonegroup, + const RGWZoneParams& zone, + const RGWPeriodConfig& period_config); + + // Realm + virtual int write_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id) override; + virtual int read_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string& realm_id) override; + virtual int delete_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y) override; + + virtual int create_realm(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWRealm& info, + std::unique_ptr* writer) override; + virtual int read_realm_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWRealm& info, + std::unique_ptr* writer) override; + virtual int read_realm_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer) override; + virtual int read_default_realm(const DoutPrefixProvider* dpp, + optional_yield y, + RGWRealm& info, + std::unique_ptr* writer) override; + virtual int read_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view realm_name, + std::string& realm_id) override; + virtual int realm_notify_new_period(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWPeriod& period) override; + virtual int list_realm_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) override; + + // Period + virtual int create_period(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWPeriod& info) override; + virtual int read_period(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view period_id, + std::optional epoch, RGWPeriod& info) override; + virtual int delete_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id) override; + virtual int list_period_ids(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) override; + + // ZoneGroup + virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zonegroup_id) override; + virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zonegroup_id) override; + virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) override; + + virtual int create_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneGroup& info, + std::unique_ptr* writer) override; + virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_id, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + virtual int read_default_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + virtual int list_zonegroup_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) override; + + // Zone + virtual int write_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zone_id) override; + virtual int read_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zone_id) override; + virtual int delete_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) override; + + virtual int create_zone(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneParams& info, + std::unique_ptr* writer) override; + virtual int read_zone_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_id, + RGWZoneParams& info, + std::unique_ptr* writer) override; + virtual int read_zone_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer) override; + virtual int read_default_zone(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneParams& info, + std::unique_ptr* writer) override; + virtual int list_zone_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) override; + + // PeriodConfig + virtual int read_period_config(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWPeriodConfig& info) override; + virtual int write_period_config(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + const RGWPeriodConfig& info) override; + + private: + const RGWZoneGroup zonegroup; + const RGWZoneParams zone; + const RGWPeriodConfig period_config; +}; // ImmutableConfigStore + + +/// ImmutableConfigStore factory function +auto create_immutable_config_store(const DoutPrefixProvider* dpp, + const RGWZoneGroup& zonegroup, + const RGWZoneParams& zone, + const RGWPeriodConfig& period_config) + -> std::unique_ptr; + +} // namespace rgw::sal diff --git a/src/rgw/driver/json_config/store.cc b/src/rgw/driver/json_config/store.cc new file mode 100644 index 000000000..cf5adda25 --- /dev/null +++ b/src/rgw/driver/json_config/store.cc @@ -0,0 +1,177 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include "include/buffer.h" +#include "common/errno.h" +#include "common/ceph_json.h" +#include "rgw_zone.h" +#include "driver/immutable_config/store.h" +#include "store.h" + +namespace rgw::sal { + +namespace { + +struct DecodedConfig { + RGWZoneGroup zonegroup; + RGWZoneParams zone; + RGWPeriodConfig period_config; + + void decode_json(JSONObj *obj) + { + JSONDecoder::decode_json("zonegroup", zonegroup, obj); + JSONDecoder::decode_json("zone", zone, obj); + JSONDecoder::decode_json("period_config", period_config, obj); + } +}; + +static void parse_config(const DoutPrefixProvider* dpp, const char* filename) +{ + bufferlist bl; + std::string errmsg; + int r = bl.read_file(filename, &errmsg); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to read json config file '" << filename + << "': " << errmsg << dendl; + throw std::system_error(-r, std::system_category()); + } + + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + ldpp_dout(dpp, 0) << "failed to parse json config file" << dendl; + throw std::system_error(make_error_code(std::errc::invalid_argument)); + } + + DecodedConfig config; + try { + decode_json_obj(config, &p); + } catch (const JSONDecoder::err& e) { + ldpp_dout(dpp, 0) << "failed to decode JSON input: " << e.what() << dendl; + throw std::system_error(make_error_code(std::errc::invalid_argument)); + } +} + +void sanity_check_config(const DoutPrefixProvider* dpp, DecodedConfig& config) +{ + if (config.zonegroup.id.empty()) { + config.zonegroup.id = "default"; + } + if (config.zonegroup.name.empty()) { + config.zonegroup.name = "default"; + } + if (config.zonegroup.api_name.empty()) { + config.zonegroup.api_name = config.zonegroup.name; + } + + if (config.zone.id.empty()) { + config.zone.id = "default"; + } + if (config.zone.name.empty()) { + config.zone.name = "default"; + } + + // add default placement if it doesn't exist + rgw_pool pool; + RGWZonePlacementInfo placement; + placement.storage_classes.set_storage_class( + RGW_STORAGE_CLASS_STANDARD, &pool, nullptr); + config.zone.placement_pools.emplace("default-placement", + std::move(placement)); + + std::set pools; + int r = rgw::init_zone_pool_names(dpp, null_yield, pools, config.zone); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to set default zone pool names" << dendl; + throw std::system_error(-r, std::system_category()); + } + + // verify that config.zonegroup only contains config.zone + if (config.zonegroup.zones.size() > 1) { + ldpp_dout(dpp, 0) << "zonegroup cannot contain multiple zones" << dendl; + throw std::system_error(make_error_code(std::errc::invalid_argument)); + } + + if (config.zonegroup.zones.size() == 1) { + auto z = config.zonegroup.zones.begin(); + if (z->first != config.zone.id) { + ldpp_dout(dpp, 0) << "zonegroup contains unknown zone id=" + << z->first << dendl; + throw std::system_error(make_error_code(std::errc::invalid_argument)); + } + if (z->second.id != config.zone.id) { + ldpp_dout(dpp, 0) << "zonegroup contains unknown zone id=" + << z->second.id << dendl; + throw std::system_error(make_error_code(std::errc::invalid_argument)); + } + if (z->second.name != config.zone.name) { + ldpp_dout(dpp, 0) << "zonegroup contains unknown zone name=" + << z->second.name << dendl; + throw std::system_error(make_error_code(std::errc::invalid_argument)); + } + if (config.zonegroup.master_zone != config.zone.id) { + ldpp_dout(dpp, 0) << "zonegroup contains unknown master_zone=" + << config.zonegroup.master_zone << dendl; + throw std::system_error(make_error_code(std::errc::invalid_argument)); + } + } else { + // add the zone to the group + const bool is_master = true; + const bool read_only = false; + std::list endpoints; + std::list sync_from; + std::list sync_from_rm; + rgw::zone_features::set enable_features; + rgw::zone_features::set disable_features; + + enable_features.insert(rgw::zone_features::supported.begin(), + rgw::zone_features::supported.end()); + + int r = rgw::add_zone_to_group(dpp, config.zonegroup, config.zone, + &is_master, &read_only, endpoints, + nullptr, nullptr, sync_from, sync_from_rm, + nullptr, std::nullopt, + enable_features, disable_features); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to add zone to zonegroup: " + << cpp_strerror(r) << dendl; + throw std::system_error(-r, std::system_category()); + } + + config.zonegroup.enabled_features.insert(rgw::zone_features::enabled.begin(), + rgw::zone_features::enabled.end()); + } + + // insert the default placement target if it doesn't exist + auto target = RGWZoneGroupPlacementTarget{.name = "default-placement"}; + config.zonegroup.placement_targets.emplace(target.name, target); + if (config.zonegroup.default_placement.name.empty()) { + config.zonegroup.default_placement.name = target.name; + } +} + +} // anonymous namespace + +auto create_json_config_store(const DoutPrefixProvider* dpp, + const std::string& filename) + -> std::unique_ptr +{ + DecodedConfig config; + parse_config(dpp, filename.c_str()); + sanity_check_config(dpp, config); + return create_immutable_config_store(dpp, config.zonegroup, config.zone, + config.period_config); +} + +} // namespace rgw::sal diff --git a/src/rgw/driver/json_config/store.h b/src/rgw/driver/json_config/store.h new file mode 100644 index 000000000..4482f6716 --- /dev/null +++ b/src/rgw/driver/json_config/store.h @@ -0,0 +1,27 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "driver/immutable_config/store.h" + +namespace rgw::sal { + +/// Create an immutable ConfigStore by parsing the zonegroup and zone from the +/// given json filename. +auto create_json_config_store(const DoutPrefixProvider* dpp, + const std::string& filename) + -> std::unique_ptr; + +} // namespace rgw::sal diff --git a/src/rgw/driver/rados/cls_fifo_legacy.cc b/src/rgw/driver/rados/cls_fifo_legacy.cc new file mode 100644 index 000000000..f5bb485fa --- /dev/null +++ b/src/rgw/driver/rados/cls_fifo_legacy.cc @@ -0,0 +1,2539 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat + * Author: Adam C. Emerson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "include/rados/librados.hpp" + +#include "include/buffer.h" + +#include "common/async/yield_context.h" +#include "common/random_string.h" + +#include "cls/fifo/cls_fifo_types.h" +#include "cls/fifo/cls_fifo_ops.h" + +#include "cls_fifo_legacy.h" + +namespace rgw::cls::fifo { +namespace cb = ceph::buffer; +namespace fifo = rados::cls::fifo; + +using ceph::from_error_code; + +inline constexpr auto MAX_RACE_RETRIES = 10; + +void create_meta(lr::ObjectWriteOperation* op, + std::string_view id, + std::optional objv, + std::optional oid_prefix, + bool exclusive, + std::uint64_t max_part_size, + std::uint64_t max_entry_size) +{ + fifo::op::create_meta cm; + + cm.id = id; + cm.version = objv; + cm.oid_prefix = oid_prefix; + cm.max_part_size = max_part_size; + cm.max_entry_size = max_entry_size; + cm.exclusive = exclusive; + + cb::list in; + encode(cm, in); + op->exec(fifo::op::CLASS, fifo::op::CREATE_META, in); +} + +int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid, + std::optional objv, fifo::info* info, + std::uint32_t* part_header_size, + std::uint32_t* part_entry_overhead, + uint64_t tid, optional_yield y, + bool probe) +{ + lr::ObjectReadOperation op; + fifo::op::get_meta gm; + gm.version = objv; + cb::list in; + encode(gm, in); + cb::list bl; + + op.exec(fifo::op::CLASS, fifo::op::GET_META, in, + &bl, nullptr); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y); + if (r >= 0) try { + fifo::op::get_meta_reply reply; + auto iter = bl.cbegin(); + decode(reply, iter); + if (info) *info = std::move(reply.info); + if (part_header_size) *part_header_size = reply.part_header_size; + if (part_entry_overhead) + *part_entry_overhead = reply.part_entry_overhead; + } catch (const cb::error& err) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " decode failed: " << err.what() + << " tid=" << tid << dendl; + r = from_error_code(err.code()); + } else if (!(probe && (r == -ENOENT || r == -ENODATA))) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " fifo::op::GET_META failed r=" << r << " tid=" << tid + << dendl; + } + return r; +}; + +namespace { +void update_meta(lr::ObjectWriteOperation* op, const fifo::objv& objv, + const fifo::update& update) +{ + fifo::op::update_meta um; + + um.version = objv; + um.tail_part_num = update.tail_part_num(); + um.head_part_num = update.head_part_num(); + um.min_push_part_num = update.min_push_part_num(); + um.max_push_part_num = update.max_push_part_num(); + um.journal_entries_add = std::move(update).journal_entries_add(); + um.journal_entries_rm = std::move(update).journal_entries_rm(); + + cb::list in; + encode(um, in); + op->exec(fifo::op::CLASS, fifo::op::UPDATE_META, in); +} + +void part_init(lr::ObjectWriteOperation* op, fifo::data_params params) +{ + fifo::op::init_part ip; + + ip.params = params; + + cb::list in; + encode(ip, in); + op->exec(fifo::op::CLASS, fifo::op::INIT_PART, in); +} + +int push_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid, + std::deque data_bufs, std::uint64_t tid, + optional_yield y) +{ + lr::ObjectWriteOperation op; + fifo::op::push_part pp; + + op.assert_exists(); + + pp.data_bufs = data_bufs; + pp.total_len = 0; + + for (const auto& bl : data_bufs) + pp.total_len += bl.length(); + + cb::list in; + encode(pp, in); + auto retval = 0; + op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in, nullptr, &retval); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y, lr::OPERATION_RETURNVEC); + if (r < 0) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " fifo::op::PUSH_PART failed r=" << r + << " tid=" << tid << dendl; + return r; + } + if (retval < 0) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " error handling response retval=" << retval + << " tid=" << tid << dendl; + } + return retval; +} + +void push_part(lr::IoCtx& ioctx, const std::string& oid, + std::deque data_bufs, std::uint64_t tid, + lr::AioCompletion* c) +{ + lr::ObjectWriteOperation op; + fifo::op::push_part pp; + + pp.data_bufs = data_bufs; + pp.total_len = 0; + + for (const auto& bl : data_bufs) + pp.total_len += bl.length(); + + cb::list in; + encode(pp, in); + op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in); + auto r = ioctx.aio_operate(oid, c, &op, lr::OPERATION_RETURNVEC); + ceph_assert(r >= 0); +} + +void trim_part(lr::ObjectWriteOperation* op, + std::uint64_t ofs, bool exclusive) +{ + fifo::op::trim_part tp; + + tp.ofs = ofs; + tp.exclusive = exclusive; + + cb::list in; + encode(tp, in); + op->exec(fifo::op::CLASS, fifo::op::TRIM_PART, in); +} + +int list_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid, + std::uint64_t ofs, std::uint64_t max_entries, + std::vector* entries, + bool* more, bool* full_part, + std::uint64_t tid, optional_yield y) +{ + lr::ObjectReadOperation op; + fifo::op::list_part lp; + + lp.ofs = ofs; + lp.max_entries = max_entries; + + cb::list in; + encode(lp, in); + cb::list bl; + op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in, &bl, nullptr); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y); + if (r >= 0) try { + fifo::op::list_part_reply reply; + auto iter = bl.cbegin(); + decode(reply, iter); + if (entries) *entries = std::move(reply.entries); + if (more) *more = reply.more; + if (full_part) *full_part = reply.full_part; + } catch (const cb::error& err) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " decode failed: " << err.what() + << " tid=" << tid << dendl; + r = from_error_code(err.code()); + } else if (r != -ENOENT) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid + << dendl; + } + return r; +} + +struct list_entry_completion : public lr::ObjectOperationCompletion { + CephContext* cct; + int* r_out; + std::vector* entries; + bool* more; + bool* full_part; + std::uint64_t tid; + + list_entry_completion(CephContext* cct, int* r_out, std::vector* entries, + bool* more, bool* full_part, std::uint64_t tid) + : cct(cct), r_out(r_out), entries(entries), more(more), + full_part(full_part), tid(tid) {} + virtual ~list_entry_completion() = default; + void handle_completion(int r, bufferlist& bl) override { + if (r >= 0) try { + fifo::op::list_part_reply reply; + auto iter = bl.cbegin(); + decode(reply, iter); + if (entries) *entries = std::move(reply.entries); + if (more) *more = reply.more; + if (full_part) *full_part = reply.full_part; + } catch (const cb::error& err) { + lderr(cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " decode failed: " << err.what() + << " tid=" << tid << dendl; + r = from_error_code(err.code()); + } else if (r < 0) { + lderr(cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid + << dendl; + } + if (r_out) *r_out = r; + } +}; + +lr::ObjectReadOperation list_part(CephContext* cct, + std::uint64_t ofs, + std::uint64_t max_entries, + int* r_out, + std::vector* entries, + bool* more, bool* full_part, + std::uint64_t tid) +{ + lr::ObjectReadOperation op; + fifo::op::list_part lp; + + lp.ofs = ofs; + lp.max_entries = max_entries; + + cb::list in; + encode(lp, in); + op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in, + new list_entry_completion(cct, r_out, entries, more, full_part, + tid)); + return op; +} + +int get_part_info(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid, + fifo::part_header* header, + std::uint64_t tid, optional_yield y) +{ + lr::ObjectReadOperation op; + fifo::op::get_part_info gpi; + + cb::list in; + cb::list bl; + encode(gpi, in); + op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in, &bl, nullptr); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y); + if (r >= 0) try { + fifo::op::get_part_info_reply reply; + auto iter = bl.cbegin(); + decode(reply, iter); + if (header) *header = std::move(reply.header); + } catch (const cb::error& err) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " decode failed: " << err.what() + << " tid=" << tid << dendl; + r = from_error_code(err.code()); + } else { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid + << dendl; + } + return r; +} + +struct partinfo_completion : public lr::ObjectOperationCompletion { + CephContext* cct; + int* rp; + fifo::part_header* h; + std::uint64_t tid; + partinfo_completion(CephContext* cct, int* rp, fifo::part_header* h, + std::uint64_t tid) : + cct(cct), rp(rp), h(h), tid(tid) { + } + virtual ~partinfo_completion() = default; + void handle_completion(int r, bufferlist& bl) override { + if (r >= 0) try { + fifo::op::get_part_info_reply reply; + auto iter = bl.cbegin(); + decode(reply, iter); + if (h) *h = std::move(reply.header); + } catch (const cb::error& err) { + r = from_error_code(err.code()); + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " decode failed: " << err.what() + << " tid=" << tid << dendl; + } else { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid + << dendl; + } + if (rp) { + *rp = r; + } + } +}; + +lr::ObjectReadOperation get_part_info(CephContext* cct, + fifo::part_header* header, + std::uint64_t tid, int* r = 0) +{ + lr::ObjectReadOperation op; + fifo::op::get_part_info gpi; + + cb::list in; + cb::list bl; + encode(gpi, in); + op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in, + new partinfo_completion(cct, r, header, tid)); + return op; +} +} + +std::optional FIFO::to_marker(std::string_view s) +{ + marker m; + if (s.empty()) { + m.num = info.tail_part_num; + m.ofs = 0; + return m; + } + + auto pos = s.find(':'); + if (pos == s.npos) { + return std::nullopt; + } + + auto num = s.substr(0, pos); + auto ofs = s.substr(pos + 1); + + auto n = ceph::parse(num); + if (!n) { + return std::nullopt; + } + m.num = *n; + auto o = ceph::parse(ofs); + if (!o) { + return std::nullopt; + } + m.ofs = *o; + return m; +} + +int FIFO::apply_update(const DoutPrefixProvider *dpp, + fifo::info* info, + const fifo::objv& objv, + const fifo::update& update, + std::uint64_t tid) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + std::unique_lock l(m); + if (objv != info->version) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " version mismatch, canceling: tid=" << tid << dendl; + return -ECANCELED; + } + + info->apply_update(update); + return {}; +} + +int FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update, + fifo::objv version, bool* pcanceled, + std::uint64_t tid, optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + lr::ObjectWriteOperation op; + bool canceled = false; + update_meta(&op, version, update); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r >= 0 || r == -ECANCELED) { + canceled = (r == -ECANCELED); + if (!canceled) { + r = apply_update(dpp, &info, version, update, tid); + if (r < 0) canceled = true; + } + if (canceled) { + r = read_meta(dpp, tid, y); + canceled = r < 0 ? false : true; + } + } + if (pcanceled) *pcanceled = canceled; + if (canceled) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled: tid=" << tid << dendl; + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " returning error: r=" << r << " tid=" << tid << dendl; + } + return r; +} + +struct Updater : public Completion { + FIFO* fifo; + fifo::update update; + fifo::objv version; + bool reread = false; + bool* pcanceled = nullptr; + std::uint64_t tid; + Updater(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super, + const fifo::update& update, fifo::objv version, + bool* pcanceled, std::uint64_t tid) + : Completion(dpp, super), fifo(fifo), update(update), version(version), + pcanceled(pcanceled) {} + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + if (reread) + handle_reread(dpp, std::move(p), r); + else + handle_update(dpp, std::move(p), r); + } + + void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " handling async update_meta: tid=" + << tid << dendl; + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " update failed: r=" << r << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + bool canceled = (r == -ECANCELED); + if (!canceled) { + int r = fifo->apply_update(dpp, &fifo->info, version, update, tid); + if (r < 0) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " update failed, marking canceled: r=" << r + << " tid=" << tid << dendl; + canceled = true; + } + } + if (canceled) { + reread = true; + fifo->read_meta(dpp, tid, call(std::move(p))); + return; + } + if (pcanceled) + *pcanceled = false; + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " completing: tid=" << tid << dendl; + complete(std::move(p), 0); + } + + void handle_reread(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " handling async read_meta: tid=" + << tid << dendl; + if (r < 0 && pcanceled) { + *pcanceled = false; + } else if (r >= 0 && pcanceled) { + *pcanceled = true; + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " failed dispatching read_meta: r=" << r << " tid=" + << tid << dendl; + } else { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " completing: tid=" << tid << dendl; + } + complete(std::move(p), r); + } +}; + +void FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update, + fifo::objv version, bool* pcanceled, + std::uint64_t tid, lr::AioCompletion* c) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + lr::ObjectWriteOperation op; + update_meta(&op, info.version, update); + auto updater = std::make_unique(dpp, this, c, update, version, pcanceled, + tid); + auto r = ioctx.aio_operate(oid, Updater::call(std::move(updater)), &op); + assert(r >= 0); +} + +int FIFO::create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid, + optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + lr::ObjectWriteOperation op; + op.create(false); /* We don't need exclusivity, part_init ensures + we're creating from the same journal entry. */ + std::unique_lock l(m); + part_init(&op, info.params); + auto oid = info.part_oid(part_num); + l.unlock(); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " part_init failed: r=" << r << " tid=" + << tid << dendl; + } + return r; +} + +int FIFO::remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid, + optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + lr::ObjectWriteOperation op; + op.remove(); + std::unique_lock l(m); + auto oid = info.part_oid(part_num); + l.unlock(); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " remove failed: r=" << r << " tid=" + << tid << dendl; + } + return r; +} + +int FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + std::vector processed; + + std::unique_lock l(m); + auto tmpjournal = info.journal; + auto new_tail = info.tail_part_num; + auto new_head = info.head_part_num; + auto new_max = info.max_push_part_num; + l.unlock(); + + int r = 0; + for (auto& entry : tmpjournal) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " processing entry: entry=" << entry << " tid=" << tid + << dendl; + switch (entry.op) { + using enum fifo::journal_entry::Op; + case create: + r = create_part(dpp, entry.part_num, tid, y); + if (entry.part_num > new_max) { + new_max = entry.part_num; + } + break; + case set_head: + r = 0; + if (entry.part_num > new_head) { + new_head = entry.part_num; + } + break; + case remove: + r = remove_part(dpp, entry.part_num, tid, y); + if (r == -ENOENT) r = 0; + if (entry.part_num >= new_tail) { + new_tail = entry.part_num + 1; + } + break; + default: + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " unknown journaled op: entry=" << entry << " tid=" + << tid << dendl; + return -EIO; + } + + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " processing entry failed: entry=" << entry + << " r=" << r << " tid=" << tid << dendl; + return -r; + } + + processed.push_back(std::move(entry)); + } + + // Postprocess + bool canceled = true; + + for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " postprocessing: i=" << i << " tid=" << tid << dendl; + + std::optional tail_part_num; + std::optional head_part_num; + std::optional max_part_num; + + std::unique_lock l(m); + auto objv = info.version; + if (new_tail > tail_part_num) tail_part_num = new_tail; + if (new_head > info.head_part_num) head_part_num = new_head; + if (new_max > info.max_push_part_num) max_part_num = new_max; + l.unlock(); + + if (processed.empty() && + !tail_part_num && + !max_part_num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " nothing to update any more: i=" << i << " tid=" + << tid << dendl; + canceled = false; + break; + } + auto u = fifo::update().tail_part_num(tail_part_num) + .head_part_num(head_part_num).max_push_part_num(max_part_num) + .journal_entries_rm(processed); + r = _update_meta(dpp, u, objv, &canceled, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _update_meta failed: update=" << u + << " r=" << r << " tid=" << tid << dendl; + break; + } + + if (canceled) { + std::vector new_processed; + std::unique_lock l(m); + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " update canceled, retrying: i=" << i << " tid=" + << tid << dendl; + for (auto& e : processed) { + if (info.journal.contains(e)) { + new_processed.push_back(e); + } + } + processed = std::move(new_processed); + } + } + if (r == 0 && canceled) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled too many times, giving up: tid=" << tid << dendl; + r = -ECANCELED; + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " failed, r=: " << r << " tid=" << tid << dendl; + } + return r; +} + +int FIFO::_prepare_new_part(const DoutPrefixProvider *dpp, + std::int64_t new_part_num, bool is_head, + std::uint64_t tid, optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + std::unique_lock l(m); + using enum fifo::journal_entry::Op; + std::vector jentries{{ create, new_part_num }}; + if (info.journal.contains({create, new_part_num}) && + (!is_head || info.journal.contains({set_head, new_part_num}))) { + l.unlock(); + ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " new part journaled, but not processed: tid=" + << tid << dendl; + auto r = process_journal(dpp, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " process_journal failed: r=" << r << " tid=" << tid << dendl; + } + return r; + } + auto version = info.version; + + if (is_head) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " needs new head: tid=" << tid << dendl; + jentries.push_back({ set_head, new_part_num }); + } + l.unlock(); + + int r = 0; + bool canceled = true; + for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) { + canceled = false; + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " updating metadata: i=" << i << " tid=" << tid << dendl; + auto u = fifo::update{}.journal_entries_add(jentries); + r = _update_meta(dpp, u, version, &canceled, tid, y); + if (r >= 0 && canceled) { + std::unique_lock l(m); + version = info.version; + auto found = (info.journal.contains({create, new_part_num}) || + info.journal.contains({set_head, new_part_num})); + if ((info.max_push_part_num >= new_part_num && + info.head_part_num >= new_part_num)) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced, but journaled and processed: i=" << i + << " tid=" << tid << dendl; + return 0; + } + if (found) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced, journaled but not processed: i=" << i + << " tid=" << tid << dendl; + canceled = false; + } + l.unlock(); + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _update_meta failed: update=" << u << " r=" << r + << " tid=" << tid << dendl; + return r; + } + } + if (canceled) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled too many times, giving up: tid=" << tid << dendl; + return -ECANCELED; + } + r = process_journal(dpp, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " process_journal failed: r=" << r << " tid=" << tid << dendl; + } + return r; +} + +int FIFO::_prepare_new_head(const DoutPrefixProvider *dpp, + std::int64_t new_head_part_num, + std::uint64_t tid, optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + std::unique_lock l(m); + auto max_push_part_num = info.max_push_part_num; + auto version = info.version; + l.unlock(); + + int r = 0; + if (max_push_part_num < new_head_part_num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " need new part: tid=" << tid << dendl; + r = _prepare_new_part(dpp, new_head_part_num, true, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _prepare_new_part failed: r=" << r + << " tid=" << tid << dendl; + return r; + } + std::unique_lock l(m); + if (info.max_push_part_num < new_head_part_num) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " inconsistency, push part less than head part: " + << " tid=" << tid << dendl; + return -EIO; + } + l.unlock(); + return 0; + } + + using enum fifo::journal_entry::Op; + fifo::journal_entry jentry; + jentry.op = set_head; + jentry.part_num = new_head_part_num; + + r = 0; + bool canceled = true; + for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) { + canceled = false; + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " updating metadata: i=" << i << " tid=" << tid << dendl; + auto u = fifo::update{}.journal_entries_add({{ jentry }}); + r = _update_meta(dpp, u, version, &canceled, tid, y); + if (r >= 0 && canceled) { + std::unique_lock l(m); + auto found = (info.journal.contains({create, new_head_part_num}) || + info.journal.contains({set_head, new_head_part_num})); + version = info.version; + if ((info.head_part_num >= new_head_part_num)) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced, but journaled and processed: i=" << i + << " tid=" << tid << dendl; + return 0; + } + if (found) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced, journaled but not processed: i=" << i + << " tid=" << tid << dendl; + canceled = false; + } + l.unlock(); + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _update_meta failed: update=" << u << " r=" << r + << " tid=" << tid << dendl; + return r; + } + } + if (canceled) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled too many times, giving up: tid=" << tid << dendl; + return -ECANCELED; + } + r = process_journal(dpp, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " process_journal failed: r=" << r << " tid=" << tid << dendl; + } + return r; +} + +struct NewPartPreparer : public Completion { + FIFO* f; + std::vector jentries; + int i = 0; + std::int64_t new_part_num; + bool canceled = false; + uint64_t tid; + + NewPartPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super, + std::vector jentries, + std::int64_t new_part_num, + std::uint64_t tid) + : Completion(dpp, super), f(f), jentries(std::move(jentries)), + new_part_num(new_part_num), tid(tid) {} + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _update_meta failed: r=" << r + << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + + if (canceled) { + using enum fifo::journal_entry::Op; + std::unique_lock l(f->m); + auto found = (f->info.journal.contains({create, new_part_num}) || + f->info.journal.contains({set_head, new_part_num})); + auto max_push_part_num = f->info.max_push_part_num; + auto head_part_num = f->info.head_part_num; + auto version = f->info.version; + l.unlock(); + if ((max_push_part_num >= new_part_num && + head_part_num >= new_part_num)) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced, but journaled and processed: i=" << i + << " tid=" << tid << dendl; + complete(std::move(p), 0); + return; + } + if (i >= MAX_RACE_RETRIES) { + complete(std::move(p), -ECANCELED); + return; + } + if (!found) { + ++i; + f->_update_meta(dpp, fifo::update{} + .journal_entries_add(jentries), + version, &canceled, tid, call(std::move(p))); + return; + } else { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced, journaled but not processed: i=" << i + << " tid=" << tid << dendl; + canceled = false; + } + // Fall through. We still need to process the journal. + } + f->process_journal(dpp, tid, super()); + return; + } +}; + +void FIFO::_prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num, + bool is_head, std::uint64_t tid, lr::AioCompletion* c) +{ + std::unique_lock l(m); + using enum fifo::journal_entry::Op; + std::vector jentries{{create, new_part_num}}; + if (info.journal.contains({create, new_part_num}) && + (!is_head || info.journal.contains({set_head, new_part_num}))) { + l.unlock(); + ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " new part journaled, but not processed: tid=" + << tid << dendl; + process_journal(dpp, tid, c); + return; + } + auto version = info.version; + + if (is_head) { + jentries.push_back({ set_head, new_part_num }); + } + l.unlock(); + + auto n = std::make_unique(dpp, this, c, jentries, + new_part_num, tid); + auto np = n.get(); + _update_meta(dpp, fifo::update{}.journal_entries_add(jentries), version, + &np->canceled, tid, NewPartPreparer::call(std::move(n))); +} + +struct NewHeadPreparer : public Completion { + FIFO* f; + int i = 0; + bool newpart; + std::int64_t new_head_part_num; + bool canceled = false; + std::uint64_t tid; + + NewHeadPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super, + bool newpart, std::int64_t new_head_part_num, + std::uint64_t tid) + : Completion(dpp, super), f(f), newpart(newpart), + new_head_part_num(new_head_part_num), tid(tid) {} + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + if (newpart) + handle_newpart(std::move(p), r); + else + handle_update(dpp, std::move(p), r); + } + + void handle_newpart(Ptr&& p, int r) { + if (r < 0) { + lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _prepare_new_part failed: r=" << r + << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + std::unique_lock l(f->m); + if (f->info.max_push_part_num < new_head_part_num) { + l.unlock(); + lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _prepare_new_part failed: r=" << r + << " tid=" << tid << dendl; + complete(std::move(p), -EIO); + } else { + l.unlock(); + complete(std::move(p), 0); + } + } + + void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _update_meta failed: r=" << r + << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + + if (canceled) { + using enum fifo::journal_entry::Op; + std::unique_lock l(f->m); + auto found = (f->info.journal.contains({create, new_head_part_num }) || + f->info.journal.contains({set_head, new_head_part_num })); + auto head_part_num = f->info.head_part_num; + auto version = f->info.version; + + l.unlock(); + if ((head_part_num >= new_head_part_num)) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced, but journaled and processed: i=" << i + << " tid=" << tid << dendl; + complete(std::move(p), 0); + return; + } + if (i >= MAX_RACE_RETRIES) { + complete(std::move(p), -ECANCELED); + return; + } + if (!found) { + ++i; + fifo::journal_entry jentry; + jentry.op = set_head; + jentry.part_num = new_head_part_num; + f->_update_meta(dpp, fifo::update{} + .journal_entries_add({{jentry}}), + version, &canceled, tid, call(std::move(p))); + return; + } else { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced, journaled but not processed: i=" << i + << " tid=" << tid << dendl; + canceled = false; + } + // Fall through. We still need to process the journal. + } + f->process_journal(dpp, tid, super()); + return; + } +}; + +void FIFO::_prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num, + std::uint64_t tid, lr::AioCompletion* c) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + std::unique_lock l(m); + auto max_push_part_num = info.max_push_part_num; + auto version = info.version; + l.unlock(); + + if (max_push_part_num < new_head_part_num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " need new part: tid=" << tid << dendl; + auto n = std::make_unique(dpp, this, c, true, new_head_part_num, + tid); + _prepare_new_part(dpp, new_head_part_num, true, tid, + NewHeadPreparer::call(std::move(n))); + } else { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " updating head: tid=" << tid << dendl; + auto n = std::make_unique(dpp, this, c, false, new_head_part_num, + tid); + auto np = n.get(); + using enum fifo::journal_entry::Op; + fifo::journal_entry jentry; + jentry.op = set_head; + jentry.part_num = new_head_part_num; + _update_meta(dpp, fifo::update{}.journal_entries_add({{jentry}}), version, + &np->canceled, tid, NewHeadPreparer::call(std::move(n))); + } +} + +int FIFO::push_entries(const DoutPrefixProvider *dpp, const std::deque& data_bufs, + std::uint64_t tid, optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + std::unique_lock l(m); + auto head_part_num = info.head_part_num; + const auto part_oid = info.part_oid(head_part_num); + l.unlock(); + + auto r = push_part(dpp, ioctx, part_oid, data_bufs, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " push_part failed: r=" << r << " tid=" << tid << dendl; + } + return r; +} + +void FIFO::push_entries(const std::deque& data_bufs, + std::uint64_t tid, lr::AioCompletion* c) +{ + std::unique_lock l(m); + auto head_part_num = info.head_part_num; + const auto part_oid = info.part_oid(head_part_num); + l.unlock(); + + push_part(ioctx, part_oid, data_bufs, tid, c); +} + +int FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs, + bool exclusive, std::uint64_t tid, + optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + lr::ObjectWriteOperation op; + std::unique_lock l(m); + const auto part_oid = info.part_oid(part_num); + l.unlock(); + rgw::cls::fifo::trim_part(&op, ofs, exclusive); + auto r = rgw_rados_operate(dpp, ioctx, part_oid, &op, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " trim_part failed: r=" << r << " tid=" << tid << dendl; + } + return 0; +} + +void FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs, + bool exclusive, std::uint64_t tid, + lr::AioCompletion* c) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + lr::ObjectWriteOperation op; + std::unique_lock l(m); + const auto part_oid = info.part_oid(part_num); + l.unlock(); + rgw::cls::fifo::trim_part(&op, ofs, exclusive); + auto r = ioctx.aio_operate(part_oid, c, &op); + ceph_assert(r >= 0); +} + +int FIFO::open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr* fifo, + optional_yield y, std::optional objv, + bool probe) +{ + ldpp_dout(dpp, 20) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering" << dendl; + fifo::info info; + std::uint32_t size; + std::uint32_t over; + int r = get_meta(dpp, ioctx, std::move(oid), objv, &info, &size, &over, 0, y, + probe); + if (r < 0) { + if (!(probe && (r == -ENOENT || r == -ENODATA))) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " get_meta failed: r=" << r << dendl; + } + return r; + } + std::unique_ptr f(new FIFO(std::move(ioctx), oid)); + f->info = info; + f->part_header_size = size; + f->part_entry_overhead = over; + // If there are journal entries, process them, in case + // someone crashed mid-transaction. + if (!info.journal.empty()) { + ldpp_dout(dpp, 20) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " processing leftover journal" << dendl; + r = f->process_journal(dpp, 0, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " process_journal failed: r=" << r << dendl; + return r; + } + } + *fifo = std::move(f); + return 0; +} + +int FIFO::create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr* fifo, + optional_yield y, std::optional objv, + std::optional oid_prefix, + bool exclusive, std::uint64_t max_part_size, + std::uint64_t max_entry_size) +{ + ldpp_dout(dpp, 20) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering" << dendl; + lr::ObjectWriteOperation op; + create_meta(&op, oid, objv, oid_prefix, exclusive, max_part_size, + max_entry_size); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " create_meta failed: r=" << r << dendl; + return r; + } + r = open(dpp, std::move(ioctx), std::move(oid), fifo, y, objv); + return r; +} + +int FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + fifo::info _info; + std::uint32_t _phs; + std::uint32_t _peo; + + auto r = get_meta(dpp, ioctx, oid, std::nullopt, &_info, &_phs, &_peo, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " get_meta failed: r=" << r << " tid=" << tid << dendl; + return r; + } + std::unique_lock l(m); + // We have a newer version already! + if (_info.version.same_or_later(this->info.version)) { + info = std::move(_info); + part_header_size = _phs; + part_entry_overhead = _peo; + } + return 0; +} + +int FIFO::read_meta(const DoutPrefixProvider *dpp, optional_yield y) { + std::unique_lock l(m); + auto tid = ++next_tid; + l.unlock(); + return read_meta(dpp, tid, y); +} + +struct Reader : public Completion { + FIFO* fifo; + cb::list bl; + std::uint64_t tid; + Reader(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super, std::uint64_t tid) + : Completion(dpp, super), fifo(fifo), tid(tid) {} + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + if (r >= 0) try { + fifo::op::get_meta_reply reply; + auto iter = bl.cbegin(); + decode(reply, iter); + std::unique_lock l(fifo->m); + if (reply.info.version.same_or_later(fifo->info.version)) { + fifo->info = std::move(reply.info); + fifo->part_header_size = reply.part_header_size; + fifo->part_entry_overhead = reply.part_entry_overhead; + } + } catch (const cb::error& err) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " failed to decode response err=" << err.what() + << " tid=" << tid << dendl; + r = from_error_code(err.code()); + } else { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " read_meta failed r=" << r + << " tid=" << tid << dendl; + } + complete(std::move(p), r); + } +}; + +void FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + lr::ObjectReadOperation op; + fifo::op::get_meta gm; + cb::list in; + encode(gm, in); + auto reader = std::make_unique(dpp, this, c, tid); + auto rp = reader.get(); + auto r = ioctx.aio_exec(oid, Reader::call(std::move(reader)), fifo::op::CLASS, + fifo::op::GET_META, in, &rp->bl); + assert(r >= 0); +} + +const fifo::info& FIFO::meta() const { + return info; +} + +std::pair FIFO::get_part_layout_info() const { + return {part_header_size, part_entry_overhead}; +} + +int FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, optional_yield y) { + return push(dpp, std::vector{ bl }, y); +} + +void FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, lr::AioCompletion* c) { + push(dpp, std::vector{ bl }, c); +} + +int FIFO::push(const DoutPrefixProvider *dpp, const std::vector& data_bufs, optional_yield y) +{ + std::unique_lock l(m); + auto tid = ++next_tid; + auto max_entry_size = info.params.max_entry_size; + auto need_new_head = info.need_new_head(); + auto head_part_num = info.head_part_num; + l.unlock(); + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + if (data_bufs.empty()) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " empty push, returning success tid=" << tid << dendl; + return 0; + } + + // Validate sizes + for (const auto& bl : data_bufs) { + if (bl.length() > max_entry_size) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entry bigger than max_entry_size tid=" << tid << dendl; + return -E2BIG; + } + } + + int r = 0; + if (need_new_head) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " need new head tid=" << tid << dendl; + r = _prepare_new_head(dpp, head_part_num + 1, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _prepare_new_head failed: r=" << r + << " tid=" << tid << dendl; + return r; + } + } + + std::deque remaining(data_bufs.begin(), data_bufs.end()); + std::deque batch; + + uint64_t batch_len = 0; + auto retries = 0; + bool canceled = true; + while ((!remaining.empty() || !batch.empty()) && + (retries <= MAX_RACE_RETRIES)) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " preparing push: remaining=" << remaining.size() + << " batch=" << batch.size() << " retries=" << retries + << " tid=" << tid << dendl; + std::unique_lock l(m); + head_part_num = info.head_part_num; + auto max_part_size = info.params.max_part_size; + auto overhead = part_entry_overhead; + l.unlock(); + + while (!remaining.empty() && + (remaining.front().length() + batch_len <= max_part_size)) { + /* We can send entries with data_len up to max_entry_size, + however, we want to also account the overhead when + dealing with multiple entries. Previous check doesn't + account for overhead on purpose. */ + batch_len += remaining.front().length() + overhead; + batch.push_back(std::move(remaining.front())); + remaining.pop_front(); + } + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " prepared push: remaining=" << remaining.size() + << " batch=" << batch.size() << " retries=" << retries + << " batch_len=" << batch_len + << " tid=" << tid << dendl; + + auto r = push_entries(dpp, batch, tid, y); + if (r == -ERANGE) { + canceled = true; + ++retries; + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " need new head tid=" << tid << dendl; + r = _prepare_new_head(dpp, head_part_num + 1, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " prepare_new_head failed: r=" << r + << " tid=" << tid << dendl; + return r; + } + r = 0; + continue; + } + if (r == -ENOENT) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " racing client trimmed part, rereading metadata " + << "tid=" << tid << dendl; + canceled = true; + ++retries; + r = read_meta(dpp, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " read_meta failed: r=" << r + << " tid=" << tid << dendl; + return r; + } + r = 0; + continue; + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " push_entries failed: r=" << r + << " tid=" << tid << dendl; + return r; + } + // Made forward progress! + canceled = false; + retries = 0; + batch_len = 0; + if (r == ssize(batch)) { + batch.clear(); + } else { + batch.erase(batch.begin(), batch.begin() + r); + for (const auto& b : batch) { + batch_len += b.length() + part_entry_overhead; + } + } + } + if (canceled) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled too many times, giving up: tid=" << tid << dendl; + return -ECANCELED; + } + return 0; +} + +struct Pusher : public Completion { + FIFO* f; + std::deque remaining; + std::deque batch; + int i = 0; + std::int64_t head_part_num; + std::uint64_t tid; + enum { pushing, new_heading, meta_reading } state = pushing; + + void prep_then_push(const DoutPrefixProvider *dpp, Ptr&& p, const unsigned successes) { + std::unique_lock l(f->m); + auto max_part_size = f->info.params.max_part_size; + auto part_entry_overhead = f->part_entry_overhead; + head_part_num = f->info.head_part_num; + l.unlock(); + + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " preparing push: remaining=" << remaining.size() + << " batch=" << batch.size() << " i=" << i + << " tid=" << tid << dendl; + + uint64_t batch_len = 0; + if (successes > 0) { + if (successes == batch.size()) { + batch.clear(); + } else { + batch.erase(batch.begin(), batch.begin() + successes); + for (const auto& b : batch) { + batch_len += b.length() + part_entry_overhead; + } + } + } + + if (batch.empty() && remaining.empty()) { + complete(std::move(p), 0); + return; + } + + while (!remaining.empty() && + (remaining.front().length() + batch_len <= max_part_size)) { + + /* We can send entries with data_len up to max_entry_size, + however, we want to also account the overhead when + dealing with multiple entries. Previous check doesn't + account for overhead on purpose. */ + batch_len += remaining.front().length() + part_entry_overhead; + batch.push_back(std::move(remaining.front())); + remaining.pop_front(); + } + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " prepared push: remaining=" << remaining.size() + << " batch=" << batch.size() << " i=" << i + << " batch_len=" << batch_len + << " tid=" << tid << dendl; + push(std::move(p)); + } + + void push(Ptr&& p) { + f->push_entries(batch, tid, call(std::move(p))); + } + + void new_head(const DoutPrefixProvider *dpp, Ptr&& p) { + state = new_heading; + f->_prepare_new_head(dpp, head_part_num + 1, tid, call(std::move(p))); + } + + void read_meta(const DoutPrefixProvider *dpp, Ptr&& p) { + ++i; + state = meta_reading; + f->read_meta(dpp, tid, call(std::move(p))); + } + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + switch (state) { + case pushing: + if (r == -ERANGE) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " need new head tid=" << tid << dendl; + new_head(dpp, std::move(p)); + return; + } + if (r == -ENOENT) { + if (i > MAX_RACE_RETRIES) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " racing client deleted part, but we're out" + << " of retries: tid=" << tid << dendl; + complete(std::move(p), r); + } + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " racing client deleted part: tid=" << tid << dendl; + read_meta(dpp, std::move(p)); + return; + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " push_entries failed: r=" << r + << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + i = 0; // We've made forward progress, so reset the race counter! + prep_then_push(dpp, std::move(p), r); + break; + + case new_heading: + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " prepare_new_head failed: r=" << r + << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + state = pushing; + handle_new_head(dpp, std::move(p), r); + break; + + case meta_reading: + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " read_meta failed: r=" << r + << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + state = pushing; + prep_then_push(dpp, std::move(p), r); + break; + } + } + + void handle_new_head(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + if (r == -ECANCELED) { + if (p->i == MAX_RACE_RETRIES) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled too many times, giving up: tid=" << tid << dendl; + complete(std::move(p), -ECANCELED); + return; + } + ++p->i; + } else if (r) { + complete(std::move(p), r); + return; + } + + if (p->batch.empty()) { + prep_then_push(dpp, std::move(p), 0); + return; + } else { + push(std::move(p)); + return; + } + } + + Pusher(const DoutPrefixProvider *dpp, FIFO* f, std::deque&& remaining, + std::int64_t head_part_num, std::uint64_t tid, + lr::AioCompletion* super) + : Completion(dpp, super), f(f), remaining(std::move(remaining)), + head_part_num(head_part_num), tid(tid) {} +}; + +void FIFO::push(const DoutPrefixProvider *dpp, const std::vector& data_bufs, + lr::AioCompletion* c) +{ + std::unique_lock l(m); + auto tid = ++next_tid; + auto max_entry_size = info.params.max_entry_size; + auto need_new_head = info.need_new_head(); + auto head_part_num = info.head_part_num; + l.unlock(); + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + auto p = std::make_unique(dpp, this, std::deque(data_bufs.begin(), data_bufs.end()), + head_part_num, tid, c); + // Validate sizes + for (const auto& bl : data_bufs) { + if (bl.length() > max_entry_size) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entry bigger than max_entry_size tid=" << tid << dendl; + Pusher::complete(std::move(p), -E2BIG); + return; + } + } + + if (data_bufs.empty() ) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " empty push, returning success tid=" << tid << dendl; + Pusher::complete(std::move(p), 0); + return; + } + + if (need_new_head) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " need new head tid=" << tid << dendl; + p->new_head(dpp, std::move(p)); + } else { + p->prep_then_push(dpp, std::move(p), 0); + } +} + +int FIFO::list(const DoutPrefixProvider *dpp, int max_entries, + std::optional markstr, + std::vector* presult, bool* pmore, + optional_yield y) +{ + std::unique_lock l(m); + auto tid = ++next_tid; + std::int64_t part_num = info.tail_part_num; + l.unlock(); + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + std::uint64_t ofs = 0; + if (markstr) { + auto marker = to_marker(*markstr); + if (!marker) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " invalid marker string: " << markstr + << " tid= "<< tid << dendl; + return -EINVAL; + } + part_num = marker->num; + ofs = marker->ofs; + } + + std::vector result; + result.reserve(max_entries); + bool more = false; + + std::vector entries; + int r = 0; + while (max_entries > 0) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " max_entries=" << max_entries << " tid=" << tid << dendl; + bool part_more = false; + bool part_full = false; + + std::unique_lock l(m); + auto part_oid = info.part_oid(part_num); + l.unlock(); + + r = list_part(dpp, ioctx, part_oid, ofs, max_entries, &entries, + &part_more, &part_full, tid, y); + if (r == -ENOENT) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " missing part, rereading metadata" + << " tid= "<< tid << dendl; + r = read_meta(dpp, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " read_meta failed: r=" << r + << " tid= "<< tid << dendl; + return r; + } + if (part_num < info.tail_part_num) { + /* raced with trim? restart */ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " raced with trim, restarting: tid=" << tid << dendl; + max_entries += result.size(); + result.clear(); + std::unique_lock l(m); + part_num = info.tail_part_num; + l.unlock(); + ofs = 0; + continue; + } + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " assuming part was not written yet, so end of data: " + << "tid=" << tid << dendl; + more = false; + r = 0; + break; + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " list_entries failed: r=" << r + << " tid= "<< tid << dendl; + return r; + } + more = part_full || part_more; + for (auto& entry : entries) { + list_entry e; + e.data = std::move(entry.data); + e.marker = marker{part_num, entry.ofs}.to_string(); + e.mtime = entry.mtime; + result.push_back(std::move(e)); + --max_entries; + if (max_entries == 0) + break; + } + entries.clear(); + if (max_entries > 0 && + part_more) { + } + + if (!part_full) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " head part is not full, so we can assume we're done: " + << "tid=" << tid << dendl; + break; + } + if (!part_more) { + ++part_num; + ofs = 0; + } + } + if (presult) + *presult = std::move(result); + if (pmore) + *pmore = more; + return 0; +} + +int FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y) +{ + bool overshoot = false; + auto marker = to_marker(markstr); + if (!marker) { + return -EINVAL; + } + auto part_num = marker->num; + auto ofs = marker->ofs; + std::unique_lock l(m); + auto tid = ++next_tid; + auto hn = info.head_part_num; + const auto max_part_size = info.params.max_part_size; + if (part_num > hn) { + l.unlock(); + auto r = read_meta(dpp, tid, y); + if (r < 0) { + return r; + } + l.lock(); + auto hn = info.head_part_num; + if (part_num > hn) { + overshoot = true; + part_num = hn; + ofs = max_part_size; + } + } + if (part_num < info.tail_part_num) { + return -ENODATA; + } + auto pn = info.tail_part_num; + l.unlock(); + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + + int r = 0; + while (pn < part_num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " pn=" << pn << " tid=" << tid << dendl; + std::unique_lock l(m); + l.unlock(); + r = trim_part(dpp, pn, max_part_size, false, tid, y); + if (r < 0 && r == -ENOENT) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " trim_part failed: r=" << r + << " tid= "<< tid << dendl; + return r; + } + ++pn; + } + r = trim_part(dpp, part_num, ofs, exclusive, tid, y); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " trim_part failed: r=" << r + << " tid= "<< tid << dendl; + return r; + } + + l.lock(); + auto tail_part_num = info.tail_part_num; + auto objv = info.version; + l.unlock(); + bool canceled = tail_part_num < part_num; + int retries = 0; + while ((tail_part_num < part_num) && + canceled && + (retries <= MAX_RACE_RETRIES)) { + r = _update_meta(dpp, fifo::update{}.tail_part_num(part_num), objv, &canceled, + tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " _update_meta failed: r=" << r + << " tid= "<< tid << dendl; + return r; + } + if (canceled) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled: retries=" << retries + << " tid=" << tid << dendl; + l.lock(); + tail_part_num = info.tail_part_num; + objv = info.version; + l.unlock(); + ++retries; + } + } + if (canceled) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled too many times, giving up: tid=" << tid << dendl; + return -EIO; + } + return overshoot ? -ENODATA : 0; +} + +struct Trimmer : public Completion { + FIFO* fifo; + std::int64_t part_num; + std::uint64_t ofs; + std::int64_t pn; + bool exclusive; + std::uint64_t tid; + bool update = false; + bool reread = false; + bool canceled = false; + bool overshoot = false; + int retries = 0; + + Trimmer(const DoutPrefixProvider *dpp, FIFO* fifo, std::int64_t part_num, std::uint64_t ofs, std::int64_t pn, + bool exclusive, lr::AioCompletion* super, std::uint64_t tid) + : Completion(dpp, super), fifo(fifo), part_num(part_num), ofs(ofs), pn(pn), + exclusive(exclusive), tid(tid) {} + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + + if (reread) { + reread = false; + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " read_meta failed: r=" + << r << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + std::unique_lock l(fifo->m); + auto hn = fifo->info.head_part_num; + const auto max_part_size = fifo->info.params.max_part_size; + const auto tail_part_num = fifo->info.tail_part_num; + l.unlock(); + if (part_num > hn) { + part_num = hn; + ofs = max_part_size; + overshoot = true; + } + if (part_num < tail_part_num) { + complete(std::move(p), -ENODATA); + return; + } + pn = tail_part_num; + if (pn < part_num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " pn=" << pn << " tid=" << tid << dendl; + fifo->trim_part(dpp, pn++, max_part_size, false, tid, + call(std::move(p))); + } else { + update = true; + canceled = tail_part_num < part_num; + fifo->trim_part(dpp, part_num, ofs, exclusive, tid, call(std::move(p))); + } + return; + } + + if (r == -ENOENT) { + r = 0; + } + + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << (update ? " update_meta " : " trim ") << "failed: r=" + << r << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } + + if (!update) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " handling preceding trim callback: tid=" << tid << dendl; + retries = 0; + if (pn < part_num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " pn=" << pn << " tid=" << tid << dendl; + std::unique_lock l(fifo->m); + const auto max_part_size = fifo->info.params.max_part_size; + l.unlock(); + fifo->trim_part(dpp, pn++, max_part_size, false, tid, + call(std::move(p))); + return; + } + + std::unique_lock l(fifo->m); + const auto tail_part_num = fifo->info.tail_part_num; + l.unlock(); + update = true; + canceled = tail_part_num < part_num; + fifo->trim_part(dpp, part_num, ofs, exclusive, tid, call(std::move(p))); + return; + } + + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " handling update-needed callback: tid=" << tid << dendl; + std::unique_lock l(fifo->m); + auto tail_part_num = fifo->info.tail_part_num; + auto objv = fifo->info.version; + l.unlock(); + if ((tail_part_num < part_num) && + canceled) { + if (retries > MAX_RACE_RETRIES) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled too many times, giving up: tid=" << tid << dendl; + complete(std::move(p), -EIO); + return; + } + ++retries; + fifo->_update_meta(dpp, fifo::update{} + .tail_part_num(part_num), objv, &canceled, + tid, call(std::move(p))); + } else { + complete(std::move(p), overshoot ? -ENODATA : 0); + } + } +}; + +void FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, + lr::AioCompletion* c) { + auto marker = to_marker(markstr); + auto realmark = marker.value_or(::rgw::cls::fifo::marker{}); + std::unique_lock l(m); + const auto hn = info.head_part_num; + const auto max_part_size = info.params.max_part_size; + const auto pn = info.tail_part_num; + const auto part_oid = info.part_oid(pn); + auto tid = ++next_tid; + l.unlock(); + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + auto trimmer = std::make_unique(dpp, this, realmark.num, realmark.ofs, + pn, exclusive, c, tid); + if (!marker) { + Trimmer::complete(std::move(trimmer), -EINVAL); + return; + } + ++trimmer->pn; + auto ofs = marker->ofs; + if (marker->num > hn) { + trimmer->reread = true; + read_meta(dpp, tid, Trimmer::call(std::move(trimmer))); + return; + } + if (pn < marker->num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " pn=" << pn << " tid=" << tid << dendl; + ofs = max_part_size; + } else { + trimmer->update = true; + } + trim_part(dpp, pn, ofs, exclusive, tid, Trimmer::call(std::move(trimmer))); +} + +int FIFO::get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, + fifo::part_header* header, + optional_yield y) +{ + std::unique_lock l(m); + const auto part_oid = info.part_oid(part_num); + auto tid = ++next_tid; + l.unlock(); + auto r = rgw::cls::fifo::get_part_info(dpp, ioctx, part_oid, header, tid, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " get_part_info failed: r=" + << r << " tid=" << tid << dendl; + } + return r; +} + +void FIFO::get_part_info(int64_t part_num, + fifo::part_header* header, + lr::AioCompletion* c) +{ + std::unique_lock l(m); + const auto part_oid = info.part_oid(part_num); + auto tid = ++next_tid; + l.unlock(); + auto op = rgw::cls::fifo::get_part_info(cct, header, tid); + auto r = ioctx.aio_operate(part_oid, c, &op, nullptr); + ceph_assert(r >= 0); +} + +struct InfoGetter : Completion { + FIFO* fifo; + fifo::part_header header; + fu2::function f; + std::uint64_t tid; + bool headerread = false; + + InfoGetter(const DoutPrefixProvider *dpp, FIFO* fifo, fu2::function f, + std::uint64_t tid, lr::AioCompletion* super) + : Completion(dpp, super), fifo(fifo), f(std::move(f)), tid(tid) {} + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + if (!headerread) { + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " read_meta failed: r=" + << r << " tid=" << tid << dendl; + if (f) + f(r, {}); + complete(std::move(p), r); + return; + } + + auto info = fifo->meta(); + auto hpn = info.head_part_num; + if (hpn < 0) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " no head, returning empty partinfo r=" + << r << " tid=" << tid << dendl; + if (f) + f(0, {}); + complete(std::move(p), r); + return; + } + headerread = true; + auto op = rgw::cls::fifo::get_part_info(fifo->cct, &header, tid); + std::unique_lock l(fifo->m); + auto oid = fifo->info.part_oid(hpn); + l.unlock(); + r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op, + nullptr); + ceph_assert(r >= 0); + return; + } + + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " get_part_info failed: r=" + << r << " tid=" << tid << dendl; + } + + if (f) + f(r, std::move(header)); + complete(std::move(p), r); + return; + } +}; + +void FIFO::get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function f, + lr::AioCompletion* c) +{ + std::unique_lock l(m); + auto tid = ++next_tid; + l.unlock(); + auto ig = std::make_unique(dpp, this, std::move(f), tid, c); + read_meta(dpp, tid, InfoGetter::call(std::move(ig))); +} + +struct JournalProcessor : public Completion { +private: + FIFO* const fifo; + + std::vector processed; + decltype(fifo->info.journal) journal; + decltype(journal)::iterator iter; + std::int64_t new_tail; + std::int64_t new_head; + std::int64_t new_max; + int race_retries = 0; + bool first_pp = true; + bool canceled = false; + std::uint64_t tid; + + enum { + entry_callback, + pp_callback, + } state; + + void create_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + state = entry_callback; + lr::ObjectWriteOperation op; + op.create(false); /* We don't need exclusivity, part_init ensures + we're creating from the same journal entry. */ + std::unique_lock l(fifo->m); + part_init(&op, fifo->info.params); + auto oid = fifo->info.part_oid(part_num); + l.unlock(); + auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op); + ceph_assert(r >= 0); + return; + } + + void remove_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + state = entry_callback; + lr::ObjectWriteOperation op; + op.remove(); + std::unique_lock l(fifo->m); + auto oid = fifo->info.part_oid(part_num); + l.unlock(); + auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op); + ceph_assert(r >= 0); + return; + } + + void finish_je(const DoutPrefixProvider *dpp, Ptr&& p, int r, + const fifo::journal_entry& entry) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " finishing entry: entry=" << entry + << " tid=" << tid << dendl; + + using enum fifo::journal_entry::Op; + if (entry.op == remove && r == -ENOENT) + r = 0; + + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " processing entry failed: entry=" << entry + << " r=" << r << " tid=" << tid << dendl; + complete(std::move(p), r); + return; + } else { + switch (entry.op) { + case unknown: + case set_head: + // Can't happen. Filtered out in process. + complete(std::move(p), -EIO); + return; + + case create: + if (entry.part_num > new_max) { + new_max = entry.part_num; + } + break; + case remove: + if (entry.part_num >= new_tail) { + new_tail = entry.part_num + 1; + } + break; + } + processed.push_back(entry); + } + ++iter; + process(dpp, std::move(p)); + } + + void postprocess(const DoutPrefixProvider *dpp, Ptr&& p) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + if (processed.empty()) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " nothing to update any more: race_retries=" + << race_retries << " tid=" << tid << dendl; + complete(std::move(p), 0); + return; + } + pp_run(dpp, std::move(p), 0, false); + } + +public: + + JournalProcessor(const DoutPrefixProvider *dpp, FIFO* fifo, std::uint64_t tid, lr::AioCompletion* super) + : Completion(dpp, super), fifo(fifo), tid(tid) { + std::unique_lock l(fifo->m); + journal = fifo->info.journal; + iter = journal.begin(); + new_tail = fifo->info.tail_part_num; + new_head = fifo->info.head_part_num; + new_max = fifo->info.max_push_part_num; + } + + void pp_run(const DoutPrefixProvider *dpp, Ptr&& p, int r, bool canceled) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + std::optional tail_part_num; + std::optional head_part_num; + std::optional max_part_num; + + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " failed, r=: " << r << " tid=" << tid << dendl; + complete(std::move(p), r); + } + + + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " postprocessing: race_retries=" + << race_retries << " tid=" << tid << dendl; + + if (!first_pp && r == 0 && !canceled) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " nothing to update any more: race_retries=" + << race_retries << " tid=" << tid << dendl; + complete(std::move(p), 0); + return; + } + + first_pp = false; + + if (canceled) { + if (race_retries >= MAX_RACE_RETRIES) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " canceled too many times, giving up: tid=" + << tid << dendl; + complete(std::move(p), -ECANCELED); + return; + } + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " update canceled, retrying: race_retries=" + << race_retries << " tid=" << tid << dendl; + + ++race_retries; + + std::vector new_processed; + std::unique_lock l(fifo->m); + for (auto& e : processed) { + if (fifo->info.journal.contains(e)) { + new_processed.push_back(e); + } + } + processed = std::move(new_processed); + } + + std::unique_lock l(fifo->m); + auto objv = fifo->info.version; + if (new_tail > fifo->info.tail_part_num) { + tail_part_num = new_tail; + } + + if (new_head > fifo->info.head_part_num) { + head_part_num = new_head; + } + + if (new_max > fifo->info.max_push_part_num) { + max_part_num = new_max; + } + l.unlock(); + + if (processed.empty() && + !tail_part_num && + !max_part_num) { + /* nothing to update anymore */ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " nothing to update any more: race_retries=" + << race_retries << " tid=" << tid << dendl; + complete(std::move(p), 0); + return; + } + state = pp_callback; + fifo->_update_meta(dpp, fifo::update{} + .tail_part_num(tail_part_num) + .head_part_num(head_part_num) + .max_push_part_num(max_part_num) + .journal_entries_rm(processed), + objv, &this->canceled, tid, call(std::move(p))); + return; + } + + JournalProcessor(const JournalProcessor&) = delete; + JournalProcessor& operator =(const JournalProcessor&) = delete; + JournalProcessor(JournalProcessor&&) = delete; + JournalProcessor& operator =(JournalProcessor&&) = delete; + + void process(const DoutPrefixProvider *dpp, Ptr&& p) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + while (iter != journal.end()) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " processing entry: entry=" << *iter + << " tid=" << tid << dendl; + const auto entry = *iter; + switch (entry.op) { + using enum fifo::journal_entry::Op; + case create: + create_part(dpp, std::move(p), entry.part_num); + return; + case set_head: + if (entry.part_num > new_head) { + new_head = entry.part_num; + } + processed.push_back(entry); + ++iter; + continue; + case remove: + remove_part(dpp, std::move(p), entry.part_num); + return; + default: + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " unknown journaled op: entry=" << entry << " tid=" + << tid << dendl; + complete(std::move(p), -EIO); + return; + } + } + postprocess(dpp, std::move(p)); + return; + } + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " entering: tid=" << tid << dendl; + switch (state) { + case entry_callback: + finish_je(dpp, std::move(p), r, *iter); + return; + case pp_callback: + auto c = canceled; + canceled = false; + pp_run(dpp, std::move(p), r, c); + return; + } + + abort(); + } + +}; + +void FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c) { + auto p = std::make_unique(dpp, this, tid, c); + p->process(dpp, std::move(p)); +} + +struct Lister : Completion { + FIFO* f; + std::vector result; + bool more = false; + std::int64_t part_num; + std::uint64_t ofs; + int max_entries; + int r_out = 0; + std::vector entries; + bool part_more = false; + bool part_full = false; + std::vector* entries_out; + bool* more_out; + std::uint64_t tid; + + bool read = false; + + void complete(Ptr&& p, int r) { + if (r >= 0) { + if (more_out) *more_out = more; + if (entries_out) *entries_out = std::move(result); + } + Completion::complete(std::move(p), r); + } + +public: + Lister(const DoutPrefixProvider *dpp, FIFO* f, std::int64_t part_num, std::uint64_t ofs, int max_entries, + std::vector* entries_out, bool* more_out, + std::uint64_t tid, lr::AioCompletion* super) + : Completion(dpp, super), f(f), part_num(part_num), ofs(ofs), max_entries(max_entries), + entries_out(entries_out), more_out(more_out), tid(tid) { + result.reserve(max_entries); + } + + Lister(const Lister&) = delete; + Lister& operator =(const Lister&) = delete; + Lister(Lister&&) = delete; + Lister& operator =(Lister&&) = delete; + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + if (read) + handle_read(std::move(p), r); + else + handle_list(dpp, std::move(p), r); + } + + void list(Ptr&& p) { + if (max_entries > 0) { + part_more = false; + part_full = false; + entries.clear(); + + std::unique_lock l(f->m); + auto part_oid = f->info.part_oid(part_num); + l.unlock(); + + read = false; + auto op = list_part(f->cct, ofs, max_entries, &r_out, + &entries, &part_more, &part_full, tid); + f->ioctx.aio_operate(part_oid, call(std::move(p)), &op, nullptr); + } else { + complete(std::move(p), 0); + } + } + + void handle_read(Ptr&& p, int r) { + read = false; + if (r >= 0) r = r_out; + r_out = 0; + + if (r < 0) { + complete(std::move(p), r); + return; + } + + if (part_num < f->info.tail_part_num) { + /* raced with trim? restart */ + max_entries += result.size(); + result.clear(); + part_num = f->info.tail_part_num; + ofs = 0; + list(std::move(p)); + return; + } + /* assuming part was not written yet, so end of data */ + more = false; + complete(std::move(p), 0); + return; + } + + void handle_list(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + if (r >= 0) r = r_out; + r_out = 0; + std::unique_lock l(f->m); + auto part_oid = f->info.part_oid(part_num); + l.unlock(); + if (r == -ENOENT) { + read = true; + f->read_meta(dpp, tid, call(std::move(p))); + return; + } + if (r < 0) { + complete(std::move(p), r); + return; + } + + more = part_full || part_more; + for (auto& entry : entries) { + list_entry e; + e.data = std::move(entry.data); + e.marker = marker{part_num, entry.ofs}.to_string(); + e.mtime = entry.mtime; + result.push_back(std::move(e)); + } + max_entries -= entries.size(); + entries.clear(); + if (max_entries > 0 && part_more) { + list(std::move(p)); + return; + } + + if (!part_full) { /* head part is not full */ + complete(std::move(p), 0); + return; + } + ++part_num; + ofs = 0; + list(std::move(p)); + } +}; + +void FIFO::list(const DoutPrefixProvider *dpp, int max_entries, + std::optional markstr, + std::vector* out, + bool* more, + lr::AioCompletion* c) { + std::unique_lock l(m); + auto tid = ++next_tid; + std::int64_t part_num = info.tail_part_num; + l.unlock(); + std::uint64_t ofs = 0; + std::optional<::rgw::cls::fifo::marker> marker; + + if (markstr) { + marker = to_marker(*markstr); + if (marker) { + part_num = marker->num; + ofs = marker->ofs; + } + } + + auto ls = std::make_unique(dpp, this, part_num, ofs, max_entries, out, + more, tid, c); + if (markstr && !marker) { + auto l = ls.get(); + l->complete(std::move(ls), -EINVAL); + } else { + ls->list(std::move(ls)); + } +} +} diff --git a/src/rgw/driver/rados/cls_fifo_legacy.h b/src/rgw/driver/rados/cls_fifo_legacy.h new file mode 100644 index 000000000..b0a68157e --- /dev/null +++ b/src/rgw/driver/rados/cls_fifo_legacy.h @@ -0,0 +1,334 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat + * Author: Adam C. Emerson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "include/rados/librados.hpp" +#include "include/buffer.h" +#include "include/function2.hpp" + +#include "common/async/yield_context.h" + +#include "cls/fifo/cls_fifo_types.h" +#include "cls/fifo/cls_fifo_ops.h" + +#include "librados/AioCompletionImpl.h" + +#include "rgw_tools.h" + +namespace rgw::cls::fifo { +namespace cb = ceph::buffer; +namespace fifo = rados::cls::fifo; +namespace lr = librados; + +inline constexpr std::uint64_t default_max_part_size = 4 * 1024 * 1024; +inline constexpr std::uint64_t default_max_entry_size = 32 * 1024; + +void create_meta(lr::ObjectWriteOperation* op, std::string_view id, + std::optional objv, + std::optional oid_prefix, + bool exclusive = false, + std::uint64_t max_part_size = default_max_part_size, + std::uint64_t max_entry_size = default_max_entry_size); +int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid, + std::optional objv, fifo::info* info, + std::uint32_t* part_header_size, + std::uint32_t* part_entry_overhead, + std::uint64_t tid, optional_yield y, + bool probe = false); +struct marker { + std::int64_t num = 0; + std::uint64_t ofs = 0; + + marker() = default; + marker(std::int64_t num, std::uint64_t ofs) : num(num), ofs(ofs) {} + static marker max() { + return { std::numeric_limits::max(), + std::numeric_limits::max() }; + } + + std::string to_string() { + return fmt::format("{:0>20}:{:0>20}", num, ofs); + } +}; + +struct list_entry { + cb::list data; + std::string marker; + ceph::real_time mtime; +}; + +using part_info = fifo::part_header; + +/// This is an implementation of FIFO using librados to facilitate +/// backports. Please see /src/neorados/cls/fifo.h for full +/// information. +/// +/// This library uses optional_yield. Please see +/// /src/common/async/yield_context.h. In summary, optional_yield +/// contains either a spawn::yield_context (in which case the current +/// coroutine is suspended until completion) or null_yield (in which +/// case the current thread is blocked until completion.) +/// +/// Please see the librados documentation for information on +/// AioCompletion and IoCtx. + +class FIFO { + friend struct Reader; + friend struct Updater; + friend struct Trimmer; + friend struct InfoGetter; + friend struct Pusher; + friend struct NewPartPreparer; + friend struct NewHeadPreparer; + friend struct JournalProcessor; + friend struct Lister; + + mutable lr::IoCtx ioctx; + CephContext* cct = static_cast(ioctx.cct()); + const std::string oid; + std::mutex m; + std::uint64_t next_tid = 0; + + fifo::info info; + + std::uint32_t part_header_size = 0xdeadbeef; + std::uint32_t part_entry_overhead = 0xdeadbeef; + + std::optional to_marker(std::string_view s); + + FIFO(lr::IoCtx&& ioc, + std::string oid) + : ioctx(std::move(ioc)), oid(oid) {} + + int apply_update(const DoutPrefixProvider *dpp, + fifo::info* info, + const fifo::objv& objv, + const fifo::update& update, + std::uint64_t tid); + int _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update, + fifo::objv version, bool* pcanceled, + std::uint64_t tid, optional_yield y); + void _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update, + fifo::objv version, bool* pcanceled, + std::uint64_t tid, lr::AioCompletion* c); + int create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid, + optional_yield y); + int remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid, + optional_yield y); + int process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y); + void process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c); + int _prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num, bool is_head, std::uint64_t tid, optional_yield y); + void _prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num, bool is_head, std::uint64_t tid, lr::AioCompletion* c); + int _prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num, + std::uint64_t tid, optional_yield y); + void _prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num, std::uint64_t tid, lr::AioCompletion* c); + int push_entries(const DoutPrefixProvider *dpp, const std::deque& data_bufs, + std::uint64_t tid, optional_yield y); + void push_entries(const std::deque& data_bufs, + std::uint64_t tid, lr::AioCompletion* c); + int trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs, + bool exclusive, std::uint64_t tid, optional_yield y); + void trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs, + bool exclusive, std::uint64_t tid, lr::AioCompletion* c); + + /// Force refresh of metadata, yielding/blocking style + int read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y); + /// Force refresh of metadata, with a librados Completion + void read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c); + +public: + + FIFO(const FIFO&) = delete; + FIFO& operator =(const FIFO&) = delete; + FIFO(FIFO&&) = delete; + FIFO& operator =(FIFO&&) = delete; + + /// Open an existing FIFO. + static int open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context + std::string oid, //< OID for metadata object + std::unique_ptr* fifo, //< OUT: Pointer to FIFO object + optional_yield y, //< Optional yield context + /// Operation will fail if FIFO is not at this version + std::optional objv = std::nullopt, + /// Probing for existence, don't print errors if we + /// can't find it. + bool probe = false); + /// Create a new or open an existing FIFO. + static int create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context + std::string oid, //< OID for metadata object + std::unique_ptr* fifo, //< OUT: Pointer to FIFO object + optional_yield y, //< Optional yield context + /// Operation will fail if the FIFO exists and is + /// not of this version. + std::optional objv = std::nullopt, + /// Prefix for all objects + std::optional oid_prefix = std::nullopt, + /// Fail if the FIFO already exists + bool exclusive = false, + /// Maximum allowed size of parts + std::uint64_t max_part_size = default_max_part_size, + /// Maximum allowed size of entries + std::uint64_t max_entry_size = default_max_entry_size); + + /// Force refresh of metadata, yielding/blocking style + int read_meta(const DoutPrefixProvider *dpp, optional_yield y); + /// Get currently known metadata + const fifo::info& meta() const; + /// Get partition header and entry overhead size + std::pair get_part_layout_info() const; + /// Push an entry to the FIFO + int push(const DoutPrefixProvider *dpp, + const cb::list& bl, //< Entry to push + optional_yield y //< Optional yield + ); + /// Push an entry to the FIFO + void push(const DoutPrefixProvider *dpp, const cb::list& bl, //< Entry to push + lr::AioCompletion* c //< Async Completion + ); + /// Push entries to the FIFO + int push(const DoutPrefixProvider *dpp, + const std::vector& data_bufs, //< Entries to push + optional_yield y //< Optional yield + ); + /// Push entries to the FIFO + void push(const DoutPrefixProvider *dpp, const std::vector& data_bufs, //< Entries to push + lr::AioCompletion* c //< Async Completion + ); + /// List entries + int list(const DoutPrefixProvider *dpp, + int max_entries, //< Maximum entries to list + /// Point after which to begin listing. Start at tail if null + std::optional markstr, + std::vector* out, //< OUT: entries + /// OUT: True if more entries in FIFO beyond the last returned + bool* more, + optional_yield y //< Optional yield + ); + void list(const DoutPrefixProvider *dpp, + int max_entries, //< Maximum entries to list + /// Point after which to begin listing. Start at tail if null + std::optional markstr, + std::vector* out, //< OUT: entries + /// OUT: True if more entries in FIFO beyond the last returned + bool* more, + lr::AioCompletion* c //< Async Completion + ); + /// Trim entries, coroutine/block style + int trim(const DoutPrefixProvider *dpp, + std::string_view markstr, //< Position to which to trim, inclusive + bool exclusive, //< If true, do not trim the target entry + //< itself, just all those before it. + optional_yield y //< Optional yield + ); + /// Trim entries, librados AioCompletion style + void trim(const DoutPrefixProvider *dpp, + std::string_view markstr, //< Position to which to trim, inclusive + bool exclusive, //< If true, do not trim the target entry + //< itself, just all those before it. + lr::AioCompletion* c //< librados AIO Completion + ); + /// Get part info + int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, /// Part number + fifo::part_header* header, //< OUT: Information + optional_yield y //< Optional yield + ); + /// Get part info + void get_part_info(int64_t part_num, //< Part number + fifo::part_header* header, //< OUT: Information + lr::AioCompletion* c //< AIO Completion + ); + /// A convenience method to fetch the part information for the FIFO + /// head, using librados::AioCompletion, since + /// libradio::AioCompletions compose lousily. + void get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function< //< Function to receive info + void(int r, fifo::part_header&&)>, + lr::AioCompletion* c //< AIO Completion + ); +}; + +template +struct Completion { +private: + const DoutPrefixProvider *_dpp; + lr::AioCompletion* _cur = nullptr; + lr::AioCompletion* _super; +public: + + using Ptr = std::unique_ptr; + + lr::AioCompletion* cur() const { + return _cur; + } + lr::AioCompletion* super() const { + return _super; + } + + Completion(const DoutPrefixProvider *dpp, lr::AioCompletion* super) : _dpp(dpp), _super(super) { + super->pc->get(); + } + + ~Completion() { + if (_super) { + _super->pc->put(); + } + if (_cur) + _cur->release(); + _super = nullptr; + _cur = nullptr; + } + + // The only times that aio_operate can return an error are: + // 1. The completion contains a null pointer. This should just + // crash, and in our case it does. + // 2. An attempt is made to write to a snapshot. RGW doesn't use + // snapshots, so we don't care. + // + // So we will just assert that initiating an Aio operation succeeds + // and not worry about recovering. + static lr::AioCompletion* call(Ptr&& p) { + p->_cur = lr::Rados::aio_create_completion(static_cast(p.get()), + &cb); + auto c = p->_cur; + p.release(); + return c; + } + static void complete(Ptr&& p, int r) { + auto c = p->_super; + p->_super = nullptr; + rgw_complete_aio_completion(c, r); + } + + static void cb(lr::completion_t, void* arg) { + auto t = static_cast(arg); + auto r = t->_cur->get_return_value(); + t->_cur->release(); + t->_cur = nullptr; + t->handle(t->_dpp, Ptr(t), r); + } +}; + +} diff --git a/src/rgw/driver/rados/config/impl.cc b/src/rgw/driver/rados/config/impl.cc new file mode 100644 index 000000000..f1b2befad --- /dev/null +++ b/src/rgw/driver/rados/config/impl.cc @@ -0,0 +1,129 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "impl.h" + +#include "common/async/yield_context.h" +#include "common/errno.h" +#include "rgw_string.h" +#include "rgw_zone.h" + +namespace rgw::rados { + +// default pool names +constexpr std::string_view default_zone_root_pool = "rgw.root"; +constexpr std::string_view default_zonegroup_root_pool = "rgw.root"; +constexpr std::string_view default_realm_root_pool = "rgw.root"; +constexpr std::string_view default_period_root_pool = "rgw.root"; + +static rgw_pool default_pool(std::string_view name, + std::string_view default_name) +{ + return std::string{name_or_default(name, default_name)}; +} + +ConfigImpl::ConfigImpl(const ceph::common::ConfigProxy& conf) + : realm_pool(default_pool(conf->rgw_realm_root_pool, + default_realm_root_pool)), + period_pool(default_pool(conf->rgw_period_root_pool, + default_period_root_pool)), + zonegroup_pool(default_pool(conf->rgw_zonegroup_root_pool, + default_zonegroup_root_pool)), + zone_pool(default_pool(conf->rgw_zone_root_pool, + default_zone_root_pool)) +{ +} + +int ConfigImpl::read(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, + bufferlist& bl, RGWObjVersionTracker* objv) +{ + librados::IoCtx ioctx; + int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false); + if (r < 0) { + return r; + } + librados::ObjectReadOperation op; + if (objv) { + objv->prepare_op_for_read(&op); + } + op.read(0, 0, &bl, nullptr); + return rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y); +} + +int ConfigImpl::write(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, + Create create, const bufferlist& bl, + RGWObjVersionTracker* objv) +{ + librados::IoCtx ioctx; + int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + switch (create) { + case Create::MustNotExist: op.create(true); break; + case Create::MayExist: op.create(false); break; + case Create::MustExist: op.assert_exists(); break; + } + if (objv) { + objv->prepare_op_for_write(&op); + } + op.write_full(bl); + + r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r >= 0 && objv) { + objv->apply_write(); + } + return r; +} + +int ConfigImpl::remove(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, + RGWObjVersionTracker* objv) +{ + librados::IoCtx ioctx; + int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + if (objv) { + objv->prepare_op_for_write(&op); + } + op.remove(); + + r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r >= 0 && objv) { + objv->apply_write(); + } + return r; +} + +int ConfigImpl::notify(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, + bufferlist& bl, uint64_t timeout_ms) +{ + librados::IoCtx ioctx; + int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false); + if (r < 0) { + return r; + } + return rgw_rados_notify(dpp, ioctx, oid, bl, timeout_ms, nullptr, y); +} + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/config/impl.h b/src/rgw/driver/rados/config/impl.h new file mode 100644 index 000000000..3aed451f9 --- /dev/null +++ b/src/rgw/driver/rados/config/impl.h @@ -0,0 +1,139 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "rgw_basic_types.h" +#include "rgw_tools.h" +#include "rgw_sal_config.h" + +namespace rgw::rados { + +// write options that control object creation +enum class Create { + MustNotExist, // fail with EEXIST if the object already exists + MayExist, // create if the object didn't exist, overwrite if it did + MustExist, // fail with ENOENT if the object doesn't exist +}; + +struct ConfigImpl { + librados::Rados rados; + + const rgw_pool realm_pool; + const rgw_pool period_pool; + const rgw_pool zonegroup_pool; + const rgw_pool zone_pool; + + ConfigImpl(const ceph::common::ConfigProxy& conf); + + int read(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, + bufferlist& bl, RGWObjVersionTracker* objv); + + template + int read(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, + T& data, RGWObjVersionTracker* objv) + { + bufferlist bl; + int r = read(dpp, y, pool, oid, bl, objv); + if (r < 0) { + return r; + } + try { + auto p = bl.cbegin(); + decode(data, p); + } catch (const buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from " + << pool << ":" << oid << dendl; + return -EIO; + } + return 0; + } + + int write(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, Create create, + const bufferlist& bl, RGWObjVersionTracker* objv); + + template + int write(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, Create create, + const T& data, RGWObjVersionTracker* objv) + { + bufferlist bl; + encode(data, bl); + + return write(dpp, y, pool, oid, create, bl, objv); + } + + int remove(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, + RGWObjVersionTracker* objv); + + int list(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& marker, + std::regular_invocable auto filter, + std::span entries, + sal::ListResult& result) + { + librados::IoCtx ioctx; + int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false); + if (r < 0) { + return r; + } + librados::ObjectCursor oc; + if (!oc.from_str(marker)) { + ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl; + return -EINVAL; + } + std::size_t count = 0; + try { + auto iter = ioctx.nobjects_begin(oc); + const auto end = ioctx.nobjects_end(); + for (; count < entries.size() && iter != end; ++iter) { + std::string entry = filter(iter->get_oid()); + if (!entry.empty()) { + entries[count++] = std::move(entry); + } + } + if (iter == end) { + result.next.clear(); + } else { + result.next = iter.get_cursor().to_str(); + } + } catch (const std::exception& e) { + ldpp_dout(dpp, 10) << "NObjectIterator exception " << e.what() << dendl; + return -EIO; + } + result.entries = entries.first(count); + return 0; + } + + int notify(const DoutPrefixProvider* dpp, optional_yield y, + const rgw_pool& pool, const std::string& oid, + bufferlist& bl, uint64_t timeout_ms); +}; + +inline std::string_view name_or_default(std::string_view name, + std::string_view default_name) +{ + if (!name.empty()) { + return name; + } + return default_name; +} + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/config/period.cc b/src/rgw/driver/rados/config/period.cc new file mode 100644 index 000000000..bc3fa27e7 --- /dev/null +++ b/src/rgw/driver/rados/config/period.cc @@ -0,0 +1,230 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/dout.h" +#include "common/errno.h" +#include "rgw_zone.h" +#include "driver/rados/config/store.h" + +#include "impl.h" + +namespace rgw::rados { + +// period oids +constexpr std::string_view period_info_oid_prefix = "periods."; +constexpr std::string_view period_latest_epoch_info_oid = ".latest_epoch"; +constexpr std::string_view period_staging_suffix = ":staging"; + +static std::string period_oid(std::string_view period_id, uint32_t epoch) +{ + // omit the epoch for the staging period + if (period_id.ends_with(period_staging_suffix)) { + return string_cat_reserve(period_info_oid_prefix, period_id); + } + return fmt::format("{}{}.{}", period_info_oid_prefix, period_id, epoch); +} + +static std::string latest_epoch_oid(const ceph::common::ConfigProxy& conf, + std::string_view period_id) +{ + return string_cat_reserve( + period_info_oid_prefix, period_id, + name_or_default(conf->rgw_period_latest_epoch_info_oid, + period_latest_epoch_info_oid)); +} + +static int read_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y, + ConfigImpl* impl, std::string_view period_id, + uint32_t& epoch, RGWObjVersionTracker* objv) +{ + const auto& pool = impl->period_pool; + const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id); + RGWPeriodLatestEpochInfo latest; + int r = impl->read(dpp, y, pool, latest_oid, latest, objv); + if (r >= 0) { + epoch = latest.epoch; + } + return r; +} + +static int write_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y, + ConfigImpl* impl, bool exclusive, + std::string_view period_id, uint32_t epoch, + RGWObjVersionTracker* objv) +{ + const auto& pool = impl->period_pool; + const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id); + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + RGWPeriodLatestEpochInfo latest{epoch}; + return impl->write(dpp, y, pool, latest_oid, create, latest, objv); +} + +static int delete_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y, + ConfigImpl* impl, std::string_view period_id, + RGWObjVersionTracker* objv) +{ + const auto& pool = impl->period_pool; + const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id); + return impl->remove(dpp, y, pool, latest_oid, objv); +} + +static int update_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y, + ConfigImpl* impl, std::string_view period_id, + uint32_t epoch) +{ + static constexpr int MAX_RETRIES = 20; + + for (int i = 0; i < MAX_RETRIES; i++) { + uint32_t existing_epoch = 0; + RGWObjVersionTracker objv; + bool exclusive = false; + + // read existing epoch + int r = read_latest_epoch(dpp, y, impl, period_id, existing_epoch, &objv); + if (r == -ENOENT) { + // use an exclusive create to set the epoch atomically + exclusive = true; + objv.generate_new_write_ver(dpp->get_cct()); + ldpp_dout(dpp, 20) << "creating initial latest_epoch=" << epoch + << " for period=" << period_id << dendl; + } else if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read latest_epoch" << dendl; + return r; + } else if (epoch <= existing_epoch) { + r = -EEXIST; // fail with EEXIST if epoch is not newer + ldpp_dout(dpp, 10) << "found existing latest_epoch " << existing_epoch + << " >= given epoch " << epoch << ", returning r=" << r << dendl; + return r; + } else { + ldpp_dout(dpp, 20) << "updating latest_epoch from " << existing_epoch + << " -> " << epoch << " on period=" << period_id << dendl; + } + + r = write_latest_epoch(dpp, y, impl, exclusive, period_id, epoch, &objv); + if (r == -EEXIST) { + continue; // exclusive create raced with another update, retry + } else if (r == -ECANCELED) { + continue; // write raced with a conflicting version, retry + } + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to write latest_epoch" << dendl; + return r; + } + return 0; // return success + } + + return -ECANCELED; // fail after max retries +} + +int RadosConfigStore::create_period(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWPeriod& info) +{ + if (info.get_id().empty()) { + ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl; + return -EINVAL; + } + if (info.get_epoch() == 0) { + ldpp_dout(dpp, 0) << "period cannot have an empty epoch" << dendl; + return -EINVAL; + } + const auto& pool = impl->period_pool; + const auto info_oid = period_oid(info.get_id(), info.get_epoch()); + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + RGWObjVersionTracker objv; + objv.generate_new_write_ver(dpp->get_cct()); + int r = impl->write(dpp, y, pool, info_oid, create, info, &objv); + if (r < 0) { + return r; + } + + (void) update_latest_epoch(dpp, y, impl.get(), info.get_id(), info.get_epoch()); + return 0; +} + +int RadosConfigStore::read_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id, + std::optional epoch, + RGWPeriod& info) +{ + int r = 0; + if (!epoch) { + epoch = 0; + r = read_latest_epoch(dpp, y, impl.get(), period_id, *epoch, nullptr); + if (r < 0) { + return r; + } + } + + const auto& pool = impl->period_pool; + const auto info_oid = period_oid(period_id, *epoch); + return impl->read(dpp, y, pool, info_oid, info, nullptr); +} + +int RadosConfigStore::delete_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id) +{ + const auto& pool = impl->period_pool; + + // read the latest_epoch + uint32_t latest_epoch = 0; + RGWObjVersionTracker latest_objv; + int r = read_latest_epoch(dpp, y, impl.get(), period_id, + latest_epoch, &latest_objv); + if (r < 0 && r != -ENOENT) { // just delete epoch=0 on ENOENT + ldpp_dout(dpp, 0) << "failed to read latest epoch for period " + << period_id << ": " << cpp_strerror(r) << dendl; + return r; + } + + for (uint32_t epoch = 0; epoch <= latest_epoch; epoch++) { + const auto info_oid = period_oid(period_id, epoch); + r = impl->remove(dpp, y, pool, info_oid, nullptr); + if (r < 0 && r != -ENOENT) { // ignore ENOENT + ldpp_dout(dpp, 0) << "failed to delete period " << info_oid + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + + return delete_latest_epoch(dpp, y, impl.get(), period_id, &latest_objv); +} + +int RadosConfigStore::list_period_ids(const DoutPrefixProvider* dpp, + optional_yield y, + const std::string& marker, + std::span entries, + sal::ListResult& result) +{ + const auto& pool = impl->period_pool; + constexpr auto prefix = [] (std::string oid) -> std::string { + if (!oid.starts_with(period_info_oid_prefix)) { + return {}; + } + if (!oid.ends_with(period_latest_epoch_info_oid)) { + return {}; + } + // trim the prefix and suffix + const std::size_t count = oid.size() - + period_info_oid_prefix.size() - + period_latest_epoch_info_oid.size(); + return oid.substr(period_info_oid_prefix.size(), count); + }; + + return impl->list(dpp, y, pool, marker, prefix, entries, result); +} + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/config/period_config.cc b/src/rgw/driver/rados/config/period_config.cc new file mode 100644 index 000000000..ec984ebdc --- /dev/null +++ b/src/rgw/driver/rados/config/period_config.cc @@ -0,0 +1,55 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_zone.h" +#include "driver/rados/config/store.h" + +#include "impl.h" + +namespace rgw::rados { + +// period config oids +constexpr std::string_view period_config_prefix = "period_config."; +constexpr std::string_view period_config_realm_default = "default"; + +std::string period_config_oid(std::string_view realm_id) +{ + if (realm_id.empty()) { + realm_id = period_config_realm_default; + } + return string_cat_reserve(period_config_prefix, realm_id); +} + +int RadosConfigStore::read_period_config(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWPeriodConfig& info) +{ + const auto& pool = impl->period_pool; + const auto oid = period_config_oid(realm_id); + return impl->read(dpp, y, pool, oid, info, nullptr); +} + +int RadosConfigStore::write_period_config(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + const RGWPeriodConfig& info) +{ + const auto& pool = impl->period_pool; + const auto oid = period_config_oid(realm_id); + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + return impl->write(dpp, y, pool, oid, create, info, nullptr); +} + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/config/realm.cc b/src/rgw/driver/rados/config/realm.cc new file mode 100644 index 000000000..331e0ffd2 --- /dev/null +++ b/src/rgw/driver/rados/config/realm.cc @@ -0,0 +1,364 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/dout.h" +#include "common/errno.h" +#include "rgw_realm_watcher.h" +#include "rgw_zone.h" +#include "driver/rados/config/store.h" + +#include "impl.h" + +namespace rgw::rados { + +// realm oids +constexpr std::string_view realm_names_oid_prefix = "realms_names."; +constexpr std::string_view realm_info_oid_prefix = "realms."; +constexpr std::string_view realm_control_oid_suffix = ".control"; +constexpr std::string_view default_realm_info_oid = "default.realm"; + +static std::string realm_info_oid(std::string_view realm_id) +{ + return string_cat_reserve(realm_info_oid_prefix, realm_id); +} +static std::string realm_name_oid(std::string_view realm_id) +{ + return string_cat_reserve(realm_names_oid_prefix, realm_id); +} +static std::string realm_control_oid(std::string_view realm_id) +{ + return string_cat_reserve(realm_info_oid_prefix, realm_id, + realm_control_oid_suffix); +} +static std::string default_realm_oid(const ceph::common::ConfigProxy& conf) +{ + return std::string{name_or_default(conf->rgw_default_realm_info_oid, + default_realm_info_oid)}; +} + + +int RadosConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id) +{ + const auto& pool = impl->realm_pool; + const auto oid = default_realm_oid(dpp->get_cct()->_conf); + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + + RGWDefaultSystemMetaObjInfo default_info; + default_info.default_id = realm_id; + + return impl->write(dpp, y, pool, oid, create, default_info, nullptr); +} + +int RadosConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string& realm_id) +{ + const auto& pool = impl->realm_pool; + const auto oid = default_realm_oid(dpp->get_cct()->_conf); + + RGWDefaultSystemMetaObjInfo default_info; + int r = impl->read(dpp, y, pool, oid, default_info, nullptr); + if (r >= 0) { + realm_id = default_info.default_id; + } + return r; +} + +int RadosConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y) +{ + const auto& pool = impl->realm_pool; + const auto oid = default_realm_oid(dpp->get_cct()->_conf); + + return impl->remove(dpp, y, pool, oid, nullptr); +} + + +class RadosRealmWriter : public sal::RealmWriter { + ConfigImpl* impl; + RGWObjVersionTracker objv; + std::string realm_id; + std::string realm_name; + public: + RadosRealmWriter(ConfigImpl* impl, RGWObjVersionTracker objv, + std::string_view realm_id, std::string_view realm_name) + : impl(impl), objv(std::move(objv)), + realm_id(realm_id), realm_name(realm_name) + { + } + + int write(const DoutPrefixProvider* dpp, optional_yield y, + const RGWRealm& info) override + { + if (realm_id != info.get_id() || realm_name != info.get_name()) { + return -EINVAL; // can't modify realm id or name directly + } + + const auto& pool = impl->realm_pool; + const auto info_oid = realm_info_oid(info.get_id()); + return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv); + } + + int rename(const DoutPrefixProvider* dpp, optional_yield y, + RGWRealm& info, std::string_view new_name) override + { + if (realm_id != info.get_id() || realm_name != info.get_name()) { + return -EINVAL; // can't modify realm id or name directly + } + if (new_name.empty()) { + ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl; + return -EINVAL; + } + + const auto& pool = impl->realm_pool; + const auto name = RGWNameToId{info.get_id()}; + const auto info_oid = realm_info_oid(info.get_id()); + const auto old_oid = realm_name_oid(info.get_name()); + const auto new_oid = realm_name_oid(new_name); + + // link the new name + RGWObjVersionTracker new_objv; + new_objv.generate_new_write_ver(dpp->get_cct()); + int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist, + name, &new_objv); + if (r < 0) { + return r; + } + + // write the info with updated name + info.set_name(std::string{new_name}); + r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv); + if (r < 0) { + // on failure, unlink the new name + (void) impl->remove(dpp, y, pool, new_oid, &new_objv); + return r; + } + + // unlink the old name + (void) impl->remove(dpp, y, pool, old_oid, nullptr); + + realm_name = new_name; + return 0; + } + + int remove(const DoutPrefixProvider* dpp, optional_yield y) override + { + const auto& pool = impl->realm_pool; + const auto info_oid = realm_info_oid(realm_id); + int r = impl->remove(dpp, y, pool, info_oid, &objv); + if (r < 0) { + return r; + } + const auto name_oid = realm_name_oid(realm_name); + (void) impl->remove(dpp, y, pool, name_oid, nullptr); + const auto control_oid = realm_control_oid(realm_id); + (void) impl->remove(dpp, y, pool, control_oid, nullptr); + return 0; + } +}; // RadosRealmWriter + + +int RadosConfigStore::create_realm(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWRealm& info, + std::unique_ptr* writer) +{ + if (info.get_id().empty()) { + ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl; + return -EINVAL; + } + if (info.get_name().empty()) { + ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl; + return -EINVAL; + } + + const auto& pool = impl->realm_pool; + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + + // write the realm info + const auto info_oid = realm_info_oid(info.get_id()); + RGWObjVersionTracker objv; + objv.generate_new_write_ver(dpp->get_cct()); + + int r = impl->write(dpp, y, pool, info_oid, create, info, &objv); + if (r < 0) { + return r; + } + + // write the realm name + const auto name_oid = realm_name_oid(info.get_name()); + const auto name = RGWNameToId{info.get_id()}; + RGWObjVersionTracker name_objv; + name_objv.generate_new_write_ver(dpp->get_cct()); + + r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv); + if (r < 0) { + (void) impl->remove(dpp, y, pool, info_oid, &objv); + return r; + } + + // create control object for watch/notify + const auto control_oid = realm_control_oid(info.get_id()); + bufferlist empty_bl; + r = impl->write(dpp, y, pool, control_oid, Create::MayExist, + empty_bl, nullptr); + if (r < 0) { + (void) impl->remove(dpp, y, pool, name_oid, &name_objv); + (void) impl->remove(dpp, y, pool, info_oid, &objv); + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWRealm& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->realm_pool; + const auto info_oid = realm_info_oid(realm_id); + RGWObjVersionTracker objv; + int r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->realm_pool; + + // look up realm id by name + RGWNameToId name; + const auto name_oid = realm_name_oid(realm_name); + int r = impl->read(dpp, y, pool, name_oid, name, nullptr); + if (r < 0) { + return r; + } + + const auto info_oid = realm_info_oid(name.obj_id); + RGWObjVersionTracker objv; + r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_default_realm(const DoutPrefixProvider* dpp, + optional_yield y, + RGWRealm& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->realm_pool; + + // read default realm id + RGWDefaultSystemMetaObjInfo default_info; + const auto default_oid = default_realm_oid(dpp->get_cct()->_conf); + int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr); + if (r < 0) { + return r; + } + + const auto info_oid = realm_info_oid(default_info.default_id); + RGWObjVersionTracker objv; + r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + std::string& realm_id) +{ + const auto& pool = impl->realm_pool; + RGWNameToId name; + + // look up realm id by name + const auto name_oid = realm_name_oid(realm_name); + int r = impl->read(dpp, y, pool, name_oid, name, nullptr); + if (r < 0) { + return r; + } + realm_id = std::move(name.obj_id); + return 0; +} + +int RadosConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWPeriod& period) +{ + const auto& pool = impl->realm_pool; + const auto control_oid = realm_control_oid(period.get_realm()); + + bufferlist bl; + using ceph::encode; + // push the period to dependent zonegroups/zones + encode(RGWRealmNotify::ZonesNeedPeriod, bl); + encode(period, bl); + // reload the gateway with the new period + encode(RGWRealmNotify::Reload, bl); + + constexpr uint64_t timeout_ms = 0; + return impl->notify(dpp, y, pool, control_oid, bl, timeout_ms); +} + +int RadosConfigStore::list_realm_names(const DoutPrefixProvider* dpp, + optional_yield y, + const std::string& marker, + std::span entries, + sal::ListResult& result) +{ + const auto& pool = impl->realm_pool; + constexpr auto prefix = [] (std::string oid) -> std::string { + if (!oid.starts_with(realm_names_oid_prefix)) { + return {}; + } + return oid.substr(realm_names_oid_prefix.size()); + }; + return impl->list(dpp, y, pool, marker, prefix, entries, result); +} + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/config/store.cc b/src/rgw/driver/rados/config/store.cc new file mode 100644 index 000000000..ec2b034a8 --- /dev/null +++ b/src/rgw/driver/rados/config/store.cc @@ -0,0 +1,52 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "impl.h" +#include "store.h" + +namespace rgw::rados { + +RadosConfigStore::RadosConfigStore(std::unique_ptr impl) + : impl(std::move(impl)) +{ +} + +RadosConfigStore::~RadosConfigStore() = default; + + +auto create_config_store(const DoutPrefixProvider* dpp) + -> std::unique_ptr +{ + auto impl = std::make_unique(dpp->get_cct()->_conf); + + // initialize a Rados client + int r = impl->rados.init_with_context(dpp->get_cct()); + if (r < 0) { + ldpp_dout(dpp, -1) << "Rados client initialization failed with " + << cpp_strerror(-r) << dendl; + return nullptr; + } + r = impl->rados.connect(); + if (r < 0) { + ldpp_dout(dpp, -1) << "Rados client connection failed with " + << cpp_strerror(-r) << dendl; + return nullptr; + } + + return std::make_unique(std::move(impl)); +} + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/config/store.h b/src/rgw/driver/rados/config/store.h new file mode 100644 index 000000000..1b93a803d --- /dev/null +++ b/src/rgw/driver/rados/config/store.h @@ -0,0 +1,182 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include +#include "rgw_common.h" +#include "rgw_sal_config.h" + +class DoutPrefixProvider; +class optional_yield; + +namespace rgw::rados { + +struct ConfigImpl; + +class RadosConfigStore : public sal::ConfigStore { + public: + explicit RadosConfigStore(std::unique_ptr impl); + virtual ~RadosConfigStore() override; + + // Realm + virtual int write_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id) override; + virtual int read_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string& realm_id) override; + virtual int delete_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y) override; + + virtual int create_realm(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWRealm& info, + std::unique_ptr* writer) override; + virtual int read_realm_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWRealm& info, + std::unique_ptr* writer) override; + virtual int read_realm_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer) override; + virtual int read_default_realm(const DoutPrefixProvider* dpp, + optional_yield y, + RGWRealm& info, + std::unique_ptr* writer) override; + virtual int read_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view realm_name, + std::string& realm_id) override; + virtual int realm_notify_new_period(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWPeriod& period) override; + virtual int list_realm_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) override; + + // Period + virtual int create_period(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWPeriod& info) override; + virtual int read_period(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view period_id, + std::optional epoch, RGWPeriod& info) override; + virtual int delete_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id) override; + virtual int list_period_ids(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) override; + + // ZoneGroup + virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zonegroup_id) override; + virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zonegroup_id) override; + virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) override; + + virtual int create_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneGroup& info, + std::unique_ptr* writer) override; + virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_id, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + virtual int read_default_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneGroup& info, + std::unique_ptr* writer) override; + virtual int list_zonegroup_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) override; + + // Zone + virtual int write_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zone_id) override; + virtual int read_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zone_id) override; + virtual int delete_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) override; + + virtual int create_zone(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneParams& info, + std::unique_ptr* writer) override; + virtual int read_zone_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_id, + RGWZoneParams& info, + std::unique_ptr* writer) override; + virtual int read_zone_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer) override; + virtual int read_default_zone(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneParams& info, + std::unique_ptr* writer) override; + virtual int list_zone_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + sal::ListResult& result) override; + + // PeriodConfig + virtual int read_period_config(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWPeriodConfig& info) override; + virtual int write_period_config(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + const RGWPeriodConfig& info) override; + + private: + std::unique_ptr impl; +}; // RadosConfigStore + + +/// RadosConfigStore factory function +auto create_config_store(const DoutPrefixProvider* dpp) + -> std::unique_ptr; + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/config/zone.cc b/src/rgw/driver/rados/config/zone.cc new file mode 100644 index 000000000..e06c1606c --- /dev/null +++ b/src/rgw/driver/rados/config/zone.cc @@ -0,0 +1,312 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/dout.h" +#include "common/errno.h" +#include "rgw_zone.h" +#include "driver/rados/config/store.h" + +#include "impl.h" + +namespace rgw::rados { + +// zone oids +constexpr std::string_view zone_info_oid_prefix = "zone_info."; +constexpr std::string_view zone_names_oid_prefix = "zone_names."; + +std::string zone_info_oid(std::string_view zone_id) +{ + return string_cat_reserve(zone_info_oid_prefix, zone_id); +} +std::string zone_name_oid(std::string_view zone_id) +{ + return string_cat_reserve(zone_names_oid_prefix, zone_id); +} +std::string default_zone_oid(const ceph::common::ConfigProxy& conf, + std::string_view realm_id) +{ + return fmt::format("{}.{}", conf->rgw_default_zone_info_oid, realm_id); +} + + +int RadosConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + bool exclusive, + std::string_view realm_id, + std::string_view zone_id) +{ + const auto& pool = impl->zone_pool; + const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id); + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + + RGWDefaultSystemMetaObjInfo default_info; + default_info.default_id = zone_id; + + return impl->write(dpp, y, pool, default_oid, create, default_info, nullptr); +} + +int RadosConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zone_id) +{ + const auto& pool = impl->zone_pool; + const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id); + + RGWDefaultSystemMetaObjInfo default_info; + int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr); + if (r >= 0) { + zone_id = default_info.default_id; + } + return r; +} + +int RadosConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) +{ + const auto& pool = impl->zone_pool; + const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id); + + return impl->remove(dpp, y, pool, default_oid, nullptr); +} + + +class RadosZoneWriter : public sal::ZoneWriter { + ConfigImpl* impl; + RGWObjVersionTracker objv; + std::string zone_id; + std::string zone_name; + public: + RadosZoneWriter(ConfigImpl* impl, RGWObjVersionTracker objv, + std::string_view zone_id, std::string_view zone_name) + : impl(impl), objv(std::move(objv)), + zone_id(zone_id), zone_name(zone_name) + { + } + + int write(const DoutPrefixProvider* dpp, optional_yield y, + const RGWZoneParams& info) override + { + if (zone_id != info.get_id() || zone_name != info.get_name()) { + return -EINVAL; // can't modify zone id or name directly + } + + const auto& pool = impl->zone_pool; + const auto info_oid = zone_info_oid(info.get_id()); + return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv); + } + + int rename(const DoutPrefixProvider* dpp, optional_yield y, + RGWZoneParams& info, std::string_view new_name) override + { + if (zone_id != info.get_id() || zone_name != info.get_name()) { + return -EINVAL; // can't modify zone id or name directly + } + if (new_name.empty()) { + ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl; + return -EINVAL; + } + + const auto& pool = impl->zone_pool; + const auto name = RGWNameToId{info.get_id()}; + const auto info_oid = zone_info_oid(info.get_id()); + const auto old_oid = zone_name_oid(info.get_name()); + const auto new_oid = zone_name_oid(new_name); + + // link the new name + RGWObjVersionTracker new_objv; + new_objv.generate_new_write_ver(dpp->get_cct()); + int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist, + name, &new_objv); + if (r < 0) { + return r; + } + + // write the info with updated name + info.set_name(std::string{new_name}); + r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv); + if (r < 0) { + // on failure, unlink the new name + (void) impl->remove(dpp, y, pool, new_oid, &new_objv); + return r; + } + + // unlink the old name + (void) impl->remove(dpp, y, pool, old_oid, nullptr); + + zone_name = new_name; + return 0; + } + + int remove(const DoutPrefixProvider* dpp, optional_yield y) override + { + const auto& pool = impl->zone_pool; + const auto info_oid = zone_info_oid(zone_id); + int r = impl->remove(dpp, y, pool, info_oid, &objv); + if (r < 0) { + return r; + } + const auto name_oid = zone_name_oid(zone_name); + (void) impl->remove(dpp, y, pool, name_oid, nullptr); + return 0; + } +}; // RadosZoneWriter + + +int RadosConfigStore::create_zone(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneParams& info, + std::unique_ptr* writer) +{ + if (info.get_id().empty()) { + ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl; + return -EINVAL; + } + if (info.get_name().empty()) { + ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl; + return -EINVAL; + } + + const auto& pool = impl->zone_pool; + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + + // write the zone info + const auto info_oid = zone_info_oid(info.get_id()); + RGWObjVersionTracker objv; + objv.generate_new_write_ver(dpp->get_cct()); + + int r = impl->write(dpp, y, pool, info_oid, create, info, &objv); + if (r < 0) { + return r; + } + + // write the zone name + const auto name_oid = zone_name_oid(info.get_name()); + const auto name = RGWNameToId{info.get_id()}; + RGWObjVersionTracker name_objv; + name_objv.generate_new_write_ver(dpp->get_cct()); + + r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv); + if (r < 0) { + (void) impl->remove(dpp, y, pool, info_oid, &objv); + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_id, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->zone_pool; + const auto info_oid = zone_info_oid(zone_id); + RGWObjVersionTracker objv; + + int r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->zone_pool; + + // look up zone id by name + const auto name_oid = zone_name_oid(zone_name); + RGWNameToId name; + int r = impl->read(dpp, y, pool, name_oid, name, nullptr); + if (r < 0) { + return r; + } + + const auto info_oid = zone_info_oid(name.obj_id); + RGWObjVersionTracker objv; + r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_default_zone(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->zone_pool; + + // read default zone id + const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id); + RGWDefaultSystemMetaObjInfo default_info; + int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr); + if (r < 0) { + return r; + } + + const auto info_oid = zone_info_oid(default_info.default_id); + RGWObjVersionTracker objv; + r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::list_zone_names(const DoutPrefixProvider* dpp, + optional_yield y, + const std::string& marker, + std::span entries, + sal::ListResult& result) +{ + const auto& pool = impl->zone_pool; + constexpr auto prefix = [] (std::string oid) -> std::string { + if (!oid.starts_with(zone_names_oid_prefix)) { + return {}; + } + return oid.substr(zone_names_oid_prefix.size()); + }; + return impl->list(dpp, y, pool, marker, prefix, entries, result); +} + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/config/zonegroup.cc b/src/rgw/driver/rados/config/zonegroup.cc new file mode 100644 index 000000000..1766a68ce --- /dev/null +++ b/src/rgw/driver/rados/config/zonegroup.cc @@ -0,0 +1,315 @@ +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/dout.h" +#include "common/errno.h" +#include "rgw_zone.h" +#include "driver/rados/config/store.h" + +#include "impl.h" + +namespace rgw::rados { + +// zonegroup oids +constexpr std::string_view zonegroup_names_oid_prefix = "zonegroups_names."; +constexpr std::string_view zonegroup_info_oid_prefix = "zonegroup_info."; +constexpr std::string_view default_zonegroup_info_oid = "default.zonegroup"; + +static std::string zonegroup_info_oid(std::string_view zonegroup_id) +{ + return string_cat_reserve(zonegroup_info_oid_prefix, zonegroup_id); +} +static std::string zonegroup_name_oid(std::string_view zonegroup_id) +{ + return string_cat_reserve(zonegroup_names_oid_prefix, zonegroup_id); +} +static std::string default_zonegroup_oid(const ceph::common::ConfigProxy& conf, + std::string_view realm_id) +{ + const auto prefix = name_or_default(conf->rgw_default_zonegroup_info_oid, + default_zonegroup_info_oid); + return fmt::format("{}.{}", prefix, realm_id); +} + + +int RadosConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + bool exclusive, + std::string_view realm_id, + std::string_view zonegroup_id) +{ + const auto& pool = impl->zonegroup_pool; + const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id); + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + + RGWDefaultSystemMetaObjInfo default_info; + default_info.default_id = zonegroup_id; + + return impl->write(dpp, y, pool, oid, create, default_info, nullptr); +} + +int RadosConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zonegroup_id) +{ + const auto& pool = impl->zonegroup_pool; + const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id); + + RGWDefaultSystemMetaObjInfo default_info; + int r = impl->read(dpp, y, pool, oid, default_info, nullptr); + if (r >= 0) { + zonegroup_id = default_info.default_id; + } + return r; +} + +int RadosConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) +{ + const auto& pool = impl->zonegroup_pool; + const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id); + return impl->remove(dpp, y, pool, oid, nullptr); +} + + +class RadosZoneGroupWriter : public sal::ZoneGroupWriter { + ConfigImpl* impl; + RGWObjVersionTracker objv; + std::string zonegroup_id; + std::string zonegroup_name; + public: + RadosZoneGroupWriter(ConfigImpl* impl, RGWObjVersionTracker objv, + std::string_view zonegroup_id, + std::string_view zonegroup_name) + : impl(impl), objv(std::move(objv)), + zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name) + { + } + + int write(const DoutPrefixProvider* dpp, optional_yield y, + const RGWZoneGroup& info) override + { + if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) { + return -EINVAL; // can't modify zonegroup id or name directly + } + + const auto& pool = impl->zonegroup_pool; + const auto info_oid = zonegroup_info_oid(info.get_id()); + return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv); + } + + int rename(const DoutPrefixProvider* dpp, optional_yield y, + RGWZoneGroup& info, std::string_view new_name) override + { + if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) { + return -EINVAL; // can't modify zonegroup id or name directly + } + if (new_name.empty()) { + ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl; + return -EINVAL; + } + + const auto& pool = impl->zonegroup_pool; + const auto name = RGWNameToId{info.get_id()}; + const auto info_oid = zonegroup_info_oid(info.get_id()); + const auto old_oid = zonegroup_name_oid(info.get_name()); + const auto new_oid = zonegroup_name_oid(new_name); + + // link the new name + RGWObjVersionTracker new_objv; + new_objv.generate_new_write_ver(dpp->get_cct()); + int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist, + name, &new_objv); + if (r < 0) { + return r; + } + + // write the info with updated name + info.set_name(std::string{new_name}); + r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv); + if (r < 0) { + // on failure, unlink the new name + (void) impl->remove(dpp, y, pool, new_oid, &new_objv); + return r; + } + + // unlink the old name + (void) impl->remove(dpp, y, pool, old_oid, nullptr); + + zonegroup_name = new_name; + return 0; + } + + int remove(const DoutPrefixProvider* dpp, optional_yield y) override + { + const auto& pool = impl->zonegroup_pool; + const auto info_oid = zonegroup_info_oid(zonegroup_id); + int r = impl->remove(dpp, y, pool, info_oid, &objv); + if (r < 0) { + return r; + } + const auto name_oid = zonegroup_name_oid(zonegroup_name); + (void) impl->remove(dpp, y, pool, name_oid, nullptr); + return 0; + } +}; // RadosZoneGroupWriter + + +int RadosConfigStore::create_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneGroup& info, + std::unique_ptr* writer) +{ + if (info.get_id().empty()) { + ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl; + return -EINVAL; + } + if (info.get_name().empty()) { + ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl; + return -EINVAL; + } + + const auto& pool = impl->zonegroup_pool; + const auto create = exclusive ? Create::MustNotExist : Create::MayExist; + + // write the zonegroup info + const auto info_oid = zonegroup_info_oid(info.get_id()); + RGWObjVersionTracker objv; + objv.generate_new_write_ver(dpp->get_cct()); + + int r = impl->write(dpp, y, pool, info_oid, create, info, &objv); + if (r < 0) { + return r; + } + + // write the zonegroup name + const auto name_oid = zonegroup_name_oid(info.get_name()); + const auto name = RGWNameToId{info.get_id()}; + RGWObjVersionTracker name_objv; + name_objv.generate_new_write_ver(dpp->get_cct()); + + r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv); + if (r < 0) { + (void) impl->remove(dpp, y, pool, info_oid, &objv); + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_id, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->zonegroup_pool; + const auto info_oid = zonegroup_info_oid(zonegroup_id); + RGWObjVersionTracker objv; + + int r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->zonegroup_pool; + + // look up zonegroup id by name + RGWNameToId name; + const auto name_oid = zonegroup_name_oid(zonegroup_name); + int r = impl->read(dpp, y, pool, name_oid, name, nullptr); + if (r < 0) { + return r; + } + + const auto info_oid = zonegroup_info_oid(name.obj_id); + RGWObjVersionTracker objv; + r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + const auto& pool = impl->zonegroup_pool; + + // read default zonegroup id + RGWDefaultSystemMetaObjInfo default_info; + const auto default_oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id); + int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr); + if (r < 0) { + return r; + } + + const auto info_oid = zonegroup_info_oid(default_info.default_id); + RGWObjVersionTracker objv; + r = impl->read(dpp, y, pool, info_oid, info, &objv); + if (r < 0) { + return r; + } + + if (writer) { + *writer = std::make_unique( + impl.get(), std::move(objv), info.get_id(), info.get_name()); + } + return 0; +} + +int RadosConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp, + optional_yield y, + const std::string& marker, + std::span entries, + sal::ListResult& result) +{ + const auto& pool = impl->zonegroup_pool; + constexpr auto prefix = [] (std::string oid) -> std::string { + if (!oid.starts_with(zonegroup_names_oid_prefix)) { + return {}; + } + return oid.substr(zonegroup_names_oid_prefix.size()); + }; + return impl->list(dpp, y, pool, marker, prefix, entries, result); +} + +} // namespace rgw::rados diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc new file mode 100644 index 000000000..32cd1ccf9 --- /dev/null +++ b/src/rgw/driver/rados/rgw_bucket.cc @@ -0,0 +1,3316 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_acl_s3.h" +#include "rgw_tag_s3.h" + +#include "rgw_bucket.h" +#include "rgw_op.h" +#include "rgw_bucket_sync.h" + +#include "services/svc_zone.h" +#include "services/svc_bucket.h" +#include "services/svc_user.h" + +#include "rgw_reshard.h" + +// stolen from src/cls/version/cls_version.cc +#define VERSION_ATTR "ceph.objclass.version" + +#include "cls/user/cls_user_types.h" + +#include "rgw_sal_rados.h" + +#define dout_subsys ceph_subsys_rgw + +// seconds for timeout during RGWBucket::check_object_index +constexpr uint64_t BUCKET_TAG_QUICK_TIMEOUT = 30; + +using namespace std; + +// these values are copied from cls/rgw/cls_rgw.cc +static const string BI_OLH_ENTRY_NS_START = "\x80" "1001_"; +static const string BI_INSTANCE_ENTRY_NS_START = "\x80" "1000_"; + +// number of characters that we should allow to be buffered by the formatter +// before flushing (used by index check methods with dump_keys=true) +static constexpr int FORMATTER_LEN_FLUSH_THRESHOLD = 4 * 1024 * 1024; + +// default number of entries to list with each bucket listing call +// (use marker to bridge between calls) +static constexpr size_t listing_max_entries = 1000; + +/* + * The tenant_name is always returned on purpose. May be empty, of course. + */ +static void parse_bucket(const string& bucket, + string *tenant_name, + string *bucket_name, + string *bucket_instance = nullptr /* optional */) +{ + /* + * expected format: [tenant/]bucket:bucket_instance + */ + int pos = bucket.find('/'); + if (pos >= 0) { + *tenant_name = bucket.substr(0, pos); + } else { + tenant_name->clear(); + } + string bn = bucket.substr(pos + 1); + pos = bn.find (':'); + if (pos < 0) { + *bucket_name = std::move(bn); + return; + } + *bucket_name = bn.substr(0, pos); + if (bucket_instance) { + *bucket_instance = bn.substr(pos + 1); + } + + /* + * deal with the possible tenant:bucket:bucket_instance case + */ + if (tenant_name->empty()) { + pos = bucket_instance->find(':'); + if (pos >= 0) { + *tenant_name = *bucket_name; + *bucket_name = bucket_instance->substr(0, pos); + *bucket_instance = bucket_instance->substr(pos + 1); + } + } +} + +static void dump_mulipart_index_results(list& objs_to_unlink, + Formatter *f) +{ + for (const auto& o : objs_to_unlink) { + f->dump_string("object", o.name); + } +} + +void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User& user, + bool fix, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + rgw::sal::BucketList user_buckets; + string marker; + + CephContext *cct = driver->ctx(); + + size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk; + + do { + int ret = user.list_buckets(dpp, marker, string(), max_entries, false, user_buckets, y); + if (ret < 0) { + ldout(driver->ctx(), 0) << "failed to read user buckets: " + << cpp_strerror(-ret) << dendl; + return; + } + + map>& buckets = user_buckets.get_buckets(); + for (auto i = buckets.begin(); + i != buckets.end(); + ++i) { + marker = i->first; + + auto& bucket = i->second; + + std::unique_ptr actual_bucket; + int r = driver->get_bucket(dpp, &user, user.get_tenant(), bucket->get_name(), &actual_bucket, y); + if (r < 0) { + ldout(driver->ctx(), 0) << "could not get bucket info for bucket=" << bucket << dendl; + continue; + } + + if (actual_bucket->get_name().compare(bucket->get_name()) != 0 || + actual_bucket->get_tenant().compare(bucket->get_tenant()) != 0 || + actual_bucket->get_marker().compare(bucket->get_marker()) != 0 || + actual_bucket->get_bucket_id().compare(bucket->get_bucket_id()) != 0) { + cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl; + if (fix) { + cout << "fixing" << std::endl; + r = actual_bucket->chown(dpp, user, y); + if (r < 0) { + cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl; + } + } + } + } + } while (user_buckets.is_truncated()); +} + +// returns true if entry is in the empty namespace. note: function +// type conforms to type RGWBucketListNameFilter +bool rgw_bucket_object_check_filter(const std::string& oid) +{ + const static std::string empty_ns; + rgw_obj_key key; // thrown away but needed for parsing + return rgw_obj_key::oid_to_key_in_ns(oid, &key, empty_ns); +} + +int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key) +{ + if (key.instance.empty()) { + key.instance = "null"; + } + + std::unique_ptr object = bucket->get_object(key); + + return object->delete_object(dpp, null_yield); +} + +static void set_err_msg(std::string *sink, std::string msg) +{ + if (sink && !msg.empty()) + *sink = msg; +} + +int RGWBucket::init(rgw::sal::Driver* _driver, RGWBucketAdminOpState& op_state, + optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg) +{ + if (!_driver) { + set_err_msg(err_msg, "no storage!"); + return -EINVAL; + } + + driver = _driver; + + std::string bucket_name = op_state.get_bucket_name(); + + if (bucket_name.empty() && op_state.get_user_id().empty()) + return -EINVAL; + + user = driver->get_user(op_state.get_user_id()); + std::string tenant = user->get_tenant(); + + // split possible tenant/name + auto pos = bucket_name.find('/'); + if (pos != string::npos) { + tenant = bucket_name.substr(0, pos); + bucket_name = bucket_name.substr(pos + 1); + } + + int r = driver->get_bucket(dpp, user.get(), tenant, bucket_name, &bucket, y); + if (r < 0) { + set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket_name); + return r; + } + + op_state.set_bucket(bucket->clone()); + + if (!rgw::sal::User::empty(user.get())) { + r = user->load_user(dpp, y); + if (r < 0) { + set_err_msg(err_msg, "failed to fetch user info"); + return r; + } + } + + op_state.display_name = user->get_display_name(); + + clear_failure(); + return 0; +} + +bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver, + const string& marker, const string& bucket_id, rgw_bucket* bucket_out) +{ + void *handle = NULL; + bool truncated = false; + string s; + + int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle); + if (ret < 0) { + cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + driver->meta_list_keys_complete(handle); + return -ret; + } + do { + list keys; + ret = driver->meta_list_keys_next(dpp, handle, 1000, keys, &truncated); + if (ret < 0) { + cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl; + driver->meta_list_keys_complete(handle); + return -ret; + } + for (list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + s = *iter; + ret = rgw_bucket_parse_bucket_key(cct, s, bucket_out, nullptr); + if (ret < 0) { + continue; + } + if (bucket_id == bucket_out->bucket_id) { + driver->meta_list_keys_complete(handle); + return true; + } + } + } while (truncated); + driver->meta_list_keys_complete(handle); + return false; +} + +int RGWBucket::chown(RGWBucketAdminOpState& op_state, const string& marker, + optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg) +{ + /* User passed in by rgw_admin is the new user; get the current user and set it in + * the bucket */ + std::unique_ptr old_user = driver->get_user(bucket->get_info().owner); + bucket->set_owner(old_user.get()); + + return rgw_chown_bucket_and_objects(driver, bucket.get(), user.get(), marker, err_msg, dpp, y); +} + +int RGWBucket::set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg) +{ + bucket = op_state.get_bucket()->clone(); + + bucket->get_info().quota = op_state.quota; + int r = bucket->put_info(dpp, false, real_time()); + if (r < 0) { + set_err_msg(err_msg, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r)); + return r; + } + return r; +} + +int RGWBucket::remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg) +{ + std::string object_name = op_state.get_object_name(); + + rgw_obj_key key(object_name); + + bucket = op_state.get_bucket()->clone(); + + int ret = rgw_remove_object(dpp, driver, bucket.get(), key); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove object" + cpp_strerror(-ret)); + return ret; + } + + return 0; +} + +static void dump_bucket_index(const vector& objs, Formatter *f) +{ + for (auto iter = objs.begin(); iter != objs.end(); ++iter) { + f->dump_string("object", iter->key.name); + } +} + +static void dump_bucket_usage(map& stats, Formatter *formatter) +{ + map::iterator iter; + + formatter->open_object_section("usage"); + for (iter = stats.begin(); iter != stats.end(); ++iter) { + RGWStorageStats& s = iter->second; + formatter->open_object_section(to_string(iter->first)); + s.dump(formatter); + formatter->close_section(); + } + formatter->close_section(); +} + +static void dump_index_check(map existing_stats, + map calculated_stats, + Formatter *formatter) +{ + formatter->open_object_section("check_result"); + formatter->open_object_section("existing_header"); + dump_bucket_usage(existing_stats, formatter); + formatter->close_section(); + formatter->open_object_section("calculated_header"); + dump_bucket_usage(calculated_stats, formatter); + formatter->close_section(); + formatter->close_section(); +} + +int RGWBucket::check_bad_index_multipart(RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const DoutPrefixProvider *dpp, + std::string *err_msg) +{ + const bool fix_index = op_state.will_fix_index(); + + bucket = op_state.get_bucket()->clone(); + + rgw::sal::Bucket::ListParams params; + params.list_versions = true; + params.ns = RGW_OBJ_NS_MULTIPART; + + std::map meta_objs; + std::map all_objs; + bool is_truncated; + do { + rgw::sal::Bucket::ListResults results; + int r = bucket->list(dpp, params, listing_max_entries, results, null_yield); + if (r < 0) { + set_err_msg(err_msg, "failed to list objects in bucket=" + bucket->get_name() + + " err=" + cpp_strerror(-r)); + + return r; + } + is_truncated = results.is_truncated; + + for (const auto& o : results.objs) { + rgw_obj_index_key key = o.key; + rgw_obj obj(bucket->get_key(), key); + std::string oid = obj.get_oid(); + + int pos = oid.find_last_of('.'); + if (pos < 0) { + /* obj has no suffix */ + all_objs[key] = oid; + } else { + /* obj has suffix */ + std::string name = oid.substr(0, pos); + std::string suffix = oid.substr(pos + 1); + + if (suffix.compare("meta") == 0) { + meta_objs[name] = true; + } else { + all_objs[key] = name; + } + } + } + } while (is_truncated); + + std::list objs_to_unlink; + Formatter *f = flusher.get_formatter(); + + f->open_array_section("invalid_multipart_entries"); + + for (const auto& o : all_objs) { + const std::string& name = o.second; + if (meta_objs.find(name) == meta_objs.end()) { + objs_to_unlink.push_back(o.first); + } + + if (objs_to_unlink.size() > listing_max_entries) { + if (fix_index) { + // note: under rados this removes directly from rados index objects + int r = bucket->remove_objs_from_index(dpp, objs_to_unlink); + if (r < 0) { + set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " + + cpp_strerror(-r)); + return r; + } + } + + dump_mulipart_index_results(objs_to_unlink, f); + flusher.flush(); + objs_to_unlink.clear(); + } + } + + if (fix_index) { + // note: under rados this removes directly from rados index objects + int r = bucket->remove_objs_from_index(dpp, objs_to_unlink); + if (r < 0) { + set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " + + cpp_strerror(-r)); + + return r; + } + } + + dump_mulipart_index_results(objs_to_unlink, f); + f->close_section(); + flusher.flush(); + + return 0; +} + +int RGWBucket::check_object_index(const DoutPrefixProvider *dpp, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + optional_yield y, + std::string *err_msg) +{ + + bool fix_index = op_state.will_fix_index(); + + if (!fix_index) { + set_err_msg(err_msg, "check-objects flag requires fix index enabled"); + return -EINVAL; + } + + // use a quicker/shorter tag timeout during this process + bucket->set_tag_timeout(dpp, BUCKET_TAG_QUICK_TIMEOUT); + + rgw::sal::Bucket::ListResults results; + results.is_truncated = true; + + Formatter *formatter = flusher.get_formatter(); + formatter->open_object_section("objects"); + + while (results.is_truncated) { + rgw::sal::Bucket::ListParams params; + params.marker = results.next_marker; + params.force_check_filter = rgw_bucket_object_check_filter; + + int r = bucket->list(dpp, params, listing_max_entries, results, y); + + if (r == -ENOENT) { + break; + } else if (r < 0) { + set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r)); + } + + dump_bucket_index(results.objs, formatter); + flusher.flush(); + } + + formatter->close_section(); + + // restore normal tag timeout for bucket + bucket->set_tag_timeout(dpp, 0); + + return 0; +} + +/** + * Loops over all olh entries in a bucket shard and finds ones with + * exists=false and pending_removal=true. If the pending log is empty on + * these entries, they were left behind after the last remaining version of + * an object was deleted or after an incomplete upload. This was known to + * happen historically due to concurrency conflicts among requests referencing + * the same object key. If op_state.fix_index is true, we continue where the + * request left off by calling RGWRados::clear_olh. If the pending log is not + * empty, we attempt to apply it. + */ +static int check_index_olh(rgw::sal::RadosStore* const rados_store, + rgw::sal::Bucket* const bucket, + const DoutPrefixProvider *dpp, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const int shard, + uint64_t* const count_out, + optional_yield y) +{ + string marker = BI_OLH_ENTRY_NS_START; + bool is_truncated = true; + list entries; + + RGWObjectCtx obj_ctx(rados_store); + RGWRados* store = rados_store->getRados(); + RGWRados::BucketShard bs(store); + + int ret = bs.init(dpp, bucket->get_info(), bucket->get_info().layout.current_index, shard); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR bs.init(bucket=" << bucket << "): " << cpp_strerror(-ret) << dendl; + return ret; + } + + *count_out = 0; + do { + entries.clear(); + ret = store->bi_list(bs, "", marker, -1, &entries, &is_truncated); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR bi_list(): " << cpp_strerror(-ret) << dendl; + break; + } + list::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_cls_bi_entry& entry = *iter; + marker = entry.idx; + if (entry.type != BIIndexType::OLH) { + is_truncated = false; + break; + } + rgw_bucket_olh_entry olh_entry; + auto iiter = entry.data.cbegin(); + try { + decode(olh_entry, iiter); + } catch (buffer::error& err) { + ldpp_dout(dpp, -1) << "ERROR failed to decode olh entry for key: " << entry.idx << dendl; + continue; + } + if (olh_entry.exists || !olh_entry.pending_removal) { + continue; + } + if (op_state.will_fix_index()) { + rgw_obj obj(bucket->get_key(), olh_entry.key.name); + if (olh_entry.pending_log.empty()) { + ret = store->clear_olh(dpp, obj_ctx, obj, bucket->get_info(), olh_entry.tag, olh_entry.epoch, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR failed to clear olh for: " << olh_entry.key.name << " clear_olh(): " << cpp_strerror(-ret) << dendl; + continue; + } + } else { + std::unique_ptr object = bucket->get_object({olh_entry.key.name}); + RGWObjState *state; + ret = object->get_obj_state(dpp, &state, y, false); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR failed to get state for: " << olh_entry.key.name << " get_obj_state(): " << cpp_strerror(-ret) << dendl; + continue; + } + ret = store->update_olh(dpp, obj_ctx, state, bucket->get_info(), obj); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR failed to update olh for: " << olh_entry.key.name << " update_olh(): " << cpp_strerror(-ret) << dendl; + continue; + } + } + } + if (op_state.dump_keys) { + flusher.get_formatter()->dump_string("", olh_entry.key.name); + if (flusher.get_formatter()->get_len() > FORMATTER_LEN_FLUSH_THRESHOLD) { + flusher.flush(); + } + } + *count_out += 1; + } + } while (is_truncated); + flusher.flush(); + return 0; +} + + +/** + * Spawns separate coroutines to check each bucket shard for leftover + * olh entries (and remove them if op_state.fix_index is true). + */ +int RGWBucket::check_index_olh(rgw::sal::RadosStore* const rados_store, + const DoutPrefixProvider *dpp, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + const RGWBucketInfo& bucket_info = get_bucket_info(); + if ((bucket_info.versioning_status() & BUCKET_VERSIONED) == 0) { + ldpp_dout(dpp, 0) << "WARNING: this command is only applicable to versioned buckets" << dendl; + return 0; + } + + Formatter* formatter = flusher.get_formatter(); + if (op_state.dump_keys) { + formatter->open_array_section(""); + } + + const int max_shards = rgw::num_shards(bucket_info.layout.current_index); + std::string verb = op_state.will_fix_index() ? "removed" : "found"; + uint64_t count_out = 0; + + boost::asio::io_context context; + int next_shard = 0; + + const int max_aio = std::max(1, op_state.get_max_aio()); + + for (int i=0; i= max_shards) { + return; + } + optional_yield y(context, yield); + uint64_t shard_count; + int r = ::check_index_olh(rados_store, &*bucket, dpp, op_state, flusher, shard, &shard_count, y); + if (r < 0) { + ldpp_dout(dpp, -1) << "NOTICE: error processing shard " << shard << + " check_index_olh(): " << r << dendl; + } + count_out += shard_count; + if (!op_state.hide_progress) { + ldpp_dout(dpp, 1) << "NOTICE: finished shard " << shard << " (" << shard_count << + " entries " << verb << ")" << dendl; + } + } + }); + } + try { + context.run(); + } catch (const std::system_error& e) { + return -e.code().value(); + } + if (!op_state.hide_progress) { + ldpp_dout(dpp, 1) << "NOTICE: finished all shards (" << count_out << + " entries " << verb << ")" << dendl; + } + if (op_state.dump_keys) { + formatter->close_section(); + flusher.flush(); + } + return 0; +} + +/** + * Indicates whether a versioned bucket instance entry is listable in the + * index. It does this by looping over all plain entries with prefix equal to + * the key name, and checking whether any have an instance ID matching the one + * on the specified key. The existence of an instance entry without a matching + * plain entry indicates that the object was uploaded successfully, but the + * request exited prior to linking the object into the index (via the creation + * of a plain entry). + */ +static int is_versioned_instance_listable(const DoutPrefixProvider *dpp, + RGWRados::BucketShard& bs, + const cls_rgw_obj_key& key, + bool& listable, + optional_yield y) +{ + const std::string empty_delim; + cls_rgw_obj_key marker; + rgw_cls_list_ret result; + listable = false; + + do { + librados::ObjectReadOperation op; + cls_rgw_bucket_list_op(op, marker, key.name, empty_delim, 1000, + true, &result); + bufferlist ibl; + int r = bs.bucket_obj.operate(dpp, &op, &ibl, y); + if (r < 0) { + return r; + } + + for (auto const& entry : result.dir.m) { + if (entry.second.key == key) { + listable = true; + return 0; + } + marker = entry.second.key; + } + } while (result.is_truncated); + return 0; +} + +/** + * Loops over all instance entries in a bucket shard and finds ones with + * versioned_epoch=0 and an mtime that is earlier than op_state.min_age + * relative to the current time. These entries represent objects that were + * uploaded successfully but were not successfully linked into the object + * index. As an extra precaution, we also verify that these entries are indeed + * non listable (have no corresponding plain entry in the index). We can assume + * that clients received an error response for the associated upload requests + * since the bucket index linking transaction did not complete. Therefore, if + * op_state.fix_index is true, we remove the object that is associated with the + * instance entry. + */ +static int check_index_unlinked(rgw::sal::RadosStore* const rados_store, + rgw::sal::Bucket* const bucket, + const DoutPrefixProvider *dpp, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const int shard, + uint64_t* const count_out, + optional_yield y) +{ + string marker = BI_INSTANCE_ENTRY_NS_START; + bool is_truncated = true; + list entries; + + RGWObjectCtx obj_ctx(rados_store); + RGWRados* store = rados_store->getRados(); + RGWRados::BucketShard bs(store); + + int ret = bs.init(dpp, bucket->get_info(), bucket->get_info().layout.current_index, shard); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR bs.init(bucket=" << bucket << "): " << cpp_strerror(-ret) << dendl; + return ret; + } + + ceph::real_clock::time_point now = ceph::real_clock::now(); + ceph::real_clock::time_point not_after = now - op_state.min_age; + + *count_out = 0; + do { + entries.clear(); + ret = store->bi_list(bs, "", marker, -1, &entries, &is_truncated); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR bi_list(): " << cpp_strerror(-ret) << dendl; + break; + } + list::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_cls_bi_entry& entry = *iter; + marker = entry.idx; + if (entry.type != BIIndexType::Instance) { + is_truncated = false; + break; + } + rgw_bucket_dir_entry dir_entry; + auto iiter = entry.data.cbegin(); + try { + decode(dir_entry, iiter); + } catch (buffer::error& err) { + ldpp_dout(dpp, -1) << "ERROR failed to decode instance entry for key: " << + entry.idx << dendl; + continue; + } + if (dir_entry.versioned_epoch != 0 || dir_entry.meta.mtime > not_after) { + continue; + } + bool listable; + ret = is_versioned_instance_listable(dpp, bs, dir_entry.key, listable, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR is_versioned_instance_listable(key='" << + dir_entry.key << "'): " << cpp_strerror(-ret) << dendl; + continue; + } + if (listable) { + continue; + } + if (op_state.will_fix_index()) { + rgw_obj_key key(dir_entry.key.name, dir_entry.key.instance); + ret = rgw_remove_object(dpp, rados_store, bucket, key); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR rgw_remove_obj(key='" << + dir_entry.key << "'): " << cpp_strerror(-ret) << dendl; + continue; + } + } + if (op_state.dump_keys) { + Formatter* const formatter = flusher.get_formatter(); + formatter->open_object_section("object_instance"); + formatter->dump_string("name", dir_entry.key.name); + formatter->dump_string("instance", dir_entry.key.instance); + formatter->close_section(); + if (formatter->get_len() > FORMATTER_LEN_FLUSH_THRESHOLD) { + flusher.flush(); + } + } + *count_out += 1; + } + } while (is_truncated); + flusher.flush(); + return 0; +} + +/** + * Spawns separate coroutines to check each bucket shard for unlinked + * instance entries (and remove them if op_state.fix_index is true). + */ +int RGWBucket::check_index_unlinked(rgw::sal::RadosStore* const rados_store, + const DoutPrefixProvider *dpp, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + const RGWBucketInfo& bucket_info = get_bucket_info(); + if ((bucket_info.versioning_status() & BUCKET_VERSIONED) == 0) { + ldpp_dout(dpp, 0) << "WARNING: this command is only applicable to versioned buckets" << dendl; + return 0; + } + + Formatter* formatter = flusher.get_formatter(); + if (op_state.dump_keys) { + formatter->open_array_section(""); + } + + const int max_shards = rgw::num_shards(bucket_info.layout.current_index); + std::string verb = op_state.will_fix_index() ? "removed" : "found"; + uint64_t count_out = 0; + + int max_aio = std::max(1, op_state.get_max_aio()); + int next_shard = 0; + boost::asio::io_context context; + for (int i=0; i= max_shards) { + return; + } + uint64_t shard_count; + optional_yield y {context, yield}; + int r = ::check_index_unlinked(rados_store, &*bucket, dpp, op_state, flusher, shard, &shard_count, y); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: error processing shard " << shard << + " check_index_unlinked(): " << r << dendl; + } + count_out += shard_count; + if (!op_state.hide_progress) { + ldpp_dout(dpp, 1) << "NOTICE: finished shard " << shard << " (" << shard_count << + " entries " << verb << ")" << dendl; + } + } + }); + } + try { + context.run(); + } catch (const std::system_error& e) { + return -e.code().value(); + } + + if (!op_state.hide_progress) { + ldpp_dout(dpp, 1) << "NOTICE: finished all shards (" << count_out << + " entries " << verb << ")" << dendl; + } + if (op_state.dump_keys) { + formatter->close_section(); + flusher.flush(); + } + return 0; +} + +int RGWBucket::check_index(const DoutPrefixProvider *dpp, + RGWBucketAdminOpState& op_state, + map& existing_stats, + map& calculated_stats, + std::string *err_msg) +{ + bool fix_index = op_state.will_fix_index(); + + int r = bucket->check_index(dpp, existing_stats, calculated_stats); + if (r < 0) { + set_err_msg(err_msg, "failed to check index error=" + cpp_strerror(-r)); + return r; + } + + if (fix_index) { + r = bucket->rebuild_index(dpp); + if (r < 0) { + set_err_msg(err_msg, "failed to rebuild index err=" + cpp_strerror(-r)); + return r; + } + } + + return 0; +} + +int RGWBucket::sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg) +{ + if (!driver->is_meta_master()) { + set_err_msg(err_msg, "ERROR: failed to update bucket sync: only allowed on meta master zone"); + return -EINVAL; + } + bool sync = op_state.will_sync_bucket(); + if (sync) { + bucket->get_info().flags &= ~BUCKET_DATASYNC_DISABLED; + } else { + bucket->get_info().flags |= BUCKET_DATASYNC_DISABLED; + } + + // when writing this metadata, RGWSI_BucketIndex_RADOS::handle_overwrite() + // will write the corresponding datalog and bilog entries + int r = bucket->put_info(dpp, false, real_time()); + if (r < 0) { + set_err_msg(err_msg, "ERROR: failed writing bucket instance info:" + cpp_strerror(-r)); + return r; + } + + return 0; +} + + +int RGWBucket::policy_bl_to_stream(bufferlist& bl, ostream& o) +{ + RGWAccessControlPolicy_S3 policy(g_ceph_context); + int ret = decode_bl(bl, policy); + if (ret < 0) { + ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl; + } + policy.to_xml(o); + return 0; +} + +int rgw_object_get_attr(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, rgw::sal::Object* obj, + const char* attr_name, bufferlist& out_bl, optional_yield y) +{ + std::unique_ptr rop = obj->get_read_op(); + + return rop->get_attr(dpp, attr_name, out_bl, y); +} + +int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp) +{ + int ret; + std::string object_name = op_state.get_object_name(); + + bucket = op_state.get_bucket()->clone(); + + if (!object_name.empty()) { + bufferlist bl; + std::unique_ptr obj = bucket->get_object(rgw_obj_key(object_name)); + + ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_ACL, bl, y); + if (ret < 0){ + return ret; + } + + ret = decode_bl(bl, policy); + if (ret < 0) { + ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl; + } + return ret; + } + + map::iterator aiter = bucket->get_attrs().find(RGW_ATTR_ACL); + if (aiter == bucket->get_attrs().end()) { + return -ENOENT; + } + + ret = decode_bl(aiter->second, policy); + if (ret < 0) { + ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl; + } + + return ret; +} + + +int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp) +{ + RGWBucket bucket; + + int ret = bucket.init(driver, op_state, null_yield, dpp); + if (ret < 0) + return ret; + + ret = bucket.get_policy(op_state, policy, null_yield, dpp); + if (ret < 0) + return ret; + + return 0; +} + +/* Wrappers to facilitate RESTful interface */ + + +int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp) +{ + RGWAccessControlPolicy policy(driver->ctx()); + + int ret = get_policy(driver, op_state, policy, dpp); + if (ret < 0) + return ret; + + Formatter *formatter = flusher.get_formatter(); + + flusher.start(0); + + formatter->open_object_section("policy"); + policy.dump(formatter); + formatter->close_section(); + + flusher.flush(); + + return 0; +} + +int RGWBucketAdminOp::dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + ostream& os, const DoutPrefixProvider *dpp) +{ + RGWAccessControlPolicy_S3 policy(driver->ctx()); + + int ret = get_policy(driver, op_state, policy, dpp); + if (ret < 0) + return ret; + + policy.to_xml(os); + + return 0; +} + +int RGWBucketAdminOp::unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp) +{ + RGWBucket bucket; + + int ret = bucket.init(driver, op_state, null_yield, dpp); + if (ret < 0) + return ret; + + return static_cast(driver)->ctl()->bucket->unlink_bucket(op_state.get_user_id(), op_state.get_bucket()->get_info().bucket, null_yield, dpp, true); +} + +int RGWBucketAdminOp::link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err) +{ + if (!op_state.is_user_op()) { + set_err_msg(err, "empty user id"); + return -EINVAL; + } + + RGWBucket bucket; + int ret = bucket.init(driver, op_state, null_yield, dpp, err); + if (ret < 0) + return ret; + + string bucket_id = op_state.get_bucket_id(); + std::string display_name = op_state.get_user_display_name(); + std::unique_ptr loc_bucket; + std::unique_ptr old_bucket; + + loc_bucket = op_state.get_bucket()->clone(); + + if (!bucket_id.empty() && bucket_id != loc_bucket->get_bucket_id()) { + set_err_msg(err, + "specified bucket id does not match " + loc_bucket->get_bucket_id()); + return -EINVAL; + } + + old_bucket = loc_bucket->clone(); + + loc_bucket->get_key().tenant = op_state.get_user_id().tenant; + + if (!op_state.new_bucket_name.empty()) { + auto pos = op_state.new_bucket_name.find('/'); + if (pos != string::npos) { + loc_bucket->get_key().tenant = op_state.new_bucket_name.substr(0, pos); + loc_bucket->get_key().name = op_state.new_bucket_name.substr(pos + 1); + } else { + loc_bucket->get_key().name = op_state.new_bucket_name; + } + } + + RGWObjVersionTracker objv_tracker; + RGWObjVersionTracker old_version = loc_bucket->get_info().objv_tracker; + + map::iterator aiter = loc_bucket->get_attrs().find(RGW_ATTR_ACL); + if (aiter == loc_bucket->get_attrs().end()) { + // should never happen; only pre-argonaut buckets lacked this. + ldpp_dout(dpp, 0) << "WARNING: can't bucket link because no acl on bucket=" << old_bucket << dendl; + set_err_msg(err, + "While crossing the Anavros you have displeased the goddess Hera." + " You must sacrifice your ancient bucket " + loc_bucket->get_bucket_id()); + return -EINVAL; + } + bufferlist& aclbl = aiter->second; + RGWAccessControlPolicy policy; + ACLOwner owner; + try { + auto iter = aclbl.cbegin(); + decode(policy, iter); + owner = policy.get_owner(); + } catch (buffer::error& e) { + set_err_msg(err, "couldn't decode policy"); + return -EIO; + } + + int r = static_cast(driver)->ctl()->bucket->unlink_bucket(owner.get_id(), old_bucket->get_info().bucket, null_yield, dpp, false); + if (r < 0) { + set_err_msg(err, "could not unlink policy from user " + owner.get_id().to_str()); + return r; + } + + // now update the user for the bucket... + if (display_name.empty()) { + ldpp_dout(dpp, 0) << "WARNING: user " << op_state.get_user_id() << " has no display name set" << dendl; + } + + RGWAccessControlPolicy policy_instance; + policy_instance.create_default(op_state.get_user_id(), display_name); + owner = policy_instance.get_owner(); + + aclbl.clear(); + policy_instance.encode(aclbl); + + bool exclusive = false; + loc_bucket->get_info().owner = op_state.get_user_id(); + if (*loc_bucket != *old_bucket) { + loc_bucket->get_info().bucket = loc_bucket->get_key(); + loc_bucket->get_info().objv_tracker.version_for_read()->ver = 0; + exclusive = true; + } + + r = loc_bucket->put_info(dpp, exclusive, ceph::real_time()); + if (r < 0) { + set_err_msg(err, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r)); + return r; + } + + /* link to user */ + RGWBucketEntryPoint ep; + ep.bucket = loc_bucket->get_info().bucket; + ep.owner = op_state.get_user_id(); + ep.creation_time = loc_bucket->get_info().creation_time; + ep.linked = true; + rgw::sal::Attrs ep_attrs; + rgw_ep_info ep_data{ep, ep_attrs}; + + r = static_cast(driver)->ctl()->bucket->link_bucket(op_state.get_user_id(), loc_bucket->get_info().bucket, loc_bucket->get_info().creation_time, null_yield, dpp, true, &ep_data); + if (r < 0) { + set_err_msg(err, "failed to relink bucket"); + return r; + } + + if (*loc_bucket != *old_bucket) { + // like RGWRados::delete_bucket -- excepting no bucket_index work. + r = static_cast(driver)->ctl()->bucket->remove_bucket_entrypoint_info( + old_bucket->get_key(), null_yield, dpp, + RGWBucketCtl::Bucket::RemoveParams() + .set_objv_tracker(&ep_data.ep_objv)); + if (r < 0) { + set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name()); + return r; + } + r = static_cast(driver)->ctl()->bucket->remove_bucket_instance_info( + old_bucket->get_key(), old_bucket->get_info(), + null_yield, dpp, + RGWBucketCtl::BucketInstance::RemoveParams() + .set_objv_tracker(&ep_data.ep_objv)); + if (r < 0) { + set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name()); + return r; + } + } + + return 0; +} + +int RGWBucketAdminOp::chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const string& marker, const DoutPrefixProvider *dpp, string *err) +{ + RGWBucket bucket; + + int ret = bucket.init(driver, op_state, null_yield, dpp, err); + if (ret < 0) + return ret; + + return bucket.chown(op_state, marker, null_yield, dpp, err); + +} + +int RGWBucketAdminOp::check_index_olh(rgw::sal::RadosStore* store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp) +{ + RGWBucket bucket; + int ret = bucket.init(store, op_state, null_yield, dpp); + if (ret < 0) { + ldpp_dout(dpp, -1) << "bucket.init(): " << ret << dendl; + return ret; + } + flusher.start(0); + ret = bucket.check_index_olh(store, dpp, op_state, flusher); + if (ret < 0) { + ldpp_dout(dpp, -1) << "check_index_olh(): " << ret << dendl; + return ret; + } + flusher.flush(); + return 0; +} + +int RGWBucketAdminOp::check_index_unlinked(rgw::sal::RadosStore* store, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const DoutPrefixProvider *dpp) +{ + flusher.start(0); + RGWBucket bucket; + int ret = bucket.init(store, op_state, null_yield, dpp); + if (ret < 0) { + ldpp_dout(dpp, -1) << "bucket.init(): " << ret << dendl; + return ret; + } + ret = bucket.check_index_unlinked(store, dpp, op_state, flusher); + if (ret < 0) { + ldpp_dout(dpp, -1) << "check_index_unlinked(): " << ret << dendl; + return ret; + } + flusher.flush(); + return 0; +} + +int RGWBucketAdminOp::check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp) +{ + int ret; + map existing_stats; + map calculated_stats; + + + RGWBucket bucket; + + ret = bucket.init(driver, op_state, null_yield, dpp); + if (ret < 0) + return ret; + + Formatter *formatter = flusher.get_formatter(); + flusher.start(0); + formatter->open_object_section("bucket_check"); + + ret = bucket.check_bad_index_multipart(op_state, flusher, dpp); + if (ret < 0) + return ret; + + if (op_state.will_check_objects()) { + ret = bucket.check_object_index(dpp, op_state, flusher, y); + if (ret < 0) + return ret; + } + + ret = bucket.check_index(dpp, op_state, existing_stats, calculated_stats); + if (ret < 0) + return ret; + + dump_index_check(existing_stats, calculated_stats, formatter); + + formatter->close_section(); + flusher.flush(); + + return 0; +} + +int RGWBucketAdminOp::remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + optional_yield y, const DoutPrefixProvider *dpp, + bool bypass_gc, bool keep_index_consistent) +{ + std::unique_ptr bucket; + std::unique_ptr user = driver->get_user(op_state.get_user_id()); + + int ret = driver->get_bucket(dpp, user.get(), user->get_tenant(), op_state.get_bucket_name(), + &bucket, y); + if (ret < 0) + return ret; + + if (bypass_gc) + ret = bucket->remove_bucket_bypass_gc(op_state.get_max_aio(), keep_index_consistent, y, dpp); + else + ret = bucket->remove_bucket(dpp, op_state.will_delete_children(), + false, nullptr, y); + + return ret; +} + +int RGWBucketAdminOp::remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp) +{ + RGWBucket bucket; + + int ret = bucket.init(driver, op_state, null_yield, dpp); + if (ret < 0) + return ret; + + return bucket.remove_object(dpp, op_state); +} + +int RGWBucketAdminOp::sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err_msg) +{ + RGWBucket bucket; + int ret = bucket.init(driver, op_state, null_yield, dpp, err_msg); + if (ret < 0) + { + return ret; + } + return bucket.sync(op_state, dpp, err_msg); +} + +static int bucket_stats(rgw::sal::Driver* driver, + const std::string& tenant_name, + const std::string& bucket_name, + Formatter *formatter, + const DoutPrefixProvider *dpp) +{ + std::unique_ptr bucket; + map stats; + + int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield); + if (ret < 0) { + return ret; + } + + const RGWBucketInfo& bucket_info = bucket->get_info(); + + const auto& index = bucket->get_info().get_current_index(); + if (is_layout_indexless(index)) { + cerr << "error, indexless buckets do not maintain stats; bucket=" << + bucket->get_name() << std::endl; + return -EINVAL; + } + + std::string bucket_ver, master_ver; + std::string max_marker; + ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, &max_marker); + if (ret < 0) { + cerr << "error getting bucket stats bucket=" << bucket->get_name() << " ret=" << ret << std::endl; + return ret; + } + + utime_t ut(bucket->get_modification_time()); + utime_t ctime_ut(bucket->get_creation_time()); + + formatter->open_object_section("stats"); + formatter->dump_string("bucket", bucket->get_name()); + formatter->dump_int("num_shards", + bucket->get_info().layout.current_index.layout.normal.num_shards); + formatter->dump_string("tenant", bucket->get_tenant()); + formatter->dump_string("zonegroup", bucket->get_info().zonegroup); + formatter->dump_string("placement_rule", bucket->get_info().placement_rule.to_str()); + ::encode_json("explicit_placement", bucket->get_key().explicit_placement, formatter); + formatter->dump_string("id", bucket->get_bucket_id()); + formatter->dump_string("marker", bucket->get_marker()); + formatter->dump_stream("index_type") << bucket->get_info().layout.current_index.layout.type; + formatter->dump_bool("versioned", bucket_info.versioned()); + formatter->dump_bool("versioning_enabled", bucket_info.versioning_enabled()); + formatter->dump_bool("object_lock_enabled", bucket_info.obj_lock_enabled()); + formatter->dump_bool("mfa_enabled", bucket_info.mfa_enabled()); + ::encode_json("owner", bucket->get_info().owner, formatter); + formatter->dump_string("ver", bucket_ver); + formatter->dump_string("master_ver", master_ver); + ut.gmtime(formatter->dump_stream("mtime")); + ctime_ut.gmtime(formatter->dump_stream("creation_time")); + formatter->dump_string("max_marker", max_marker); + dump_bucket_usage(stats, formatter); + encode_json("bucket_quota", bucket->get_info().quota, formatter); + + // bucket tags + auto iter = bucket->get_attrs().find(RGW_ATTR_TAGS); + if (iter != bucket->get_attrs().end()) { + RGWObjTagSet_S3 tagset; + bufferlist::const_iterator piter{&iter->second}; + try { + tagset.decode(piter); + tagset.dump(formatter); + } catch (buffer::error& err) { + cerr << "ERROR: caught buffer:error, couldn't decode TagSet" << std::endl; + } + } + + // TODO: bucket CORS + // TODO: bucket LC + formatter->close_section(); + + return 0; +} + +int RGWBucketAdminOp::limit_check(rgw::sal::Driver* driver, + RGWBucketAdminOpState& op_state, + const std::list& user_ids, + RGWFormatterFlusher& flusher, optional_yield y, + const DoutPrefixProvider *dpp, + bool warnings_only) +{ + int ret = 0; + const size_t max_entries = + driver->ctx()->_conf->rgw_list_buckets_max_chunk; + + const size_t safe_max_objs_per_shard = + driver->ctx()->_conf->rgw_safe_max_objects_per_shard; + + uint16_t shard_warn_pct = + driver->ctx()->_conf->rgw_shard_warning_threshold; + if (shard_warn_pct > 100) + shard_warn_pct = 90; + + Formatter *formatter = flusher.get_formatter(); + flusher.start(0); + + formatter->open_array_section("users"); + + for (const auto& user_id : user_ids) { + + formatter->open_object_section("user"); + formatter->dump_string("user_id", user_id); + formatter->open_array_section("buckets"); + + string marker; + rgw::sal::BucketList buckets; + do { + std::unique_ptr user = driver->get_user(rgw_user(user_id)); + + ret = user->list_buckets(dpp, marker, string(), max_entries, false, buckets, y); + + if (ret < 0) + return ret; + + map>& m_buckets = buckets.get_buckets(); + + for (const auto& iter : m_buckets) { + auto& bucket = iter.second; + uint64_t num_objects = 0; + + marker = bucket->get_name(); /* Casey's location for marker update, + * as we may now not reach the end of + * the loop body */ + + ret = bucket->load_bucket(dpp, y); + if (ret < 0) + continue; + + const auto& index = bucket->get_info().get_current_index(); + if (is_layout_indexless(index)) { + continue; // indexless buckets don't have stats + } + + /* need stats for num_entries */ + string bucket_ver, master_ver; + std::map stats; + ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, nullptr); + + if (ret < 0) + continue; + + for (const auto& s : stats) { + num_objects += s.second.num_objects; + } + + const uint32_t num_shards = rgw::num_shards(index.layout.normal); + uint64_t objs_per_shard = + (num_shards) ? num_objects/num_shards : num_objects; + { + bool warn; + stringstream ss; + uint64_t fill_pct = objs_per_shard * 100 / safe_max_objs_per_shard; + if (fill_pct > 100) { + ss << "OVER " << fill_pct << "%"; + warn = true; + } else if (fill_pct >= shard_warn_pct) { + ss << "WARN " << fill_pct << "%"; + warn = true; + } else { + ss << "OK"; + warn = false; + } + + if (warn || !warnings_only) { + formatter->open_object_section("bucket"); + formatter->dump_string("bucket", bucket->get_name()); + formatter->dump_string("tenant", bucket->get_tenant()); + formatter->dump_int("num_objects", num_objects); + formatter->dump_int("num_shards", num_shards); + formatter->dump_int("objects_per_shard", objs_per_shard); + formatter->dump_string("fill_status", ss.str()); + formatter->close_section(); + } + } + } + formatter->flush(cout); + } while (buckets.is_truncated()); /* foreach: bucket */ + + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + + } /* foreach: user_id */ + + formatter->close_section(); + formatter->flush(cout); + + return ret; +} /* RGWBucketAdminOp::limit_check */ + +int RGWBucketAdminOp::info(rgw::sal::Driver* driver, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + RGWBucket bucket; + int ret = 0; + const std::string& bucket_name = op_state.get_bucket_name(); + if (!bucket_name.empty()) { + ret = bucket.init(driver, op_state, y, dpp); + if (-ENOENT == ret) + return -ERR_NO_SUCH_BUCKET; + else if (ret < 0) + return ret; + } + + Formatter *formatter = flusher.get_formatter(); + flusher.start(0); + + CephContext *cct = driver->ctx(); + + const size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk; + + const bool show_stats = op_state.will_fetch_stats(); + const rgw_user& user_id = op_state.get_user_id(); + if (op_state.is_user_op()) { + formatter->open_array_section("buckets"); + + rgw::sal::BucketList buckets; + std::unique_ptr user = driver->get_user(op_state.get_user_id()); + std::string marker; + const std::string empty_end_marker; + constexpr bool no_need_stats = false; // set need_stats to false + + do { + ret = user->list_buckets(dpp, marker, empty_end_marker, max_entries, + no_need_stats, buckets, y); + if (ret < 0) { + return ret; + } + + const std::string* marker_cursor = nullptr; + map>& m = buckets.get_buckets(); + + for (const auto& i : m) { + const std::string& obj_name = i.first; + if (!bucket_name.empty() && bucket_name != obj_name) { + continue; + } + + if (show_stats) { + bucket_stats(driver, user_id.tenant, obj_name, formatter, dpp); + } else { + formatter->dump_string("bucket", obj_name); + } + + marker_cursor = &obj_name; + } // for loop + if (marker_cursor) { + marker = *marker_cursor; + } + + flusher.flush(); + } while (buckets.is_truncated()); + + formatter->close_section(); + } else if (!bucket_name.empty()) { + ret = bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp); + if (ret < 0) { + return ret; + } + } else { + void *handle = nullptr; + bool truncated = true; + + formatter->open_array_section("buckets"); + ret = driver->meta_list_keys_init(dpp, "bucket", string(), &handle); + while (ret == 0 && truncated) { + std::list buckets; + constexpr int max_keys = 1000; + ret = driver->meta_list_keys_next(dpp, handle, max_keys, buckets, + &truncated); + for (auto& bucket_name : buckets) { + if (show_stats) { + bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp); + } else { + formatter->dump_string("bucket", bucket_name); + } + } + } + driver->meta_list_keys_complete(handle); + + formatter->close_section(); + } + + flusher.flush(); + + return 0; +} + +int RGWBucketAdminOp::set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp) +{ + RGWBucket bucket; + + int ret = bucket.init(driver, op_state, null_yield, dpp); + if (ret < 0) + return ret; + return bucket.set_quota(op_state, dpp); +} + +inline auto split_tenant(const std::string& bucket_name){ + auto p = bucket_name.find('/'); + if(p != std::string::npos) { + return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1)); + } + return std::make_pair(std::string(), bucket_name); +} + +using bucket_instance_ls = std::vector; +void get_stale_instances(rgw::sal::Driver* driver, const std::string& bucket_name, + const vector& lst, + bucket_instance_ls& stale_instances, + const DoutPrefixProvider *dpp) +{ + + bucket_instance_ls other_instances; +// first iterate over the entries, and pick up the done buckets; these +// are guaranteed to be stale + for (const auto& bucket_instance : lst){ + RGWBucketInfo binfo; + std::unique_ptr bucket; + rgw_bucket rbucket; + rgw_bucket_parse_bucket_key(driver->ctx(), bucket_instance, &rbucket, nullptr); + int r = driver->get_bucket(dpp, nullptr, rbucket, &bucket, null_yield); + if (r < 0){ + // this can only happen if someone deletes us right when we're processing + ldpp_dout(dpp, -1) << "Bucket instance is invalid: " << bucket_instance + << cpp_strerror(-r) << dendl; + continue; + } + binfo = bucket->get_info(); + if (binfo.reshard_status == cls_rgw_reshard_status::DONE) + stale_instances.emplace_back(std::move(binfo)); + else { + other_instances.emplace_back(std::move(binfo)); + } + } + + // Read the cur bucket info, if the bucket doesn't exist we can simply return + // all the instances + auto [tenant, bname] = split_tenant(bucket_name); + RGWBucketInfo cur_bucket_info; + std::unique_ptr cur_bucket; + int r = driver->get_bucket(dpp, nullptr, tenant, bname, &cur_bucket, null_yield); + if (r < 0) { + if (r == -ENOENT) { + // bucket doesn't exist, everything is stale then + stale_instances.insert(std::end(stale_instances), + std::make_move_iterator(other_instances.begin()), + std::make_move_iterator(other_instances.end())); + } else { + // all bets are off if we can't read the bucket, just return the sureshot stale instances + ldpp_dout(dpp, -1) << "error: reading bucket info for bucket: " + << bname << cpp_strerror(-r) << dendl; + } + return; + } + + // Don't process further in this round if bucket is resharding + cur_bucket_info = cur_bucket->get_info(); + if (cur_bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) + return; + + other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(), + [&cur_bucket_info](const RGWBucketInfo& b){ + return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id || + b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id); + }), + other_instances.end()); + + // check if there are still instances left + if (other_instances.empty()) { + return; + } + + // Now we have a bucket with instances where the reshard status is none, this + // usually happens when the reshard process couldn't complete, lockdown the + // bucket and walk through these instances to make sure no one else interferes + // with these + { + RGWBucketReshardLock reshard_lock(static_cast(driver), cur_bucket->get_info(), true); + r = reshard_lock.lock(dpp); + if (r < 0) { + // most likely bucket is under reshard, return the sureshot stale instances + ldpp_dout(dpp, 5) << __func__ + << "failed to take reshard lock; reshard underway likey" << dendl; + return; + } + auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} ); + // this should be fast enough that we may not need to renew locks and check + // exit status?, should we read the values of the instances again? + stale_instances.insert(std::end(stale_instances), + std::make_move_iterator(other_instances.begin()), + std::make_move_iterator(other_instances.end())); + } + + return; +} + +static int process_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const DoutPrefixProvider *dpp, + std::function process_f) +{ + std::string marker; + void *handle; + Formatter *formatter = flusher.get_formatter(); + static constexpr auto default_max_keys = 1000; + + int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle); + if (ret < 0) { + cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + bool truncated; + + formatter->open_array_section("keys"); + auto g = make_scope_guard([&driver, &handle, &formatter]() { + driver->meta_list_keys_complete(handle); + formatter->close_section(); // keys + formatter->flush(cout); + }); + + do { + list keys; + + ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl; + return ret; + } if (ret != -ENOENT) { + // partition the list of buckets by buckets as the listing is un sorted, + // since it would minimize the reads to bucket_info + std::unordered_map> bucket_instance_map; + for (auto &key: keys) { + auto pos = key.find(':'); + if(pos != std::string::npos) + bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key)); + } + for (const auto& kv: bucket_instance_map) { + bucket_instance_ls stale_lst; + get_stale_instances(driver, kv.first, kv.second, stale_lst, dpp); + process_f(stale_lst, formatter, driver); + } + } + } while (truncated); + + return 0; +} + +int RGWBucketAdminOp::list_stale_instances(rgw::sal::Driver* driver, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const DoutPrefixProvider *dpp) +{ + auto process_f = [](const bucket_instance_ls& lst, + Formatter *formatter, + rgw::sal::Driver*){ + for (const auto& binfo: lst) + formatter->dump_string("key", binfo.bucket.get_key()); + }; + return process_stale_instances(driver, op_state, flusher, dpp, process_f); +} + + +int RGWBucketAdminOp::clear_stale_instances(rgw::sal::Driver* driver, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const DoutPrefixProvider *dpp) +{ + auto process_f = [dpp](const bucket_instance_ls& lst, + Formatter *formatter, + rgw::sal::Driver* driver){ + for (const auto &binfo: lst) { + std::unique_ptr bucket; + driver->get_bucket(nullptr, binfo, &bucket); + int ret = bucket->purge_instance(dpp); + if (ret == 0){ + auto md_key = "bucket.instance:" + binfo.bucket.get_key(); + ret = driver->meta_remove(dpp, md_key, null_yield); + } + formatter->open_object_section("delete_status"); + formatter->dump_string("bucket_instance", binfo.bucket.get_key()); + formatter->dump_int("status", -ret); + formatter->close_section(); + } + }; + + return process_stale_instances(driver, op_state, flusher, dpp, process_f); +} + +static int fix_single_bucket_lc(rgw::sal::Driver* driver, + const std::string& tenant_name, + const std::string& bucket_name, + const DoutPrefixProvider *dpp) +{ + std::unique_ptr bucket; + int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield); + if (ret < 0) { + // TODO: Should we handle the case where the bucket could've been removed between + // listing and fetching? + return ret; + } + + return rgw::lc::fix_lc_shard_entry(dpp, driver, driver->get_rgwlc()->get_lc(), bucket.get()); +} + +static void format_lc_status(Formatter* formatter, + const std::string& tenant_name, + const std::string& bucket_name, + int status) +{ + formatter->open_object_section("bucket_entry"); + std::string entry = tenant_name.empty() ? bucket_name : tenant_name + "/" + bucket_name; + formatter->dump_string("bucket", entry); + formatter->dump_int("status", status); + formatter->close_section(); // bucket_entry +} + +static void process_single_lc_entry(rgw::sal::Driver* driver, + Formatter *formatter, + const std::string& tenant_name, + const std::string& bucket_name, + const DoutPrefixProvider *dpp) +{ + int ret = fix_single_bucket_lc(driver, tenant_name, bucket_name, dpp); + format_lc_status(formatter, tenant_name, bucket_name, -ret); +} + +int RGWBucketAdminOp::fix_lc_shards(rgw::sal::Driver* driver, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const DoutPrefixProvider *dpp) +{ + std::string marker; + void *handle; + Formatter *formatter = flusher.get_formatter(); + static constexpr auto default_max_keys = 1000; + + bool truncated; + if (const std::string& bucket_name = op_state.get_bucket_name(); + ! bucket_name.empty()) { + const rgw_user user_id = op_state.get_user_id(); + process_single_lc_entry(driver, formatter, user_id.tenant, bucket_name, dpp); + formatter->flush(cout); + } else { + int ret = driver->meta_list_keys_init(dpp, "bucket", marker, &handle); + if (ret < 0) { + std::cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + { + formatter->open_array_section("lc_fix_status"); + auto sg = make_scope_guard([&driver, &handle, &formatter](){ + driver->meta_list_keys_complete(handle); + formatter->close_section(); // lc_fix_status + formatter->flush(cout); + }); + do { + list keys; + ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated); + if (ret < 0 && ret != -ENOENT) { + std::cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl; + return ret; + } if (ret != -ENOENT) { + for (const auto &key:keys) { + auto [tenant_name, bucket_name] = split_tenant(key); + process_single_lc_entry(driver, formatter, tenant_name, bucket_name, dpp); + } + } + formatter->flush(cout); // regularly flush every 1k entries + } while (truncated); + } + + } + return 0; + +} + +static bool has_object_expired(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + rgw::sal::Bucket* bucket, + const rgw_obj_key& key, utime_t& delete_at) +{ + std::unique_ptr obj = bucket->get_object(key); + bufferlist delete_at_bl; + + int ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_DELETE_AT, delete_at_bl, null_yield); + if (ret < 0) { + return false; // no delete at attr, proceed + } + + ret = decode_bl(delete_at_bl, delete_at); + if (ret < 0) { + return false; // failed to parse + } + + if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) { + return true; + } + + return false; +} + +static int fix_bucket_obj_expiry(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + rgw::sal::Bucket* bucket, + RGWFormatterFlusher& flusher, bool dry_run) +{ + if (bucket->get_key().bucket_id == bucket->get_key().marker) { + ldpp_dout(dpp, -1) << "Not a resharded bucket skipping" << dendl; + return 0; // not a resharded bucket, move along + } + + Formatter *formatter = flusher.get_formatter(); + formatter->open_array_section("expired_deletion_status"); + auto sg = make_scope_guard([&formatter] { + formatter->close_section(); + formatter->flush(std::cout); + }); + + rgw::sal::Bucket::ListParams params; + rgw::sal::Bucket::ListResults results; + + params.list_versions = bucket->versioned(); + params.allow_unordered = true; + + do { + int ret = bucket->list(dpp, params, listing_max_entries, results, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR failed to list objects in the bucket" << dendl; + return ret; + } + for (const auto& obj : results.objs) { + rgw_obj_key key(obj.key); + utime_t delete_at; + if (has_object_expired(dpp, driver, bucket, key, delete_at)) { + formatter->open_object_section("object_status"); + formatter->dump_string("object", key.name); + formatter->dump_stream("delete_at") << delete_at; + + if (!dry_run) { + ret = rgw_remove_object(dpp, driver, bucket, key); + formatter->dump_int("status", ret); + } + + formatter->close_section(); // object_status + } + } + formatter->flush(cout); // regularly flush every 1k entries + } while (results.is_truncated); + + return 0; +} + +int RGWBucketAdminOp::fix_obj_expiry(rgw::sal::Driver* driver, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const DoutPrefixProvider *dpp, bool dry_run) +{ + RGWBucket admin_bucket; + int ret = admin_bucket.init(driver, op_state, null_yield, dpp); + if (ret < 0) { + ldpp_dout(dpp, -1) << "failed to initialize bucket" << dendl; + return ret; + } + std::unique_ptr bucket; + ret = driver->get_bucket(nullptr, admin_bucket.get_bucket_info(), &bucket); + if (ret < 0) { + return ret; + } + + return fix_bucket_obj_expiry(dpp, driver, bucket.get(), flusher, dry_run); +} + +void RGWBucketCompleteInfo::dump(Formatter *f) const { + encode_json("bucket_info", info, f); + encode_json("attrs", attrs, f); +} + +void RGWBucketCompleteInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket_info", info, obj); + JSONDecoder::decode_json("attrs", attrs, obj); +} + +class RGWBucketMetadataHandler : public RGWBucketMetadataHandlerBase { +public: + struct Svc { + RGWSI_Bucket *bucket{nullptr}; + } svc; + + struct Ctl { + RGWBucketCtl *bucket{nullptr}; + } ctl; + + RGWBucketMetadataHandler() {} + + void init(RGWSI_Bucket *bucket_svc, + RGWBucketCtl *bucket_ctl) override { + base_init(bucket_svc->ctx(), + bucket_svc->get_ep_be_handler().get()); + svc.bucket = bucket_svc; + ctl.bucket = bucket_ctl; + } + + string get_type() override { return "bucket"; } + + RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override { + RGWBucketEntryPoint be; + + try { + decode_json_obj(be, jo); + } catch (JSONDecoder::err& e) { + return nullptr; + } + + return new RGWBucketEntryMetadataObject(be, objv, mtime); + } + + int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override { + RGWObjVersionTracker ot; + RGWBucketEntryPoint be; + + real_time mtime; + map attrs; + + RGWSI_Bucket_EP_Ctx ctx(op->ctx()); + + int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &ot, &mtime, &attrs, y, dpp); + if (ret < 0) + return ret; + + RGWBucketEntryMetadataObject *mdo = new RGWBucketEntryMetadataObject(be, ot.read_version, mtime, std::move(attrs)); + + *obj = mdo; + + return 0; + } + + int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, bool from_remote_zone) override; + + int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + RGWBucketEntryPoint be; + + real_time orig_mtime; + + RGWSI_Bucket_EP_Ctx ctx(op->ctx()); + + int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &orig_mtime, nullptr, y, dpp); + if (ret < 0) + return ret; + + /* + * We're unlinking the bucket but we don't want to update the entrypoint here - we're removing + * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal + * will incorrectly fail. + */ + ret = ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false); + if (ret < 0) { + ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl; + } + + ret = svc.bucket->remove_bucket_entrypoint_info(ctx, entry, &objv_tracker, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl; + } + /* idempotent */ + return 0; + } + + int call(std::function f) { + return call(nullopt, f); + } + + int call(std::optional bectx_params, + std::function f) { + return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) { + RGWSI_Bucket_EP_Ctx ctx(op->ctx()); + return f(ctx); + }); + } +}; + +class RGWMetadataHandlerPut_Bucket : public RGWMetadataHandlerPut_SObj +{ + RGWBucketMetadataHandler *bhandler; + RGWBucketEntryMetadataObject *obj; +public: + RGWMetadataHandlerPut_Bucket(RGWBucketMetadataHandler *_handler, + RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, + RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone), + bhandler(_handler) { + obj = static_cast(_obj); + } + ~RGWMetadataHandlerPut_Bucket() {} + + void encode_obj(bufferlist *bl) override { + obj->get_ep().encode(*bl); + } + + int put_checked(const DoutPrefixProvider *dpp) override; + int put_post(const DoutPrefixProvider *dpp) override; +}; + +int RGWBucketMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, bool from_remote_zone) +{ + RGWMetadataHandlerPut_Bucket put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone); + return do_put_operate(&put_op, dpp); +} + +int RGWMetadataHandlerPut_Bucket::put_checked(const DoutPrefixProvider *dpp) +{ + RGWBucketEntryMetadataObject *orig_obj = static_cast(old_obj); + + if (orig_obj) { + obj->set_pattrs(&orig_obj->get_attrs()); + } + + auto& be = obj->get_ep(); + auto mtime = obj->get_mtime(); + auto pattrs = obj->get_pattrs(); + + RGWSI_Bucket_EP_Ctx ctx(op->ctx()); + + return bhandler->svc.bucket->store_bucket_entrypoint_info(ctx, entry, + be, + false, + mtime, + pattrs, + &objv_tracker, + y, + dpp); +} + +int RGWMetadataHandlerPut_Bucket::put_post(const DoutPrefixProvider *dpp) +{ + auto& be = obj->get_ep(); + + int ret; + + /* link bucket */ + if (be.linked) { + ret = bhandler->ctl.bucket->link_bucket(be.owner, be.bucket, be.creation_time, y, dpp, false); + } else { + ret = bhandler->ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false); + } + + return ret; +} + +static void get_md5_digest(const RGWBucketEntryPoint *be, string& md5_digest) { + + char md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + bufferlist bl; + + Formatter *f = new JSONFormatter(false); + be->dump(f); + f->flush(bl); + + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + hash.Update((const unsigned char *)bl.c_str(), bl.length()); + hash.Final(m); + + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, md5); + + delete f; + + md5_digest = md5; +} + +#define ARCHIVE_META_ATTR RGW_ATTR_PREFIX "zone.archive.info" + +struct archive_meta_info { + rgw_bucket orig_bucket; + + bool from_attrs(CephContext *cct, map& attrs) { + auto iter = attrs.find(ARCHIVE_META_ATTR); + if (iter == attrs.end()) { + return false; + } + + auto bliter = iter->second.cbegin(); + try { + decode(bliter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode archive meta info" << dendl; + return false; + } + + return true; + } + + void store_in_attrs(map& attrs) const { + encode(attrs[ARCHIVE_META_ATTR]); + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(orig_bucket, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(orig_bucket, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(archive_meta_info) + +class RGWArchiveBucketMetadataHandler : public RGWBucketMetadataHandler { +public: + RGWArchiveBucketMetadataHandler() {} + + int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + auto cct = svc.bucket->ctx(); + + RGWSI_Bucket_EP_Ctx ctx(op->ctx()); + + ldpp_dout(dpp, 5) << "SKIP: bucket removal is not allowed on archive zone: bucket:" << entry << " ... proceeding to rename" << dendl; + + string tenant_name, bucket_name; + parse_bucket(entry, &tenant_name, &bucket_name); + rgw_bucket entry_bucket; + entry_bucket.tenant = tenant_name; + entry_bucket.name = bucket_name; + + real_time mtime; + + /* read original entrypoint */ + + RGWBucketEntryPoint be; + map attrs; + int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &mtime, &attrs, y, dpp); + if (ret < 0) { + return ret; + } + + string bi_meta_name = RGWSI_Bucket::get_bi_meta_key(be.bucket); + + /* read original bucket instance info */ + + map attrs_m; + ceph::real_time orig_mtime; + RGWBucketInfo old_bi; + + ret = ctl.bucket->read_bucket_instance_info(be.bucket, &old_bi, y, dpp, RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(&orig_mtime) + .set_attrs(&attrs_m)); + if (ret < 0) { + return ret; + } + + archive_meta_info ami; + + if (!ami.from_attrs(svc.bucket->ctx(), attrs_m)) { + ami.orig_bucket = old_bi.bucket; + ami.store_in_attrs(attrs_m); + } + + /* generate a new bucket instance. We could have avoided this if we could just point a new + * bucket entry point to the old bucket instance, however, due to limitation in the way + * we index buckets under the user, bucket entrypoint and bucket instance of the same + * bucket need to have the same name, so we need to copy the old bucket instance into + * to a new entry with the new name + */ + + string new_bucket_name; + + RGWBucketInfo new_bi = old_bi; + RGWBucketEntryPoint new_be = be; + + string md5_digest; + + get_md5_digest(&new_be, md5_digest); + new_bucket_name = ami.orig_bucket.name + "-deleted-" + md5_digest; + + new_bi.bucket.name = new_bucket_name; + new_bi.objv_tracker.clear(); + + new_be.bucket.name = new_bucket_name; + + ret = ctl.bucket->store_bucket_instance_info(new_be.bucket, new_bi, y, dpp, RGWBucketCtl::BucketInstance::PutParams() + .set_exclusive(false) + .set_mtime(orig_mtime) + .set_attrs(&attrs_m) + .set_orig_info(&old_bi)); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket instance info for bucket=" << new_bi.bucket << " ret=" << ret << dendl; + return ret; + } + + /* store a new entrypoint */ + + RGWObjVersionTracker ot; + ot.generate_new_write_ver(cct); + + ret = svc.bucket->store_bucket_entrypoint_info(ctx, RGWSI_Bucket::get_entrypoint_meta_key(new_be.bucket), + new_be, true, mtime, &attrs, nullptr, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl; + return ret; + } + + /* link new bucket */ + + ret = ctl.bucket->link_bucket(new_be.owner, new_be.bucket, new_be.creation_time, y, dpp, false); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to link new bucket for bucket=" << new_be.bucket << " ret=" << ret << dendl; + return ret; + } + + /* clean up old stuff */ + + ret = ctl.bucket->unlink_bucket(be.owner, entry_bucket, y, dpp, false); + if (ret < 0) { + ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl; + } + + // if (ret == -ECANCELED) it means that there was a race here, and someone + // wrote to the bucket entrypoint just before we removed it. The question is + // whether it was a newly created bucket entrypoint ... in which case we + // should ignore the error and move forward, or whether it is a higher version + // of the same bucket instance ... in which we should retry + ret = svc.bucket->remove_bucket_entrypoint_info(ctx, + RGWSI_Bucket::get_entrypoint_meta_key(be.bucket), + &objv_tracker, + y, + dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl; + return ret; + } + + ret = ctl.bucket->remove_bucket_instance_info(be.bucket, old_bi, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl; + } + + + /* idempotent */ + + return 0; + } + + int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, bool from_remote_zone) override { + if (entry.find("-deleted-") != string::npos) { + RGWObjVersionTracker ot; + RGWMetadataObject *robj; + int ret = do_get(op, entry, &robj, y, dpp); + if (ret != -ENOENT) { + if (ret < 0) { + return ret; + } + ot.read_version = robj->get_version(); + delete robj; + + ret = do_remove(op, entry, ot, y, dpp); + if (ret < 0) { + return ret; + } + } + } + + return RGWBucketMetadataHandler::do_put(op, entry, obj, + objv_tracker, y, dpp, type, from_remote_zone); + } + +}; + +class RGWBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandlerBase { + int read_bucket_instance_entry(RGWSI_Bucket_BI_Ctx& ctx, + const string& entry, + RGWBucketCompleteInfo *bi, + ceph::real_time *pmtime, + optional_yield y, + const DoutPrefixProvider *dpp) { + return svc.bucket->read_bucket_instance_info(ctx, + entry, + &bi->info, + pmtime, &bi->attrs, + y, + dpp); + } + +public: + struct Svc { + RGWSI_Zone *zone{nullptr}; + RGWSI_Bucket *bucket{nullptr}; + RGWSI_BucketIndex *bi{nullptr}; + } svc; + + rgw::sal::Driver* driver; + + RGWBucketInstanceMetadataHandler(rgw::sal::Driver* driver) + : driver(driver) {} + + void init(RGWSI_Zone *zone_svc, + RGWSI_Bucket *bucket_svc, + RGWSI_BucketIndex *bi_svc) override { + base_init(bucket_svc->ctx(), + bucket_svc->get_bi_be_handler().get()); + svc.zone = zone_svc; + svc.bucket = bucket_svc; + svc.bi = bi_svc; + } + + string get_type() override { return "bucket.instance"; } + + RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override { + RGWBucketCompleteInfo bci; + + try { + decode_json_obj(bci, jo); + } catch (JSONDecoder::err& e) { + return nullptr; + } + + return new RGWBucketInstanceMetadataObject(bci, objv, mtime); + } + + int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override { + RGWBucketCompleteInfo bci; + real_time mtime; + + RGWSI_Bucket_BI_Ctx ctx(op->ctx()); + + int ret = svc.bucket->read_bucket_instance_info(ctx, entry, &bci.info, &mtime, &bci.attrs, y, dpp); + if (ret < 0) + return ret; + + RGWBucketInstanceMetadataObject *mdo = new RGWBucketInstanceMetadataObject(bci, bci.info.objv_tracker.read_version, mtime); + + *obj = mdo; + + return 0; + } + + int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp, + RGWMDLogSyncType sync_type, bool from_remote_zone) override; + + int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + RGWBucketCompleteInfo bci; + + RGWSI_Bucket_BI_Ctx ctx(op->ctx()); + + int ret = read_bucket_instance_entry(ctx, entry, &bci, nullptr, y, dpp); + if (ret < 0 && ret != -ENOENT) + return ret; + + return svc.bucket->remove_bucket_instance_info(ctx, entry, bci.info, &bci.info.objv_tracker, y, dpp); + } + + int call(std::function f) { + return call(nullopt, f); + } + + int call(std::optional bectx_params, + std::function f) { + return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) { + RGWSI_Bucket_BI_Ctx ctx(op->ctx()); + return f(ctx); + }); + } +}; + +class RGWMetadataHandlerPut_BucketInstance : public RGWMetadataHandlerPut_SObj +{ + CephContext *cct; + RGWBucketInstanceMetadataHandler *bihandler; + RGWBucketInstanceMetadataObject *obj; +public: + RGWMetadataHandlerPut_BucketInstance(CephContext *_cct, + RGWBucketInstanceMetadataHandler *_handler, + RGWSI_MetaBackend_Handler::Op *_op, string& entry, + RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, + RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, _op, entry, _obj, objv_tracker, y, type, from_remote_zone), + cct(_cct), bihandler(_handler) { + obj = static_cast(_obj); + + auto& bci = obj->get_bci(); + obj->set_pattrs(&bci.attrs); + } + + void encode_obj(bufferlist *bl) override { + obj->get_bucket_info().encode(*bl); + } + + int put_check(const DoutPrefixProvider *dpp) override; + int put_checked(const DoutPrefixProvider *dpp) override; + int put_post(const DoutPrefixProvider *dpp) override; +}; + +int RGWBucketInstanceMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, + string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, bool from_remote_zone) +{ + RGWMetadataHandlerPut_BucketInstance put_op(svc.bucket->ctx(), this, op, entry, obj, + objv_tracker, y, type, from_remote_zone); + return do_put_operate(&put_op, dpp); +} + +void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout, + const RGWZone& zone, + std::optional shards, + std::optional type) { + layout.current_index.gen = 0; + layout.current_index.layout.normal.hash_type = rgw::BucketHashType::Mod; + + layout.current_index.layout.type = + type.value_or(rgw::BucketIndexType::Normal); + + if (shards) { + layout.current_index.layout.normal.num_shards = *shards; + } else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) { + layout.current_index.layout.normal.num_shards = + cct->_conf->rgw_override_bucket_index_max_shards; + } else { + layout.current_index.layout.normal.num_shards = + zone.bucket_index_max_shards; + } + + if (layout.current_index.layout.type == rgw::BucketIndexType::Normal) { + layout.logs.push_back(log_layout_from_index(0, layout.current_index)); + } +} + +int RGWMetadataHandlerPut_BucketInstance::put_check(const DoutPrefixProvider *dpp) +{ + int ret; + + RGWBucketCompleteInfo& bci = obj->get_bci(); + + RGWBucketInstanceMetadataObject *orig_obj = static_cast(old_obj); + + RGWBucketCompleteInfo *old_bci = (orig_obj ? &orig_obj->get_bci() : nullptr); + + const bool exists = (!!orig_obj); + + if (from_remote_zone) { + // don't sync bucket layout changes + if (!exists) { + // replace peer's layout with default-constructed, then apply our defaults + bci.info.layout = rgw::BucketLayout{}; + init_default_bucket_layout(cct, bci.info.layout, + bihandler->svc.zone->get_zone(), + std::nullopt, std::nullopt); + } else { + bci.info.layout = old_bci->info.layout; + } + } + + if (!exists || old_bci->info.bucket.bucket_id != bci.info.bucket.bucket_id) { + /* a new bucket, we need to select a new bucket placement for it */ + string tenant_name; + string bucket_name; + string bucket_instance; + parse_bucket(entry, &tenant_name, &bucket_name, &bucket_instance); + + RGWZonePlacementInfo rule_info; + bci.info.bucket.name = bucket_name; + bci.info.bucket.bucket_id = bucket_instance; + bci.info.bucket.tenant = tenant_name; + // if the sync module never writes data, don't require the zone to specify all placement targets + if (bihandler->svc.zone->sync_module_supports_writes()) { + ret = bihandler->svc.zone->select_bucket_location_by_rule(dpp, bci.info.placement_rule, &rule_info, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: select_bucket_placement() returned " << ret << dendl; + return ret; + } + } + bci.info.layout.current_index.layout.type = rule_info.index_type; + } else { + /* always keep bucket versioning enabled on archive zone */ + if (bihandler->driver->get_zone()->get_tier_type() == "archive") { + bci.info.flags = (bci.info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED; + } + /* existing bucket, keep its placement */ + bci.info.bucket.explicit_placement = old_bci->info.bucket.explicit_placement; + bci.info.placement_rule = old_bci->info.placement_rule; + } + + /* record the read version (if any), store the new version */ + bci.info.objv_tracker.read_version = objv_tracker.read_version; + bci.info.objv_tracker.write_version = objv_tracker.write_version; + + return 0; +} + +int RGWMetadataHandlerPut_BucketInstance::put_checked(const DoutPrefixProvider *dpp) +{ + RGWBucketInstanceMetadataObject *orig_obj = static_cast(old_obj); + + RGWBucketInfo *orig_info = (orig_obj ? &orig_obj->get_bucket_info() : nullptr); + + auto& info = obj->get_bucket_info(); + auto mtime = obj->get_mtime(); + auto pattrs = obj->get_pattrs(); + + RGWSI_Bucket_BI_Ctx ctx(op->ctx()); + + return bihandler->svc.bucket->store_bucket_instance_info(ctx, + entry, + info, + orig_info, + false, + mtime, + pattrs, + y, + dpp); +} + +int RGWMetadataHandlerPut_BucketInstance::put_post(const DoutPrefixProvider *dpp) +{ + RGWBucketCompleteInfo& bci = obj->get_bci(); + + objv_tracker = bci.info.objv_tracker; + + int ret = bihandler->svc.bi->init_index(dpp, bci.info, bci.info.layout.current_index); + if (ret < 0) { + return ret; + } + + /* update lifecyle policy */ + { + std::unique_ptr bucket; + ret = bihandler->driver->get_bucket(nullptr, bci.info, &bucket); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << " failed to get_bucket(...) for " + << bci.info.bucket.name + << dendl; + return ret; + } + + auto lc = bihandler->driver->get_rgwlc(); + + auto lc_it = bci.attrs.find(RGW_ATTR_LC); + if (lc_it != bci.attrs.end()) { + ldpp_dout(dpp, 20) << "set lc config for " << bci.info.bucket.name << dendl; + ret = lc->set_bucket_config(bucket.get(), bci.attrs, nullptr); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << " failed to set lc config for " + << bci.info.bucket.name + << dendl; + return ret; + } + + } else { + ldpp_dout(dpp, 20) << "remove lc config for " << bci.info.bucket.name << dendl; + ret = lc->remove_bucket_config(bucket.get(), bci.attrs, false /* cannot merge attrs */); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << " failed to remove lc config for " + << bci.info.bucket.name + << dendl; + return ret; + } + } + } /* update lc */ + + return STATUS_APPLIED; +} + +class RGWArchiveBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandler { +public: + RGWArchiveBucketInstanceMetadataHandler(rgw::sal::Driver* driver) + : RGWBucketInstanceMetadataHandler(driver) {} + + // N.B. replication of lifecycle policy relies on logic in RGWBucketInstanceMetadataHandler::do_put(...), override with caution + + int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override { + ldpp_dout(dpp, 0) << "SKIP: bucket instance removal is not allowed on archive zone: bucket.instance:" << entry << dendl; + return 0; + } +}; + +RGWBucketCtl::RGWBucketCtl(RGWSI_Zone *zone_svc, + RGWSI_Bucket *bucket_svc, + RGWSI_Bucket_Sync *bucket_sync_svc, + RGWSI_BucketIndex *bi_svc, + RGWSI_User* user_svc) + : cct(zone_svc->ctx()) +{ + svc.zone = zone_svc; + svc.bucket = bucket_svc; + svc.bucket_sync = bucket_sync_svc; + svc.bi = bi_svc; + svc.user = user_svc; +} + +void RGWBucketCtl::init(RGWUserCtl *user_ctl, + RGWBucketMetadataHandler *_bm_handler, + RGWBucketInstanceMetadataHandler *_bmi_handler, + RGWDataChangesLog *datalog, + const DoutPrefixProvider *dpp) +{ + ctl.user = user_ctl; + + bm_handler = _bm_handler; + bmi_handler = _bmi_handler; + + bucket_be_handler = bm_handler->get_be_handler(); + bi_be_handler = bmi_handler->get_be_handler(); + + datalog->set_bucket_filter( + [this](const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp) { + return bucket_exports_data(bucket, y, dpp); + }); +} + +int RGWBucketCtl::call(std::function f) { + return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ep_ctx) { + return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& bi_ctx) { + RGWSI_Bucket_X_Ctx ctx{ep_ctx, bi_ctx}; + return f(ctx); + }); + }); +} + +int RGWBucketCtl::read_bucket_entrypoint_info(const rgw_bucket& bucket, + RGWBucketEntryPoint *info, + optional_yield y, const DoutPrefixProvider *dpp, + const Bucket::GetParams& params) +{ + return bm_handler->call(params.bectx_params, [&](RGWSI_Bucket_EP_Ctx& ctx) { + return svc.bucket->read_bucket_entrypoint_info(ctx, + RGWSI_Bucket::get_entrypoint_meta_key(bucket), + info, + params.objv_tracker, + params.mtime, + params.attrs, + y, + dpp, + params.cache_info, + params.refresh_version); + }); +} + +int RGWBucketCtl::store_bucket_entrypoint_info(const rgw_bucket& bucket, + RGWBucketEntryPoint& info, + optional_yield y, + const DoutPrefixProvider *dpp, + const Bucket::PutParams& params) +{ + return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) { + return svc.bucket->store_bucket_entrypoint_info(ctx, + RGWSI_Bucket::get_entrypoint_meta_key(bucket), + info, + params.exclusive, + params.mtime, + params.attrs, + params.objv_tracker, + y, + dpp); + }); +} + +int RGWBucketCtl::remove_bucket_entrypoint_info(const rgw_bucket& bucket, + optional_yield y, + const DoutPrefixProvider *dpp, + const Bucket::RemoveParams& params) +{ + return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) { + return svc.bucket->remove_bucket_entrypoint_info(ctx, + RGWSI_Bucket::get_entrypoint_meta_key(bucket), + params.objv_tracker, + y, + dpp); + }); +} + +int RGWBucketCtl::read_bucket_instance_info(const rgw_bucket& bucket, + RGWBucketInfo *info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::GetParams& params) +{ + int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) { + return svc.bucket->read_bucket_instance_info(ctx, + RGWSI_Bucket::get_bi_meta_key(bucket), + info, + params.mtime, + params.attrs, + y, + dpp, + params.cache_info, + params.refresh_version); + }); + + if (ret < 0) { + return ret; + } + + if (params.objv_tracker) { + *params.objv_tracker = info->objv_tracker; + } + + return 0; +} + +int RGWBucketCtl::read_bucket_info(const rgw_bucket& bucket, + RGWBucketInfo *info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::GetParams& params, + RGWObjVersionTracker *ep_objv_tracker) +{ + const rgw_bucket *b = &bucket; + + std::optional ep; + + if (b->bucket_id.empty()) { + ep.emplace(); + + int r = read_bucket_entrypoint_info(*b, &(*ep), y, dpp, RGWBucketCtl::Bucket::GetParams() + .set_bectx_params(params.bectx_params) + .set_objv_tracker(ep_objv_tracker)); + if (r < 0) { + return r; + } + + b = &ep->bucket; + } + + int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) { + return svc.bucket->read_bucket_instance_info(ctx, + RGWSI_Bucket::get_bi_meta_key(*b), + info, + params.mtime, + params.attrs, + y, dpp, + params.cache_info, + params.refresh_version); + }); + + if (ret < 0) { + return ret; + } + + if (params.objv_tracker) { + *params.objv_tracker = info->objv_tracker; + } + + return 0; +} + +int RGWBucketCtl::do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const rgw_bucket& bucket, + RGWBucketInfo& info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::PutParams& params) +{ + if (params.objv_tracker) { + info.objv_tracker = *params.objv_tracker; + } + + return svc.bucket->store_bucket_instance_info(ctx, + RGWSI_Bucket::get_bi_meta_key(bucket), + info, + params.orig_info, + params.exclusive, + params.mtime, + params.attrs, + y, + dpp); +} + +int RGWBucketCtl::store_bucket_instance_info(const rgw_bucket& bucket, + RGWBucketInfo& info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::PutParams& params) +{ + return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) { + return do_store_bucket_instance_info(ctx, bucket, info, y, dpp, params); + }); +} + +int RGWBucketCtl::remove_bucket_instance_info(const rgw_bucket& bucket, + RGWBucketInfo& info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::RemoveParams& params) +{ + if (params.objv_tracker) { + info.objv_tracker = *params.objv_tracker; + } + + return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) { + return svc.bucket->remove_bucket_instance_info(ctx, + RGWSI_Bucket::get_bi_meta_key(bucket), + info, + &info.objv_tracker, + y, + dpp); + }); +} + +int RGWBucketCtl::do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx, + RGWBucketInfo& info, + RGWBucketInfo *orig_info, + bool exclusive, real_time mtime, + obj_version *pep_objv, + map *pattrs, + bool create_entry_point, + optional_yield y, const DoutPrefixProvider *dpp) +{ + bool create_head = !info.has_instance_obj || create_entry_point; + + int ret = svc.bucket->store_bucket_instance_info(ctx.bi, + RGWSI_Bucket::get_bi_meta_key(info.bucket), + info, + orig_info, + exclusive, + mtime, pattrs, + y, dpp); + if (ret < 0) { + return ret; + } + + if (!create_head) + return 0; /* done! */ + + RGWBucketEntryPoint entry_point; + entry_point.bucket = info.bucket; + entry_point.owner = info.owner; + entry_point.creation_time = info.creation_time; + entry_point.linked = true; + RGWObjVersionTracker ot; + if (pep_objv && !pep_objv->tag.empty()) { + ot.write_version = *pep_objv; + } else { + ot.generate_new_write_ver(cct); + if (pep_objv) { + *pep_objv = ot.write_version; + } + } + ret = svc.bucket->store_bucket_entrypoint_info(ctx.ep, + RGWSI_Bucket::get_entrypoint_meta_key(info.bucket), + entry_point, + exclusive, + mtime, + pattrs, + &ot, + y, + dpp); + if (ret < 0) + return ret; + + return 0; +} +int RGWBucketCtl::convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx, + const rgw_bucket& bucket, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + RGWBucketEntryPoint entry_point; + real_time ep_mtime; + RGWObjVersionTracker ot; + map attrs; + RGWBucketInfo info; + auto cct = svc.bucket->ctx(); + + ldpp_dout(dpp, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket << dendl; + + int ret = svc.bucket->read_bucket_entrypoint_info(ctx.ep, + RGWSI_Bucket::get_entrypoint_meta_key(bucket), + &entry_point, &ot, &ep_mtime, &attrs, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket << dendl; + return ret; + } + + if (!entry_point.has_bucket_info) { + /* already converted! */ + return 0; + } + + info = entry_point.old_bucket_info; + + ot.generate_new_write_ver(cct); + + ret = do_store_linked_bucket_info(ctx, info, nullptr, false, ep_mtime, &ot.write_version, &attrs, true, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl; + return ret; + } + + return 0; +} + +int RGWBucketCtl::set_bucket_instance_attrs(RGWBucketInfo& bucket_info, + map& attrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + return call([&](RGWSI_Bucket_X_Ctx& ctx) { + rgw_bucket& bucket = bucket_info.bucket; + + if (!bucket_info.has_instance_obj) { + /* an old bucket object, need to convert it */ + int ret = convert_old_bucket_info(ctx, bucket, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed converting old bucket info: " << ret << dendl; + return ret; + } + } + + return do_store_bucket_instance_info(ctx.bi, + bucket, + bucket_info, + y, + dpp, + BucketInstance::PutParams().set_attrs(&attrs) + .set_objv_tracker(objv_tracker) + .set_orig_info(&bucket_info)); + }); +} + + +int RGWBucketCtl::link_bucket(const rgw_user& user_id, + const rgw_bucket& bucket, + ceph::real_time creation_time, + optional_yield y, + const DoutPrefixProvider *dpp, + bool update_entrypoint, + rgw_ep_info *pinfo) +{ + return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) { + return do_link_bucket(ctx, user_id, bucket, creation_time, + update_entrypoint, pinfo, y, dpp); + }); +} + +int RGWBucketCtl::do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx, + const rgw_user& user_id, + const rgw_bucket& bucket, + ceph::real_time creation_time, + bool update_entrypoint, + rgw_ep_info *pinfo, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + int ret; + + RGWBucketEntryPoint ep; + RGWObjVersionTracker ot; + RGWObjVersionTracker& rot = (pinfo) ? pinfo->ep_objv : ot; + map attrs, *pattrs = nullptr; + string meta_key; + + if (update_entrypoint) { + meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket); + if (pinfo) { + ep = pinfo->ep; + pattrs = &pinfo->attrs; + } else { + ret = svc.bucket->read_bucket_entrypoint_info(ctx, + meta_key, + &ep, &rot, + nullptr, &attrs, + y, dpp); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() returned: " + << cpp_strerror(-ret) << dendl; + } + pattrs = &attrs; + } + } + + ret = svc.user->add_bucket(dpp, user_id, bucket, creation_time, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: error adding bucket to user directory:" + << " user=" << user_id + << " bucket=" << bucket + << " err=" << cpp_strerror(-ret) + << dendl; + goto done_err; + } + + if (!update_entrypoint) + return 0; + + ep.linked = true; + ep.owner = user_id; + ep.bucket = bucket; + ret = svc.bucket->store_bucket_entrypoint_info( + ctx, meta_key, ep, false, real_time(), pattrs, &rot, y, dpp); + if (ret < 0) + goto done_err; + + return 0; + +done_err: + int r = do_unlink_bucket(ctx, user_id, bucket, true, y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed unlinking bucket on error cleanup: " + << cpp_strerror(-r) << dendl; + } + return ret; +} + +int RGWBucketCtl::unlink_bucket(const rgw_user& user_id, const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp, bool update_entrypoint) +{ + return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) { + return do_unlink_bucket(ctx, user_id, bucket, update_entrypoint, y, dpp); + }); +} + +int RGWBucketCtl::do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx, + const rgw_user& user_id, + const rgw_bucket& bucket, + bool update_entrypoint, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + int ret = svc.user->remove_bucket(dpp, user_id, bucket, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: error removing bucket from directory: " + << cpp_strerror(-ret)<< dendl; + } + + if (!update_entrypoint) + return 0; + + RGWBucketEntryPoint ep; + RGWObjVersionTracker ot; + map attrs; + string meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket); + ret = svc.bucket->read_bucket_entrypoint_info(ctx, meta_key, &ep, &ot, nullptr, &attrs, y, dpp); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + + if (!ep.linked) + return 0; + + if (ep.owner != user_id) { + ldpp_dout(dpp, 0) << "bucket entry point user mismatch, can't unlink bucket: " << ep.owner << " != " << user_id << dendl; + return -EINVAL; + } + + ep.linked = false; + return svc.bucket->store_bucket_entrypoint_info(ctx, meta_key, ep, false, real_time(), &attrs, &ot, y, dpp); +} + +int RGWBucketCtl::read_bucket_stats(const rgw_bucket& bucket, + RGWBucketEnt *result, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + return call([&](RGWSI_Bucket_X_Ctx& ctx) { + return svc.bucket->read_bucket_stats(ctx, bucket, result, y, dpp); + }); +} + +int RGWBucketCtl::read_buckets_stats(map& m, + optional_yield y, const DoutPrefixProvider *dpp) +{ + return call([&](RGWSI_Bucket_X_Ctx& ctx) { + return svc.bucket->read_buckets_stats(ctx, m, y, dpp); + }); +} + +int RGWBucketCtl::sync_user_stats(const DoutPrefixProvider *dpp, + const rgw_user& user_id, + const RGWBucketInfo& bucket_info, + optional_yield y, + RGWBucketEnt* pent) +{ + RGWBucketEnt ent; + if (!pent) { + pent = &ent; + } + int r = svc.bi->read_stats(dpp, bucket_info, pent, y); + if (r < 0) { + ldpp_dout(dpp, 20) << __func__ << "(): failed to read bucket stats (r=" << r << ")" << dendl; + return r; + } + + return svc.user->flush_bucket_stats(dpp, user_id, *pent, y); +} + +int RGWBucketCtl::get_sync_policy_handler(std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef *phandler, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + int r = call([&](RGWSI_Bucket_X_Ctx& ctx) { + return svc.bucket_sync->get_policy_handler(ctx, zone, bucket, phandler, y, dpp); + }); + if (r < 0) { + ldpp_dout(dpp, 20) << __func__ << "(): failed to get policy handler for bucket=" << bucket << " (r=" << r << ")" << dendl; + return r; + } + return 0; +} + +int RGWBucketCtl::bucket_exports_data(const rgw_bucket& bucket, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + + RGWBucketSyncPolicyHandlerRef handler; + + int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp); + if (r < 0) { + return r; + } + + return handler->bucket_exports_data(); +} + +int RGWBucketCtl::bucket_imports_data(const rgw_bucket& bucket, + optional_yield y, const DoutPrefixProvider *dpp) +{ + + RGWBucketSyncPolicyHandlerRef handler; + + int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp); + if (r < 0) { + return r; + } + + return handler->bucket_imports_data(); +} + +RGWBucketMetadataHandlerBase* RGWBucketMetaHandlerAllocator::alloc() +{ + return new RGWBucketMetadataHandler(); +} + +RGWBucketInstanceMetadataHandlerBase* RGWBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver) +{ + return new RGWBucketInstanceMetadataHandler(driver); +} + +RGWBucketMetadataHandlerBase* RGWArchiveBucketMetaHandlerAllocator::alloc() +{ + return new RGWArchiveBucketMetadataHandler(); +} + +RGWBucketInstanceMetadataHandlerBase* RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver) +{ + return new RGWArchiveBucketInstanceMetadataHandler(driver); +} + + +void RGWBucketEntryPoint::generate_test_instances(list& o) +{ + RGWBucketEntryPoint *bp = new RGWBucketEntryPoint(); + init_bucket(&bp->bucket, "tenant", "bucket", "pool", ".index.pool", "marker", "10"); + bp->owner = "owner"; + bp->creation_time = ceph::real_clock::from_ceph_timespec({ceph_le32(2), ceph_le32(3)}); + + o.push_back(bp); + o.push_back(new RGWBucketEntryPoint); +} + +void RGWBucketEntryPoint::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + encode_json("owner", owner, f); + utime_t ut(creation_time); + encode_json("creation_time", ut, f); + encode_json("linked", linked, f); + encode_json("has_bucket_info", has_bucket_info, f); + if (has_bucket_info) { + encode_json("old_bucket_info", old_bucket_info, f); + } +} + +void RGWBucketEntryPoint::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket", bucket, obj); + JSONDecoder::decode_json("owner", owner, obj); + utime_t ut; + JSONDecoder::decode_json("creation_time", ut, obj); + creation_time = ut.to_real_time(); + JSONDecoder::decode_json("linked", linked, obj); + JSONDecoder::decode_json("has_bucket_info", has_bucket_info, obj); + if (has_bucket_info) { + JSONDecoder::decode_json("old_bucket_info", old_bucket_info, obj); + } +} + diff --git a/src/rgw/driver/rados/rgw_bucket.h b/src/rgw/driver/rados/rgw_bucket.h new file mode 100644 index 000000000..c13e737ce --- /dev/null +++ b/src/rgw/driver/rados/rgw_bucket.h @@ -0,0 +1,766 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include +#include + +#include "include/types.h" +#include "rgw_common.h" +#include "rgw_tools.h" +#include "rgw_metadata.h" +#include "rgw/rgw_bucket.h" + +#include "rgw_string.h" +#include "rgw_sal.h" + +#include "common/Formatter.h" +#include "common/lru_map.h" +#include "common/ceph_time.h" + +#include "rgw_formats.h" + +#include "services/svc_bucket_types.h" +#include "services/svc_bucket_sync.h" + +// define as static when RGWBucket implementation completes +extern void rgw_get_buckets_obj(const rgw_user& user_id, std::string& buckets_obj_id); + +class RGWSI_Meta; +class RGWBucketMetadataHandler; +class RGWBucketInstanceMetadataHandler; +class RGWUserCtl; +class RGWBucketCtl; +class RGWZone; +struct RGWZoneParams; + +// this is used as a filter to RGWRados::cls_bucket_list_ordered; it +// conforms to the type RGWBucketListNameFilter +extern bool rgw_bucket_object_check_filter(const std::string& oid); + +void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout, + const RGWZone& zone, + std::optional shards, + std::optional type); + +struct RGWBucketCompleteInfo { + RGWBucketInfo info; + std::map attrs; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; + +class RGWBucketEntryMetadataObject : public RGWMetadataObject { + RGWBucketEntryPoint ep; + std::map attrs; +public: + RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m) : ep(_ep) { + objv = v; + mtime = m; + set_pattrs (&attrs); + } + RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m, std::map&& _attrs) : + ep(_ep), attrs(std::move(_attrs)) { + objv = v; + mtime = m; + set_pattrs (&attrs); + } + + void dump(Formatter *f) const override { + ep.dump(f); + } + + RGWBucketEntryPoint& get_ep() { + return ep; + } + + std::map& get_attrs() { + return attrs; + } +}; + +class RGWBucketInstanceMetadataObject : public RGWMetadataObject { + RGWBucketCompleteInfo info; +public: + RGWBucketInstanceMetadataObject() {} + RGWBucketInstanceMetadataObject(RGWBucketCompleteInfo& i, const obj_version& v, real_time m) : info(i) { + objv = v; + mtime = m; + } + + void dump(Formatter *f) const override { + info.dump(f); + } + + void decode_json(JSONObj *obj) { + info.decode_json(obj); + } + + RGWBucketCompleteInfo& get_bci() { + return info; + } + RGWBucketInfo& get_bucket_info() { + return info.info; + } +}; + +/** + * store a list of the user's buckets, with associated functinos. + */ +class RGWUserBuckets { + std::map buckets; + +public: + RGWUserBuckets() = default; + RGWUserBuckets(RGWUserBuckets&&) = default; + + RGWUserBuckets& operator=(const RGWUserBuckets&) = default; + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(buckets, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(buckets, bl); + } + /** + * Check if the user owns a bucket by the given name. + */ + bool owns(std::string& name) { + std::map::iterator iter; + iter = buckets.find(name); + return (iter != buckets.end()); + } + + /** + * Add a (created) bucket to the user's bucket list. + */ + void add(const RGWBucketEnt& bucket) { + buckets[bucket.bucket.name] = bucket; + } + + /** + * Remove a bucket from the user's list by name. + */ + void remove(const std::string& name) { + std::map::iterator iter; + iter = buckets.find(name); + if (iter != buckets.end()) { + buckets.erase(iter); + } + } + + /** + * Get the user's buckets as a map. + */ + std::map& get_buckets() { return buckets; } + + /** + * Cleanup data structure + */ + void clear() { buckets.clear(); } + + size_t count() { return buckets.size(); } +}; +WRITE_CLASS_ENCODER(RGWUserBuckets) + +class RGWBucketMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE { +public: + virtual ~RGWBucketMetadataHandlerBase() {} + virtual void init(RGWSI_Bucket *bucket_svc, + RGWBucketCtl *bucket_ctl) = 0; + +}; + +class RGWBucketInstanceMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE { +public: + virtual ~RGWBucketInstanceMetadataHandlerBase() {} + virtual void init(RGWSI_Zone *zone_svc, + RGWSI_Bucket *bucket_svc, + RGWSI_BucketIndex *bi_svc) = 0; +}; + +class RGWBucketMetaHandlerAllocator { +public: + static RGWBucketMetadataHandlerBase *alloc(); +}; + +class RGWBucketInstanceMetaHandlerAllocator { +public: + static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver); +}; + +class RGWArchiveBucketMetaHandlerAllocator { +public: + static RGWBucketMetadataHandlerBase *alloc(); +}; + +class RGWArchiveBucketInstanceMetaHandlerAllocator { +public: + static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver); +}; + +extern int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key); + +extern int rgw_object_get_attr(rgw::sal::Driver* driver, rgw::sal::Object* obj, + const char* attr_name, bufferlist& out_bl, + optional_yield y); + +extern void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User& user, bool fix, optional_yield y, const DoutPrefixProvider *dpp); + +struct RGWBucketAdminOpState { + rgw_user uid; + std::string display_name; + std::string bucket_name; + std::string bucket_id; + std::string object_name; + std::string new_bucket_name; + + bool list_buckets; + bool stat_buckets; + bool check_objects; + bool fix_index; + bool delete_child_objects; + bool bucket_stored; + bool sync_bucket; + bool dump_keys; + bool hide_progress; + int max_aio = 0; + ceph::timespan min_age = std::chrono::hours::zero(); + + std::unique_ptr bucket; + + RGWQuotaInfo quota; + RGWRateLimitInfo ratelimit_info; + + void set_fetch_stats(bool value) { stat_buckets = value; } + void set_check_objects(bool value) { check_objects = value; } + void set_fix_index(bool value) { fix_index = value; } + void set_delete_children(bool value) { delete_child_objects = value; } + void set_hide_progress(bool value) { hide_progress = value; } + void set_dump_keys(bool value) { dump_keys = value; } + + void set_max_aio(int value) { max_aio = value; } + void set_min_age(ceph::timespan value) { min_age = value; } + + void set_user_id(const rgw_user& user_id) { + if (!user_id.empty()) + uid = user_id; + } + void set_tenant(const std::string& tenant_str) { + uid.tenant = tenant_str; + } + void set_bucket_name(const std::string& bucket_str) { + bucket_name = bucket_str; + } + void set_object(std::string& object_str) { + object_name = object_str; + } + void set_new_bucket_name(std::string& new_bucket_str) { + new_bucket_name = new_bucket_str; + } + void set_quota(RGWQuotaInfo& value) { + quota = value; + } + void set_bucket_ratelimit(RGWRateLimitInfo& value) { + ratelimit_info = value; + } + + + void set_sync_bucket(bool value) { sync_bucket = value; } + + rgw_user& get_user_id() { return uid; } + std::string& get_user_display_name() { return display_name; } + std::string& get_bucket_name() { return bucket_name; } + std::string& get_object_name() { return object_name; } + std::string& get_tenant() { return uid.tenant; } + + rgw::sal::Bucket* get_bucket() { return bucket.get(); } + void set_bucket(std::unique_ptr _bucket) { + bucket = std::move(_bucket); + bucket_stored = true; + } + + void set_bucket_id(const std::string& bi) { + bucket_id = bi; + } + const std::string& get_bucket_id() { return bucket_id; } + + bool will_fetch_stats() { return stat_buckets; } + bool will_fix_index() { return fix_index; } + bool will_delete_children() { return delete_child_objects; } + bool will_check_objects() { return check_objects; } + bool is_user_op() { return !uid.empty(); } + bool is_system_op() { return uid.empty(); } + bool has_bucket_stored() { return bucket_stored; } + int get_max_aio() { return max_aio; } + bool will_sync_bucket() { return sync_bucket; } + + RGWBucketAdminOpState() : list_buckets(false), stat_buckets(false), check_objects(false), + fix_index(false), delete_child_objects(false), + bucket_stored(false), sync_bucket(true), + dump_keys(false), hide_progress(false) {} +}; + + +/* + * A simple wrapper class for administrative bucket operations + */ +class RGWBucket { + RGWUserBuckets buckets; + rgw::sal::Driver* driver; + RGWAccessHandle handle; + + std::unique_ptr bucket; + std::unique_ptr user; + + bool failure; + + RGWObjVersionTracker ep_objv; // entrypoint object version + +public: + RGWBucket() : driver(NULL), handle(NULL), failure(false) {} + int init(rgw::sal::Driver* storage, RGWBucketAdminOpState& op_state, optional_yield y, + const DoutPrefixProvider *dpp, std::string *err_msg = NULL); + + int check_bad_index_multipart(RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + const DoutPrefixProvider *dpp, std::string *err_msg = NULL); + + int check_object_index(const DoutPrefixProvider *dpp, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + optional_yield y, + std::string *err_msg = NULL); + int check_index_olh(rgw::sal::RadosStore* rados_store, const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher); + int check_index_unlinked(rgw::sal::RadosStore* rados_store, const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher); + + int check_index(const DoutPrefixProvider *dpp, + RGWBucketAdminOpState& op_state, + std::map& existing_stats, + std::map& calculated_stats, + std::string *err_msg = NULL); + + int chown(RGWBucketAdminOpState& op_state, const std::string& marker, + optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL); + int set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL); + + int remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg = NULL); + int policy_bl_to_stream(bufferlist& bl, std::ostream& o); + int get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp); + int sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL); + + void clear_failure() { failure = false; } + + const RGWBucketInfo& get_bucket_info() const { return bucket->get_info(); } +}; + +class RGWBucketAdminOp { +public: + static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp); + static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp); + static int dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + std::ostream& os, const DoutPrefixProvider *dpp); + + static int unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp); + static int link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL); + static int chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const std::string& marker, const DoutPrefixProvider *dpp, std::string *err_msg = NULL); + + static int check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp); + static int check_index_olh(rgw::sal::RadosStore* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp); + static int check_index_unlinked(rgw::sal::RadosStore* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp); + + static int remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, optional_yield y, + const DoutPrefixProvider *dpp, bool bypass_gc = false, bool keep_index_consistent = true); + static int remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp); + static int info(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp); + static int limit_check(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + const std::list& user_ids, + RGWFormatterFlusher& flusher, optional_yield y, + const DoutPrefixProvider *dpp, + bool warnings_only = false); + static int set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp); + + static int list_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp); + + static int clear_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp); + static int fix_lc_shards(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp); + static int fix_obj_expiry(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp, bool dry_run = false); + + static int sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL); +}; + +struct rgw_ep_info { + RGWBucketEntryPoint &ep; + std::map& attrs; + RGWObjVersionTracker ep_objv; + rgw_ep_info(RGWBucketEntryPoint &ep, std::map& attrs) + : ep(ep), attrs(attrs) {} +}; + +class RGWBucketCtl { + CephContext *cct; + + struct Svc { + RGWSI_Zone *zone{nullptr}; + RGWSI_Bucket *bucket{nullptr}; + RGWSI_Bucket_Sync *bucket_sync{nullptr}; + RGWSI_BucketIndex *bi{nullptr}; + RGWSI_User* user = nullptr; + } svc; + + struct Ctl { + RGWUserCtl *user{nullptr}; + } ctl; + + RGWBucketMetadataHandler *bm_handler; + RGWBucketInstanceMetadataHandler *bmi_handler; + + RGWSI_Bucket_BE_Handler bucket_be_handler; /* bucket backend handler */ + RGWSI_BucketInstance_BE_Handler bi_be_handler; /* bucket instance backend handler */ + + int call(std::function f); + +public: + RGWBucketCtl(RGWSI_Zone *zone_svc, + RGWSI_Bucket *bucket_svc, + RGWSI_Bucket_Sync *bucket_sync_svc, + RGWSI_BucketIndex *bi_svc, + RGWSI_User* user_svc); + + void init(RGWUserCtl *user_ctl, + RGWBucketMetadataHandler *_bm_handler, + RGWBucketInstanceMetadataHandler *_bmi_handler, + RGWDataChangesLog *datalog, + const DoutPrefixProvider *dpp); + + struct Bucket { + struct GetParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + real_time *mtime{nullptr}; + std::map *attrs{nullptr}; + rgw_cache_entry_info *cache_info{nullptr}; + boost::optional refresh_version; + std::optional bectx_params; + + GetParams() {} + + GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + GetParams& set_mtime(ceph::real_time *_mtime) { + mtime = _mtime; + return *this; + } + + GetParams& set_attrs(std::map *_attrs) { + attrs = _attrs; + return *this; + } + + GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) { + cache_info = _cache_info; + return *this; + } + + GetParams& set_refresh_version(const obj_version& _refresh_version) { + refresh_version = _refresh_version; + return *this; + } + + GetParams& set_bectx_params(std::optional _bectx_params) { + bectx_params = _bectx_params; + return *this; + } + }; + + struct PutParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + ceph::real_time mtime; + bool exclusive{false}; + std::map *attrs{nullptr}; + + PutParams() {} + + PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + PutParams& set_mtime(const ceph::real_time& _mtime) { + mtime = _mtime; + return *this; + } + + PutParams& set_exclusive(bool _exclusive) { + exclusive = _exclusive; + return *this; + } + + PutParams& set_attrs(std::map *_attrs) { + attrs = _attrs; + return *this; + } + }; + + struct RemoveParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + + RemoveParams() {} + + RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + }; + }; + + struct BucketInstance { + struct GetParams { + real_time *mtime{nullptr}; + std::map *attrs{nullptr}; + rgw_cache_entry_info *cache_info{nullptr}; + boost::optional refresh_version; + RGWObjVersionTracker *objv_tracker{nullptr}; + std::optional bectx_params; + + GetParams() {} + + GetParams& set_mtime(ceph::real_time *_mtime) { + mtime = _mtime; + return *this; + } + + GetParams& set_attrs(std::map *_attrs) { + attrs = _attrs; + return *this; + } + + GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) { + cache_info = _cache_info; + return *this; + } + + GetParams& set_refresh_version(const obj_version& _refresh_version) { + refresh_version = _refresh_version; + return *this; + } + + GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + GetParams& set_bectx_params(std::optional _bectx_params) { + bectx_params = _bectx_params; + return *this; + } + }; + + struct PutParams { + std::optional orig_info; /* nullopt: orig_info was not fetched, + nullptr: orig_info was not found (new bucket instance */ + ceph::real_time mtime; + bool exclusive{false}; + std::map *attrs{nullptr}; + RGWObjVersionTracker *objv_tracker{nullptr}; + + PutParams() {} + + PutParams& set_orig_info(RGWBucketInfo *pinfo) { + orig_info = pinfo; + return *this; + } + + PutParams& set_mtime(const ceph::real_time& _mtime) { + mtime = _mtime; + return *this; + } + + PutParams& set_exclusive(bool _exclusive) { + exclusive = _exclusive; + return *this; + } + + PutParams& set_attrs(std::map *_attrs) { + attrs = _attrs; + return *this; + } + + PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + }; + + struct RemoveParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + + RemoveParams() {} + + RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + }; + }; + + /* bucket entrypoint */ + int read_bucket_entrypoint_info(const rgw_bucket& bucket, + RGWBucketEntryPoint *info, + optional_yield y, + const DoutPrefixProvider *dpp, + const Bucket::GetParams& params = {}); + int store_bucket_entrypoint_info(const rgw_bucket& bucket, + RGWBucketEntryPoint& info, + optional_yield y, + const DoutPrefixProvider *dpp, + const Bucket::PutParams& params = {}); + int remove_bucket_entrypoint_info(const rgw_bucket& bucket, + optional_yield y, + const DoutPrefixProvider *dpp, + const Bucket::RemoveParams& params = {}); + + /* bucket instance */ + int read_bucket_instance_info(const rgw_bucket& bucket, + RGWBucketInfo *info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::GetParams& params = {}); + int store_bucket_instance_info(const rgw_bucket& bucket, + RGWBucketInfo& info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::PutParams& params = {}); + int remove_bucket_instance_info(const rgw_bucket& bucket, + RGWBucketInfo& info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::RemoveParams& params = {}); + + /* + * bucket_id may or may not be provided + * + * ep_objv_tracker might not be populated even if provided. Will only be set if entrypoint is read + * (that is: if bucket_id is empty). + */ + int read_bucket_info(const rgw_bucket& bucket, + RGWBucketInfo *info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::GetParams& params = {}, + RGWObjVersionTracker *ep_objv_tracker = nullptr); + + + int set_bucket_instance_attrs(RGWBucketInfo& bucket_info, + std::map& attrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp); + + /* user/bucket */ + int link_bucket(const rgw_user& user_id, + const rgw_bucket& bucket, + ceph::real_time creation_time, + optional_yield y, + const DoutPrefixProvider *dpp, + bool update_entrypoint = true, + rgw_ep_info *pinfo = nullptr); + + int unlink_bucket(const rgw_user& user_id, + const rgw_bucket& bucket, + optional_yield y, + const DoutPrefixProvider *dpp, + bool update_entrypoint = true); + + int read_buckets_stats(std::map& m, + optional_yield y, + const DoutPrefixProvider *dpp); + + int read_bucket_stats(const rgw_bucket& bucket, + RGWBucketEnt *result, + optional_yield y, + const DoutPrefixProvider *dpp); + + /* quota related */ + int sync_user_stats(const DoutPrefixProvider *dpp, + const rgw_user& user_id, const RGWBucketInfo& bucket_info, + optional_yield y, + RGWBucketEnt* pent); + + /* bucket sync */ + int get_sync_policy_handler(std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef *phandler, + optional_yield y, + const DoutPrefixProvider *dpp); + int bucket_exports_data(const rgw_bucket& bucket, + optional_yield y, + const DoutPrefixProvider *dpp); + int bucket_imports_data(const rgw_bucket& bucket, + optional_yield y, + const DoutPrefixProvider *dpp); + +private: + int convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx, + const rgw_bucket& bucket, + optional_yield y, + const DoutPrefixProvider *dpp); + + int do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const rgw_bucket& bucket, + RGWBucketInfo& info, + optional_yield y, + const DoutPrefixProvider *dpp, + const BucketInstance::PutParams& params); + + int do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx, + RGWBucketInfo& info, + RGWBucketInfo *orig_info, + bool exclusive, real_time mtime, + obj_version *pep_objv, + std::map *pattrs, + bool create_entry_point, + optional_yield, + const DoutPrefixProvider *dpp); + + int do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx, + const rgw_user& user, + const rgw_bucket& bucket, + ceph::real_time creation_time, + bool update_entrypoint, + rgw_ep_info *pinfo, + optional_yield y, + const DoutPrefixProvider *dpp); + + int do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx, + const rgw_user& user_id, + const rgw_bucket& bucket, + bool update_entrypoint, + optional_yield y, + const DoutPrefixProvider *dpp); + +}; + +bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver, const std::string& marker, + const std::string& bucket_id, rgw_bucket* bucket_out); diff --git a/src/rgw/driver/rados/rgw_bucket_sync.cc b/src/rgw/driver/rados/rgw_bucket_sync.cc new file mode 100644 index 000000000..6ff76c16a --- /dev/null +++ b/src/rgw/driver/rados/rgw_bucket_sync.cc @@ -0,0 +1,1018 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_common.h" +#include "rgw_bucket_sync.h" +#include "rgw_data_sync.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" +#include "services/svc_bucket_sync.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +ostream& operator<<(ostream& os, const rgw_sync_bucket_entity& e) { + os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zone.value_or(rgw_zone_id()) << ",az=" << (int)e.all_zones << "}"; + return os; +} + +ostream& operator<<(ostream& os, const rgw_sync_bucket_pipe& pipe) { + os << "{s=" << pipe.source << ",d=" << pipe.dest << "}"; + return os; +} + +ostream& operator<<(ostream& os, const rgw_sync_bucket_entities& e) { + os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zones.value_or(std::set()) << "}"; + return os; +} + +ostream& operator<<(ostream& os, const rgw_sync_bucket_pipes& pipe) { + os << "{id=" << pipe.id << ",s=" << pipe.source << ",d=" << pipe.dest << "}"; + return os; +} + +static std::vector filter_relevant_pipes(const std::vector& pipes, + const rgw_zone_id& source_zone, + const rgw_zone_id& dest_zone) +{ + std::vector relevant_pipes; + for (auto& p : pipes) { + if (p.source.match_zone(source_zone) && + p.dest.match_zone(dest_zone)) { + for (auto pipe : p.expand()) { + pipe.source.apply_zone(source_zone); + pipe.dest.apply_zone(dest_zone); + relevant_pipes.push_back(pipe); + } + } + } + + return relevant_pipes; +} + +static bool is_wildcard_bucket(const rgw_bucket& bucket) +{ + return bucket.name.empty(); +} + +void rgw_sync_group_pipe_map::dump(ceph::Formatter *f) const +{ + encode_json("zone", zone.id, f); + encode_json("buckets", rgw_sync_bucket_entities::bucket_key(bucket), f); + encode_json("sources", sources, f); + encode_json("dests", dests, f); +} + + +template +void rgw_sync_group_pipe_map::try_add_to_pipe_map(const rgw_zone_id& source_zone, + const rgw_zone_id& dest_zone, + const std::vector& pipes, + zb_pipe_map_t *pipe_map, + CB1 filter_cb, + CB2 call_filter_cb) +{ + if (!filter_cb(source_zone, nullopt, dest_zone, nullopt)) { + return; + } + auto relevant_pipes = filter_relevant_pipes(pipes, source_zone, dest_zone); + + for (auto& pipe : relevant_pipes) { + rgw_sync_bucket_entity zb; + if (!call_filter_cb(pipe, &zb)) { + continue; + } + pipe_map->insert(make_pair(zb, pipe)); + } +} + +template +void rgw_sync_group_pipe_map::try_add_source(const rgw_zone_id& source_zone, + const rgw_zone_id& dest_zone, + const std::vector& pipes, + CB filter_cb) +{ + return try_add_to_pipe_map(source_zone, dest_zone, pipes, + &sources, + filter_cb, + [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) { + *zb = rgw_sync_bucket_entity{source_zone, pipe.source.get_bucket()}; + return filter_cb(source_zone, zb->bucket, dest_zone, pipe.dest.get_bucket()); + }); +} + +template +void rgw_sync_group_pipe_map::try_add_dest(const rgw_zone_id& source_zone, + const rgw_zone_id& dest_zone, + const std::vector& pipes, + CB filter_cb) +{ + return try_add_to_pipe_map(source_zone, dest_zone, pipes, + &dests, + filter_cb, + [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) { + *zb = rgw_sync_bucket_entity{dest_zone, pipe.dest.get_bucket()}; + return filter_cb(source_zone, pipe.source.get_bucket(), dest_zone, zb->bucket); + }); +} + +using zb_pipe_map_t = rgw_sync_group_pipe_map::zb_pipe_map_t; + +pair rgw_sync_group_pipe_map::find_pipes(const zb_pipe_map_t& m, + const rgw_zone_id& zone, + std::optional b) const +{ + if (!b) { + return m.equal_range(rgw_sync_bucket_entity{zone, rgw_bucket()}); + } + + auto zb = rgw_sync_bucket_entity{zone, *b}; + + auto range = m.equal_range(zb); + if (range.first == range.second && + !is_wildcard_bucket(*b)) { + /* couldn't find the specific bucket, try to find by wildcard */ + zb.bucket = rgw_bucket(); + range = m.equal_range(zb); + } + + return range; +} + + +template +void rgw_sync_group_pipe_map::init(const DoutPrefixProvider *dpp, + CephContext *cct, + const rgw_zone_id& _zone, + std::optional _bucket, + const rgw_sync_policy_group& group, + rgw_sync_data_flow_group *_default_flow, + std::set *_pall_zones, + CB filter_cb) { + zone = _zone; + bucket = _bucket; + default_flow = _default_flow; + pall_zones = _pall_zones; + + rgw_sync_bucket_entity zb(zone, bucket); + + status = group.status; + + std::vector zone_pipes; + + string bucket_key = (bucket ? bucket->get_key() : "*"); + + /* only look at pipes that touch the specific zone and bucket */ + for (auto& pipe : group.pipes) { + if (pipe.contains_zone_bucket(zone, bucket)) { + ldpp_dout(dpp, 20) << __func__ << "(): pipe_map (zone=" << zone << " bucket=" << bucket_key << "): adding potential pipe: " << pipe << dendl; + zone_pipes.push_back(pipe); + } + } + + const rgw_sync_data_flow_group *pflow; + + if (!group.data_flow.empty()) { + pflow = &group.data_flow; + } else { + if (!default_flow) { + return; + } + pflow = default_flow; + } + + auto& flow = *pflow; + + pall_zones->insert(zone); + + /* symmetrical */ + for (auto& symmetrical_group : flow.symmetrical) { + if (symmetrical_group.zones.find(zone) != symmetrical_group.zones.end()) { + for (auto& z : symmetrical_group.zones) { + if (z != zone) { + pall_zones->insert(z); + try_add_source(z, zone, zone_pipes, filter_cb); + try_add_dest(zone, z, zone_pipes, filter_cb); + } + } + } + } + + /* directional */ + for (auto& rule : flow.directional) { + if (rule.source_zone == zone) { + pall_zones->insert(rule.dest_zone); + try_add_dest(zone, rule.dest_zone, zone_pipes, filter_cb); + } else if (rule.dest_zone == zone) { + pall_zones->insert(rule.source_zone); + try_add_source(rule.source_zone, zone, zone_pipes, filter_cb); + } + } +} + +/* + * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket} + */ +vector rgw_sync_group_pipe_map::find_source_pipes(const rgw_zone_id& source_zone, + std::optional source_bucket, + std::optional dest_bucket) const { + vector result; + + auto range = find_pipes(sources, source_zone, source_bucket); + + for (auto iter = range.first; iter != range.second; ++iter) { + auto pipe = iter->second; + if (pipe.dest.match_bucket(dest_bucket)) { + result.push_back(pipe); + } + } + return result; +} + +/* + * find all relevant pipes in other zones that pull from a specific + * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket} + */ +vector rgw_sync_group_pipe_map::find_dest_pipes(std::optional source_bucket, + const rgw_zone_id& dest_zone, + std::optional dest_bucket) const { + vector result; + + auto range = find_pipes(dests, dest_zone, dest_bucket); + + for (auto iter = range.first; iter != range.second; ++iter) { + auto pipe = iter->second; + if (pipe.source.match_bucket(source_bucket)) { + result.push_back(pipe); + } + } + + return result; +} + +/* + * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket} + */ +vector rgw_sync_group_pipe_map::find_pipes(const rgw_zone_id& source_zone, + std::optional source_bucket, + const rgw_zone_id& dest_zone, + std::optional dest_bucket) const { + if (dest_zone == zone) { + return find_source_pipes(source_zone, source_bucket, dest_bucket); + } + + if (source_zone == zone) { + return find_dest_pipes(source_bucket, dest_zone, dest_bucket); + } + + return vector(); +} + +void RGWBucketSyncFlowManager::pipe_rules::insert(const rgw_sync_bucket_pipe& pipe) +{ + pipes.push_back(pipe); + + auto ppipe = &pipes.back(); + auto prefix = ppipe->params.source.filter.prefix.value_or(string()); + + prefix_refs.insert(make_pair(prefix, ppipe)); + + for (auto& t : ppipe->params.source.filter.tags) { + string tag = t.key + "=" + t.value; + auto titer = tag_refs.find(tag); + if (titer != tag_refs.end() && + ppipe->params.priority > titer->second->params.priority) { + titer->second = ppipe; + } else { + tag_refs[tag] = ppipe; + } + } +} + +bool RGWBucketSyncFlowManager::pipe_rules::find_basic_info_without_tags(const rgw_obj_key& key, + std::optional *user, + std::optional *acl_translation_owner, + std::optional *storage_class, + rgw_sync_pipe_params::Mode *mode, + bool *need_more_info) const +{ + std::optional owner; + + *need_more_info = false; + + if (prefix_refs.empty()) { + return false; + } + + auto end = prefix_refs.upper_bound(key.name); + auto iter = end; + if (iter != prefix_refs.begin()) { + --iter; + } + if (iter == prefix_refs.end()) { + return false; + } + + if (iter != prefix_refs.begin()) { + iter = prefix_refs.find(iter->first); /* prefix_refs is multimap, find first element + holding that key */ + } + + std::vector iters; + + std::optional priority; + + for (; iter != end; ++iter) { + auto& prefix = iter->first; + if (!boost::starts_with(key.name, prefix)) { + continue; + } + + auto& rule_params = iter->second->params; + auto& filter = rule_params.source.filter; + + if (rule_params.priority > priority) { + priority = rule_params.priority; + + if (!filter.has_tags()) { + iters.clear(); + } + iters.push_back(iter); + + *need_more_info = filter.has_tags(); /* if highest priority filter has tags, then + we can't be sure if it would be used. + We need to first read the info from the source object */ + } + } + + if (iters.empty()) { + return false; + } + + std::optional _user; + std::optional _acl_translation; + std::optional _storage_class; + rgw_sync_pipe_params::Mode _mode{rgw_sync_pipe_params::Mode::MODE_SYSTEM}; + + // make sure all params are the same by saving the first one + // encountered and comparing all subsequent to it + bool first_iter = true; + for (auto& iter : iters) { + const rgw_sync_pipe_params& rule_params = iter->second->params; + if (first_iter) { + _user = rule_params.user; + _acl_translation = rule_params.dest.acl_translation; + _storage_class = rule_params.dest.storage_class; + _mode = rule_params.mode; + first_iter = false; + } else { + // note: three of these == operators are comparing std::optional + // against std::optional; as one would expect they are equal a) + // if both do not contain values or b) if both do and those + // contained values are the same + const bool conflict = + !(_user == rule_params.user && + _acl_translation == rule_params.dest.acl_translation && + _storage_class == rule_params.dest.storage_class && + _mode == rule_params.mode); + if (conflict) { + *need_more_info = true; + return false; + } + } + } + + *user = _user; + if (_acl_translation) { + *acl_translation_owner = _acl_translation->owner; + } + *storage_class = _storage_class; + *mode = _mode; + + return true; +} + +bool RGWBucketSyncFlowManager::pipe_rules::find_obj_params(const rgw_obj_key& key, + const RGWObjTags::tag_map_t& tags, + rgw_sync_pipe_params *params) const +{ + if (prefix_refs.empty()) { + return false; + } + + auto iter = prefix_refs.upper_bound(key.name); + if (iter != prefix_refs.begin()) { + --iter; + } + if (iter == prefix_refs.end()) { + return false; + } + + auto end = prefix_refs.upper_bound(key.name); + auto max = end; + + std::optional priority; + + for (; iter != end; ++iter) { + /* NOTE: this is not the most efficient way to do it, + * a trie data structure would be better + */ + auto& prefix = iter->first; + if (!boost::starts_with(key.name, prefix)) { + continue; + } + + auto& rule_params = iter->second->params; + auto& filter = rule_params.source.filter; + + if (!filter.check_tags(tags)) { + continue; + } + + if (rule_params.priority > priority) { + priority = rule_params.priority; + max = iter; + } + } + + if (max == end) { + return false; + } + + *params = max->second->params; + return true; +} + +/* + * return either the current prefix for s, or the next one if s is not within a prefix + */ + +RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator RGWBucketSyncFlowManager::pipe_rules::prefix_search(const std::string& s) const +{ + if (prefix_refs.empty()) { + return prefix_refs.end(); + } + auto next = prefix_refs.upper_bound(s); + auto iter = next; + if (iter != prefix_refs.begin()) { + --iter; + } + if (!boost::starts_with(s, iter->first)) { + return next; + } + + return iter; +} + +void RGWBucketSyncFlowManager::pipe_set::insert(const rgw_sync_bucket_pipe& pipe) { + /* Ensure this pipe doesn't match with any disabled pipes */ + for (auto p: disabled_pipe_map) { + if (p.second.source.match(pipe.source) && p.second.dest.match(pipe.dest)) { + return; + } + } + pipe_map.insert(make_pair(pipe.id, pipe)); + + auto& rules_ref = rules[endpoints_pair(pipe)]; + + if (!rules_ref) { + rules_ref = make_shared(); + } + + rules_ref->insert(pipe); + + pipe_handler h(rules_ref, pipe); + + handlers.insert(h); +} + +void RGWBucketSyncFlowManager::pipe_set::remove_all() { + pipe_map.clear(); + disabled_pipe_map.clear(); + rules.clear(); + handlers.clear(); +} + +void RGWBucketSyncFlowManager::pipe_set::disable(const rgw_sync_bucket_pipe& pipe) { + /* This pipe is disabled. Add it to disabled pipes & remove any + * matching pipes already inserted + */ + disabled_pipe_map.insert(make_pair(pipe.id, pipe)); + for (auto iter_p = pipe_map.begin(); iter_p != pipe_map.end(); ) { + auto p = iter_p++; + if (p->second.source.match(pipe.source) && p->second.dest.match(pipe.dest)) { + auto& rules_ref = rules[endpoints_pair(p->second)]; + if (rules_ref) { + pipe_handler h(rules_ref, p->second); + handlers.erase(h); + } + rules.erase(endpoints_pair(p->second)); + pipe_map.erase(p); + } + } +} + +void RGWBucketSyncFlowManager::pipe_set::dump(ceph::Formatter *f) const +{ + encode_json("pipes", pipe_map, f); +} + +bool RGWBucketSyncFlowManager::allowed_data_flow(const rgw_zone_id& source_zone, + std::optional source_bucket, + const rgw_zone_id& dest_zone, + std::optional dest_bucket, + bool check_activated) const +{ + bool found = false; + bool found_activated = false; + + for (auto m : flow_groups) { + auto& fm = m.second; + auto pipes = fm.find_pipes(source_zone, source_bucket, + dest_zone, dest_bucket); + + bool is_found = !pipes.empty(); + + if (is_found) { + switch (fm.status) { + case rgw_sync_policy_group::Status::FORBIDDEN: + return false; + case rgw_sync_policy_group::Status::ENABLED: + found = true; + found_activated = true; + break; + case rgw_sync_policy_group::Status::ALLOWED: + found = true; + break; + default: + break; /* unknown -- ignore */ + } + } + } + + if (check_activated && found_activated) { + return true; + } + + return found; +} + +void RGWBucketSyncFlowManager::init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy) { + std::optional default_flow; + if (parent) { + default_flow.emplace(); + default_flow->init_default(parent->all_zones); + } + + for (auto& item : sync_policy.groups) { + auto& group = item.second; + auto& flow_group_map = flow_groups[group.id]; + + flow_group_map.init(dpp, cct, zone_id, bucket, group, + (default_flow ? &(*default_flow) : nullptr), + &all_zones, + [&](const rgw_zone_id& source_zone, + std::optional source_bucket, + const rgw_zone_id& dest_zone, + std::optional dest_bucket) { + if (!parent) { + return true; + } + return parent->allowed_data_flow(source_zone, + source_bucket, + dest_zone, + dest_bucket, + false); /* just check that it's not disabled */ + }); + } +} + +/* +* These are the semantics to be followed while resolving the policy +* conflicts - +* +* ================================================== +* zonegroup bucket Result +* ================================================== +* enabled enabled enabled +* allowed enabled +* forbidden disabled +* allowed enabled enabled +* allowed disabled +* forbidden disabled +* forbidden enabled disabled +* allowed disabled +* forbidden disabled +* +* In case multiple group policies are set to reflect for any sync pair +* (, ), the following +* rules are applied in the order- +* 1) Even if one policy status is FORBIDDEN, the sync will be disabled +* 2) Atleast one policy should be ENABLED for the sync to be allowed. +* +*/ +void RGWBucketSyncFlowManager::reflect(const DoutPrefixProvider *dpp, + std::optional effective_bucket, + RGWBucketSyncFlowManager::pipe_set *source_pipes, + RGWBucketSyncFlowManager::pipe_set *dest_pipes, + bool only_enabled) const + +{ + string effective_bucket_key; + bool is_forbidden = false; + if (effective_bucket) { + effective_bucket_key = effective_bucket->get_key(); + } + if (parent) { + parent->reflect(dpp, effective_bucket, source_pipes, dest_pipes, only_enabled); + } + + for (auto& item : flow_groups) { + auto& flow_group_map = item.second; + is_forbidden = false; + + if (flow_group_map.status == rgw_sync_policy_group::Status::FORBIDDEN) { + /* FORBIDDEN takes precedence over all the other rules. + * Remove any other pipes which may allow access. + */ + is_forbidden = true; + } else if (flow_group_map.status != rgw_sync_policy_group::Status::ENABLED && + (only_enabled || flow_group_map.status != rgw_sync_policy_group::Status::ALLOWED)) { + /* only return enabled groups */ + continue; + } + + for (auto& entry : flow_group_map.sources) { + rgw_sync_bucket_pipe pipe = entry.second; + if (!pipe.dest.match_bucket(effective_bucket)) { + continue; + } + + pipe.source.apply_bucket(effective_bucket); + pipe.dest.apply_bucket(effective_bucket); + + if (is_forbidden) { + ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): removing source pipe: " << pipe << dendl; + source_pipes->disable(pipe); + } else { + ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding source pipe: " << pipe << dendl; + source_pipes->insert(pipe); + } + } + + for (auto& entry : flow_group_map.dests) { + rgw_sync_bucket_pipe pipe = entry.second; + + if (!pipe.source.match_bucket(effective_bucket)) { + continue; + } + + pipe.source.apply_bucket(effective_bucket); + pipe.dest.apply_bucket(effective_bucket); + + if (is_forbidden) { + ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): removing dest pipe: " << pipe << dendl; + dest_pipes->disable(pipe); + } else { + ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding dest pipe: " << pipe << dendl; + dest_pipes->insert(pipe); + } + } + } +} + + +RGWBucketSyncFlowManager::RGWBucketSyncFlowManager(CephContext *_cct, + const rgw_zone_id& _zone_id, + std::optional _bucket, + const RGWBucketSyncFlowManager *_parent) : cct(_cct), + zone_id(_zone_id), + bucket(_bucket), + parent(_parent) {} + + +void RGWSyncPolicyCompat::convert_old_sync_config(RGWSI_Zone *zone_svc, + RGWSI_SyncModules *sync_modules_svc, + rgw_sync_policy_info *ppolicy) +{ + bool found = false; + + rgw_sync_policy_info policy; + + auto& group = policy.groups["default"]; + auto& zonegroup = zone_svc->get_zonegroup(); + + for (const auto& ziter1 : zonegroup.zones) { + auto& id1 = ziter1.first; + const RGWZone& z1 = ziter1.second; + + for (const auto& ziter2 : zonegroup.zones) { + auto& id2 = ziter2.first; + const RGWZone& z2 = ziter2.second; + + if (id1 == id2) { + continue; + } + + if (z1.syncs_from(z2.name)) { + found = true; + rgw_sync_directional_rule *rule; + group.data_flow.find_or_create_directional(id2, + id1, + &rule); + } + } + } + + if (!found) { /* nothing syncs */ + return; + } + + rgw_sync_bucket_pipes pipes; + pipes.id = "all"; + pipes.source.all_zones = true; + pipes.dest.all_zones = true; + + group.pipes.emplace_back(std::move(pipes)); + + + group.status = rgw_sync_policy_group::Status::ENABLED; + + *ppolicy = std::move(policy); +} + +RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc, + RGWSI_SyncModules *sync_modules_svc, + RGWSI_Bucket_Sync *_bucket_sync_svc, + std::optional effective_zone) : zone_svc(_zone_svc) , + bucket_sync_svc(_bucket_sync_svc) { + zone_id = effective_zone.value_or(zone_svc->zone_id()); + flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(), + zone_id, + nullopt, + nullptr)); + sync_policy = zone_svc->get_zonegroup().sync_policy; + + if (sync_policy.empty()) { + RGWSyncPolicyCompat::convert_old_sync_config(zone_svc, sync_modules_svc, &sync_policy); + legacy_config = true; + } +} + +RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent, + const RGWBucketInfo& _bucket_info, + map&& _bucket_attrs) : parent(_parent), + bucket_info(_bucket_info), + bucket_attrs(std::move(_bucket_attrs)) { + if (_bucket_info.sync_policy) { + sync_policy = *_bucket_info.sync_policy; + + for (auto& entry : sync_policy.groups) { + for (auto& pipe : entry.second.pipes) { + if (pipe.params.mode == rgw_sync_pipe_params::MODE_USER && + pipe.params.user.empty()) { + pipe.params.user = _bucket_info.owner; + } + } + } + } + legacy_config = parent->legacy_config; + bucket = _bucket_info.bucket; + zone_svc = parent->zone_svc; + bucket_sync_svc = parent->bucket_sync_svc; + flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(), + parent->zone_id, + _bucket_info.bucket, + parent->flow_mgr.get())); +} + +RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent, + const rgw_bucket& _bucket, + std::optional _sync_policy) : parent(_parent) { + if (_sync_policy) { + sync_policy = *_sync_policy; + } + legacy_config = parent->legacy_config; + bucket = _bucket; + zone_svc = parent->zone_svc; + bucket_sync_svc = parent->bucket_sync_svc; + flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(), + parent->zone_id, + _bucket, + parent->flow_mgr.get())); +} + +RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const RGWBucketInfo& bucket_info, + map&& bucket_attrs) const +{ + return new RGWBucketSyncPolicyHandler(this, bucket_info, std::move(bucket_attrs)); +} + +RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const rgw_bucket& bucket, + std::optional sync_policy) const +{ + return new RGWBucketSyncPolicyHandler(this, bucket, sync_policy); +} + +int RGWBucketSyncPolicyHandler::init(const DoutPrefixProvider *dpp, optional_yield y) +{ + int r = bucket_sync_svc->get_bucket_sync_hints(dpp, bucket.value_or(rgw_bucket()), + &source_hints, + &target_hints, + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize bucket sync policy handler: get_bucket_sync_hints() on bucket=" + << bucket << " returned r=" << r << dendl; + return r; + } + + flow_mgr->init(dpp, sync_policy); + + reflect(dpp, &source_pipes, + &target_pipes, + &sources, + &targets, + &source_zones, + &target_zones, + true); + + return 0; +} + +void RGWBucketSyncPolicyHandler::reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes, + RGWBucketSyncFlowManager::pipe_set *ptarget_pipes, + map *psources, + map *ptargets, + std::set *psource_zones, + std::set *ptarget_zones, + bool only_enabled) const +{ + RGWBucketSyncFlowManager::pipe_set _source_pipes; + RGWBucketSyncFlowManager::pipe_set _target_pipes; + map _sources; + map _targets; + std::set _source_zones; + std::set _target_zones; + + flow_mgr->reflect(dpp, bucket, &_source_pipes, &_target_pipes, only_enabled); + + for (auto& entry : _source_pipes.pipe_map) { + auto& pipe = entry.second; + if (!pipe.source.zone) { + continue; + } + _source_zones.insert(*pipe.source.zone); + _sources[*pipe.source.zone].insert(pipe); + } + + for (auto& entry : _target_pipes.pipe_map) { + auto& pipe = entry.second; + if (!pipe.dest.zone) { + continue; + } + _target_zones.insert(*pipe.dest.zone); + _targets[*pipe.dest.zone].insert(pipe); + } + + if (psource_pipes) { + *psource_pipes = std::move(_source_pipes); + } + if (ptarget_pipes) { + *ptarget_pipes = std::move(_target_pipes); + } + if (psources) { + *psources = std::move(_sources); + } + if (ptargets) { + *ptargets = std::move(_targets); + } + if (psource_zones) { + *psource_zones = std::move(_source_zones); + } + if (ptarget_zones) { + *ptarget_zones = std::move(_target_zones); + } +} + +multimap RGWBucketSyncPolicyHandler::get_all_sources() const +{ + multimap m; + + for (auto& source_entry : sources) { + auto& zone_id = source_entry.first; + + auto& pipes = source_entry.second.pipe_map; + + for (auto& entry : pipes) { + auto& pipe = entry.second; + m.insert(make_pair(zone_id, pipe)); + } + } + + for (auto& pipe : resolved_sources) { + if (!pipe.source.zone) { + continue; + } + + m.insert(make_pair(*pipe.source.zone, pipe)); + } + + return m; +} + +multimap RGWBucketSyncPolicyHandler::get_all_dests() const +{ + multimap m; + + for (auto& dest_entry : targets) { + auto& zone_id = dest_entry.first; + + auto& pipes = dest_entry.second.pipe_map; + + for (auto& entry : pipes) { + auto& pipe = entry.second; + m.insert(make_pair(zone_id, pipe)); + } + } + + for (auto& pipe : resolved_dests) { + if (!pipe.dest.zone) { + continue; + } + + m.insert(make_pair(*pipe.dest.zone, pipe)); + } + + return m; +} + +multimap RGWBucketSyncPolicyHandler::get_all_dests_in_zone(const rgw_zone_id& zone_id) const +{ + multimap m; + + auto iter = targets.find(zone_id); + if (iter != targets.end()) { + auto& pipes = iter->second.pipe_map; + + for (auto& entry : pipes) { + auto& pipe = entry.second; + m.insert(make_pair(zone_id, pipe)); + } + } + + for (auto& pipe : resolved_dests) { + if (!pipe.dest.zone || + *pipe.dest.zone != zone_id) { + continue; + } + + m.insert(make_pair(*pipe.dest.zone, pipe)); + } + + return m; +} + +void RGWBucketSyncPolicyHandler::get_pipes(std::set *_sources, std::set *_targets, + std::optional filter_peer) { /* return raw pipes */ + for (auto& entry : source_pipes.pipe_map) { + auto& source_pipe = entry.second; + if (!filter_peer || + source_pipe.source.match(*filter_peer)) { + _sources->insert(source_pipe); + } + } + + for (auto& entry : target_pipes.pipe_map) { + auto& target_pipe = entry.second; + if (!filter_peer || + target_pipe.dest.match(*filter_peer)) { + _targets->insert(target_pipe); + } + } +} + +bool RGWBucketSyncPolicyHandler::bucket_exports_data() const +{ + if (!bucket) { + return false; + } + + if (!zone_svc->sync_module_exports_data()) { + return false; + } + + if (bucket_is_sync_source()) { + return true; + } + + return (zone_svc->need_to_log_data() && + bucket_info->datasync_flag_enabled()); +} + +bool RGWBucketSyncPolicyHandler::bucket_imports_data() const +{ + return bucket_is_sync_target(); +} + diff --git a/src/rgw/driver/rados/rgw_bucket_sync.h b/src/rgw/driver/rados/rgw_bucket_sync.h new file mode 100644 index 000000000..d425ecf17 --- /dev/null +++ b/src/rgw/driver/rados/rgw_bucket_sync.h @@ -0,0 +1,416 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_common.h" +#include "rgw_sync_policy.h" + +class RGWSI_Zone; +class RGWSI_SyncModules; +class RGWSI_Bucket_Sync; + +struct rgw_sync_group_pipe_map; +struct rgw_sync_bucket_pipes; +struct rgw_sync_policy_info; + +struct rgw_sync_group_pipe_map { + rgw_zone_id zone; + std::optional bucket; + + rgw_sync_policy_group::Status status{rgw_sync_policy_group::Status::UNKNOWN}; + + using zb_pipe_map_t = std::multimap; + + zb_pipe_map_t sources; /* all the pipes where zone is pulling from */ + zb_pipe_map_t dests; /* all the pipes that pull from zone */ + + std::set *pall_zones{nullptr}; + rgw_sync_data_flow_group *default_flow{nullptr}; /* flow to use if policy doesn't define it, + used in the case of bucket sync policy, not at the + zonegroup level */ + + void dump(ceph::Formatter *f) const; + + template + void try_add_to_pipe_map(const rgw_zone_id& source_zone, + const rgw_zone_id& dest_zone, + const std::vector& pipes, + zb_pipe_map_t *pipe_map, + CB1 filter_cb, + CB2 call_filter_cb); + + template + void try_add_source(const rgw_zone_id& source_zone, + const rgw_zone_id& dest_zone, + const std::vector& pipes, + CB filter_cb); + + template + void try_add_dest(const rgw_zone_id& source_zone, + const rgw_zone_id& dest_zone, + const std::vector& pipes, + CB filter_cb); + + std::pair find_pipes(const zb_pipe_map_t& m, + const rgw_zone_id& zone, + std::optional b) const; + + template + void init(const DoutPrefixProvider *dpp, CephContext *cct, + const rgw_zone_id& _zone, + std::optional _bucket, + const rgw_sync_policy_group& group, + rgw_sync_data_flow_group *_default_flow, + std::set *_pall_zones, + CB filter_cb); + + /* + * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket} + */ + std::vector find_source_pipes(const rgw_zone_id& source_zone, + std::optional source_bucket, + std::optional dest_bucket) const; + + /* + * find all relevant pipes in other zones that pull from a specific + * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket} + */ + std::vector find_dest_pipes(std::optional source_bucket, + const rgw_zone_id& dest_zone, + std::optional dest_bucket) const; + + /* + * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket} + */ + std::vector find_pipes(const rgw_zone_id& source_zone, + std::optional source_bucket, + const rgw_zone_id& dest_zone, + std::optional dest_bucket) const; +}; + +class RGWSyncPolicyCompat { +public: + static void convert_old_sync_config(RGWSI_Zone *zone_svc, + RGWSI_SyncModules *sync_modules_svc, + rgw_sync_policy_info *ppolicy); +}; + +class RGWBucketSyncFlowManager { + friend class RGWBucketSyncPolicyHandler; +public: + struct endpoints_pair { + rgw_sync_bucket_entity source; + rgw_sync_bucket_entity dest; + + endpoints_pair() {} + endpoints_pair(const rgw_sync_bucket_pipe& pipe) { + source = pipe.source; + dest = pipe.dest; + } + + bool operator<(const endpoints_pair& e) const { + if (source < e.source) { + return true; + } + if (e.source < source) { + return false; + } + return (dest < e.dest); + } + }; + + /* + * pipe_rules: deal with a set of pipes that have common endpoints_pair + */ + class pipe_rules { + std::list pipes; + + public: + using prefix_map_t = std::multimap; + + std::map tag_refs; + prefix_map_t prefix_refs; + + void insert(const rgw_sync_bucket_pipe& pipe); + + bool find_basic_info_without_tags(const rgw_obj_key& key, + std::optional *user, + std::optional *acl_translation, + std::optional *storage_class, + rgw_sync_pipe_params::Mode *mode, + bool *need_more_info) const; + bool find_obj_params(const rgw_obj_key& key, + const RGWObjTags::tag_map_t& tags, + rgw_sync_pipe_params *params) const; + + void scan_prefixes(std::vector *prefixes) const; + + prefix_map_t::const_iterator prefix_begin() const { + return prefix_refs.begin(); + } + prefix_map_t::const_iterator prefix_search(const std::string& s) const; + prefix_map_t::const_iterator prefix_end() const { + return prefix_refs.end(); + } + }; + + using pipe_rules_ref = std::shared_ptr; + + /* + * pipe_handler: extends endpoints_rule to point at the corresponding rules handler + */ + struct pipe_handler : public endpoints_pair { + pipe_rules_ref rules; + + pipe_handler() {} + pipe_handler(pipe_rules_ref& _rules, + const rgw_sync_bucket_pipe& _pipe) : endpoints_pair(_pipe), + rules(_rules) {} + bool specific() const { + return source.specific() && dest.specific(); + } + + bool find_basic_info_without_tags(const rgw_obj_key& key, + std::optional *user, + std::optional *acl_translation, + std::optional *storage_class, + rgw_sync_pipe_params::Mode *mode, + bool *need_more_info) const { + if (!rules) { + return false; + } + return rules->find_basic_info_without_tags(key, user, acl_translation, storage_class, mode, need_more_info); + } + + bool find_obj_params(const rgw_obj_key& key, + const RGWObjTags::tag_map_t& tags, + rgw_sync_pipe_params *params) const { + if (!rules) { + return false; + } + return rules->find_obj_params(key, tags, params); + } + }; + + struct pipe_set { + std::map rules; + std::multimap pipe_map; + std::multimap disabled_pipe_map; + + std::set handlers; + + using iterator = std::set::iterator; + + void clear() { + rules.clear(); + pipe_map.clear(); + disabled_pipe_map.clear(); + handlers.clear(); + } + + void insert(const rgw_sync_bucket_pipe& pipe); + void remove_all(); + void disable(const rgw_sync_bucket_pipe& pipe); + + iterator begin() const { + return handlers.begin(); + } + + iterator end() const { + return handlers.end(); + } + + void dump(ceph::Formatter *f) const; + }; + +private: + + CephContext *cct; + + rgw_zone_id zone_id; + std::optional bucket; + + const RGWBucketSyncFlowManager *parent{nullptr}; + + std::map flow_groups; + + std::set all_zones; + + bool allowed_data_flow(const rgw_zone_id& source_zone, + std::optional source_bucket, + const rgw_zone_id& dest_zone, + std::optional dest_bucket, + bool check_activated) const; + + /* + * find all the matching flows om a flow map for a specific bucket + */ + void update_flow_maps(const rgw_sync_bucket_pipes& pipe); + + void init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy); + +public: + + RGWBucketSyncFlowManager(CephContext *_cct, + const rgw_zone_id& _zone_id, + std::optional _bucket, + const RGWBucketSyncFlowManager *_parent); + + void reflect(const DoutPrefixProvider *dpp, std::optional effective_bucket, + pipe_set *flow_by_source, + pipe_set *flow_by_dest, + bool only_enabled) const; + +}; + +static inline std::ostream& operator<<(std::ostream& os, const RGWBucketSyncFlowManager::endpoints_pair& e) { + os << e.dest << " -> " << e.source; + return os; +} + +class RGWBucketSyncPolicyHandler { + bool legacy_config{false}; + const RGWBucketSyncPolicyHandler *parent{nullptr}; + RGWSI_Zone *zone_svc; + RGWSI_Bucket_Sync *bucket_sync_svc; + rgw_zone_id zone_id; + std::optional bucket_info; + std::optional > bucket_attrs; + std::optional bucket; + std::unique_ptr flow_mgr; + rgw_sync_policy_info sync_policy; + + RGWBucketSyncFlowManager::pipe_set source_pipes; + RGWBucketSyncFlowManager::pipe_set target_pipes; + + std::map sources; /* source pipes by source zone id */ + std::map targets; /* target pipes by target zone id */ + + std::set source_zones; + std::set target_zones; + + std::set source_hints; + std::set target_hints; + std::set resolved_sources; + std::set resolved_dests; + + + bool bucket_is_sync_source() const { + return !targets.empty() || !resolved_dests.empty(); + } + + bool bucket_is_sync_target() const { + return !sources.empty() || !resolved_sources.empty(); + } + + RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent, + const RGWBucketInfo& _bucket_info, + std::map&& _bucket_attrs); + + RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent, + const rgw_bucket& _bucket, + std::optional _sync_policy); +public: + RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc, + RGWSI_SyncModules *sync_modules_svc, + RGWSI_Bucket_Sync *bucket_sync_svc, + std::optional effective_zone = std::nullopt); + + RGWBucketSyncPolicyHandler *alloc_child(const RGWBucketInfo& bucket_info, + std::map&& bucket_attrs) const; + RGWBucketSyncPolicyHandler *alloc_child(const rgw_bucket& bucket, + std::optional sync_policy) const; + + int init(const DoutPrefixProvider *dpp, optional_yield y); + + void reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes, + RGWBucketSyncFlowManager::pipe_set *ptarget_pipes, + std::map *psources, + std::map *ptargets, + std::set *psource_zones, + std::set *ptarget_zones, + bool only_enabled) const; + + void set_resolved_hints(std::set&& _resolved_sources, + std::set&& _resolved_dests) { + resolved_sources = std::move(_resolved_sources); + resolved_dests = std::move(_resolved_dests); + } + + const std::set& get_resolved_source_hints() { + return resolved_sources; + } + + const std::set& get_resolved_dest_hints() { + return resolved_dests; + } + + const std::set& get_source_zones() const { + return source_zones; + } + + const std::set& get_target_zones() const { + return target_zones; + } + + const std::map& get_sources() { + return sources; + } + + std::multimap get_all_sources() const; + std::multimap get_all_dests() const; + std::multimap get_all_dests_in_zone(const rgw_zone_id& zone_id) const; + + const std::map& get_targets() { + return targets; + } + + const std::optional& get_bucket_info() const { + return bucket_info; + } + + const std::optional >& get_bucket_attrs() const { + return bucket_attrs; + } + + void get_pipes(RGWBucketSyncFlowManager::pipe_set **_sources, RGWBucketSyncFlowManager::pipe_set **_targets) { /* return raw pipes (with zone name) */ + *_sources = &source_pipes; + *_targets = &target_pipes; + } + void get_pipes(std::set *sources, std::set *targets, + std::optional filter_peer); + + const std::set& get_source_hints() const { + return source_hints; + } + + const std::set& get_target_hints() const { + return target_hints; + } + + bool bucket_exports_data() const; + bool bucket_imports_data() const; + + const rgw_sync_policy_info& get_sync_policy() const { + return sync_policy; + } + + bool is_legacy_config() const { + return legacy_config; + } +}; + diff --git a/src/rgw/driver/rados/rgw_cr_rados.cc b/src/rgw/driver/rados/rgw_cr_rados.cc new file mode 100644 index 000000000..d8e0ecba6 --- /dev/null +++ b/src/rgw/driver/rados/rgw_cr_rados.cc @@ -0,0 +1,1165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "include/compat.h" +#include "rgw_sal.h" +#include "rgw_zone.h" +#include "rgw_coroutine.h" +#include "rgw_cr_rados.h" +#include "rgw_sync_counters.h" +#include "rgw_bucket.h" +#include "rgw_datalog_notify.h" +#include "rgw_cr_rest.h" +#include "rgw_rest_conn.h" +#include "rgw_rados.h" + +#include "services/svc_zone.h" +#include "services/svc_zone_utils.h" +#include "services/svc_sys_obj.h" +#include "services/svc_cls.h" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rgw/cls_rgw_client.h" + +#include +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +bool RGWAsyncRadosProcessor::RGWWQ::_enqueue(RGWAsyncRadosRequest *req) { + if (processor->is_going_down()) { + return false; + } + req->get(); + processor->m_req_queue.push_back(req); + dout(20) << "enqueued request req=" << hex << req << dec << dendl; + _dump_queue(); + return true; +} + +bool RGWAsyncRadosProcessor::RGWWQ::_empty() { + return processor->m_req_queue.empty(); +} + +RGWAsyncRadosRequest *RGWAsyncRadosProcessor::RGWWQ::_dequeue() { + if (processor->m_req_queue.empty()) + return NULL; + RGWAsyncRadosRequest *req = processor->m_req_queue.front(); + processor->m_req_queue.pop_front(); + dout(20) << "dequeued request req=" << hex << req << dec << dendl; + _dump_queue(); + return req; +} + +void RGWAsyncRadosProcessor::RGWWQ::_process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) { + processor->handle_request(this, req); + processor->req_throttle.put(1); +} + +void RGWAsyncRadosProcessor::RGWWQ::_dump_queue() { + if (!g_conf()->subsys.should_gather()) { + return; + } + deque::iterator iter; + if (processor->m_req_queue.empty()) { + dout(20) << "RGWWQ: empty" << dendl; + return; + } + dout(20) << "RGWWQ:" << dendl; + for (iter = processor->m_req_queue.begin(); iter != processor->m_req_queue.end(); ++iter) { + dout(20) << "req: " << hex << *iter << dec << dendl; + } +} + +RGWAsyncRadosProcessor::RGWAsyncRadosProcessor(CephContext *_cct, int num_threads) + : cct(_cct), m_tp(cct, "RGWAsyncRadosProcessor::m_tp", "rados_async", num_threads), + req_throttle(_cct, "rgw_async_rados_ops", num_threads * 2), + req_wq(this, + ceph::make_timespan(g_conf()->rgw_op_thread_timeout), + ceph::make_timespan(g_conf()->rgw_op_thread_suicide_timeout), + &m_tp) { +} + +void RGWAsyncRadosProcessor::start() { + m_tp.start(); +} + +void RGWAsyncRadosProcessor::stop() { + going_down = true; + m_tp.drain(&req_wq); + m_tp.stop(); + for (auto iter = m_req_queue.begin(); iter != m_req_queue.end(); ++iter) { + (*iter)->put(); + } +} + +void RGWAsyncRadosProcessor::handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req) { + req->send_request(dpp); + req->put(); +} + +void RGWAsyncRadosProcessor::queue(RGWAsyncRadosRequest *req) { + req_throttle.get(1); + req_wq.queue(req); +} + +int RGWAsyncGetSystemObj::_send_request(const DoutPrefixProvider *dpp) +{ + map *pattrs = want_attrs ? &attrs : nullptr; + + auto sysobj = svc_sysobj->get_obj(obj); + return sysobj.rop() + .set_objv_tracker(&objv_tracker) + .set_attrs(pattrs) + .set_raw_attrs(raw_attrs) + .read(dpp, &bl, null_yield); +} + +RGWAsyncGetSystemObj::RGWAsyncGetSystemObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + bool want_attrs, bool raw_attrs) + : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc_sysobj(_svc), + obj(_obj), want_attrs(want_attrs), raw_attrs(raw_attrs) +{ + if (_objv_tracker) { + objv_tracker = *_objv_tracker; + } +} + +int RGWSimpleRadosReadAttrsCR::send_request(const DoutPrefixProvider *dpp) +{ + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" + << r << dendl; + return r; + } + + set_status() << "sending request"; + + librados::ObjectReadOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + + if (raw_attrs && pattrs) { + op.getxattrs(pattrs, nullptr); + } else { + op.getxattrs(&unfiltered_attrs, nullptr); + } + + cn = stack->create_completion_notifier(); + return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op, + nullptr); +} + +int RGWSimpleRadosReadAttrsCR::request_complete() +{ + int ret = cn->completion()->get_return_value(); + set_status() << "request complete; ret=" << ret; + if (!raw_attrs && pattrs) { + rgw_filter_attrset(unfiltered_attrs, RGW_ATTR_PREFIX, pattrs); + } + return ret; +} + +int RGWAsyncPutSystemObj::_send_request(const DoutPrefixProvider *dpp) +{ + auto sysobj = svc->get_obj(obj); + return sysobj.wop() + .set_objv_tracker(&objv_tracker) + .set_exclusive(exclusive) + .write_data(dpp, bl, null_yield); +} + +RGWAsyncPutSystemObj::RGWAsyncPutSystemObj(const DoutPrefixProvider *_dpp, + RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, + RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + bool _exclusive, bufferlist _bl) + : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc), + obj(_obj), exclusive(_exclusive), bl(std::move(_bl)) +{ + if (_objv_tracker) { + objv_tracker = *_objv_tracker; + } +} + +int RGWAsyncPutSystemObjAttrs::_send_request(const DoutPrefixProvider *dpp) +{ + auto sysobj = svc->get_obj(obj); + return sysobj.wop() + .set_objv_tracker(&objv_tracker) + .set_exclusive(exclusive) + .set_attrs(attrs) + .write_attrs(dpp, null_yield); +} + +RGWAsyncPutSystemObjAttrs::RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, + RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + map _attrs, bool exclusive) + : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc), + obj(_obj), attrs(std::move(_attrs)), exclusive(exclusive) +{ + if (_objv_tracker) { + objv_tracker = *_objv_tracker; + } +} + + +RGWOmapAppend::RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj, + uint64_t _window_size) + : RGWConsumerCR(_store->ctx()), async_rados(_async_rados), + store(_store), obj(_obj), going_down(false), num_pending_entries(0), window_size(_window_size), total_entries(0) +{ +} + +int RGWAsyncLockSystemObj::_send_request(const DoutPrefixProvider *dpp) +{ + rgw_rados_ref ref; + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + rados::cls::lock::Lock l(lock_name); + utime_t duration(duration_secs, 0); + l.set_duration(duration); + l.set_cookie(cookie); + l.set_may_renew(true); + + return l.lock_exclusive(&ref.pool.ioctx(), ref.obj.oid); +} + +RGWAsyncLockSystemObj::RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + const string& _name, const string& _cookie, uint32_t _duration_secs) : RGWAsyncRadosRequest(caller, cn), store(_store), + obj(_obj), + lock_name(_name), + cookie(_cookie), + duration_secs(_duration_secs) +{ +} + +int RGWAsyncUnlockSystemObj::_send_request(const DoutPrefixProvider *dpp) +{ + rgw_rados_ref ref; + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + rados::cls::lock::Lock l(lock_name); + + l.set_cookie(cookie); + + return l.unlock(&ref.pool.ioctx(), ref.obj.oid); +} + +RGWAsyncUnlockSystemObj::RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + const string& _name, const string& _cookie) : RGWAsyncRadosRequest(caller, cn), store(_store), + obj(_obj), + lock_name(_name), cookie(_cookie) +{ +} + +RGWRadosSetOmapKeysCR::RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + map& _entries) : RGWSimpleCoroutine(_store->ctx()), + store(_store), + entries(_entries), + obj(_obj), cn(NULL) +{ + stringstream& s = set_description(); + s << "set omap keys dest=" << obj << " keys=[" << s.str() << "]"; + for (auto i = entries.begin(); i != entries.end(); ++i) { + if (i != entries.begin()) { + s << ", "; + } + s << i->first; + } + s << "]"; +} + +int RGWRadosSetOmapKeysCR::send_request(const DoutPrefixProvider *dpp) +{ + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "sending request"; + + librados::ObjectWriteOperation op; + op.omap_set(entries); + + cn = stack->create_completion_notifier(); + return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op); +} + +int RGWRadosSetOmapKeysCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosGetOmapKeysCR::RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + const string& _marker, + int _max_entries, + ResultPtr _result) + : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj), + marker(_marker), max_entries(_max_entries), + result(std::move(_result)) +{ + ceph_assert(result); // must be allocated + set_description() << "get omap keys dest=" << obj << " marker=" << marker; +} + +int RGWRadosGetOmapKeysCR::send_request(const DoutPrefixProvider *dpp) { + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "send request"; + + librados::ObjectReadOperation op; + op.omap_get_keys2(marker, max_entries, &result->entries, &result->more, nullptr); + + cn = stack->create_completion_notifier(result); + return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL); +} + +int RGWRadosGetOmapKeysCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosGetOmapValsCR::RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + const string& _marker, + int _max_entries, + ResultPtr _result) + : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj), + marker(_marker), max_entries(_max_entries), + result(std::move(_result)) +{ + ceph_assert(result); // must be allocated + set_description() << "get omap keys dest=" << obj << " marker=" << marker; +} + +int RGWRadosGetOmapValsCR::send_request(const DoutPrefixProvider *dpp) { + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "send request"; + + librados::ObjectReadOperation op; + op.omap_get_vals2(marker, max_entries, &result->entries, &result->more, nullptr); + + cn = stack->create_completion_notifier(result); + return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL); +} + +int RGWRadosGetOmapValsCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosRemoveOmapKeysCR::RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + const set& _keys) : RGWSimpleCoroutine(_store->ctx()), + store(_store), + keys(_keys), + obj(_obj), cn(NULL) +{ + set_description() << "remove omap keys dest=" << obj << " keys=" << keys; +} + +int RGWRadosRemoveOmapKeysCR::send_request(const DoutPrefixProvider *dpp) { + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "send request"; + + librados::ObjectWriteOperation op; + op.omap_rm_keys(keys); + + cn = stack->create_completion_notifier(); + return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op); +} + +int RGWRadosRemoveOmapKeysCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosRemoveCR::RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj, + RGWObjVersionTracker* objv_tracker) + : RGWSimpleCoroutine(store->ctx()), + store(store), obj(obj), objv_tracker(objv_tracker) +{ + set_description() << "remove dest=" << obj; +} + +int RGWRadosRemoveCR::send_request(const DoutPrefixProvider *dpp) +{ + auto rados = store->getRados()->get_rados_handle(); + int r = rados->ioctx_create(obj.pool.name.c_str(), ioctx); + if (r < 0) { + lderr(cct) << "ERROR: failed to open pool (" << obj.pool.name << ") ret=" << r << dendl; + return r; + } + ioctx.locator_set_key(obj.loc); + + set_status() << "send request"; + + librados::ObjectWriteOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + op.remove(); + + cn = stack->create_completion_notifier(); + return ioctx.aio_operate(obj.oid, cn->completion(), &op); +} + +int RGWRadosRemoveCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store, + librados::IoCtx&& ioctx, + std::string_view oid, + RGWObjVersionTracker* objv_tracker) + : RGWSimpleCoroutine(store->ctx()), ioctx(std::move(ioctx)), + oid(std::string(oid)), objv_tracker(objv_tracker) +{ + set_description() << "remove dest=" << oid; +} + +RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store, + RGWSI_RADOS::Obj& obj, + RGWObjVersionTracker* objv_tracker) + : RGWSimpleCoroutine(store->ctx()), + ioctx(librados::IoCtx(obj.get_ref().pool.ioctx())), + oid(obj.get_ref().obj.oid), + objv_tracker(objv_tracker) +{ + set_description() << "remove dest=" << oid; +} + +RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store, + RGWSI_RADOS::Obj&& obj, + RGWObjVersionTracker* objv_tracker) + : RGWSimpleCoroutine(store->ctx()), + ioctx(std::move(obj.get_ref().pool.ioctx())), + oid(std::move(obj.get_ref().obj.oid)), + objv_tracker(objv_tracker) +{ + set_description() << "remove dest=" << oid; +} + +int RGWRadosRemoveOidCR::send_request(const DoutPrefixProvider *dpp) +{ + librados::ObjectWriteOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + op.remove(); + + cn = stack->create_completion_notifier(); + return ioctx.aio_operate(oid, cn->completion(), &op); +} + +int RGWRadosRemoveOidCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWSimpleRadosLockCR::RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + const string& _lock_name, + const string& _cookie, + uint32_t _duration) : RGWSimpleCoroutine(_store->ctx()), + async_rados(_async_rados), + store(_store), + lock_name(_lock_name), + cookie(_cookie), + duration(_duration), + obj(_obj), + req(NULL) +{ + set_description() << "rados lock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie << " duration=" << duration; +} + +void RGWSimpleRadosLockCR::request_cleanup() +{ + if (req) { + req->finish(); + req = NULL; + } +} + +int RGWSimpleRadosLockCR::send_request(const DoutPrefixProvider *dpp) +{ + set_status() << "sending request"; + req = new RGWAsyncLockSystemObj(this, stack->create_completion_notifier(), + store, NULL, obj, lock_name, cookie, duration); + async_rados->queue(req); + return 0; +} + +int RGWSimpleRadosLockCR::request_complete() +{ + set_status() << "request complete; ret=" << req->get_ret_status(); + return req->get_ret_status(); +} + +RGWSimpleRadosUnlockCR::RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + const string& _lock_name, + const string& _cookie) : RGWSimpleCoroutine(_store->ctx()), + async_rados(_async_rados), + store(_store), + lock_name(_lock_name), + cookie(_cookie), + obj(_obj), + req(NULL) +{ + set_description() << "rados unlock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie; +} + +void RGWSimpleRadosUnlockCR::request_cleanup() +{ + if (req) { + req->finish(); + req = NULL; + } +} + +int RGWSimpleRadosUnlockCR::send_request(const DoutPrefixProvider *dpp) +{ + set_status() << "sending request"; + + req = new RGWAsyncUnlockSystemObj(this, stack->create_completion_notifier(), + store, NULL, obj, lock_name, cookie); + async_rados->queue(req); + return 0; +} + +int RGWSimpleRadosUnlockCR::request_complete() +{ + set_status() << "request complete; ret=" << req->get_ret_status(); + return req->get_ret_status(); +} + +int RGWOmapAppend::operate(const DoutPrefixProvider *dpp) { + reenter(this) { + for (;;) { + if (!has_product() && going_down) { + set_status() << "going down"; + break; + } + set_status() << "waiting for product"; + yield wait_for_product(); + yield { + string entry; + while (consume(&entry)) { + set_status() << "adding entry: " << entry; + entries[entry] = bufferlist(); + if (entries.size() >= window_size) { + break; + } + } + if (entries.size() >= window_size || going_down) { + set_status() << "flushing to omap"; + call(new RGWRadosSetOmapKeysCR(store, obj, entries)); + entries.clear(); + } + } + if (get_ret_status() < 0) { + ldout(cct, 0) << "ERROR: failed to store entries in omap" << dendl; + return set_state(RGWCoroutine_Error); + } + } + /* done with coroutine */ + return set_state(RGWCoroutine_Done); + } + return 0; +} + +void RGWOmapAppend::flush_pending() { + receive(pending_entries); + num_pending_entries = 0; +} + +bool RGWOmapAppend::append(const string& s) { + if (is_done()) { + return false; + } + ++total_entries; + pending_entries.push_back(s); + if (++num_pending_entries >= (int)window_size) { + flush_pending(); + } + return true; +} + +bool RGWOmapAppend::finish() { + going_down = true; + flush_pending(); + set_sleeping(false); + return (!is_done()); +} + +int RGWAsyncGetBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp) +{ + int r; + if (!bucket.bucket_id.empty()) { + r = store->getRados()->get_bucket_instance_info(bucket, bucket_info, nullptr, &attrs, null_yield, dpp); + } else { + r = store->ctl()->bucket->read_bucket_info(bucket, &bucket_info, null_yield, dpp, + RGWBucketCtl::BucketInstance::GetParams().set_attrs(&attrs)); + } + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to get bucket instance info for " + << bucket << dendl; + return r; + } + + return 0; +} + +int RGWAsyncPutBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp) +{ + auto r = store->getRados()->put_bucket_instance_info(bucket_info, exclusive, + mtime, attrs, dpp, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to put bucket instance info for " + << bucket_info.bucket << dendl; + return r; + } + + return 0; +} + +RGWRadosBILogTrimCR::RGWRadosBILogTrimCR( + const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, + const RGWBucketInfo& bucket_info, + int shard_id, + const rgw::bucket_index_layout_generation& generation, + const std::string& start_marker, + const std::string& end_marker) + : RGWSimpleCoroutine(store->ctx()), bucket_info(bucket_info), + shard_id(shard_id), generation(generation), bs(store->getRados()), + start_marker(BucketIndexShardsManager::get_shard_marker(start_marker)), + end_marker(BucketIndexShardsManager::get_shard_marker(end_marker)) +{ +} + +int RGWRadosBILogTrimCR::send_request(const DoutPrefixProvider *dpp) +{ + int r = bs.init(dpp, bucket_info, generation, shard_id); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: bucket shard init failed ret=" << r << dendl; + return r; + } + + bufferlist in; + cls_rgw_bi_log_trim_op call; + call.start_marker = std::move(start_marker); + call.end_marker = std::move(end_marker); + encode(call, in); + + librados::ObjectWriteOperation op; + op.exec(RGW_CLASS, RGW_BI_LOG_TRIM, in); + + cn = stack->create_completion_notifier(); + return bs.bucket_obj.aio_operate(cn->completion(), &op); +} + +int RGWRadosBILogTrimCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + set_status() << "request complete; ret=" << r; + return r; +} + +int RGWAsyncFetchRemoteObj::_send_request(const DoutPrefixProvider *dpp) +{ + RGWObjectCtx obj_ctx(store); + + char buf[16]; + snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id()); + rgw::sal::Attrs attrs; + + rgw_obj src_obj(src_bucket, key); + + rgw::sal::RadosBucket dest_bucket(store, dest_bucket_info); + rgw::sal::RadosObject dest_obj(store, dest_key.value_or(key), &dest_bucket); + + std::string etag; + + std::optional bytes_transferred; + int r = store->getRados()->fetch_remote_obj(obj_ctx, + user_id.value_or(rgw_user()), + NULL, /* req_info */ + source_zone, + dest_obj.get_obj(), + src_obj, + dest_bucket_info, /* dest */ + nullptr, /* source */ + dest_placement_rule, + nullptr, /* real_time* src_mtime, */ + NULL, /* real_time* mtime, */ + NULL, /* const real_time* mod_ptr, */ + NULL, /* const real_time* unmod_ptr, */ + false, /* high precision time */ + NULL, /* const char *if_match, */ + NULL, /* const char *if_nomatch, */ + RGWRados::ATTRSMOD_NONE, + copy_if_newer, + attrs, + RGWObjCategory::Main, + versioned_epoch, + real_time(), /* delete_at */ + NULL, /* string *ptag, */ + &etag, /* string *petag, */ + NULL, /* void (*progress_cb)(off_t, void *), */ + NULL, /* void *progress_data*); */ + dpp, + filter.get(), + source_trace_entry, + &zones_trace, + &bytes_transferred); + + if (r < 0) { + ldpp_dout(dpp, 0) << "store->fetch_remote_obj() returned r=" << r << dendl; + if (counters) { + counters->inc(sync_counters::l_fetch_err, 1); + } + } else { + // r >= 0 + if (bytes_transferred) { + // send notification that object was succesfully synced + std::string user_id = "rgw sync"; + std::string req_id = "0"; + + RGWObjTags obj_tags; + auto iter = attrs.find(RGW_ATTR_TAGS); + if (iter != attrs.end()) { + try { + auto it = iter->second.cbegin(); + obj_tags.decode(it); + } catch (buffer::error &err) { + ldpp_dout(dpp, 1) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl; + } + } + + // NOTE: we create a mutable copy of bucket.get_tenant as the get_notification function expects a std::string&, not const + std::string tenant(dest_bucket.get_tenant()); + + std::unique_ptr notify + = store->get_notification(dpp, &dest_obj, nullptr, rgw::notify::ObjectSyncedCreate, + &dest_bucket, user_id, + tenant, + req_id, null_yield); + + auto notify_res = static_cast(notify.get())->get_reservation(); + int ret = rgw::notify::publish_reserve(dpp, rgw::notify::ObjectSyncedCreate, notify_res, &obj_tags); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: reserving notification failed, with error: " << ret << dendl; + // no need to return, the sync already happened + } else { + ret = rgw::notify::publish_commit(&dest_obj, *bytes_transferred, ceph::real_clock::now(), etag, dest_obj.get_instance(), rgw::notify::ObjectSyncedCreate, notify_res, dpp); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl; + } + } + } + + if (counters) { + if (bytes_transferred) { + counters->inc(sync_counters::l_fetch, *bytes_transferred); + } else { + counters->inc(sync_counters::l_fetch_not_modified); + } + } + } + return r; +} + +int RGWAsyncStatRemoteObj::_send_request(const DoutPrefixProvider *dpp) +{ + RGWObjectCtx obj_ctx(store); + + string user_id; + char buf[16]; + snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id()); + + + rgw_obj src_obj(src_bucket, key); + + int r = store->getRados()->stat_remote_obj(dpp, + obj_ctx, + rgw_user(user_id), + nullptr, /* req_info */ + source_zone, + src_obj, + nullptr, /* source */ + pmtime, /* real_time* src_mtime, */ + psize, /* uint64_t * */ + nullptr, /* const real_time* mod_ptr, */ + nullptr, /* const real_time* unmod_ptr, */ + true, /* high precision time */ + nullptr, /* const char *if_match, */ + nullptr, /* const char *if_nomatch, */ + pattrs, + pheaders, + nullptr, + nullptr, /* string *ptag, */ + petag); /* string *petag, */ + + if (r < 0) { + ldpp_dout(dpp, 0) << "store->stat_remote_obj() returned r=" << r << dendl; + } + return r; +} + + +int RGWAsyncRemoveObj::_send_request(const DoutPrefixProvider *dpp) +{ + ldpp_dout(dpp, 0) << __func__ << "(): deleting obj=" << obj << dendl; + + obj->set_atomic(); + + RGWObjState *state; + + int ret = obj->get_obj_state(dpp, &state, null_yield); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << "(): get_obj_state() obj=" << obj << " returned ret=" << ret << dendl; + return ret; + } + + /* has there been any racing object write? */ + if (del_if_older && (state->mtime > timestamp)) { + ldpp_dout(dpp, 20) << __func__ << "(): skipping object removal obj=" << obj << " (obj mtime=" << state->mtime << ", request timestamp=" << timestamp << ")" << dendl; + return 0; + } + + RGWAccessControlPolicy policy; + + /* decode policy */ + map::iterator iter = state->attrset.find(RGW_ATTR_ACL); + if (iter != state->attrset.end()) { + auto bliter = iter->second.cbegin(); + try { + policy.decode(bliter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + } + + std::unique_ptr del_op = obj->get_delete_op(); + + del_op->params.bucket_owner = bucket->get_info().owner; + del_op->params.obj_owner = policy.get_owner(); + if (del_if_older) { + del_op->params.unmod_since = timestamp; + } + if (versioned) { + del_op->params.versioning_status = BUCKET_VERSIONED; + } + del_op->params.olh_epoch = versioned_epoch; + del_op->params.marker_version_id = marker_version_id; + del_op->params.obj_owner.set_id(rgw_user(owner)); + del_op->params.obj_owner.set_name(owner_display_name); + del_op->params.mtime = timestamp; + del_op->params.high_precision_time = true; + del_op->params.zones_trace = &zones_trace; + + ret = del_op->delete_obj(dpp, null_yield); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << "(): delete_obj() obj=" << obj << " returned ret=" << ret << dendl; + } + return ret; +} + +int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp) +{ + if (aborted) { + caller->set_sleeping(false); + return set_cr_done(); + } + reenter(this) { + last_renew_try_time = ceph::coarse_mono_clock::now(); + while (!going_down) { + current_time = ceph::coarse_mono_clock::now(); + yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval)); + if (latency) { + latency->add_latency(ceph::coarse_mono_clock::now() - current_time); + } + current_time = ceph::coarse_mono_clock::now(); + if (current_time - last_renew_try_time > interval_tolerance) { + // renewal should happen between 50%-90% of interval + ldout(store->ctx(), 1) << *this << ": WARNING: did not renew lock " << obj << ":" << lock_name << ": within 90\% of interval. " << + (current_time - last_renew_try_time) << " > " << interval_tolerance << dendl; + } + last_renew_try_time = current_time; + + caller->set_sleeping(false); /* will only be relevant when we return, that's why we can do it early */ + if (retcode < 0) { + set_locked(false); + ldout(store->ctx(), 20) << *this << ": couldn't lock " << obj << ":" << lock_name << ": retcode=" << retcode << dendl; + return set_state(RGWCoroutine_Error, retcode); + } + ldout(store->ctx(), 20) << *this << ": successfully locked " << obj << ":" << lock_name << dendl; + set_locked(true); + yield wait(utime_t(interval / 2, 0)); + } + set_locked(false); /* moot at this point anyway */ + current_time = ceph::coarse_mono_clock::now(); + yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie)); + if (latency) { + latency->add_latency(ceph::coarse_mono_clock::now() - current_time); + } + return set_state(RGWCoroutine_Done); + } + return 0; +} + +RGWRadosTimelogAddCR::RGWRadosTimelogAddCR(const DoutPrefixProvider *_dpp, rgw::sal::RadosStore* _store, const string& _oid, + const cls_log_entry& entry) : RGWSimpleCoroutine(_store->ctx()), + dpp(_dpp), + store(_store), + oid(_oid), cn(NULL) +{ + stringstream& s = set_description(); + s << "timelog add entry oid=" << oid << "entry={id=" << entry.id << ", section=" << entry.section << ", name=" << entry.name << "}"; + entries.push_back(entry); +} + +int RGWRadosTimelogAddCR::send_request(const DoutPrefixProvider *dpp) +{ + set_status() << "sending request"; + + cn = stack->create_completion_notifier(); + return store->svc()->cls->timelog.add(dpp, oid, entries, cn->completion(), true, null_yield); +} + +int RGWRadosTimelogAddCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosTimelogTrimCR::RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, + const std::string& oid, + const real_time& start_time, + const real_time& end_time, + const std::string& from_marker, + const std::string& to_marker) + : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), oid(oid), + start_time(start_time), end_time(end_time), + from_marker(from_marker), to_marker(to_marker) +{ + set_description() << "timelog trim oid=" << oid + << " start_time=" << start_time << " end_time=" << end_time + << " from_marker=" << from_marker << " to_marker=" << to_marker; +} + +int RGWRadosTimelogTrimCR::send_request(const DoutPrefixProvider *dpp) +{ + set_status() << "sending request"; + + cn = stack->create_completion_notifier(); + return store->svc()->cls->timelog.trim(dpp, oid, start_time, end_time, from_marker, + to_marker, cn->completion(), + null_yield); +} + +int RGWRadosTimelogTrimCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + + +RGWSyncLogTrimCR::RGWSyncLogTrimCR(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, const std::string& oid, + const std::string& to_marker, + std::string *last_trim_marker) + : RGWRadosTimelogTrimCR(dpp, store, oid, real_time{}, real_time{}, + std::string{}, to_marker), + cct(store->ctx()), last_trim_marker(last_trim_marker) +{ +} + +int RGWSyncLogTrimCR::request_complete() +{ + int r = RGWRadosTimelogTrimCR::request_complete(); + if (r != -ENODATA) { + return r; + } + // nothing left to trim, update last_trim_marker + if (*last_trim_marker < to_marker && to_marker != max_marker) { + *last_trim_marker = to_marker; + } + return 0; +} + + +int RGWAsyncStatObj::_send_request(const DoutPrefixProvider *dpp) +{ + rgw_raw_obj raw_obj; + store->getRados()->obj_to_raw(bucket_info.placement_rule, obj, &raw_obj); + return store->getRados()->raw_obj_stat(dpp, raw_obj, psize, pmtime, pepoch, + nullptr, nullptr, objv_tracker, null_yield); +} + +RGWStatObjCR::RGWStatObjCR(const DoutPrefixProvider *dpp, + RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store, + const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize, + real_time* pmtime, uint64_t *pepoch, + RGWObjVersionTracker *objv_tracker) + : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), async_rados(async_rados), + bucket_info(_bucket_info), obj(obj), psize(psize), pmtime(pmtime), pepoch(pepoch), + objv_tracker(objv_tracker) +{ +} + +void RGWStatObjCR::request_cleanup() +{ + if (req) { + req->finish(); + req = NULL; + } +} + +int RGWStatObjCR::send_request(const DoutPrefixProvider *dpp) +{ + req = new RGWAsyncStatObj(dpp, this, stack->create_completion_notifier(), + store, bucket_info, obj, psize, pmtime, pepoch, objv_tracker); + async_rados->queue(req); + return 0; +} + +int RGWStatObjCR::request_complete() +{ + return req->get_ret_status(); +} + +RGWRadosNotifyCR::RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj, + bufferlist& request, uint64_t timeout_ms, + bufferlist *response) + : RGWSimpleCoroutine(store->ctx()), store(store), obj(obj), + request(request), timeout_ms(timeout_ms), response(response) +{ + set_description() << "notify dest=" << obj; +} + +int RGWRadosNotifyCR::send_request(const DoutPrefixProvider *dpp) +{ + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "sending request"; + + cn = stack->create_completion_notifier(); + return ref.pool.ioctx().aio_notify(ref.obj.oid, cn->completion(), request, + timeout_ms, response); +} + +int RGWRadosNotifyCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + + +int RGWDataPostNotifyCR::operate(const DoutPrefixProvider* dpp) +{ + reenter(this) { + using PostNotify2 = RGWPostRESTResourceCR>, int>; + yield { + rgw_http_param_pair pairs[] = { { "type", "data" }, + { "notify2", NULL }, + { "source-zone", source_zone }, + { NULL, NULL } }; + call(new PostNotify2(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, nullptr)); + } + if (retcode == -ERR_METHOD_NOT_ALLOWED) { + using PostNotify1 = RGWPostRESTResourceCR; + yield { + rgw_http_param_pair pairs[] = { { "type", "data" }, + { "notify", NULL }, + { "source-zone", source_zone }, + { NULL, NULL } }; + auto encoder = rgw_data_notify_v1_encoder{shards}; + call(new PostNotify1(store->ctx(), conn, &http_manager, "/admin/log", pairs, encoder, nullptr)); + } + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} diff --git a/src/rgw/driver/rados/rgw_cr_rados.h b/src/rgw/driver/rados/rgw_cr_rados.h new file mode 100644 index 000000000..7bda18878 --- /dev/null +++ b/src/rgw/driver/rados/rgw_cr_rados.h @@ -0,0 +1,1647 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "include/ceph_assert.h" +#include "rgw_coroutine.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "common/WorkQueue.h" +#include "common/Throttle.h" + +#include +#include "common/ceph_time.h" + +#include "services/svc_sys_obj.h" +#include "services/svc_bucket.h" + +struct rgw_http_param_pair; +class RGWRESTConn; + +class RGWAsyncRadosRequest : public RefCountedObject { + RGWCoroutine *caller; + RGWAioCompletionNotifier *notifier; + + int retcode; + + ceph::mutex lock = ceph::make_mutex("RGWAsyncRadosRequest::lock"); + +protected: + virtual int _send_request(const DoutPrefixProvider *dpp) = 0; +public: + RGWAsyncRadosRequest(RGWCoroutine *_caller, RGWAioCompletionNotifier *_cn) + : caller(_caller), notifier(_cn), retcode(0) { + } + ~RGWAsyncRadosRequest() override { + if (notifier) { + notifier->put(); + } + } + + void send_request(const DoutPrefixProvider *dpp) { + get(); + retcode = _send_request(dpp); + { + std::lock_guard l{lock}; + if (notifier) { + notifier->cb(); // drops its own ref + notifier = nullptr; + } + } + put(); + } + + int get_ret_status() { return retcode; } + + void finish() { + { + std::lock_guard l{lock}; + if (notifier) { + // we won't call notifier->cb() to drop its ref, so drop it here + notifier->put(); + notifier = nullptr; + } + } + put(); + } +}; + + +class RGWAsyncRadosProcessor { + std::deque m_req_queue; + std::atomic going_down = { false }; +protected: + CephContext *cct; + ThreadPool m_tp; + Throttle req_throttle; + + struct RGWWQ : public DoutPrefixProvider, public ThreadPool::WorkQueue { + RGWAsyncRadosProcessor *processor; + RGWWQ(RGWAsyncRadosProcessor *p, + ceph::timespan timeout, ceph::timespan suicide_timeout, + ThreadPool *tp) + : ThreadPool::WorkQueue("RGWWQ", timeout, suicide_timeout, tp), processor(p) {} + + bool _enqueue(RGWAsyncRadosRequest *req) override; + void _dequeue(RGWAsyncRadosRequest *req) override { + ceph_abort(); + } + bool _empty() override; + RGWAsyncRadosRequest *_dequeue() override; + using ThreadPool::WorkQueue::_process; + void _process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) override; + void _dump_queue(); + void _clear() override { + ceph_assert(processor->m_req_queue.empty()); + } + + CephContext *get_cct() const { return processor->cct; } + unsigned get_subsys() const { return ceph_subsys_rgw; } + std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw async rados processor: ";} + + } req_wq; + +public: + RGWAsyncRadosProcessor(CephContext *_cct, int num_threads); + ~RGWAsyncRadosProcessor() {} + void start(); + void stop(); + void handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req); + void queue(RGWAsyncRadosRequest *req); + + bool is_going_down() { + return going_down; + } + +}; + +template +class RGWSimpleWriteOnlyAsyncCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + + P params; + const DoutPrefixProvider *dpp; + + class Request : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + P params; + const DoutPrefixProvider *dpp; + protected: + int _send_request(const DoutPrefixProvider *dpp) override; + public: + Request(RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, + rgw::sal::RadosStore* store, + const P& _params, + const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), + store(store), + params(_params), + dpp(dpp) {} + } *req{nullptr}; + + public: + RGWSimpleWriteOnlyAsyncCR(RGWAsyncRadosProcessor *_async_rados, + rgw::sal::RadosStore* _store, + const P& _params, + const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()), + async_rados(_async_rados), + store(_store), + params(_params), + dpp(_dpp) {} + + ~RGWSimpleWriteOnlyAsyncCR() override { + request_cleanup(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new Request(this, + stack->create_completion_notifier(), + store, + params, + dpp); + + async_rados->queue(req); + return 0; + } + int request_complete() override { + return req->get_ret_status(); + } +}; + + +template +class RGWSimpleAsyncCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + + P params; + std::shared_ptr result; + const DoutPrefixProvider *dpp; + + class Request : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + P params; + std::shared_ptr result; + const DoutPrefixProvider *dpp; + protected: + int _send_request(const DoutPrefixProvider *dpp) override; + public: + Request(const DoutPrefixProvider *dpp, + RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, + rgw::sal::RadosStore* _store, + const P& _params, + std::shared_ptr& _result, + const DoutPrefixProvider *_dpp) : RGWAsyncRadosRequest(caller, cn), + store(_store), + params(_params), + result(_result), + dpp(_dpp) {} + } *req{nullptr}; + + public: + RGWSimpleAsyncCR(RGWAsyncRadosProcessor *_async_rados, + rgw::sal::RadosStore* _store, + const P& _params, + std::shared_ptr& _result, + const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()), + async_rados(_async_rados), + store(_store), + params(_params), + result(_result), + dpp(_dpp) {} + + ~RGWSimpleAsyncCR() override { + request_cleanup(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new Request(dpp, + this, + stack->create_completion_notifier(), + store, + params, + result, + dpp); + + async_rados->queue(req); + return 0; + } + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWGenericAsyncCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + + +public: + class Action { + public: + virtual ~Action() {} + virtual int operate() = 0; + }; + +private: + std::shared_ptr action; + + class Request : public RGWAsyncRadosRequest { + std::shared_ptr action; + protected: + int _send_request(const DoutPrefixProvider *dpp) override { + if (!action) { + return 0; + } + return action->operate(); + } + public: + Request(const DoutPrefixProvider *dpp, + RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, + std::shared_ptr& _action) : RGWAsyncRadosRequest(caller, cn), + action(_action) {} + } *req{nullptr}; + + public: + RGWGenericAsyncCR(CephContext *_cct, + RGWAsyncRadosProcessor *_async_rados, + std::shared_ptr& _action) : RGWSimpleCoroutine(_cct), + async_rados(_async_rados), + action(_action) {} + template + RGWGenericAsyncCR(CephContext *_cct, + RGWAsyncRadosProcessor *_async_rados, + std::shared_ptr& _action) : RGWSimpleCoroutine(_cct), + async_rados(_async_rados), + action(std::static_pointer_cast(_action)) {} + + ~RGWGenericAsyncCR() override { + request_cleanup(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new Request(dpp, this, + stack->create_completion_notifier(), + action); + + async_rados->queue(req); + return 0; + } + int request_complete() override { + return req->get_ret_status(); + } +}; + + +class RGWAsyncGetSystemObj : public RGWAsyncRadosRequest { + const DoutPrefixProvider *dpp; + RGWSI_SysObj* svc_sysobj; + rgw_raw_obj obj; + const bool want_attrs; + const bool raw_attrs; +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncGetSystemObj(const DoutPrefixProvider *dpp, + RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + bool want_attrs, bool raw_attrs); + + bufferlist bl; + std::map attrs; + RGWObjVersionTracker objv_tracker; +}; + +class RGWAsyncPutSystemObj : public RGWAsyncRadosRequest { + const DoutPrefixProvider *dpp; + RGWSI_SysObj *svc; + rgw_raw_obj obj; + bool exclusive; + bufferlist bl; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncPutSystemObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + bool _exclusive, bufferlist _bl); + + RGWObjVersionTracker objv_tracker; +}; + +class RGWAsyncPutSystemObjAttrs : public RGWAsyncRadosRequest { + const DoutPrefixProvider *dpp; + RGWSI_SysObj *svc; + rgw_raw_obj obj; + std::map attrs; + bool exclusive; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + std::map _attrs, bool exclusive); + + RGWObjVersionTracker objv_tracker; +}; + +class RGWAsyncLockSystemObj : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + rgw_raw_obj obj; + std::string lock_name; + std::string cookie; + uint32_t duration_secs; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + const std::string& _name, const std::string& _cookie, uint32_t _duration_secs); +}; + +class RGWAsyncUnlockSystemObj : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + rgw_raw_obj obj; + std::string lock_name; + std::string cookie; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + const std::string& _name, const std::string& _cookie); +}; + +template +class RGWSimpleRadosReadCR : public RGWSimpleCoroutine { + const DoutPrefixProvider* dpp; + rgw::sal::RadosStore* store; + rgw_raw_obj obj; + T* result; + /// on ENOENT, call handle_data() with an empty object instead of failing + const bool empty_on_enoent; + RGWObjVersionTracker* objv_tracker; + + T val; + rgw_rados_ref ref; + ceph::buffer::list bl; + boost::intrusive_ptr cn; + +public: + RGWSimpleRadosReadCR(const DoutPrefixProvider* dpp, + rgw::sal::RadosStore* store, + const rgw_raw_obj& obj, + T* result, bool empty_on_enoent = true, + RGWObjVersionTracker* objv_tracker = nullptr) + : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), + obj(obj), result(result), empty_on_enoent(empty_on_enoent), + objv_tracker(objv_tracker) { + if (!result) { + result = &val; + } + } + + int send_request(const DoutPrefixProvider *dpp) { + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" + << r << dendl; + return r; + } + + set_status() << "sending request"; + + librados::ObjectReadOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + + op.read(0, -1, &bl, nullptr); + + cn = stack->create_completion_notifier(); + return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op, + nullptr); + } + + int request_complete() { + int ret = cn->completion()->get_return_value(); + set_status() << "request complete; ret=" << ret; + + if (ret == -ENOENT && empty_on_enoent) { + *result = T(); + } else { + if (ret < 0) { + return ret; + } + try { + auto iter = bl.cbegin(); + if (iter.end()) { + // allow successful reads with empty buffers. ReadSyncStatus coroutines + // depend on this to be able to read without locking, because the + // cls lock from InitSyncStatus will create an empty object if it didn't + // exist + *result = T(); + } else { + decode(*result, iter); + } + } catch (buffer::error& err) { + return -EIO; + } + } + + return handle_data(*result); + } + + virtual int handle_data(T& data) { + return 0; + } +}; + +class RGWSimpleRadosReadAttrsCR : public RGWSimpleCoroutine { + const DoutPrefixProvider* dpp; + rgw::sal::RadosStore* const store; + + const rgw_raw_obj obj; + std::map* const pattrs; + const bool raw_attrs; + RGWObjVersionTracker* const objv_tracker; + + rgw_rados_ref ref; + std::map unfiltered_attrs; + boost::intrusive_ptr cn; + +public: + RGWSimpleRadosReadAttrsCR(const DoutPrefixProvider* dpp, + rgw::sal::RadosStore* store, + rgw_raw_obj obj, + std::map* pattrs, + bool raw_attrs, + RGWObjVersionTracker* objv_tracker = nullptr) + : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), + obj(std::move(obj)), pattrs(pattrs), raw_attrs(raw_attrs), + objv_tracker(objv_tracker) {} + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +template +class RGWSimpleRadosWriteCR : public RGWSimpleCoroutine { + const DoutPrefixProvider* dpp; + rgw::sal::RadosStore* const store; + rgw_raw_obj obj; + RGWObjVersionTracker* objv_tracker; + bool exclusive; + + bufferlist bl; + rgw_rados_ref ref; + std::map unfiltered_attrs; + boost::intrusive_ptr cn; + + +public: + RGWSimpleRadosWriteCR(const DoutPrefixProvider* dpp, + rgw::sal::RadosStore* const store, + rgw_raw_obj obj, const T& data, + RGWObjVersionTracker* objv_tracker = nullptr, + bool exclusive = false) + : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), + obj(std::move(obj)), objv_tracker(objv_tracker), exclusive(exclusive) { + encode(data, bl); + } + + int send_request(const DoutPrefixProvider *dpp) override { + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" + << r << dendl; + return r; + } + + set_status() << "sending request"; + + librados::ObjectWriteOperation op; + if (exclusive) { + op.create(true); + } + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + op.write_full(bl); + + cn = stack->create_completion_notifier(); + return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op); + } + + int request_complete() override { + int ret = cn->completion()->get_return_value(); + set_status() << "request complete; ret=" << ret; + if (ret >= 0 && objv_tracker) { + objv_tracker->apply_write(); + } + return ret; + } +}; + +class RGWSimpleRadosWriteAttrsCR : public RGWSimpleCoroutine { + const DoutPrefixProvider* dpp; + rgw::sal::RadosStore* const store; + RGWObjVersionTracker* objv_tracker; + rgw_raw_obj obj; + std::map attrs; + bool exclusive; + + rgw_rados_ref ref; + boost::intrusive_ptr cn; + + +public: + RGWSimpleRadosWriteAttrsCR(const DoutPrefixProvider* dpp, + rgw::sal::RadosStore* const store, + rgw_raw_obj obj, + std::map attrs, + RGWObjVersionTracker* objv_tracker = nullptr, + bool exclusive = false) + : RGWSimpleCoroutine(store->ctx()), dpp(dpp), + store(store), objv_tracker(objv_tracker), + obj(std::move(obj)), attrs(std::move(attrs)), + exclusive(exclusive) {} + + int send_request(const DoutPrefixProvider *dpp) override { + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" + << r << dendl; + return r; + } + + set_status() << "sending request"; + + librados::ObjectWriteOperation op; + if (exclusive) { + op.create(true); + } + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + + for (const auto& [name, bl] : attrs) { + if (!bl.length()) + continue; + op.setxattr(name.c_str(), bl); + } + + cn = stack->create_completion_notifier(); + if (!op.size()) { + cn->cb(); + return 0; + } + + return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op); + } + + int request_complete() override { + int ret = cn->completion()->get_return_value(); + set_status() << "request complete; ret=" << ret; + if (ret >= 0 && objv_tracker) { + objv_tracker->apply_write(); + } + return ret; + } +}; + +class RGWRadosSetOmapKeysCR : public RGWSimpleCoroutine { + rgw::sal::RadosStore* store; + std::map entries; + + rgw_rados_ref ref; + + rgw_raw_obj obj; + + boost::intrusive_ptr cn; + +public: + RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + std::map& _entries); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +class RGWRadosGetOmapKeysCR : public RGWSimpleCoroutine { + public: + struct Result { + rgw_rados_ref ref; + std::set entries; + bool more = false; + }; + using ResultPtr = std::shared_ptr; + + RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj, + const std::string& _marker, int _max_entries, + ResultPtr result); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; + + private: + rgw::sal::RadosStore* store; + rgw_raw_obj obj; + std::string marker; + int max_entries; + ResultPtr result; + boost::intrusive_ptr cn; +}; + +class RGWRadosGetOmapValsCR : public RGWSimpleCoroutine { + public: + struct Result { + rgw_rados_ref ref; + std::map entries; + bool more = false; + }; + using ResultPtr = std::shared_ptr; + + RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj, + const std::string& _marker, int _max_entries, + ResultPtr result); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; + + private: + rgw::sal::RadosStore* store; + rgw_raw_obj obj; + std::string marker; + int max_entries; + ResultPtr result; + boost::intrusive_ptr cn; +}; + +class RGWRadosRemoveOmapKeysCR : public RGWSimpleCoroutine { + rgw::sal::RadosStore* store; + + rgw_rados_ref ref; + + std::set keys; + + rgw_raw_obj obj; + + boost::intrusive_ptr cn; + +public: + RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + const std::set& _keys); + + int send_request(const DoutPrefixProvider *dpp) override; + + int request_complete() override; +}; + +class RGWRadosRemoveCR : public RGWSimpleCoroutine { + rgw::sal::RadosStore* store; + librados::IoCtx ioctx; + const rgw_raw_obj obj; + RGWObjVersionTracker* objv_tracker; + boost::intrusive_ptr cn; + +public: + RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj, + RGWObjVersionTracker* objv_tracker = nullptr); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +class RGWRadosRemoveOidCR : public RGWSimpleCoroutine { + librados::IoCtx ioctx; + const std::string oid; + RGWObjVersionTracker* objv_tracker; + boost::intrusive_ptr cn; + +public: + RGWRadosRemoveOidCR(rgw::sal::RadosStore* store, + librados::IoCtx&& ioctx, std::string_view oid, + RGWObjVersionTracker* objv_tracker = nullptr); + + RGWRadosRemoveOidCR(rgw::sal::RadosStore* store, + RGWSI_RADOS::Obj& obj, + RGWObjVersionTracker* objv_tracker = nullptr); + + RGWRadosRemoveOidCR(rgw::sal::RadosStore* store, + RGWSI_RADOS::Obj&& obj, + RGWObjVersionTracker* objv_tracker = nullptr); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +class RGWSimpleRadosLockCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + std::string lock_name; + std::string cookie; + uint32_t duration; + + rgw_raw_obj obj; + + RGWAsyncLockSystemObj *req; + +public: + RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + const std::string& _lock_name, + const std::string& _cookie, + uint32_t _duration); + ~RGWSimpleRadosLockCR() override { + request_cleanup(); + } + void request_cleanup() override; + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; + + static std::string gen_random_cookie(CephContext* cct) { + static constexpr std::size_t COOKIE_LEN = 16; + char buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1); + return buf; + } +}; + +class RGWSimpleRadosUnlockCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + std::string lock_name; + std::string cookie; + + rgw_raw_obj obj; + + RGWAsyncUnlockSystemObj *req; + +public: + RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + const std::string& _lock_name, + const std::string& _cookie); + ~RGWSimpleRadosUnlockCR() override { + request_cleanup(); + } + void request_cleanup() override; + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +#define OMAP_APPEND_MAX_ENTRIES_DEFAULT 100 + +class RGWOmapAppend : public RGWConsumerCR { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + + rgw_raw_obj obj; + + bool going_down; + + int num_pending_entries; + std::list pending_entries; + + std::map entries; + + uint64_t window_size; + uint64_t total_entries; +public: + RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_raw_obj& _obj, + uint64_t _window_size = OMAP_APPEND_MAX_ENTRIES_DEFAULT); + int operate(const DoutPrefixProvider *dpp) override; + void flush_pending(); + bool append(const std::string& s); + bool finish(); + + uint64_t get_total_entries() { + return total_entries; + } + + const rgw_raw_obj& get_obj() { + return obj; + } +}; + +class RGWShardedOmapCRManager { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + RGWCoroutine *op; + + int num_shards; + + std::vector shards; +public: + RGWShardedOmapCRManager(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, RGWCoroutine *_op, int _num_shards, const rgw_pool& pool, const std::string& oid_prefix) + : async_rados(_async_rados), + store(_store), op(_op), num_shards(_num_shards) { + shards.reserve(num_shards); + for (int i = 0; i < num_shards; ++i) { + char buf[oid_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), i); + RGWOmapAppend *shard = new RGWOmapAppend(async_rados, store, rgw_raw_obj(pool, buf)); + shard->get(); + shards.push_back(shard); + op->spawn(shard, false); + } + } + + ~RGWShardedOmapCRManager() { + for (auto shard : shards) { + shard->put(); + } + } + + bool append(const std::string& entry, int shard_id) { + return shards[shard_id]->append(entry); + } + bool finish() { + bool success = true; + for (auto& append_op : shards) { + success &= (append_op->finish() && (!append_op->is_error())); + } + return success; + } + + uint64_t get_total_entries(int shard_id) { + return shards[shard_id]->get_total_entries(); + } +}; + +class RGWAsyncGetBucketInstanceInfo : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + rgw_bucket bucket; + const DoutPrefixProvider *dpp; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncGetBucketInstanceInfo(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, + rgw::sal::RadosStore* _store, const rgw_bucket& bucket, + const DoutPrefixProvider *dpp) + : RGWAsyncRadosRequest(caller, cn), store(_store), bucket(bucket), dpp(dpp) {} + + RGWBucketInfo bucket_info; + std::map attrs; +}; + +class RGWAsyncPutBucketInstanceInfo : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + RGWBucketInfo& bucket_info; + bool exclusive; + real_time mtime; + std::map* attrs; + const DoutPrefixProvider *dpp; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncPutBucketInstanceInfo(RGWCoroutine* caller, + RGWAioCompletionNotifier* cn, + rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + bool exclusive, + real_time mtime, + std::map* attrs, + const DoutPrefixProvider* dpp) + : RGWAsyncRadosRequest(caller, cn), store(store), bucket_info(bucket_info), + exclusive(exclusive), mtime(mtime), attrs(attrs), dpp(dpp) {} +}; + +class RGWGetBucketInstanceInfoCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + rgw_bucket bucket; + RGWBucketInfo *bucket_info; + std::map *pattrs; + const DoutPrefixProvider *dpp; + + RGWAsyncGetBucketInstanceInfo *req{nullptr}; + +public: + // rgw_bucket constructor + RGWGetBucketInstanceInfoCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_bucket& _bucket, RGWBucketInfo *_bucket_info, + std::map *_pattrs, const DoutPrefixProvider *dpp) + : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados), store(_store), + bucket(_bucket), bucket_info(_bucket_info), pattrs(_pattrs), dpp(dpp) {} + ~RGWGetBucketInstanceInfoCR() override { + request_cleanup(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new RGWAsyncGetBucketInstanceInfo(this, stack->create_completion_notifier(), store, bucket, dpp); + async_rados->queue(req); + return 0; + } + int request_complete() override { + if (bucket_info) { + *bucket_info = std::move(req->bucket_info); + } + if (pattrs) { + *pattrs = std::move(req->attrs); + } + return req->get_ret_status(); + } +}; + +class RGWPutBucketInstanceInfoCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + RGWBucketInfo& bucket_info; + bool exclusive; + real_time mtime; + std::map* attrs; + const DoutPrefixProvider *dpp; + + RGWAsyncPutBucketInstanceInfo* req = nullptr; + +public: + // rgw_bucket constructor + RGWPutBucketInstanceInfoCR(RGWAsyncRadosProcessor *async_rados, + rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + bool exclusive, + real_time mtime, + std::map* attrs, + const DoutPrefixProvider *dpp) + : RGWSimpleCoroutine(store->ctx()), async_rados(async_rados), store(store), + bucket_info(bucket_info), exclusive(exclusive), + mtime(mtime), attrs(attrs), dpp(dpp) {} + ~RGWPutBucketInstanceInfoCR() override { + request_cleanup(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = nullptr; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new RGWAsyncPutBucketInstanceInfo(this, + stack->create_completion_notifier(), + store, bucket_info, exclusive, + mtime, attrs, dpp); + async_rados->queue(req); + return 0; + } + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWRadosBILogTrimCR : public RGWSimpleCoroutine { + const RGWBucketInfo& bucket_info; + int shard_id; + const rgw::bucket_index_layout_generation generation; + RGWRados::BucketShard bs; + std::string start_marker; + std::string end_marker; + boost::intrusive_ptr cn; + public: + RGWRadosBILogTrimCR(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info, + int shard_id, + const rgw::bucket_index_layout_generation& generation, + const std::string& start_marker, + const std::string& end_marker); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +class RGWAsyncFetchRemoteObj : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + rgw_zone_id source_zone; + + std::optional user_id; + + rgw_bucket src_bucket; + std::optional dest_placement_rule; + RGWBucketInfo dest_bucket_info; + + rgw_obj_key key; + std::optional dest_key; + std::optional versioned_epoch; + + real_time src_mtime; + + bool copy_if_newer; + std::shared_ptr filter; + rgw_zone_set_entry source_trace_entry; + rgw_zone_set zones_trace; + PerfCounters* counters; + const DoutPrefixProvider *dpp; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncFetchRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + const rgw_zone_id& _source_zone, + std::optional& _user_id, + const rgw_bucket& _src_bucket, + std::optional _dest_placement_rule, + const RGWBucketInfo& _dest_bucket_info, + const rgw_obj_key& _key, + const std::optional& _dest_key, + std::optional _versioned_epoch, + bool _if_newer, + std::shared_ptr _filter, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *_zones_trace, + PerfCounters* counters, const DoutPrefixProvider *dpp) + : RGWAsyncRadosRequest(caller, cn), store(_store), + source_zone(_source_zone), + user_id(_user_id), + src_bucket(_src_bucket), + dest_placement_rule(_dest_placement_rule), + dest_bucket_info(_dest_bucket_info), + key(_key), + dest_key(_dest_key), + versioned_epoch(_versioned_epoch), + copy_if_newer(_if_newer), + filter(_filter), + source_trace_entry(source_trace_entry), + counters(counters), + dpp(dpp) + { + if (_zones_trace) { + zones_trace = *_zones_trace; + } + } +}; + +class RGWFetchRemoteObjCR : public RGWSimpleCoroutine { + CephContext *cct; + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + rgw_zone_id source_zone; + + std::optional user_id; + + rgw_bucket src_bucket; + std::optional dest_placement_rule; + RGWBucketInfo dest_bucket_info; + + rgw_obj_key key; + std::optional dest_key; + std::optional versioned_epoch; + + real_time src_mtime; + + bool copy_if_newer; + + std::shared_ptr filter; + + RGWAsyncFetchRemoteObj *req; + const rgw_zone_set_entry& source_trace_entry; + rgw_zone_set *zones_trace; + PerfCounters* counters; + const DoutPrefixProvider *dpp; + +public: + RGWFetchRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_zone_id& _source_zone, + std::optional _user_id, + const rgw_bucket& _src_bucket, + std::optional _dest_placement_rule, + const RGWBucketInfo& _dest_bucket_info, + const rgw_obj_key& _key, + const std::optional& _dest_key, + std::optional _versioned_epoch, + bool _if_newer, + std::shared_ptr _filter, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *_zones_trace, + PerfCounters* counters, const DoutPrefixProvider *dpp) + : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()), + async_rados(_async_rados), store(_store), + source_zone(_source_zone), + user_id(_user_id), + src_bucket(_src_bucket), + dest_placement_rule(_dest_placement_rule), + dest_bucket_info(_dest_bucket_info), + key(_key), + dest_key(_dest_key), + versioned_epoch(_versioned_epoch), + copy_if_newer(_if_newer), + filter(_filter), + req(NULL), + source_trace_entry(source_trace_entry), + zones_trace(_zones_trace), counters(counters), dpp(dpp) {} + + + ~RGWFetchRemoteObjCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new RGWAsyncFetchRemoteObj(this, stack->create_completion_notifier(), store, + source_zone, user_id, src_bucket, dest_placement_rule, dest_bucket_info, + key, dest_key, versioned_epoch, copy_if_newer, filter, + source_trace_entry, zones_trace, counters, dpp); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWAsyncStatRemoteObj : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + rgw_zone_id source_zone; + + rgw_bucket src_bucket; + rgw_obj_key key; + + ceph::real_time *pmtime; + uint64_t *psize; + std::string *petag; + std::map *pattrs; + std::map *pheaders; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncStatRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + const rgw_zone_id& _source_zone, + rgw_bucket& _src_bucket, + const rgw_obj_key& _key, + ceph::real_time *_pmtime, + uint64_t *_psize, + std::string *_petag, + std::map *_pattrs, + std::map *_pheaders) : RGWAsyncRadosRequest(caller, cn), store(_store), + source_zone(_source_zone), + src_bucket(_src_bucket), + key(_key), + pmtime(_pmtime), + psize(_psize), + petag(_petag), + pattrs(_pattrs), + pheaders(_pheaders) {} +}; + +class RGWStatRemoteObjCR : public RGWSimpleCoroutine { + CephContext *cct; + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + rgw_zone_id source_zone; + + rgw_bucket src_bucket; + rgw_obj_key key; + + ceph::real_time *pmtime; + uint64_t *psize; + std::string *petag; + std::map *pattrs; + std::map *pheaders; + + RGWAsyncStatRemoteObj *req; + +public: + RGWStatRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_zone_id& _source_zone, + rgw_bucket& _src_bucket, + const rgw_obj_key& _key, + ceph::real_time *_pmtime, + uint64_t *_psize, + std::string *_petag, + std::map *_pattrs, + std::map *_pheaders) : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()), + async_rados(_async_rados), store(_store), + source_zone(_source_zone), + src_bucket(_src_bucket), + key(_key), + pmtime(_pmtime), + psize(_psize), + petag(_petag), + pattrs(_pattrs), + pheaders(_pheaders), + req(NULL) {} + + + ~RGWStatRemoteObjCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new RGWAsyncStatRemoteObj(this, stack->create_completion_notifier(), store, source_zone, + src_bucket, key, pmtime, psize, petag, pattrs, pheaders); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWAsyncRemoveObj : public RGWAsyncRadosRequest { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + rgw_zone_id source_zone; + + std::unique_ptr bucket; + std::unique_ptr obj; + + std::string owner; + std::string owner_display_name; + bool versioned; + uint64_t versioned_epoch; + std::string marker_version_id; + + bool del_if_older; + ceph::real_time timestamp; + rgw_zone_set zones_trace; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncRemoveObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, + rgw::sal::RadosStore* _store, + const rgw_zone_id& _source_zone, + RGWBucketInfo& _bucket_info, + const rgw_obj_key& _key, + const std::string& _owner, + const std::string& _owner_display_name, + bool _versioned, + uint64_t _versioned_epoch, + bool _delete_marker, + bool _if_older, + real_time& _timestamp, + rgw_zone_set* _zones_trace) : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), store(_store), + source_zone(_source_zone), + owner(_owner), + owner_display_name(_owner_display_name), + versioned(_versioned), + versioned_epoch(_versioned_epoch), + del_if_older(_if_older), + timestamp(_timestamp) { + if (_delete_marker) { + marker_version_id = _key.instance; + } + + if (_zones_trace) { + zones_trace = *_zones_trace; + } + store->get_bucket(nullptr, _bucket_info, &bucket); + obj = bucket->get_object(_key); + } +}; + +class RGWRemoveObjCR : public RGWSimpleCoroutine { + const DoutPrefixProvider *dpp; + CephContext *cct; + RGWAsyncRadosProcessor *async_rados; + rgw::sal::RadosStore* store; + rgw_zone_id source_zone; + + RGWBucketInfo bucket_info; + + rgw_obj_key key; + bool versioned; + uint64_t versioned_epoch; + bool delete_marker; + std::string owner; + std::string owner_display_name; + + bool del_if_older; + real_time timestamp; + + RGWAsyncRemoveObj *req; + + rgw_zone_set *zones_trace; + +public: + RGWRemoveObjCR(const DoutPrefixProvider *_dpp, RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, + const rgw_zone_id& _source_zone, + RGWBucketInfo& _bucket_info, + const rgw_obj_key& _key, + bool _versioned, + uint64_t _versioned_epoch, + std::string *_owner, + std::string *_owner_display_name, + bool _delete_marker, + real_time *_timestamp, + rgw_zone_set *_zones_trace) : RGWSimpleCoroutine(_store->ctx()), dpp(_dpp), cct(_store->ctx()), + async_rados(_async_rados), store(_store), + source_zone(_source_zone), + bucket_info(_bucket_info), + key(_key), + versioned(_versioned), + versioned_epoch(_versioned_epoch), + delete_marker(_delete_marker), req(NULL), zones_trace(_zones_trace) { + del_if_older = (_timestamp != NULL); + if (_timestamp) { + timestamp = *_timestamp; + } + + if (_owner) { + owner = *_owner; + } + + if (_owner_display_name) { + owner_display_name = *_owner_display_name; + } + } + ~RGWRemoveObjCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new RGWAsyncRemoveObj(dpp, this, stack->create_completion_notifier(), store, source_zone, bucket_info, + key, owner, owner_display_name, versioned, versioned_epoch, + delete_marker, del_if_older, timestamp, zones_trace); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } +}; + +/// \brief Collect average latency +/// +/// Used in data sync to back off on concurrency when latency of lock +/// operations rises. +/// +/// \warning This class is not thread safe. We do not use a mutex +/// because all coroutines spawned by RGWDataSyncCR share a single thread. +class LatencyMonitor { + ceph::timespan total; + std::uint64_t count = 0; + +public: + + LatencyMonitor() = default; + void add_latency(ceph::timespan latency) { + total += latency; + ++count; + } + + ceph::timespan avg_latency() { + using namespace std::literals; + return count == 0 ? 0s : total / count; + } +}; + +class RGWContinuousLeaseCR : public RGWCoroutine { + RGWAsyncRadosProcessor* async_rados; + rgw::sal::RadosStore* store; + + const rgw_raw_obj obj; + + const std::string lock_name; + const std::string cookie{RGWSimpleRadosLockCR::gen_random_cookie(cct)}; + + int interval; + bool going_down{false}; + bool locked{false}; + + const ceph::timespan interval_tolerance; + const ceph::timespan ts_interval; + + RGWCoroutine* caller; + + bool aborted{false}; + + ceph::coarse_mono_time last_renew_try_time; + ceph::coarse_mono_time current_time; + + LatencyMonitor* latency; + +public: + RGWContinuousLeaseCR(RGWAsyncRadosProcessor* async_rados, + rgw::sal::RadosStore* _store, + rgw_raw_obj obj, std::string lock_name, + int interval, RGWCoroutine* caller, + LatencyMonitor* const latency) + : RGWCoroutine(_store->ctx()), async_rados(async_rados), store(_store), + obj(std::move(obj)), lock_name(std::move(lock_name)), + interval(interval), interval_tolerance(ceph::make_timespan(9*interval/10)), + ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency) + {} + + virtual ~RGWContinuousLeaseCR() override; + + int operate(const DoutPrefixProvider *dpp) override; + + bool is_locked() const { + if (ceph::coarse_mono_clock::now() - last_renew_try_time > ts_interval) { + return false; + } + return locked; + } + + void set_locked(bool status) { + locked = status; + } + + void go_down() { + going_down = true; + wakeup(); + } + + void abort() { + aborted = true; + } +}; + +class RGWRadosTimelogAddCR : public RGWSimpleCoroutine { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + std::list entries; + + std::string oid; + + boost::intrusive_ptr cn; + +public: + RGWRadosTimelogAddCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store, const std::string& _oid, + const cls_log_entry& entry); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +class RGWRadosTimelogTrimCR : public RGWSimpleCoroutine { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + boost::intrusive_ptr cn; + protected: + std::string oid; + real_time start_time; + real_time end_time; + std::string from_marker; + std::string to_marker; + + public: + RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, const std::string& oid, + const real_time& start_time, const real_time& end_time, + const std::string& from_marker, + const std::string& to_marker); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +// wrapper to update last_trim_marker on success +class RGWSyncLogTrimCR : public RGWRadosTimelogTrimCR { + CephContext *cct; + std::string *last_trim_marker; + public: + static constexpr const char* max_marker = "99999999"; + + RGWSyncLogTrimCR(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, const std::string& oid, + const std::string& to_marker, std::string *last_trim_marker); + int request_complete() override; +}; + +class RGWAsyncStatObj : public RGWAsyncRadosRequest { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + RGWBucketInfo bucket_info; + rgw_obj obj; + uint64_t *psize; + real_time *pmtime; + uint64_t *pepoch; + RGWObjVersionTracker *objv_tracker; +protected: + int _send_request(const DoutPrefixProvider *dpp) override; +public: + RGWAsyncStatObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* store, + const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr, + real_time *pmtime = nullptr, uint64_t *pepoch = nullptr, + RGWObjVersionTracker *objv_tracker = nullptr) + : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(store), obj(obj), psize(psize), + pmtime(pmtime), pepoch(pepoch), objv_tracker(objv_tracker) {} +}; + +class RGWStatObjCR : public RGWSimpleCoroutine { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + RGWAsyncRadosProcessor *async_rados; + RGWBucketInfo bucket_info; + rgw_obj obj; + uint64_t *psize; + real_time *pmtime; + uint64_t *pepoch; + RGWObjVersionTracker *objv_tracker; + RGWAsyncStatObj *req = nullptr; + public: + RGWStatObjCR(const DoutPrefixProvider *dpp, RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store, + const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr, + real_time* pmtime = nullptr, uint64_t *pepoch = nullptr, + RGWObjVersionTracker *objv_tracker = nullptr); + ~RGWStatObjCR() override { + request_cleanup(); + } + void request_cleanup() override; + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +/// coroutine wrapper for IoCtx::aio_notify() +class RGWRadosNotifyCR : public RGWSimpleCoroutine { + rgw::sal::RadosStore* const store; + const rgw_raw_obj obj; + bufferlist request; + const uint64_t timeout_ms; + bufferlist *response; + rgw_rados_ref ref; + boost::intrusive_ptr cn; + +public: + RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj, + bufferlist& request, uint64_t timeout_ms, + bufferlist *response); + + int send_request(const DoutPrefixProvider *dpp) override; + int request_complete() override; +}; + +class RGWDataPostNotifyCR : public RGWCoroutine { + RGWRados *store; + RGWHTTPManager& http_manager; + bc::flat_map >& shards; + const char *source_zone; + RGWRESTConn *conn; + +public: + RGWDataPostNotifyCR(RGWRados *_store, RGWHTTPManager& _http_manager, bc::flat_map >& _shards, const char *_zone, RGWRESTConn *_conn) + : RGWCoroutine(_store->ctx()), store(_store), http_manager(_http_manager), + shards(_shards), source_zone(_zone), conn(_conn) {} + + int operate(const DoutPrefixProvider* dpp) override; +}; + diff --git a/src/rgw/driver/rados/rgw_cr_tools.cc b/src/rgw/driver/rados/rgw_cr_tools.cc new file mode 100644 index 000000000..94665a35a --- /dev/null +++ b/src/rgw/driver/rados/rgw_cr_tools.cc @@ -0,0 +1,292 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" + +#include "rgw_cr_tools.h" +#include "rgw_bucket.h" +#include "rgw_user.h" +#include "rgw_op.h" +#include "rgw_acl_s3.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +template<> +int RGWUserCreateCR::Request::_send_request(const DoutPrefixProvider *dpp) +{ + CephContext *cct = store->ctx(); + + const int32_t default_max_buckets = + cct->_conf.get_val("rgw_user_max_buckets"); + + RGWUserAdminOpState op_state(store); + + auto& user = params.user; + + op_state.set_user_id(user); + op_state.set_display_name(params.display_name); + op_state.set_user_email(params.email); + op_state.set_caps(params.caps); + op_state.set_access_key(params.access_key); + op_state.set_secret_key(params.secret_key); + + if (!params.key_type.empty()) { + int32_t key_type = KEY_TYPE_S3; + if (params.key_type == "swift") { + key_type = KEY_TYPE_SWIFT; + } + + op_state.set_key_type(key_type); + } + + op_state.set_max_buckets(params.max_buckets.value_or(default_max_buckets)); + op_state.set_suspension(params.suspended); + op_state.set_system(params.system); + op_state.set_exclusive(params.exclusive); + + if (params.generate_key) { + op_state.set_generate_key(); + } + + + if (params.apply_quota) { + RGWQuota quota; + + if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) { + quota.bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects; + quota.bucket_quota.enabled = true; + } + + if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) { + quota.bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size; + quota.bucket_quota.enabled = true; + } + + if (cct->_conf->rgw_user_default_quota_max_objects >= 0) { + quota.user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects; + quota.user_quota.enabled = true; + } + + if (cct->_conf->rgw_user_default_quota_max_size >= 0) { + quota.user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size; + quota.user_quota.enabled = true; + } + + if (quota.bucket_quota.enabled) { + op_state.set_bucket_quota(quota.bucket_quota); + } + + if (quota.user_quota.enabled) { + op_state.set_user_quota(quota.user_quota); + } + } + + RGWNullFlusher flusher; + return RGWUserAdminOp_User::create(dpp, store, op_state, flusher, null_yield); +} + +template<> +int RGWGetUserInfoCR::Request::_send_request(const DoutPrefixProvider *dpp) +{ + return store->ctl()->user->get_info_by_uid(dpp, params.user, result.get(), null_yield); +} + +template<> +int RGWGetBucketInfoCR::Request::_send_request(const DoutPrefixProvider *dpp) +{ + return store->get_bucket(dpp, nullptr, params.tenant, params.bucket_name, &result->bucket, null_yield); +} + +template<> +int RGWBucketCreateLocalCR::Request::_send_request(const DoutPrefixProvider *dpp) +{ + CephContext *cct = store->ctx(); + auto& zone_svc = store->svc()->zone; + + const auto& user_info = params.user_info.get(); + const auto& user = user_info->user_id; + const auto& bucket_name = params.bucket_name; + auto& placement_rule = params.placement_rule; + + if (!placement_rule.empty() && + !zone_svc->get_zone_params().valid_placement(placement_rule)) { + ldpp_dout(dpp, 0) << "placement target (" << placement_rule << ")" + << " doesn't exist in the placement targets of zonegroup" + << " (" << zone_svc->get_zonegroup().api_name << ")" << dendl; + return -ERR_INVALID_LOCATION_CONSTRAINT; + } + + /* we need to make sure we read bucket info, it's not read before for this + * specific request */ + RGWBucketInfo bucket_info; + map bucket_attrs; + + int ret = store->getRados()->get_bucket_info(store->svc(), user.tenant, bucket_name, + bucket_info, nullptr, null_yield, dpp, &bucket_attrs); + if (ret < 0 && ret != -ENOENT) + return ret; + bool bucket_exists = (ret != -ENOENT); + + RGWAccessControlPolicy old_policy(cct); + ACLOwner bucket_owner; + bucket_owner.set_id(user); + bucket_owner.set_name(user_info->display_name); + if (bucket_exists) { + ret = rgw_op_get_bucket_policy_from_attr(dpp, cct, store, bucket_info, + bucket_attrs, &old_policy, null_yield); + if (ret >= 0) { + if (old_policy.get_owner().get_id().compare(user) != 0) { + return -EEXIST; + } + } + } + + RGWBucketInfo master_info; + rgw_bucket *pmaster_bucket = nullptr; + uint32_t *pmaster_num_shards = nullptr; + real_time creation_time; + + string zonegroup_id = zone_svc->get_zonegroup().get_id(); + + if (bucket_exists) { + rgw_placement_rule selected_placement_rule; + rgw_bucket bucket; + bucket.tenant = user.tenant; + bucket.name = bucket_name; + ret = zone_svc->select_bucket_placement(dpp, *user_info, zonegroup_id, + placement_rule, + &selected_placement_rule, nullptr, null_yield); + if (selected_placement_rule != bucket_info.placement_rule) { + ldpp_dout(dpp, 0) << "bucket already exists on a different placement rule: " + << " selected_rule= " << selected_placement_rule + << " existing_rule= " << bucket_info.placement_rule << dendl; + return -EEXIST; + } + } + + /* Encode special metadata first as we're using std::map::emplace under + * the hood. This method will add the new items only if the map doesn't + * contain such keys yet. */ + RGWAccessControlPolicy_S3 policy(cct); + policy.create_canned(bucket_owner, bucket_owner, string()); /* default private policy */ + bufferlist aclbl; + policy.encode(aclbl); + map attrs; + attrs.emplace(std::move(RGW_ATTR_ACL), std::move(aclbl)); + + RGWQuotaInfo quota_info; + const RGWQuotaInfo * pquota_info = nullptr; + + rgw_bucket bucket; + bucket.tenant = user.tenant; + bucket.name = bucket_name; + + RGWBucketInfo info; + obj_version ep_objv; + + ret = store->getRados()->create_bucket(*user_info, bucket, zonegroup_id, + placement_rule, bucket_info.swift_ver_location, + pquota_info, attrs, + info, nullptr, &ep_objv, creation_time, + pmaster_bucket, pmaster_num_shards, null_yield, dpp, true); + + + if (ret && ret != -EEXIST) + return ret; + + bool existed = (ret == -EEXIST); + + if (existed) { + if (info.owner != user) { + ldpp_dout(dpp, 20) << "NOTICE: bucket already exists under a different user (bucket=" << bucket << " user=" << user << " bucket_owner=" << info.owner << dendl; + return -EEXIST; + } + bucket = info.bucket; + } + + ret = store->ctl()->bucket->link_bucket(user, bucket, info.creation_time, null_yield, dpp, false); + if (ret && !existed && ret != -EEXIST) { + /* if it exists (or previously existed), don't remove it! */ + int r = store->ctl()->bucket->unlink_bucket(user, bucket, null_yield, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << r << dendl; + } + } else if (ret == -EEXIST || (ret == 0 && existed)) { + ret = -ERR_BUCKET_EXISTS; + } + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: bucket creation (bucket=" << bucket << ") return ret=" << ret << dendl; + } + + return ret; +} + +template<> +int RGWObjectSimplePutCR::Request::_send_request(const DoutPrefixProvider *dpp) +{ + RGWDataAccess::ObjectRef obj; + + CephContext *cct = store->ctx(); + + int ret = params.bucket->get_object(params.key, &obj); + if (ret < 0) { + lderr(cct) << "ERROR: failed to get object: " << cpp_strerror(-ret) << dendl; + return -ret; + } + + if (params.user_data) { + obj->set_user_data(*params.user_data); + } + + ret = obj->put(params.data, params.attrs, dpp, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: put object returned error: " << cpp_strerror(-ret) << dendl; + } + + return 0; +} + +template<> +int RGWBucketLifecycleConfigCR::Request::_send_request(const DoutPrefixProvider *dpp) +{ + CephContext *cct = store->ctx(); + + RGWLC *lc = store->getRados()->get_lc(); + if (!lc) { + lderr(cct) << "ERROR: lifecycle object is not initialized!" << dendl; + return -EIO; + } + + int ret = lc->set_bucket_config(params.bucket, + params.bucket_attrs, + ¶ms.config); + if (ret < 0) { + lderr(cct) << "ERROR: failed to set lifecycle on bucke: " << cpp_strerror(-ret) << dendl; + return -ret; + } + + return 0; +} + +template<> +int RGWBucketGetSyncPolicyHandlerCR::Request::_send_request(const DoutPrefixProvider *dpp) +{ + int r = store->ctl()->bucket->get_sync_policy_handler(params.zone, + params.bucket, + &result->policy_handler, + null_yield, + dpp); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: " << __func__ << "(): get_sync_policy_handler() returned " << r << dendl; + return r; + } + + return 0; +} diff --git a/src/rgw/driver/rados/rgw_cr_tools.h b/src/rgw/driver/rados/rgw_cr_tools.h new file mode 100644 index 000000000..4cd97aa82 --- /dev/null +++ b/src/rgw/driver/rados/rgw_cr_tools.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_cr_rados.h" +#include "rgw_tools.h" +#include "rgw_lc.h" + +#include "services/svc_bucket_sync.h" + +struct rgw_user_create_params { + rgw_user user; + std::string display_name; + std::string email; + std::string access_key; + std::string secret_key; + std::string key_type; /* "swift" or "s3" */ + std::string caps; + + bool generate_key{true}; + bool suspended{false}; + std::optional max_buckets; + bool system{false}; + bool exclusive{false}; + bool apply_quota{true}; +}; + +using RGWUserCreateCR = RGWSimpleWriteOnlyAsyncCR; + +struct rgw_get_user_info_params { + rgw_user user; +}; + +using RGWGetUserInfoCR = RGWSimpleAsyncCR; + +struct rgw_get_bucket_info_params { + std::string tenant; + std::string bucket_name; +}; + +struct rgw_get_bucket_info_result { + std::unique_ptr bucket; +}; + +using RGWGetBucketInfoCR = RGWSimpleAsyncCR; + +struct rgw_bucket_create_local_params { + std::shared_ptr user_info; + std::string bucket_name; + rgw_placement_rule placement_rule; +}; + +using RGWBucketCreateLocalCR = RGWSimpleWriteOnlyAsyncCR; + +struct rgw_object_simple_put_params { + RGWDataAccess::BucketRef bucket; + rgw_obj_key key; + bufferlist data; + std::map attrs; + std::optional user_data; +}; + +using RGWObjectSimplePutCR = RGWSimpleWriteOnlyAsyncCR; + + +struct rgw_bucket_lifecycle_config_params { + rgw::sal::Bucket* bucket; + rgw::sal::Attrs bucket_attrs; + RGWLifecycleConfiguration config; +}; + +using RGWBucketLifecycleConfigCR = RGWSimpleWriteOnlyAsyncCR; + +struct rgw_bucket_get_sync_policy_params { + std::optional zone; + std::optional bucket; +}; + +struct rgw_bucket_get_sync_policy_result { + RGWBucketSyncPolicyHandlerRef policy_handler; +}; + +using RGWBucketGetSyncPolicyHandlerCR = RGWSimpleAsyncCR; + diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.cc b/src/rgw/driver/rados/rgw_d3n_datacache.cc new file mode 100644 index 000000000..f1bf731ae --- /dev/null +++ b/src/rgw/driver/rados/rgw_d3n_datacache.cc @@ -0,0 +1,369 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_d3n_datacache.h" +#include "rgw_rest_client.h" +#include "rgw_auth_s3.h" +#include "rgw_op.h" +#include "rgw_common.h" +#include "rgw_auth_s3.h" +#include "rgw_op.h" +#include "rgw_crypt_sanitize.h" +#if defined(__linux__) +#include +#endif + +#if __has_include() +#include +namespace efs = std::filesystem; +#else +#include +namespace efs = std::experimental::filesystem; +#endif + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int D3nCacheAioWriteRequest::d3n_libaio_prepare_write_op(bufferlist& bl, unsigned int len, string oid, string cache_location) +{ + std::string location = cache_location + url_encode(oid, true); + int r = 0; + + lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): Write To Cache, location=" << location << dendl; + cb = new struct aiocb; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + memset(cb, 0, sizeof(struct aiocb)); + r = fd = ::open(location.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode); + if (fd < 0) { + ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: open file failed, errno=" << errno << ", location='" << location.c_str() << "'" << dendl; + goto done; + } + if (g_conf()->rgw_d3n_l1_fadvise != POSIX_FADV_NORMAL) + posix_fadvise(fd, 0, 0, g_conf()->rgw_d3n_l1_fadvise); + cb->aio_fildes = fd; + + data = malloc(len); + if (!data) { + ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: memory allocation failed" << dendl; + goto close_file; + } + cb->aio_buf = data; + memcpy((void*)data, bl.c_str(), len); + cb->aio_nbytes = len; + goto done; + +close_file: + ::close(fd); +done: + return r; +} + +D3nDataCache::D3nDataCache() + : cct(nullptr), io_type(_io_type::ASYNC_IO), free_data_cache_size(0), outstanding_write_size(0) +{ + lsubdout(g_ceph_context, rgw_datacache, 5) << "D3nDataCache: " << __func__ << "()" << dendl; +} + +void D3nDataCache::init(CephContext *_cct) { + cct = _cct; + free_data_cache_size = cct->_conf->rgw_d3n_l1_datacache_size; + head = nullptr; + tail = nullptr; + cache_location = cct->_conf->rgw_d3n_l1_datacache_persistent_path; + if(cache_location.back() != '/') { + cache_location += "/"; + } + try { + if (efs::exists(cache_location)) { + // d3n: evict the cache storage directory + if (g_conf()->rgw_d3n_l1_evict_cache_on_start) { + lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: evicting the persistent storage directory on start" << dendl; + for (auto& p : efs::directory_iterator(cache_location)) { + efs::remove_all(p.path()); + } + } + } else { + // create the cache storage directory + lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: creating the persistent storage directory on start" << dendl; + efs::create_directories(cache_location); + } + } catch (const efs::filesystem_error& e) { + lderr(g_ceph_context) << "D3nDataCache: init: ERROR initializing the cache storage directory '" << cache_location << + "' : " << e.what() << dendl; + } + + auto conf_eviction_policy = cct->_conf.get_val("rgw_d3n_l1_eviction_policy"); + ceph_assert(conf_eviction_policy == "lru" || conf_eviction_policy == "random"); + if (conf_eviction_policy == "lru") + eviction_policy = _eviction_policy::LRU; + if (conf_eviction_policy == "random") + eviction_policy = _eviction_policy::RANDOM; + +#if defined(HAVE_LIBAIO) && defined(__GLIBC__) + // libaio setup + struct aioinit ainit{0}; + ainit.aio_threads = cct->_conf.get_val("rgw_d3n_libaio_aio_threads"); + ainit.aio_num = cct->_conf.get_val("rgw_d3n_libaio_aio_num"); + ainit.aio_idle_time = 10; + aio_init(&ainit); +#endif +} + +int D3nDataCache::d3n_io_write(bufferlist& bl, unsigned int len, std::string oid) +{ + D3nChunkDataInfo* chunk_info = new D3nChunkDataInfo; + std::string location = cache_location + url_encode(oid, true); + + lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl; + FILE *cache_file = nullptr; + int r = 0; + size_t nbytes = 0; + + cache_file = fopen(location.c_str(), "w+"); + if (cache_file == nullptr) { + ldout(cct, 0) << "ERROR: D3nDataCache::fopen file has return error, errno=" << errno << dendl; + return -errno; + } + + nbytes = fwrite(bl.c_str(), 1, len, cache_file); + if (nbytes != len) { + ldout(cct, 0) << "ERROR: D3nDataCache::io_write: fwrite has returned error: nbytes!=len, nbytes=" << nbytes << ", len=" << len << dendl; + return -EIO; + } + + r = fclose(cache_file); + if (r != 0) { + ldout(cct, 0) << "ERROR: D3nDataCache::fclsoe file has return error, errno=" << errno << dendl; + return -errno; + } + + { // update cahce_map entries for new chunk in cache + const std::lock_guard l(d3n_cache_lock); + chunk_info->oid = oid; + chunk_info->set_ctx(cct); + chunk_info->size = len; + d3n_cache_map.insert(pair(oid, chunk_info)); + } + + return r; +} + +void d3n_libaio_write_cb(sigval sigval) +{ + lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl; + D3nCacheAioWriteRequest* c = static_cast(sigval.sival_ptr); + c->priv_data->d3n_libaio_write_completion_cb(c); +} + + +void D3nDataCache::d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c) +{ + D3nChunkDataInfo* chunk_info{nullptr}; + + ldout(cct, 5) << "D3nDataCache: " << __func__ << "(): oid=" << c->oid << dendl; + + { // update cache_map entries for new chunk in cache + const std::lock_guard l(d3n_cache_lock); + d3n_outstanding_write_list.erase(c->oid); + chunk_info = new D3nChunkDataInfo; + chunk_info->oid = c->oid; + chunk_info->set_ctx(cct); + chunk_info->size = c->cb->aio_nbytes; + d3n_cache_map.insert(pair(c->oid, chunk_info)); + } + + { // update free size + const std::lock_guard l(d3n_eviction_lock); + free_data_cache_size -= c->cb->aio_nbytes; + outstanding_write_size -= c->cb->aio_nbytes; + lru_insert_head(chunk_info); + } + delete c; + c = nullptr; +} + +int D3nDataCache::d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid) +{ + lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "(): Write To Cache, oid=" << oid << ", len=" << len << dendl; + struct D3nCacheAioWriteRequest* wr = new struct D3nCacheAioWriteRequest(cct); + int r=0; + if ((r = wr->d3n_libaio_prepare_write_op(bl, len, oid, cache_location)) < 0) { + ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() prepare libaio write op r=" << r << dendl; + goto done; + } + wr->cb->aio_sigevent.sigev_notify = SIGEV_THREAD; + wr->cb->aio_sigevent.sigev_notify_function = d3n_libaio_write_cb; + wr->cb->aio_sigevent.sigev_notify_attributes = nullptr; + wr->cb->aio_sigevent.sigev_value.sival_ptr = (void*)wr; + wr->oid = oid; + wr->priv_data = this; + + if ((r = ::aio_write(wr->cb)) != 0) { + ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() aio_write r=" << r << dendl; + goto error; + } + return 0; + +error: + delete wr; +done: + return r; +} + +void D3nDataCache::put(bufferlist& bl, unsigned int len, std::string& oid) +{ + size_t sr = 0; + uint64_t freed_size = 0, _free_data_cache_size = 0, _outstanding_write_size = 0; + + ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): oid=" << oid << ", len=" << len << dendl; + { + const std::lock_guard l(d3n_cache_lock); + std::unordered_map::iterator iter = d3n_cache_map.find(oid); + if (iter != d3n_cache_map.end()) { + ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): data already cached, no rewrite" << dendl; + return; + } + auto it = d3n_outstanding_write_list.find(oid); + if (it != d3n_outstanding_write_list.end()) { + ldout(cct, 10) << "D3nDataCache: NOTE: data put in cache already issued, no rewrite" << dendl; + return; + } + d3n_outstanding_write_list.insert(oid); + } + { + const std::lock_guard l(d3n_eviction_lock); + _free_data_cache_size = free_data_cache_size; + _outstanding_write_size = outstanding_write_size; + } + ldout(cct, 20) << "D3nDataCache: Before eviction _free_data_cache_size:" << _free_data_cache_size << ", _outstanding_write_size:" << _outstanding_write_size << ", freed_size:" << freed_size << dendl; + while (len > (_free_data_cache_size - _outstanding_write_size + freed_size)) { + ldout(cct, 20) << "D3nDataCache: enter eviction" << dendl; + if (eviction_policy == _eviction_policy::LRU) { + sr = lru_eviction(); + } else if (eviction_policy == _eviction_policy::RANDOM) { + sr = random_eviction(); + } else { + ldout(cct, 0) << "D3nDataCache: Warning: unknown cache eviction policy, defaulting to lru eviction" << dendl; + sr = lru_eviction(); + } + if (sr == 0) { + ldout(cct, 2) << "D3nDataCache: Warning: eviction was not able to free disk space, not writing to cache" << dendl; + d3n_outstanding_write_list.erase(oid); + return; + } + ldout(cct, 20) << "D3nDataCache: completed eviction of " << sr << " bytes" << dendl; + freed_size += sr; + } + int r = 0; + r = d3n_libaio_create_write_request(bl, len, oid); + if (r < 0) { + const std::lock_guard l(d3n_cache_lock); + d3n_outstanding_write_list.erase(oid); + ldout(cct, 1) << "D3nDataCache: create_aio_write_request fail, r=" << r << dendl; + return; + } + + const std::lock_guard l(d3n_eviction_lock); + free_data_cache_size += freed_size; + outstanding_write_size += len; +} + +bool D3nDataCache::get(const string& oid, const off_t len) +{ + const std::lock_guard l(d3n_cache_lock); + bool exist = false; + string location = cache_location + url_encode(oid, true); + + lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl; + std::unordered_map::iterator iter = d3n_cache_map.find(oid); + if (!(iter == d3n_cache_map.end())) { + // check inside cache whether file exists or not!!!! then make exist true; + struct D3nChunkDataInfo* chdo = iter->second; + struct stat st; + int r = stat(location.c_str(), &st); + if ( r != -1 && st.st_size == len) { // file exists and containes required data range length + exist = true; + /*LRU*/ + /*get D3nChunkDataInfo*/ + const std::lock_guard l(d3n_eviction_lock); + lru_remove(chdo); + lru_insert_head(chdo); + } else { + d3n_cache_map.erase(oid); + const std::lock_guard l(d3n_eviction_lock); + lru_remove(chdo); + delete chdo; + exist = false; + } + } + return exist; +} + +size_t D3nDataCache::random_eviction() +{ + lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl; + int n_entries = 0; + int random_index = 0; + size_t freed_size = 0; + D3nChunkDataInfo* del_entry; + string del_oid, location; + { + const std::lock_guard l(d3n_cache_lock); + n_entries = d3n_cache_map.size(); + if (n_entries <= 0) { + return -1; + } + srand (time(NULL)); + random_index = ceph::util::generate_random_number(0, n_entries-1); + std::unordered_map::iterator iter = d3n_cache_map.begin(); + std::advance(iter, random_index); + del_oid = iter->first; + del_entry = iter->second; + ldout(cct, 20) << "D3nDataCache: random_eviction: index:" << random_index << ", free size: " << del_entry->size << dendl; + freed_size = del_entry->size; + delete del_entry; + del_entry = nullptr; + d3n_cache_map.erase(del_oid); // oid + } + + location = cache_location + url_encode(del_oid, true); + ::remove(location.c_str()); + return freed_size; +} + +size_t D3nDataCache::lru_eviction() +{ + lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl; + int n_entries = 0; + size_t freed_size = 0; + D3nChunkDataInfo* del_entry; + string del_oid, location; + + { + const std::lock_guard l(d3n_eviction_lock); + del_entry = tail; + if (del_entry == nullptr) { + ldout(cct, 2) << "D3nDataCache: lru_eviction: del_entry=null_ptr" << dendl; + return 0; + } + lru_remove(del_entry); + } + + { + const std::lock_guard l(d3n_cache_lock); + n_entries = d3n_cache_map.size(); + if (n_entries <= 0) { + ldout(cct, 2) << "D3nDataCache: lru_eviction: cache_map.size<=0" << dendl; + return -1; + } + del_oid = del_entry->oid; + ldout(cct, 20) << "D3nDataCache: lru_eviction: oid to remove: " << del_oid << dendl; + d3n_cache_map.erase(del_oid); // oid + } + freed_size = del_entry->size; + delete del_entry; + location = cache_location + url_encode(del_oid, true); + ::remove(location.c_str()); + return freed_size; +} diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.h b/src/rgw/driver/rados/rgw_d3n_datacache.h new file mode 100644 index 000000000..feaa3f2b7 --- /dev/null +++ b/src/rgw/driver/rados/rgw_d3n_datacache.h @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rados.h" +#include + +#include "rgw_common.h" + +#include +#include +#include "include/Context.h" +#include "include/lru.h" +#include "rgw_d3n_cacherequest.h" + + +/*D3nDataCache*/ +struct D3nDataCache; + + +struct D3nChunkDataInfo : public LRUObject { + CephContext *cct; + uint64_t size; + time_t access_time; + std::string address; + std::string oid; + bool complete; + struct D3nChunkDataInfo* lru_prev; + struct D3nChunkDataInfo* lru_next; + + D3nChunkDataInfo(): size(0) {} + + void set_ctx(CephContext *_cct) { + cct = _cct; + } + + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; + +struct D3nCacheAioWriteRequest { + std::string oid; + void *data; + int fd; + struct aiocb *cb; + D3nDataCache *priv_data; + CephContext *cct; + + D3nCacheAioWriteRequest(CephContext *_cct) : cct(_cct) {} + int d3n_libaio_prepare_write_op(bufferlist& bl, unsigned int len, std::string oid, std::string cache_location); + + ~D3nCacheAioWriteRequest() { + ::close(fd); + cb->aio_buf = nullptr; + free(data); + data = nullptr; + delete(cb); + } +}; + +struct D3nDataCache { + +private: + std::unordered_map d3n_cache_map; + std::set d3n_outstanding_write_list; + std::mutex d3n_cache_lock; + std::mutex d3n_eviction_lock; + + CephContext *cct; + enum class _io_type { + SYNC_IO = 1, + ASYNC_IO = 2, + SEND_FILE = 3 + } io_type; + enum class _eviction_policy { + LRU=0, RANDOM=1 + } eviction_policy; + + struct sigaction action; + uint64_t free_data_cache_size = 0; + uint64_t outstanding_write_size = 0; + struct D3nChunkDataInfo* head; + struct D3nChunkDataInfo* tail; + +private: + void add_io(); + +public: + D3nDataCache(); + ~D3nDataCache() { + while (lru_eviction() > 0); + } + + std::string cache_location; + + bool get(const std::string& oid, const off_t len); + void put(bufferlist& bl, unsigned int len, std::string& obj_key); + int d3n_io_write(bufferlist& bl, unsigned int len, std::string oid); + int d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid); + void d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c); + size_t random_eviction(); + size_t lru_eviction(); + + void init(CephContext *_cct); + + void lru_insert_head(struct D3nChunkDataInfo* o) { + lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl; + o->lru_next = head; + o->lru_prev = nullptr; + if (head) { + head->lru_prev = o; + } else { + tail = o; + } + head = o; + } + + void lru_insert_tail(struct D3nChunkDataInfo* o) { + lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl; + o->lru_next = nullptr; + o->lru_prev = tail; + if (tail) { + tail->lru_next = o; + } else { + head = o; + } + tail = o; + } + + void lru_remove(struct D3nChunkDataInfo* o) { + lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl; + if (o->lru_next) + o->lru_next->lru_prev = o->lru_prev; + else + tail = o->lru_prev; + if (o->lru_prev) + o->lru_prev->lru_next = o->lru_next; + else + head = o->lru_next; + o->lru_next = o->lru_prev = nullptr; + } +}; + + +template +class D3nRGWDataCache : public T { + +public: + D3nRGWDataCache() {} + + int init_rados() override { + int ret; + ret = T::init_rados(); + if (ret < 0) + return ret; + + return 0; + } + + int get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg) override; +}; + +template +int D3nRGWDataCache::get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg) { + lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache::" << __func__ << "(): is head object : " << is_head_obj << dendl; + librados::ObjectReadOperation op; + struct get_obj_data* d = static_cast(arg); + std::string oid, key; + + if (is_head_obj) { + // only when reading from the head object do we need to do the atomic test + int r = T::append_atomic_test(dpp, astate, op); + if (r < 0) + return r; + + if (astate && + obj_ofs < astate->data.length()) { + unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len); + + r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len); + if (r < 0) + return r; + + len -= chunk_len; + d->offset += chunk_len; + read_ofs += chunk_len; + obj_ofs += chunk_len; + if (!len) + return 0; + } + + auto obj = d->rgwrados->svc.rados->obj(read_obj); + r = obj.open(dpp); + if (r < 0) { + lsubdout(g_ceph_context, rgw, 4) << "failed to open rados context for " << read_obj << dendl; + return r; + } + + ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl; + op.read(read_ofs, len, nullptr, nullptr); + + const uint64_t cost = len; + const uint64_t id = obj_ofs; // use logical object offset for sorting replies + + auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id); + return d->flush(std::move(completed)); + } else { + ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << ", is_head_obj=" << is_head_obj << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl; + int r; + + op.read(read_ofs, len, nullptr, nullptr); + + const uint64_t cost = len; + const uint64_t id = obj_ofs; // use logical object offset for sorting replies + oid = read_obj.oid; + + auto obj = d->rgwrados->svc.rados->obj(read_obj); + r = obj.open(dpp); + if (r < 0) { + lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: Error: failed to open rados context for " << read_obj << ", r=" << r << dendl; + return r; + } + + const bool is_compressed = (astate->attrset.find(RGW_ATTR_COMPRESSION) != astate->attrset.end()); + const bool is_encrypted = (astate->attrset.find(RGW_ATTR_CRYPT_MODE) != astate->attrset.end()); + if (read_ofs != 0 || astate->size != astate->accounted_size || is_compressed || is_encrypted) { + d->d3n_bypass_cache_write = true; + lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: " << __func__ << "(): Note - bypassing datacache: oid=" << read_obj.oid << ", read_ofs!=0 = " << read_ofs << ", size=" << astate->size << " != accounted_size=" << astate->accounted_size << ", is_compressed=" << is_compressed << ", is_encrypted=" << is_encrypted << dendl; + auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id); + r = d->flush(std::move(completed)); + return r; + } + + if (d->rgwrados->d3n_data_cache->get(oid, len)) { + // Read From Cache + ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): READ FROM CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl; + auto completed = d->aio->get(obj, rgw::Aio::d3n_cache_op(dpp, d->yield, read_ofs, len, d->rgwrados->d3n_data_cache->cache_location), cost, id); + r = d->flush(std::move(completed)); + if (r < 0) { + lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: " << __func__ << "(): Error: failed to drain/flush, r= " << r << dendl; + } + return r; + } else { + // Write To Cache + ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): WRITE TO CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << " len=" << len << dendl; + auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id); + return d->flush(std::move(completed)); + } + } + lsubdout(g_ceph_context, rgw, 1) << "D3nDataCache: " << __func__ << "(): Warning: Check head object cache handling flow, oid=" << read_obj.oid << dendl; + + return 0; +} + diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc new file mode 100644 index 000000000..a5730e51d --- /dev/null +++ b/src/rgw/driver/rados/rgw_data_sync.cc @@ -0,0 +1,6762 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/ceph_json.h" +#include "common/RefCountedObj.h" +#include "common/WorkQueue.h" +#include "common/Throttle.h" +#include "common/errno.h" + +#include "rgw_common.h" +#include "rgw_zone.h" +#include "rgw_sync.h" +#include "rgw_data_sync.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_cr_tools.h" +#include "rgw_http_client.h" +#include "rgw_bucket.h" +#include "rgw_bucket_sync.h" +#include "rgw_bucket_sync_cache.h" +#include "rgw_datalog.h" +#include "rgw_metadata.h" +#include "rgw_sync_counters.h" +#include "rgw_sync_error_repo.h" +#include "rgw_sync_module.h" +#include "rgw_sal.h" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rgw/cls_rgw_client.h" + +#include "services/svc_zone.h" +#include "services/svc_sync_modules.h" + +#include "include/common_fwd.h" +#include "include/random.h" + +#include +#include + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "data sync: ") + +using namespace std; + +static const string datalog_sync_status_oid_prefix = "datalog.sync-status"; +static const string datalog_sync_status_shard_prefix = "datalog.sync-status.shard"; +static const string datalog_sync_full_sync_index_prefix = "data.full-sync.index"; +static const string bucket_full_status_oid_prefix = "bucket.full-sync-status"; +static const string bucket_status_oid_prefix = "bucket.sync-status"; +static const string object_status_oid_prefix = "bucket.sync-status"; + +void rgw_datalog_info::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("num_objects", num_shards, obj); +} + +void rgw_datalog_entry::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("key", key, obj); + utime_t ut; + JSONDecoder::decode_json("timestamp", ut, obj); + timestamp = ut.to_real_time(); +} + +void rgw_datalog_shard_data::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("entries", entries, obj); +}; + +// print a bucket shard with [gen] +std::string to_string(const rgw_bucket_shard& bs, std::optional gen) +{ + constexpr auto digits10 = std::numeric_limits::digits10; + constexpr auto reserve = 2 + digits10; // [value] + auto str = bs.get_key('/', ':', ':', reserve); + str.append(1, '['); + str.append(std::to_string(gen.value_or(0))); + str.append(1, ']'); + return str; +} + +class RGWReadDataSyncStatusMarkersCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + RGWDataSyncCtx *sc; + RGWDataSyncEnv *env; + const int num_shards; + int shard_id{0};; + + map& markers; + std::vector& objvs; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to read data sync status: " + << cpp_strerror(r) << dendl; + } + return r; + } + public: + RGWReadDataSyncStatusMarkersCR(RGWDataSyncCtx *sc, int num_shards, + map& markers, + std::vector& objvs) + : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS), + sc(sc), env(sc->env), num_shards(num_shards), markers(markers), objvs(objvs) + {} + bool spawn_next() override; +}; + +bool RGWReadDataSyncStatusMarkersCR::spawn_next() +{ + if (shard_id >= num_shards) { + return false; + } + using CR = RGWSimpleRadosReadCR; + spawn(new CR(env->dpp, env->driver, + rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)), + &markers[shard_id], true, &objvs[shard_id]), + false); + shard_id++; + return true; +} + +class RGWReadDataSyncRecoveringShardsCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + RGWDataSyncCtx *sc; + RGWDataSyncEnv *env; + + uint64_t max_entries; + int num_shards; + int shard_id{0}; + + string marker; + std::vector& omapkeys; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to list recovering data sync: " + << cpp_strerror(r) << dendl; + } + return r; + } + public: + RGWReadDataSyncRecoveringShardsCR(RGWDataSyncCtx *sc, uint64_t _max_entries, int _num_shards, + std::vector& omapkeys) + : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS), sc(sc), env(sc->env), + max_entries(_max_entries), num_shards(_num_shards), omapkeys(omapkeys) + {} + bool spawn_next() override; +}; + +bool RGWReadDataSyncRecoveringShardsCR::spawn_next() +{ + if (shard_id >= num_shards) + return false; + + string error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry"; + auto& shard_keys = omapkeys[shard_id]; + shard_keys = std::make_shared(); + spawn(new RGWRadosGetOmapKeysCR(env->driver, rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, error_oid), + marker, max_entries, shard_keys), false); + + ++shard_id; + return true; +} + +class RGWReadDataSyncStatusCoroutine : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw_data_sync_status *sync_status; + RGWObjVersionTracker* objv_tracker; + std::vector& objvs; + +public: + RGWReadDataSyncStatusCoroutine(RGWDataSyncCtx *_sc, + rgw_data_sync_status *_status, + RGWObjVersionTracker* objv_tracker, + std::vector& objvs) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(sc->env), sync_status(_status), + objv_tracker(objv_tracker), objvs(objvs) + {} + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWReadDataSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + // read sync info + using ReadInfoCR = RGWSimpleRadosReadCR; + yield { + bool empty_on_enoent = false; // fail on ENOENT + call(new ReadInfoCR(dpp, sync_env->driver, + rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)), + &sync_status->sync_info, empty_on_enoent, objv_tracker)); + } + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to read sync status info with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + // read shard markers + objvs.resize(sync_status->sync_info.num_shards); + using ReadMarkersCR = RGWReadDataSyncStatusMarkersCR; + yield call(new ReadMarkersCR(sc, sync_status->sync_info.num_shards, + sync_status->sync_markers, objvs)); + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to read sync status markers with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} + +class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + RGWRESTReadResource *http_op; + + int shard_id; + RGWDataChangesLogInfo *shard_info; + +public: + RGWReadRemoteDataLogShardInfoCR(RGWDataSyncCtx *_sc, + int _shard_id, RGWDataChangesLogInfo *_shard_info) : RGWCoroutine(_sc->cct), + sc(_sc), + sync_env(_sc->env), + http_op(NULL), + shard_id(_shard_id), + shard_info(_shard_info) { + } + + ~RGWReadRemoteDataLogShardInfoCR() override { + if (http_op) { + http_op->put(); + } + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + rgw_http_param_pair pairs[] = { { "type" , "data" }, + { "id", buf }, + { "info" , NULL }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); + + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + int ret = http_op->wait(shard_info, null_yield); + if (ret < 0) { + return set_cr_error(ret); + } + return set_cr_done(); + } + } + return 0; + } +}; + +struct read_remote_data_log_response { + string marker; + bool truncated; + vector entries; + + read_remote_data_log_response() : truncated(false) {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("entries", entries, obj); + }; +}; + +class RGWReadRemoteDataLogShardCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + RGWRESTReadResource *http_op = nullptr; + + int shard_id; + const std::string& marker; + string *pnext_marker; + vector *entries; + bool *truncated; + + read_remote_data_log_response response; + std::optional timer; + +public: + RGWReadRemoteDataLogShardCR(RGWDataSyncCtx *_sc, int _shard_id, + const std::string& marker, string *pnext_marker, + vector *_entries, + bool *_truncated) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker), + entries(_entries), truncated(_truncated) { + } + ~RGWReadRemoteDataLogShardCR() override { + if (http_op) { + http_op->put(); + } + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + rgw_http_param_pair pairs[] = { { "type" , "data" }, + { "id", buf }, + { "marker", marker.c_str() }, + { "extra-info", "true" }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); + + if (sync_env->counters) { + timer.emplace(sync_env->counters, sync_counters::l_poll); + } + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + if (sync_env->counters) { + sync_env->counters->inc(sync_counters::l_poll_err); + } + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + timer.reset(); + int ret = http_op->wait(&response, null_yield); + if (ret < 0) { + if (sync_env->counters && ret != -ENOENT) { + sync_env->counters->inc(sync_counters::l_poll_err); + } + return set_cr_error(ret); + } + entries->clear(); + entries->swap(response.entries); + *pnext_marker = response.marker; + *truncated = response.truncated; + return set_cr_done(); + } + } + return 0; + } +}; + +class RGWReadRemoteDataLogInfoCR : public RGWShardCollectCR { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + int num_shards; + map *datalog_info; + + int shard_id; +#define READ_DATALOG_MAX_CONCURRENT 10 + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to fetch remote datalog info: " + << cpp_strerror(r) << dendl; + } + return r; + } +public: + RGWReadRemoteDataLogInfoCR(RGWDataSyncCtx *_sc, + int _num_shards, + map *_datalog_info) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT), + sc(_sc), sync_env(_sc->env), num_shards(_num_shards), + datalog_info(_datalog_info), shard_id(0) {} + bool spawn_next() override; +}; + +bool RGWReadRemoteDataLogInfoCR::spawn_next() { + if (shard_id >= num_shards) { + return false; + } + spawn(new RGWReadRemoteDataLogShardInfoCR(sc, shard_id, &(*datalog_info)[shard_id]), false); + shard_id++; + return true; +} + +class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + RGWRESTReadResource *http_op; + + int shard_id; + string marker; + uint32_t max_entries; + rgw_datalog_shard_data *result; + +public: + RGWListRemoteDataLogShardCR(RGWDataSyncCtx *sc, int _shard_id, + const string& _marker, uint32_t _max_entries, + rgw_datalog_shard_data *_result) + : RGWSimpleCoroutine(sc->cct), sc(sc), sync_env(sc->env), http_op(NULL), + shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {} + + int send_request(const DoutPrefixProvider *dpp) override { + RGWRESTConn *conn = sc->conn; + + char buf[32]; + snprintf(buf, sizeof(buf), "%d", shard_id); + + char max_entries_buf[32]; + snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries); + + const char *marker_key = (marker.empty() ? "" : "marker"); + + rgw_http_param_pair pairs[] = { { "type", "data" }, + { "id", buf }, + { "max-entries", max_entries_buf }, + { marker_key, marker.c_str() }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager); + init_new_io(http_op); + + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return ret; + } + + return 0; + } + + int request_complete() override { + int ret = http_op->wait(result, null_yield); + http_op->put(); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl; + return ret; + } + return 0; + } +}; + +class RGWListRemoteDataLogCR : public RGWShardCollectCR { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + map shards; + int max_entries_per_shard; + map *result; + + map::iterator iter; +#define READ_DATALOG_MAX_CONCURRENT 10 + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to list remote datalog: " + << cpp_strerror(r) << dendl; + } + return r; + } +public: + RGWListRemoteDataLogCR(RGWDataSyncCtx *_sc, + map& _shards, + int _max_entries_per_shard, + map *_result) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT), + sc(_sc), sync_env(_sc->env), max_entries_per_shard(_max_entries_per_shard), + result(_result) { + shards.swap(_shards); + iter = shards.begin(); + } + bool spawn_next() override; +}; + +bool RGWListRemoteDataLogCR::spawn_next() { + if (iter == shards.end()) { + return false; + } + + spawn(new RGWListRemoteDataLogShardCR(sc, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false); + ++iter; + return true; +} + +class RGWInitDataSyncStatusCoroutine : public RGWCoroutine { + static constexpr auto lock_name{ "sync_lock"sv }; + RGWDataSyncCtx* const sc; + RGWDataSyncEnv* const sync_env{ sc->env }; + const uint32_t num_shards; + rgw_data_sync_status* const status; + RGWSyncTraceNodeRef tn; + boost::intrusive_ptr lease_cr; + RGWObjVersionTracker& objv_tracker; + std::vector& objvs; + + const rgw_pool& pool{ sync_env->svc->zone->get_zone_params().log_pool }; + const string sync_status_oid{ + RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) }; + + map shards_info; + + +public: + RGWInitDataSyncStatusCoroutine( + RGWDataSyncCtx* _sc, uint32_t num_shards, uint64_t instance_id, + const RGWSyncTraceNodeRef& tn_parent, rgw_data_sync_status* status, + boost::intrusive_ptr lease_cr, + RGWObjVersionTracker& objv_tracker, + std::vector& objvs) + : RGWCoroutine(_sc->cct), sc(_sc), num_shards(num_shards), status(status), + tn(sync_env->sync_tracer->add_node(tn_parent, "init_data_sync_status")), + lease_cr(std::move(lease_cr)), objv_tracker(objv_tracker), objvs(objvs) { + status->sync_info.instance_id = instance_id; + } + + static auto continuous_lease_cr(RGWDataSyncCtx* const sc, + RGWCoroutine* const caller) { + auto lock_duration = sc->cct->_conf->rgw_sync_lease_period; + return new RGWContinuousLeaseCR( + sc->env->async_rados, sc->env->driver, + { sc->env->svc->zone->get_zone_params().log_pool, + RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) }, + string(lock_name), lock_duration, caller, &sc->lcc); + } + + int operate(const DoutPrefixProvider *dpp) override { + int ret; + reenter(this) { + if (!lease_cr->is_locked()) { + drain_all(); + return set_cr_error(-ECANCELED); + } + + using WriteInfoCR = RGWSimpleRadosWriteCR; + yield call(new WriteInfoCR(dpp, sync_env->driver, + rgw_raw_obj{pool, sync_status_oid}, + status->sync_info, &objv_tracker)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode)); + return set_cr_error(retcode); + } + + // In the original code we reacquired the lock. Since + // RGWSimpleRadosWriteCR doesn't appear to touch the attributes + // and cls_version works across it, this should be unnecessary. + // Putting a note here just in case. If we see ECANCELED where + // we expect EBUSY, we can revisit this. + + /* fetch current position in logs */ + yield { + RGWRESTConn *conn = sync_env->svc->zone->get_zone_conn(sc->source_zone); + if (!conn) { + tn->log(0, SSTR("ERROR: connection to zone " << sc->source_zone << " does not exist!")); + return set_cr_error(-EIO); + } + for (uint32_t i = 0; i < num_shards; i++) { + spawn(new RGWReadRemoteDataLogShardInfoCR(sc, i, &shards_info[i]), true); + } + } + while (collect(&ret, NULL)) { + if (ret < 0) { + tn->log(0, SSTR("ERROR: failed to read remote data log shards")); + return set_state(RGWCoroutine_Error); + } + yield; + } + yield { + objvs.resize(num_shards); + for (uint32_t i = 0; i < num_shards; i++) { + RGWDataChangesLogInfo& info = shards_info[i]; + auto& marker = status->sync_markers[i]; + marker.next_step_marker = info.marker; + marker.timestamp = info.last_update; + const auto& oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, i); + auto& objv = objvs[i]; + objv.generate_new_write_ver(cct); + using WriteMarkerCR = RGWSimpleRadosWriteCR; + spawn(new WriteMarkerCR(dpp, sync_env->driver, + rgw_raw_obj{pool, oid}, marker, &objv), true); + } + } + while (collect(&ret, NULL)) { + if (ret < 0) { + tn->log(0, SSTR("ERROR: failed to write data sync status markers")); + return set_state(RGWCoroutine_Error); + } + yield; + } + + status->sync_info.state = rgw_data_sync_info::StateBuildingFullSyncMaps; + yield call(new WriteInfoCR(dpp, sync_env->driver, + rgw_raw_obj{pool, sync_status_oid}, + status->sync_info, &objv_tracker)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode)); + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +RGWRemoteDataLog::RGWRemoteDataLog(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* driver, + RGWAsyncRadosProcessor *async_rados) + : RGWCoroutinesManager(driver->ctx(), driver->getRados()->get_cr_registry()), + dpp(dpp), driver(driver), + cct(driver->ctx()), cr_registry(driver->getRados()->get_cr_registry()), + async_rados(async_rados), + http_manager(driver->ctx(), completion_mgr), + data_sync_cr(NULL), + initialized(false) +{ +} + +int RGWRemoteDataLog::read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info) +{ + rgw_http_param_pair pairs[] = { { "type", "data" }, + { NULL, NULL } }; + + int ret = sc.conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch datalog info" << dendl; + return ret; + } + + ldpp_dout(dpp, 20) << "remote datalog, num_shards=" << log_info->num_shards << dendl; + + return 0; +} + +int RGWRemoteDataLog::read_source_log_shards_info(const DoutPrefixProvider *dpp, map *shards_info) +{ + rgw_datalog_info log_info; + int ret = read_log_info(dpp, &log_info); + if (ret < 0) { + return ret; + } + + return run(dpp, new RGWReadRemoteDataLogInfoCR(&sc, log_info.num_shards, shards_info)); +} + +int RGWRemoteDataLog::read_source_log_shards_next(const DoutPrefixProvider *dpp, map shard_markers, map *result) +{ + return run(dpp, new RGWListRemoteDataLogCR(&sc, shard_markers, 1, result)); +} + +int RGWRemoteDataLog::init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger, + RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& _sync_module, + PerfCounters* counters) +{ + sync_env.init(dpp, cct, driver, driver->svc(), async_rados, &http_manager, _error_logger, + _sync_tracer, _sync_module, counters); + sc.init(&sync_env, _conn, _source_zone); + + if (initialized) { + return 0; + } + + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + + tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "data"); + + initialized = true; + + return 0; +} + +void RGWRemoteDataLog::finish() +{ + stop(); +} + +int RGWRemoteDataLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status) +{ + // cannot run concurrently with run_sync(), so run in a separate manager + RGWObjVersionTracker objv; + std::vector shard_objvs; + RGWCoroutinesManager crs(cct, cr_registry); + RGWHTTPManager http_manager(cct, crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWDataSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + + RGWDataSyncCtx sc_local = sc; + sc_local.env = &sync_env_local; + + ret = crs.run(dpp, new RGWReadDataSyncStatusCoroutine(&sc_local, sync_status, + &objv, shard_objvs)); + http_manager.stop(); + return ret; +} + +int RGWRemoteDataLog::read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, set& recovering_shards) +{ + // cannot run concurrently with run_sync(), so run in a separate manager + RGWCoroutinesManager crs(cct, cr_registry); + RGWHTTPManager http_manager(cct, crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWDataSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + + RGWDataSyncCtx sc_local = sc; + sc_local.env = &sync_env_local; + + std::vector omapkeys; + omapkeys.resize(num_shards); + uint64_t max_entries{1}; + + ret = crs.run(dpp, new RGWReadDataSyncRecoveringShardsCR(&sc_local, max_entries, num_shards, omapkeys)); + http_manager.stop(); + + if (ret == 0) { + for (int i = 0; i < num_shards; i++) { + if (omapkeys[i]->entries.size() != 0) { + recovering_shards.insert(i); + } + } + } + + return ret; +} + +namespace RGWRDL { +class DataSyncInitCR : public RGWCoroutine { + RGWDataSyncCtx* const sc; + const uint32_t num_shards; + uint64_t instance_id; + const RGWSyncTraceNodeRef& tn; + rgw_data_sync_status* const sync_status; + std::vector& objvs; + + boost::intrusive_ptr lease_cr; + + RGWObjVersionTracker objv_tracker; + +public: + + DataSyncInitCR(RGWDataSyncCtx* sc, uint32_t num_shards, uint64_t instance_id, + const RGWSyncTraceNodeRef& tn, + rgw_data_sync_status* sync_status, + std::vector& objvs) + : RGWCoroutine(sc->cct), sc(sc), num_shards(num_shards), + instance_id(instance_id), tn(tn), + sync_status(sync_status), objvs(objvs) {} + + ~DataSyncInitCR() override { + if (lease_cr) { + lease_cr->abort(); + } + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + lease_cr.reset( + RGWInitDataSyncStatusCoroutine::continuous_lease_cr(sc, this)); + + yield spawn(lease_cr.get(), false); + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + tn->log(5, "ERROR: failed to take data sync status lease"); + set_status("lease lock failed, early abort"); + drain_all(); + return set_cr_error(lease_cr->get_ret_status()); + } + tn->log(5, "waiting on data sync status lease"); + yield set_sleeping(true); + } + tn->log(5, "acquired data sync status lease"); + objv_tracker.generate_new_write_ver(sc->cct); + yield call(new RGWInitDataSyncStatusCoroutine(sc, num_shards, instance_id, + tn, sync_status, lease_cr, + objv_tracker, objvs)); + lease_cr->go_down(); + lease_cr.reset(); + drain_all(); + if (retcode < 0) { + set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; +} + +int RGWRemoteDataLog::init_sync_status(const DoutPrefixProvider *dpp, int num_shards) +{ + rgw_data_sync_status sync_status; + std::vector objvs; + sync_status.sync_info.num_shards = num_shards; + + RGWCoroutinesManager crs(cct, cr_registry); + RGWHTTPManager http_manager(cct, crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWDataSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + auto instance_id = ceph::util::generate_random_number(); + RGWDataSyncCtx sc_local = sc; + sc_local.env = &sync_env_local; + ret = crs.run(dpp, new RGWRDL::DataSyncInitCR(&sc_local, num_shards, + instance_id, tn, &sync_status, objvs)); + http_manager.stop(); + return ret; +} + +static string full_data_sync_index_shard_oid(const rgw_zone_id& source_zone, int shard_id) +{ + char buf[datalog_sync_full_sync_index_prefix.size() + 1 + source_zone.id.size() + 1 + 16]; + snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_full_sync_index_prefix.c_str(), source_zone.id.c_str(), shard_id); + return string(buf); +} + +struct read_metadata_list { + string marker; + bool truncated; + list keys; + int count; + + read_metadata_list() : truncated(false), count(0) {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("keys", keys, obj); + JSONDecoder::decode_json("count", count, obj); + } +}; + +struct bucket_instance_meta_info { + string key; + obj_version ver; + utime_t mtime; + RGWBucketInstanceMetadataObject data; + + bucket_instance_meta_info() {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("key", key, obj); + JSONDecoder::decode_json("ver", ver, obj); + JSONDecoder::decode_json("mtime", mtime, obj); + JSONDecoder::decode_json("data", data, obj); + } +}; + +class RGWReadRemoteBucketIndexLogInfoCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + const string instance_key; + + rgw_bucket_index_marker_info *info; + +public: + RGWReadRemoteBucketIndexLogInfoCR(RGWDataSyncCtx *_sc, + const rgw_bucket& bucket, + rgw_bucket_index_marker_info *_info) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + instance_key(bucket.get_key()), info(_info) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield { + rgw_http_param_pair pairs[] = { { "type" , "bucket-index" }, + { "bucket-instance", instance_key.c_str() }, + { "info" , NULL }, + { NULL, NULL } }; + + string p = "/admin/log/"; + call(new RGWReadRESTResourceCR(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, info)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + + return set_cr_done(); + } + return 0; + } +}; + + +class RGWListBucketIndexesCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env = sc->env; + + rgw::sal::RadosStore* driver = sync_env->driver; + + rgw_data_sync_status *sync_status; + std::vector& objvs; + + int req_ret = 0; + int ret = 0; + + list::iterator iter; + + unique_ptr entries_index; + string oid_prefix = + datalog_sync_full_sync_index_prefix + "." + sc->source_zone.id; + + string path = "/admin/metadata/bucket.instance"; + bucket_instance_meta_info meta_info; + string key; + + bool failed = false; + bool truncated = false; + read_metadata_list result; + +public: + RGWListBucketIndexesCR(RGWDataSyncCtx* sc, + rgw_data_sync_status* sync_status, std::vector& objvs) + : RGWCoroutine(sc->cct), sc(sc), sync_status(sync_status), objvs(objvs) {} + ~RGWListBucketIndexesCR() override { } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + entries_index = std::make_unique( + sync_env->async_rados, driver, this, + cct->_conf->rgw_data_log_num_shards, + sync_env->svc->zone->get_zone_params().log_pool, + oid_prefix); + yield; // yield so OmapAppendCRs can start + + do { + yield { + string entrypoint = "/admin/metadata/bucket.instance"s; + + rgw_http_param_pair pairs[] = {{"max-entries", "1000"}, + {"marker", result.marker.c_str()}, + {NULL, NULL}}; + + call(new RGWReadRESTResourceCR( + sync_env->cct, sc->conn, sync_env->http_manager, + entrypoint, pairs, &result)); + } + if (retcode < 0) { + ldpp_dout(dpp, 0) + << "ERROR: failed to fetch metadata for section bucket.instance" + << dendl; + return set_cr_error(retcode); + } + + for (iter = result.keys.begin(); iter != result.keys.end(); ++iter) { + ldpp_dout(dpp, 20) << "list metadata: section=bucket.instance key=" + << *iter << dendl; + key = *iter; + + yield { + rgw_http_param_pair pairs[] = {{"key", key.c_str()}, + {NULL, NULL}}; + + call(new RGWReadRESTResourceCR( + sync_env->cct, sc->conn, sync_env->http_manager, path, pairs, + &meta_info)); + } + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata for key: " + << key << dendl; + return set_cr_error(retcode); + } + // Now that bucket full sync is bucket-wide instead of + // per-shard, we only need to register a single shard of + // each bucket to guarantee that sync will see everything + // that happened before data full sync starts. This also + // means we don't have to care about the bucket's current + // shard count. + yield entries_index->append( + fmt::format("{}:{}", key, 0), + sync_env->svc->datalog_rados->get_log_shard_id( + meta_info.data.get_bucket_info().bucket, 0)); + } + truncated = result.truncated; + } while (truncated); + + yield { + if (!entries_index->finish()) { + failed = true; + } + } + if (!failed) { + for (auto iter = sync_status->sync_markers.begin(); + iter != sync_status->sync_markers.end(); + ++iter) { + int shard_id = (int)iter->first; + rgw_data_sync_marker& marker = iter->second; + marker.total_entries = entries_index->get_total_entries(shard_id); + spawn(new RGWSimpleRadosWriteCR( + dpp, sync_env->driver, + rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, + RGWDataSyncStatusManager::shard_obj_name( + sc->source_zone, shard_id)), + marker, &objvs[shard_id]), + true); + } + } else { + yield call(sync_env->error_logger->log_error_cr( + dpp, sc->conn->get_remote_id(), "data.init", "", + EIO, string("failed to build bucket instances map"))); + } + while (collect(&ret, NULL)) { + if (ret < 0) { + yield call(sync_env->error_logger->log_error_cr( + dpp, sc->conn->get_remote_id(), "data.init", "", + -ret, string("failed to driver sync status: ") + + cpp_strerror(-ret))); + req_ret = ret; + } + yield; + } + drain_all(); + if (req_ret < 0) { + yield return set_cr_error(req_ret); + } + yield return set_cr_done(); + } + return 0; + } +}; + +#define DATA_SYNC_UPDATE_MARKER_WINDOW 1 + +class RGWDataSyncShardMarkerTrack : public RGWSyncShardMarkerTrack { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + string marker_oid; + rgw_data_sync_marker sync_marker; + RGWSyncTraceNodeRef tn; + RGWObjVersionTracker& objv; + +public: + RGWDataSyncShardMarkerTrack(RGWDataSyncCtx *_sc, + const string& _marker_oid, + const rgw_data_sync_marker& _marker, + RGWSyncTraceNodeRef& _tn, RGWObjVersionTracker& objv) : RGWSyncShardMarkerTrack(DATA_SYNC_UPDATE_MARKER_WINDOW), + sc(_sc), sync_env(_sc->env), + marker_oid(_marker_oid), + sync_marker(_marker), + tn(_tn), objv(objv) {} + + RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override { + sync_marker.marker = new_marker; + sync_marker.pos = index_pos; + sync_marker.timestamp = timestamp; + + tn->log(20, SSTR("updating marker marker_oid=" << marker_oid << " marker=" << new_marker)); + + return new RGWSimpleRadosWriteCR(sync_env->dpp, sync_env->driver, + rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, marker_oid), + sync_marker, &objv); + } + + RGWOrderCallCR *allocate_order_control_cr() override { + return new RGWLastCallerWinsCR(sync_env->cct); + } +}; + +// ostream wrappers to print buckets without copying strings +struct bucket_str { + const rgw_bucket& b; + explicit bucket_str(const rgw_bucket& b) : b(b) {} +}; +std::ostream& operator<<(std::ostream& out, const bucket_str& rhs) { + auto& b = rhs.b; + if (!b.tenant.empty()) { + out << b.tenant << '/'; + } + out << b.name; + if (!b.bucket_id.empty()) { + out << ':' << b.bucket_id; + } + return out; +} + +struct bucket_str_noinstance { + const rgw_bucket& b; + explicit bucket_str_noinstance(const rgw_bucket& b) : b(b) {} +}; +std::ostream& operator<<(std::ostream& out, const bucket_str_noinstance& rhs) { + auto& b = rhs.b; + if (!b.tenant.empty()) { + out << b.tenant << '/'; + } + out << b.name; + return out; +} + +struct bucket_shard_str { + const rgw_bucket_shard& bs; + explicit bucket_shard_str(const rgw_bucket_shard& bs) : bs(bs) {} +}; +std::ostream& operator<<(std::ostream& out, const bucket_shard_str& rhs) { + auto& bs = rhs.bs; + out << bucket_str{bs.bucket}; + if (bs.shard_id >= 0) { + out << ':' << bs.shard_id; + } + return out; +} +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter : fmt::ostream_formatter {}; +#endif + +struct all_bucket_info { + RGWBucketInfo bucket_info; + map attrs; +}; + +struct rgw_sync_pipe_info_entity +{ +private: + RGWBucketInfo bucket_info; + map bucket_attrs; + bool _has_bucket_info{false}; + +public: + rgw_zone_id zone; + + rgw_sync_pipe_info_entity() {} + rgw_sync_pipe_info_entity(const rgw_sync_bucket_entity& e, + std::optional& binfo) { + if (e.zone) { + zone = *e.zone; + } + if (!e.bucket) { + return; + } + if (!binfo || + binfo->bucket_info.bucket != *e.bucket) { + bucket_info.bucket = *e.bucket; + } else { + set_bucket_info(*binfo); + } + } + + void update_empty_bucket_info(const std::map& buckets_info) { + if (_has_bucket_info) { + return; + } + if (bucket_info.bucket.name.empty()) { + return; + } + + auto iter = buckets_info.find(bucket_info.bucket); + if (iter == buckets_info.end()) { + return; + } + + set_bucket_info(iter->second); + } + + bool has_bucket_info() const { + return _has_bucket_info; + } + + void set_bucket_info(const all_bucket_info& all_info) { + bucket_info = all_info.bucket_info; + bucket_attrs = all_info.attrs; + _has_bucket_info = true; + } + + const RGWBucketInfo& get_bucket_info() const { + return bucket_info; + } + + const rgw_bucket& get_bucket() const { + return bucket_info.bucket; + } + + bool operator<(const rgw_sync_pipe_info_entity& e) const { + if (zone < e.zone) { + return false; + } + if (zone > e.zone) { + return true; + } + return (bucket_info.bucket < e.bucket_info.bucket); + } +}; + +std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_info_entity& e) { + auto& bucket = e.get_bucket_info().bucket; + + out << e.zone << ":" << bucket.get_key(); + return out; +} + +struct rgw_sync_pipe_handler_info { + RGWBucketSyncFlowManager::pipe_handler handler; + rgw_sync_pipe_info_entity source; + rgw_sync_pipe_info_entity target; + + rgw_sync_pipe_handler_info() {} + rgw_sync_pipe_handler_info(const RGWBucketSyncFlowManager::pipe_handler& _handler, + std::optional source_bucket_info, + std::optional target_bucket_info) : handler(_handler), + source(handler.source, source_bucket_info), + target(handler.dest, target_bucket_info) { + } + + bool operator<(const rgw_sync_pipe_handler_info& p) const { + if (source < p.source) { + return true; + } + if (p.source < source) { + return false; + } + return (target < p.target); + } + + void update_empty_bucket_info(const std::map& buckets_info) { + source.update_empty_bucket_info(buckets_info); + target.update_empty_bucket_info(buckets_info); + } +}; + +std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_handler_info& p) { + out << p.source << ">" << p.target; + return out; +} + +struct rgw_sync_pipe_info_set { + std::set handlers; + + using iterator = std::set::iterator; + + void clear() { + handlers.clear(); + } + + void insert(const RGWBucketSyncFlowManager::pipe_handler& handler, + std::optional& source_bucket_info, + std::optional& target_bucket_info) { + rgw_sync_pipe_handler_info p(handler, source_bucket_info, target_bucket_info); + handlers.insert(p); + } + + iterator begin() { + return handlers.begin(); + } + + iterator end() { + return handlers.end(); + } + + size_t size() const { + return handlers.size(); + } + + bool empty() const { + return handlers.empty(); + } + + void update_empty_bucket_info(const std::map& buckets_info) { + if (buckets_info.empty()) { + return; + } + + std::set p; + + for (auto pipe : handlers) { + pipe.update_empty_bucket_info(buckets_info); + p.insert(pipe); + } + + handlers = std::move(p); + } +}; + +class RGWRunBucketSourcesSyncCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + boost::intrusive_ptr lease_cr; + + rgw_sync_pipe_info_set pipes; + rgw_sync_pipe_info_set::iterator siter; + + rgw_bucket_sync_pair_info sync_pair; + + RGWSyncTraceNodeRef tn; + ceph::real_time* progress; + std::vector shard_progress; + std::vector::iterator cur_shard_progress; + + RGWRESTConn *conn{nullptr}; + rgw_zone_id last_zone; + + std::optional gen; + rgw_bucket_index_marker_info marker_info; + BucketIndexShardsManager marker_mgr; + +public: + RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc, + boost::intrusive_ptr lease_cr, + const rgw_bucket_shard& source_bs, + const RGWSyncTraceNodeRef& _tn_parent, + std::optional gen, + ceph::real_time* progress); + + int operate(const DoutPrefixProvider *dpp) override; +}; + +class RGWDataSyncSingleEntryCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw::bucket_sync::Handle state; // cached bucket-shard state + rgw_data_sync_obligation obligation; // input obligation + std::optional complete; // obligation to complete + uint32_t obligation_counter = 0; + RGWDataSyncShardMarkerTrack *marker_tracker; + rgw_raw_obj error_repo; + boost::intrusive_ptr lease_cr; + RGWSyncTraceNodeRef tn; + + ceph::real_time progress; + int sync_status = 0; +public: + RGWDataSyncSingleEntryCR(RGWDataSyncCtx *_sc, rgw::bucket_sync::Handle state, + rgw_data_sync_obligation _obligation, + RGWDataSyncShardMarkerTrack *_marker_tracker, + const rgw_raw_obj& error_repo, + boost::intrusive_ptr lease_cr, + const RGWSyncTraceNodeRef& _tn_parent) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + state(std::move(state)), obligation(std::move(_obligation)), + marker_tracker(_marker_tracker), error_repo(error_repo), + lease_cr(std::move(lease_cr)) { + set_description() << "data sync single entry (source_zone=" << sc->source_zone << ") " << obligation; + tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", to_string(obligation.bs, obligation.gen)); + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + if (state->obligation) { + // this is already syncing in another DataSyncSingleEntryCR + if (state->obligation->timestamp < obligation.timestamp) { + // cancel existing obligation and overwrite it + tn->log(10, SSTR("canceling existing obligation " << *state->obligation)); + complete = std::move(*state->obligation); + *state->obligation = std::move(obligation); + state->counter++; + } else { + // cancel new obligation + tn->log(10, SSTR("canceling new obligation " << obligation)); + complete = std::move(obligation); + } + } else { + // start syncing a new obligation + state->obligation = obligation; + obligation_counter = state->counter; + state->counter++; + + // loop until the latest obligation is satisfied, because other callers + // may update the obligation while we're syncing + while ((state->obligation->timestamp == ceph::real_time() || + state->progress_timestamp < state->obligation->timestamp) && + obligation_counter != state->counter) { + obligation_counter = state->counter; + progress = ceph::real_time{}; + + ldout(cct, 4) << "starting sync on " << bucket_shard_str{state->key.first} + << ' ' << *state->obligation << " progress timestamp " << state->progress_timestamp + << " progress " << progress << dendl; + yield call(new RGWRunBucketSourcesSyncCR(sc, lease_cr, + state->key.first, tn, + state->obligation->gen, + &progress)); + if (retcode < 0) { + break; + } + state->progress_timestamp = std::max(progress, state->progress_timestamp); + } + // any new obligations will process themselves + complete = std::move(*state->obligation); + state->obligation.reset(); + + tn->log(10, SSTR("sync finished on " << bucket_shard_str{state->key.first} + << " progress=" << progress << ' ' << complete << " r=" << retcode)); + } + sync_status = retcode; + + if (sync_status == -ENOENT) { + // this was added when 'tenant/' was added to datalog entries, because + // preexisting tenant buckets could never sync and would stay in the + // error_repo forever + tn->log(0, SSTR("WARNING: skipping data log entry for missing bucket " << complete->bs)); + sync_status = 0; + } + + if (sync_status < 0) { + // write actual sync failures for 'radosgw-admin sync error list' + if (sync_status != -EBUSY && sync_status != -EAGAIN) { + yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data", + to_string(complete->bs, complete->gen), + -sync_status, string("failed to sync bucket instance: ") + cpp_strerror(-sync_status))); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to log sync failure: retcode=" << retcode)); + } + } + if (complete->timestamp != ceph::real_time{}) { + tn->log(10, SSTR("writing " << *complete << " to error repo for retry")); + yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo, + rgw::error_repo::encode_key(complete->bs, complete->gen), + complete->timestamp)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to log sync failure in error repo: retcode=" << retcode)); + } + } + } else if (complete->retry) { + yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo, + rgw::error_repo::encode_key(complete->bs, complete->gen), + complete->timestamp)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to remove omap key from error repo (" + << error_repo << " retcode=" << retcode)); + } + } + /* FIXME: what do do in case of error */ + if (marker_tracker && !complete->marker.empty()) { + /* update marker */ + yield call(marker_tracker->finish(complete->marker)); + if (retcode < 0) { + return set_cr_error(retcode); + } + } + if (sync_status == 0) { + sync_status = retcode; + } + if (sync_status < 0) { + return set_cr_error(sync_status); + } + return set_cr_done(); + } + return 0; + } +}; + +rgw_raw_obj datalog_oid_for_error_repo(RGWDataSyncCtx *sc, rgw::sal::RadosStore* driver, + rgw_pool& pool, rgw_bucket_shard& bs) { + int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs); + string oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, datalog_shard); + return rgw_raw_obj(pool, oid + ".retry"); + } + +class RGWDataIncrementalSyncFullObligationCR: public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw_bucket_shard source_bs; + rgw_raw_obj error_repo; + std::string error_marker; + ceph::real_time timestamp; + RGWSyncTraceNodeRef tn; + rgw_bucket_index_marker_info remote_info; + rgw_pool pool; + uint32_t sid; + rgw_bucket_shard bs; + std::vector::const_iterator each; + +public: + RGWDataIncrementalSyncFullObligationCR(RGWDataSyncCtx *_sc, rgw_bucket_shard& _source_bs, + const rgw_raw_obj& error_repo, const std::string& _error_marker, + ceph::real_time& _timestamp, RGWSyncTraceNodeRef& _tn) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), source_bs(_source_bs), + error_repo(error_repo), error_marker(_error_marker), timestamp(_timestamp), + tn(sync_env->sync_tracer->add_node(_tn, "error_repo", SSTR(bucket_shard_str(source_bs)))) + {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + each = remote_info.generations.cbegin(); + for (; each != remote_info.generations.cend(); each++) { + for (sid = 0; sid < each->num_shards; sid++) { + bs.bucket = source_bs.bucket; + bs.shard_id = sid; + pool = sync_env->svc->zone->get_zone_params().log_pool; + error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs); + tn->log(10, SSTR("writing shard_id " << sid << " of gen " << each->gen << " to error repo for retry")); + yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo, + rgw::error_repo::encode_key(bs, each->gen), + timestamp), sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), + [&](uint64_t stack_id, int ret) { + if (ret < 0) { + retcode = ret; + } + return 0; + }); + } + } + drain_all_cb([&](uint64_t stack_id, int ret) { + if (ret < 0) { + tn->log(10, SSTR("writing to error repo returned error: " << ret)); + } + return ret; + }); + + // once everything succeeds, remove the full sync obligation from the error repo + yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo, + error_marker, timestamp)); + return set_cr_done(); + } + return 0; + } +}; + +RGWCoroutine* data_sync_single_entry(RGWDataSyncCtx *sc, const rgw_bucket_shard& src, + std::optional gen, + const std::string marker, + ceph::real_time timestamp, + boost::intrusive_ptr lease_cr, + boost::intrusive_ptr bucket_shard_cache, + RGWDataSyncShardMarkerTrack* marker_tracker, + rgw_raw_obj error_repo, + RGWSyncTraceNodeRef& tn, + bool retry) { + auto state = bucket_shard_cache->get(src, gen); + auto obligation = rgw_data_sync_obligation{src, gen, marker, timestamp, retry}; + return new RGWDataSyncSingleEntryCR(sc, std::move(state), std::move(obligation), + &*marker_tracker, error_repo, + lease_cr.get(), tn); +} + +static ceph::real_time timestamp_for_bucket_shard(rgw::sal::RadosStore* driver, + const rgw_data_sync_status& sync_status, + const rgw_bucket_shard& bs) { + int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs); + auto status = sync_status.sync_markers.find(datalog_shard); + if (status == sync_status.sync_markers.end()) { + return ceph::real_clock::zero(); + } + return status->second.timestamp; +} + +class RGWDataFullSyncSingleEntryCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw_pool pool; + rgw_bucket_shard source_bs; + const std::string key; + rgw_data_sync_status sync_status; + rgw_raw_obj error_repo; + ceph::real_time timestamp; + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr bucket_shard_cache; + RGWDataSyncShardMarkerTrack* marker_tracker; + RGWSyncTraceNodeRef tn; + rgw_bucket_index_marker_info remote_info; + uint32_t sid; + std::vector::iterator each; + uint64_t i{0}; + RGWCoroutine* shard_cr = nullptr; + bool first_shard = true; + bool error_inject; + +public: + RGWDataFullSyncSingleEntryCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool, const rgw_bucket_shard& _source_bs, + const std::string& _key, const rgw_data_sync_status& sync_status, const rgw_raw_obj& _error_repo, + ceph::real_time _timestamp, boost::intrusive_ptr _lease_cr, + boost::intrusive_ptr _bucket_shard_cache, + RGWDataSyncShardMarkerTrack* _marker_tracker, + RGWSyncTraceNodeRef& _tn) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), pool(_pool), source_bs(_source_bs), key(_key), + error_repo(_error_repo), timestamp(_timestamp), lease_cr(std::move(_lease_cr)), + bucket_shard_cache(_bucket_shard_cache), marker_tracker(_marker_tracker), tn(_tn) { + error_inject = (sync_env->cct->_conf->rgw_sync_data_full_inject_err_probability > 0); + } + + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + if (error_inject && + rand() % 10000 < cct->_conf->rgw_sync_data_full_inject_err_probability * 10000.0) { + tn->log(0, SSTR("injecting read bilog info error on key=" << key)); + retcode = -ENOENT; + } else { + tn->log(0, SSTR("read bilog info key=" << key)); + yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info)); + } + + if (retcode < 0) { + tn->log(10, SSTR("full sync: failed to read remote bucket info. Writing " + << source_bs.shard_id << " to error repo for retry")); + yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo, + rgw::error_repo::encode_key(source_bs, std::nullopt), + timestamp)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to log " << source_bs.shard_id << " in error repo: retcode=" << retcode)); + } + yield call(marker_tracker->finish(key)); + return set_cr_error(retcode); + } + + //wait to sync the first shard of the oldest generation and then sync all other shards. + //if any of the operations fail at any time, write them into error repo for later retry. + + each = remote_info.generations.begin(); + for (; each != remote_info.generations.end(); each++) { + for (sid = 0; sid < each->num_shards; sid++) { + source_bs.shard_id = sid; + // use the error repo and sync status timestamp from the datalog shard corresponding to source_bs + error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs); + timestamp = timestamp_for_bucket_shard(sync_env->driver, sync_status, source_bs); + if (retcode < 0) { + tn->log(10, SSTR("Write " << source_bs.shard_id << " to error repo for retry")); + yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo, + rgw::error_repo::encode_key(source_bs, each->gen), + timestamp), sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), std::nullopt); + } else { + shard_cr = data_sync_single_entry(sc, source_bs, each->gen, key, timestamp, + lease_cr, bucket_shard_cache, nullptr, error_repo, tn, false); + tn->log(10, SSTR("full sync: syncing shard_id " << sid << " of gen " << each->gen)); + if (first_shard) { + yield call(shard_cr); + first_shard = false; + } else { + yield_spawn_window(shard_cr, sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), + [&](uint64_t stack_id, int ret) { + if (ret < 0) { + retcode = ret; + } + return retcode; + }); + } + } + } + drain_all_cb([&](uint64_t stack_id, int ret) { + if (ret < 0) { + retcode = ret; + } + return retcode; + }); + } + + yield call(marker_tracker->finish(key)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + return set_cr_done(); + } + return 0; + } +}; + +class RGWDataBaseSyncShardCR : public RGWCoroutine { +protected: + RGWDataSyncCtx *const sc; + const rgw_pool& pool; + const uint32_t shard_id; + rgw_data_sync_marker& sync_marker; + RGWSyncTraceNodeRef tn; + const string& status_oid; + const rgw_raw_obj& error_repo; + boost::intrusive_ptr lease_cr; + const rgw_data_sync_status& sync_status; + RGWObjVersionTracker& objv; + boost::intrusive_ptr bucket_shard_cache; + + std::optional marker_tracker; + RGWRadosGetOmapValsCR::ResultPtr omapvals; + rgw_bucket_shard source_bs; + + int parse_bucket_key(const std::string& key, rgw_bucket_shard& bs) const { + int ret = rgw_bucket_parse_bucket_key(sc->env->cct, key, + &bs.bucket, &bs.shard_id); + //for the case of num_shards 0, shard_id gets a value of -1 + //because of the way bucket instance gets parsed in the absence of shard_id delimiter. + //interpret it as a non-negative value. + if (ret == 0) { + if (bs.shard_id < 0) { + bs.shard_id = 0; + } + } + return ret; + } + + RGWDataBaseSyncShardCR( + RGWDataSyncCtx *const _sc, const rgw_pool& pool, const uint32_t shard_id, + rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn, + const string& status_oid, const rgw_raw_obj& error_repo, + boost::intrusive_ptr lease_cr, + const rgw_data_sync_status& sync_status, + RGWObjVersionTracker& objv, + const boost::intrusive_ptr& bucket_shard_cache) + : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id), + sync_marker(sync_marker), tn(tn), status_oid(status_oid), + error_repo(error_repo), lease_cr(std::move(lease_cr)), + sync_status(sync_status), objv(objv), + bucket_shard_cache(bucket_shard_cache) {} +}; + +class RGWDataFullSyncShardCR : public RGWDataBaseSyncShardCR { + static constexpr auto OMAP_GET_MAX_ENTRIES = 100; + + string oid; + uint64_t total_entries = 0; + ceph::real_time entry_timestamp; + std::map entries; + std::map::iterator iter; + string error_marker; + +public: + + RGWDataFullSyncShardCR( + RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id, + rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn, + const string& status_oid, const rgw_raw_obj& error_repo, + boost::intrusive_ptr lease_cr, + const rgw_data_sync_status& sync_status, RGWObjVersionTracker& objv, + const boost::intrusive_ptr& bucket_shard_cache) + : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn, + status_oid, error_repo, std::move(lease_cr), + sync_status, objv, bucket_shard_cache) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + tn->log(10, "start full sync"); + oid = full_data_sync_index_shard_oid(sc->source_zone, shard_id); + marker_tracker.emplace(sc, status_oid, sync_marker, tn, objv); + total_entries = sync_marker.pos; + entry_timestamp = sync_marker.timestamp; // time when full sync started + do { + if (!lease_cr->is_locked()) { + drain_all(); + tn->log(1, "lease is lost, abort"); + return set_cr_error(-ECANCELED); + } + omapvals = std::make_shared(); + yield call(new RGWRadosGetOmapValsCR(sc->env->driver, + rgw_raw_obj(pool, oid), + sync_marker.marker, + OMAP_GET_MAX_ENTRIES, omapvals)); + if (retcode < 0) { + drain_all(); + return set_cr_error(retcode); + } + entries = std::move(omapvals->entries); + if (entries.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync")); + iter = entries.begin(); + for (; iter != entries.end(); ++iter) { + retcode = parse_bucket_key(iter->first, source_bs); + if (retcode < 0) { + tn->log(1, SSTR("failed to parse bucket shard: " << iter->first)); + marker_tracker->try_update_high_marker(iter->first, 0, + entry_timestamp); + continue; + } + tn->log(20, SSTR("full sync: " << iter->first)); + total_entries++; + if (!marker_tracker->start(iter->first, total_entries, + entry_timestamp)) { + tn->log(0, SSTR("ERROR: cannot start syncing " << iter->first + << ". Duplicate entry?")); + } else { + tn->log(10, SSTR("timestamp for " << iter->first << " is :" << entry_timestamp)); + yield_spawn_window(new RGWDataFullSyncSingleEntryCR( + sc, pool, source_bs, iter->first, sync_status, + error_repo, entry_timestamp, lease_cr, + bucket_shard_cache, &*marker_tracker, tn), + sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), + std::nullopt); + } + sync_marker.marker = iter->first; + } + } while (omapvals->more); + omapvals.reset(); + + drain_all(); + + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + + /* update marker to reflect we're done with full sync */ + sync_marker.state = rgw_data_sync_marker::IncrementalSync; + sync_marker.marker = sync_marker.next_step_marker; + sync_marker.next_step_marker.clear(); + yield call(new RGWSimpleRadosWriteCR( + sc->env->dpp, sc->env->driver, + rgw_raw_obj(pool, status_oid), sync_marker, &objv)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to set sync marker: retcode=" << retcode)); + return set_cr_error(retcode); + } + + // clean up full sync index, ignoring errors + yield call(new RGWRadosRemoveCR(sc->env->driver, {pool, oid})); + + // transition to incremental sync + return set_cr_done(); + } + return 0; + } +}; + +class RGWDataIncSyncShardCR : public RGWDataBaseSyncShardCR { + static constexpr int max_error_entries = 10; + static constexpr uint32_t retry_backoff_secs = 60; + + ceph::mutex& inc_lock; + bc::flat_set& modified_shards; + + bc::flat_set current_modified; + decltype(current_modified)::iterator modified_iter; + + ceph::coarse_real_time error_retry_time; + string error_marker; + std::map error_entries; + decltype(error_entries)::iterator iter; + ceph::real_time entry_timestamp; + std::optional gen; + + string next_marker; + vector log_entries; + decltype(log_entries)::iterator log_iter; + bool truncated = false; + int cbret = 0; + + utime_t get_idle_interval() const { + ceph::timespan interval = std::chrono::seconds(cct->_conf->rgw_data_sync_poll_interval); + if (!ceph::coarse_real_clock::is_zero(error_retry_time)) { + auto now = ceph::coarse_real_clock::now(); + if (error_retry_time > now) { + auto d = error_retry_time - now; + if (interval > d) { + interval = d; + } + } + } + // convert timespan -> time_point -> utime_t + return utime_t(ceph::coarse_real_clock::zero() + interval); + } + + +public: + + RGWDataIncSyncShardCR( + RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id, + rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn, + const string& status_oid, const rgw_raw_obj& error_repo, + boost::intrusive_ptr lease_cr, + const rgw_data_sync_status& sync_status, RGWObjVersionTracker& objv, + const boost::intrusive_ptr& bucket_shard_cache, + ceph::mutex& inc_lock, + bc::flat_set& modified_shards) + : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn, + status_oid, error_repo, std::move(lease_cr), + sync_status, objv, bucket_shard_cache), + inc_lock(inc_lock), modified_shards(modified_shards) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + tn->log(10, "start incremental sync"); + marker_tracker.emplace(sc, status_oid, sync_marker, tn, objv); + do { + if (!lease_cr->is_locked()) { + drain_all(); + tn->log(1, "lease is lost, abort"); + return set_cr_error(-ECANCELED); + } + { + current_modified.clear(); + std::unique_lock il(inc_lock); + current_modified.swap(modified_shards); + il.unlock(); + } + + if (current_modified.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + /* process out of band updates */ + for (modified_iter = current_modified.begin(); + modified_iter != current_modified.end(); + ++modified_iter) { + if (!lease_cr->is_locked()) { + drain_all(); + yield call(marker_tracker->flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + return set_cr_error(-ECANCELED); + } + retcode = parse_bucket_key(modified_iter->key, source_bs); + if (retcode < 0) { + tn->log(1, SSTR("failed to parse bucket shard: " + << modified_iter->key)); + continue; + } + tn->log(20, SSTR("received async update notification: " + << modified_iter->key)); + spawn(data_sync_single_entry(sc, source_bs, modified_iter->gen, {}, + ceph::real_time{}, lease_cr, + bucket_shard_cache, &*marker_tracker, + error_repo, tn, false), false); + } + + if (error_retry_time <= ceph::coarse_real_clock::now()) { + /* process bucket shards that previously failed */ + omapvals = std::make_shared(); + yield call(new RGWRadosGetOmapValsCR(sc->env->driver, error_repo, + error_marker, max_error_entries, + omapvals)); + error_entries = std::move(omapvals->entries); + tn->log(20, SSTR("read error repo, got " << error_entries.size() + << " entries")); + iter = error_entries.begin(); + for (; iter != error_entries.end(); ++iter) { + if (!lease_cr->is_locked()) { + drain_all(); + yield call(marker_tracker->flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + return set_cr_error(-ECANCELED); + } + error_marker = iter->first; + entry_timestamp = rgw::error_repo::decode_value(iter->second); + retcode = rgw::error_repo::decode_key(iter->first, source_bs, gen); + if (retcode == -EINVAL) { + // backward compatibility for string keys that don't encode a gen + retcode = parse_bucket_key(error_marker, source_bs); + } + if (retcode < 0) { + tn->log(1, SSTR("failed to parse bucket shard: " << error_marker)); + spawn(rgw::error_repo::remove_cr(sc->env->driver->svc()->rados, + error_repo, error_marker, + entry_timestamp), + false); + continue; + } + tn->log(10, SSTR("gen is " << gen)); + if (!gen) { + // write all full sync obligations for the bucket to error repo + spawn(new RGWDataIncrementalSyncFullObligationCR(sc, source_bs, + error_repo, error_marker, entry_timestamp, tn), false); + } else { + tn->log(20, SSTR("handle error entry key=" + << to_string(source_bs, gen) + << " timestamp=" << entry_timestamp)); + spawn(data_sync_single_entry(sc, source_bs, gen, "", + entry_timestamp, lease_cr, + bucket_shard_cache, &*marker_tracker, + error_repo, tn, true), false); + } + } + if (!omapvals->more) { + error_retry_time = ceph::coarse_real_clock::now() + + make_timespan(retry_backoff_secs); + error_marker.clear(); + } + } + omapvals.reset(); + + tn->log(20, SSTR("shard_id=" << shard_id << " sync_marker=" + << sync_marker.marker)); + yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id, + sync_marker.marker, + &next_marker, &log_entries, + &truncated)); + if (retcode < 0 && retcode != -ENOENT) { + tn->log(0, SSTR("ERROR: failed to read remote data log info: ret=" + << retcode)); + drain_all(); + return set_cr_error(retcode); + } + + if (log_entries.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + + for (log_iter = log_entries.begin(); + log_iter != log_entries.end(); + ++log_iter) { + if (!lease_cr->is_locked()) { + drain_all(); + yield call(marker_tracker->flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + return set_cr_error(-ECANCELED); + } + + tn->log(20, SSTR("shard_id=" << shard_id << " log_entry: " << log_iter->log_id << ":" << log_iter->log_timestamp << ":" << log_iter->entry.key)); + retcode = parse_bucket_key(log_iter->entry.key, source_bs); + if (retcode < 0) { + tn->log(1, SSTR("failed to parse bucket shard: " + << log_iter->entry.key)); + marker_tracker->try_update_high_marker(log_iter->log_id, 0, + log_iter->log_timestamp); + continue; + } + if (!marker_tracker->start(log_iter->log_id, 0, + log_iter->log_timestamp)) { + tn->log(0, SSTR("ERROR: cannot start syncing " << log_iter->log_id + << ". Duplicate entry?")); + } else { + tn->log(1, SSTR("incremental sync on " << log_iter->entry.key << "shard: " << shard_id << "on gen " << log_iter->entry.gen)); + yield_spawn_window(data_sync_single_entry(sc, source_bs, log_iter->entry.gen, log_iter->log_id, + log_iter->log_timestamp, lease_cr,bucket_shard_cache, + &*marker_tracker, error_repo, tn, false), + sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), + [&](uint64_t stack_id, int ret) { + if (ret < 0) { + tn->log(10, SSTR("data_sync_single_entry returned error: " << ret)); + cbret = ret; + } + return 0; + }); + } + } + if (cbret < 0 ) { + retcode = cbret; + drain_all(); + return set_cr_error(retcode); + } + + tn->log(20, SSTR("shard_id=" << shard_id << + " sync_marker="<< sync_marker.marker + << " next_marker=" << next_marker + << " truncated=" << truncated)); + if (!next_marker.empty()) { + sync_marker.marker = next_marker; + } else if (!log_entries.empty()) { + sync_marker.marker = log_entries.back().log_id; + } + if (!truncated) { + // we reached the end, wait a while before checking for more + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + yield wait(get_idle_interval()); + } + } while (true); + } + return 0; + } +}; + +class RGWDataSyncShardCR : public RGWCoroutine { + RGWDataSyncCtx *const sc; + const rgw_pool pool; + const uint32_t shard_id; + rgw_data_sync_marker& sync_marker; + rgw_data_sync_status sync_status; + const RGWSyncTraceNodeRef tn; + RGWObjVersionTracker& objv; + bool *reset_backoff; + + ceph::mutex inc_lock = ceph::make_mutex("RGWDataSyncShardCR::inc_lock"); + ceph::condition_variable inc_cond; + + RGWDataSyncEnv *const sync_env{ sc->env }; + + const string status_oid{ RGWDataSyncStatusManager::shard_obj_name( + sc->source_zone, shard_id) }; + const rgw_raw_obj error_repo{ pool, status_oid + ".retry" }; + + // target number of entries to cache before recycling idle ones + static constexpr size_t target_cache_size = 256; + boost::intrusive_ptr bucket_shard_cache { + rgw::bucket_sync::Cache::create(target_cache_size) }; + + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; + + bc::flat_set modified_shards; + +public: + RGWDataSyncShardCR(RGWDataSyncCtx* const _sc, const rgw_pool& pool, + const uint32_t shard_id, rgw_data_sync_marker& marker, + const rgw_data_sync_status& sync_status, + RGWSyncTraceNodeRef& tn, RGWObjVersionTracker& objv, bool *reset_backoff) + : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id), + sync_marker(marker), sync_status(sync_status), tn(tn), + objv(objv), reset_backoff(reset_backoff) { + set_description() << "data sync shard source_zone=" << sc->source_zone + << " shard_id=" << shard_id; + } + + ~RGWDataSyncShardCR() override { + if (lease_cr) { + lease_cr->abort(); + } + } + + void append_modified_shards(bc::flat_set& entries) { + std::lock_guard l{inc_lock}; + modified_shards.insert(entries.begin(), entries.end()); + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield init_lease_cr(); + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + tn->log(5, "failed to take lease"); + set_status("lease lock failed, early abort"); + drain_all(); + return set_cr_error(lease_cr->get_ret_status()); + } + set_sleeping(true); + yield; + } + *reset_backoff = true; + tn->log(10, "took lease"); + /* Reread data sync status to fech latest marker and objv */ + objv.clear(); + yield call(new RGWSimpleRadosReadCR(sync_env->dpp, sync_env->driver, + rgw_raw_obj(pool, status_oid), + &sync_marker, true, &objv)); + if (retcode < 0) { + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + + while (true) { + if (sync_marker.state == rgw_data_sync_marker::FullSync) { + yield call(new RGWDataFullSyncShardCR(sc, pool, shard_id, + sync_marker, tn, + status_oid, error_repo, + lease_cr, sync_status, + objv, bucket_shard_cache)); + if (retcode < 0) { + if (retcode != -EBUSY) { + tn->log(10, SSTR("full sync failed (retcode=" << retcode << ")")); + } + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + } else if (sync_marker.state == rgw_data_sync_marker::IncrementalSync) { + yield call(new RGWDataIncSyncShardCR(sc, pool, shard_id, + sync_marker, tn, + status_oid, error_repo, + lease_cr, sync_status, + objv, bucket_shard_cache, + inc_lock, modified_shards)); + if (retcode < 0) { + if (retcode != -EBUSY) { + tn->log(10, SSTR("incremental sync failed (retcode=" << retcode + << ")")); + } + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + } else { + lease_cr->go_down(); + drain_all(); + return set_cr_error(-EIO); + } + } + } + return 0; + } + + void init_lease_cr() { + set_status("acquiring sync lock"); + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + if (lease_cr) { + lease_cr->abort(); + } + auto driver = sync_env->driver; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver, + rgw_raw_obj(pool, status_oid), + lock_name, lock_duration, this, + &sc->lcc)); + lease_stack.reset(spawn(lease_cr.get(), false)); + } +}; + +class RGWDataSyncShardControlCR : public RGWBackoffControlCR { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + rgw_pool pool; + + uint32_t shard_id; + rgw_data_sync_marker sync_marker; + rgw_data_sync_status sync_status; + + RGWSyncTraceNodeRef tn; + RGWObjVersionTracker& objv; +public: + RGWDataSyncShardControlCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool, + uint32_t _shard_id, rgw_data_sync_marker& _marker, + const rgw_data_sync_status& sync_status, + RGWObjVersionTracker& objv, + RGWSyncTraceNodeRef& _tn_parent) + : RGWBackoffControlCR(_sc->cct, false), + sc(_sc), sync_env(_sc->env), + pool(_pool), + shard_id(_shard_id), + sync_marker(_marker), objv(objv) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "shard", std::to_string(shard_id)); + } + + RGWCoroutine *alloc_cr() override { + return new RGWDataSyncShardCR(sc, pool, shard_id, sync_marker, sync_status, tn, objv, backoff_ptr()); + } + + RGWCoroutine *alloc_finisher_cr() override { + return new RGWSimpleRadosReadCR(sync_env->dpp, sync_env->driver, + rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)), + &sync_marker, true, &objv); + } + + void append_modified_shards(bc::flat_set& keys) { + std::lock_guard l{cr_lock()}; + + RGWDataSyncShardCR *cr = static_cast(get_cr()); + if (!cr) { + return; + } + + cr->append_modified_shards(keys); + } +}; + +class RGWDataSyncCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + uint32_t num_shards; + + rgw_data_sync_status sync_status; + std::vector objvs; + + ceph::mutex shard_crs_lock = + ceph::make_mutex("RGWDataSyncCR::shard_crs_lock"); + map shard_crs; + + bool *reset_backoff; + + RGWSyncTraceNodeRef tn; + + RGWDataSyncModule *data_sync_module{nullptr}; + + boost::intrusive_ptr init_lease; + boost::intrusive_ptr lease_stack; + + RGWObjVersionTracker obj_version; +public: + RGWDataSyncCR(RGWDataSyncCtx *_sc, uint32_t _num_shards, RGWSyncTraceNodeRef& _tn, bool *_reset_backoff) : RGWCoroutine(_sc->cct), + sc(_sc), sync_env(_sc->env), + num_shards(_num_shards), + reset_backoff(_reset_backoff), tn(_tn) { + + } + + ~RGWDataSyncCR() override { + for (auto iter : shard_crs) { + iter.second->put(); + } + if (init_lease) { + init_lease->abort(); + } + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + + /* read sync status */ + yield call(new RGWReadDataSyncStatusCoroutine(sc, &sync_status, + &obj_version, objvs)); + + data_sync_module = sync_env->sync_module->get_data_handler(); + + if (retcode < 0 && retcode != -ENOENT) { + tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode)); + return set_cr_error(retcode); + } + + if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state != + rgw_data_sync_info::StateSync) { + init_lease.reset( + RGWInitDataSyncStatusCoroutine::continuous_lease_cr(sc, this)); + yield lease_stack.reset(spawn(init_lease.get(), false)); + + while (!init_lease->is_locked()) { + if (init_lease->is_done()) { + tn->log(5, "ERROR: failed to take data sync status lease"); + set_status("lease lock failed, early abort"); + drain_all(); + return set_cr_error(init_lease->get_ret_status()); + } + tn->log(5, "waiting on data sync status lease"); + yield set_sleeping(true); + } + tn->log(5, "acquired data sync status lease"); + + // Reread sync status now that we've acquired the lock! + obj_version.clear(); + yield call(new RGWReadDataSyncStatusCoroutine(sc, &sync_status, &obj_version, objvs)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode)); + return set_cr_error(retcode); + } + } + + /* state: init status */ + if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateInit) { + tn->log(20, SSTR("init")); + sync_status.sync_info.num_shards = num_shards; + uint64_t instance_id; + instance_id = ceph::util::generate_random_number(); + yield call(new RGWInitDataSyncStatusCoroutine(sc, num_shards, instance_id, tn, + &sync_status, init_lease, obj_version, objvs)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to init sync, retcode=" << retcode)); + init_lease->go_down(); + drain_all(); + return set_cr_error(retcode); + } + // sets state = StateBuildingFullSyncMaps + + *reset_backoff = true; + } + + data_sync_module->init(sc, sync_status.sync_info.instance_id); + + if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateBuildingFullSyncMaps) { + tn->log(10, SSTR("building full sync maps")); + /* call sync module init here */ + sync_status.sync_info.num_shards = num_shards; + yield call(data_sync_module->init_sync(dpp, sc)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: sync module init_sync() failed, retcode=" << retcode)); + return set_cr_error(retcode); + } + + if (!init_lease->is_locked()) { + init_lease->go_down(); + drain_all(); + return set_cr_error(-ECANCELED); + } + /* state: building full sync maps */ + yield call(new RGWListBucketIndexesCR(sc, &sync_status, objvs)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to build full sync maps, retcode=" << retcode)); + return set_cr_error(retcode); + } + sync_status.sync_info.state = rgw_data_sync_info::StateSync; + + if (!init_lease->is_locked()) { + init_lease->go_down(); + drain_all(); + return set_cr_error(-ECANCELED); + } + /* update new state */ + yield call(set_sync_info_cr()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to write sync status, retcode=" << retcode)); + return set_cr_error(retcode); + } + + *reset_backoff = true; + } + + yield call(data_sync_module->start_sync(dpp, sc)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to start sync, retcode=" << retcode)); + return set_cr_error(retcode); + } + + if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateSync) { + if (init_lease) { + init_lease->go_down(); + drain_all(); + init_lease.reset(); + lease_stack.reset(); + } + yield { + tn->log(10, SSTR("spawning " << num_shards << " shards sync")); + for (map::iterator iter = sync_status.sync_markers.begin(); + iter != sync_status.sync_markers.end(); ++iter) { + RGWDataSyncShardControlCR *cr = new RGWDataSyncShardControlCR(sc, sync_env->svc->zone->get_zone_params().log_pool, + iter->first, iter->second, sync_status, objvs[iter->first], tn); + cr->get(); + shard_crs_lock.lock(); + shard_crs[iter->first] = cr; + shard_crs_lock.unlock(); + spawn(cr, true); + } + } + } + + return set_cr_done(); + } + return 0; + } + + RGWCoroutine *set_sync_info_cr() { + return new RGWSimpleRadosWriteCR(sync_env->dpp, sync_env->driver, + rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)), + sync_status.sync_info, &obj_version); + } + + void wakeup(int shard_id, bc::flat_set& entries) { + std::lock_guard l{shard_crs_lock}; + map::iterator iter = shard_crs.find(shard_id); + if (iter == shard_crs.end()) { + return; + } + iter->second->append_modified_shards(entries); + iter->second->wakeup(); + } +}; + +class RGWDefaultDataSyncModule : public RGWDataSyncModule { +public: + RGWDefaultDataSyncModule() {} + + RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, + rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, + std::optional versioned_epoch, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *zones_trace) override; + RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override; + RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override; +}; + +class RGWDefaultSyncModuleInstance : public RGWSyncModuleInstance { + RGWDefaultDataSyncModule data_handler; +public: + RGWDefaultSyncModuleInstance() {} + RGWDataSyncModule *get_data_handler() override { + return &data_handler; + } + bool supports_user_writes() override { + return true; + } +}; + +int RGWDefaultSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) +{ + instance->reset(new RGWDefaultSyncModuleInstance()); + return 0; +} + +class RGWUserPermHandler { + friend struct Init; + friend class Bucket; + + RGWDataSyncEnv *sync_env; + rgw_user uid; + + struct _info { + RGWUserInfo user_info; + rgw::IAM::Environment env; + std::unique_ptr identity; + RGWAccessControlPolicy user_acl; + }; + + std::shared_ptr<_info> info; + + struct Init; + + std::shared_ptr init_action; + + struct Init : public RGWGenericAsyncCR::Action { + RGWDataSyncEnv *sync_env; + + rgw_user uid; + std::shared_ptr info; + + int ret{0}; + + Init(RGWUserPermHandler *handler) : sync_env(handler->sync_env), + uid(handler->uid), + info(handler->info) {} + int operate() override { + auto user_ctl = sync_env->driver->getRados()->ctl.user; + + ret = user_ctl->get_info_by_uid(sync_env->dpp, uid, &info->user_info, null_yield); + if (ret < 0) { + return ret; + } + + info->identity = rgw::auth::transform_old_authinfo(sync_env->cct, + uid, + RGW_PERM_FULL_CONTROL, + false, /* system_request? */ + TYPE_RGW); + + map uattrs; + + ret = user_ctl->get_attrs_by_uid(sync_env->dpp, uid, &uattrs, null_yield); + if (ret == 0) { + ret = RGWUserPermHandler::policy_from_attrs(sync_env->cct, uattrs, &info->user_acl); + } + if (ret == -ENOENT) { + info->user_acl.create_default(uid, info->user_info.display_name); + } + + return 0; + } + }; + +public: + RGWUserPermHandler(RGWDataSyncEnv *_sync_env, + const rgw_user& _uid) : sync_env(_sync_env), + uid(_uid) {} + + RGWCoroutine *init_cr() { + info = make_shared<_info>(); + init_action = make_shared(this); + + return new RGWGenericAsyncCR(sync_env->cct, + sync_env->async_rados, + init_action); + } + + class Bucket { + RGWDataSyncEnv *sync_env; + std::shared_ptr<_info> info; + RGWAccessControlPolicy bucket_acl; + std::optional ps; + public: + Bucket() {} + + int init(RGWUserPermHandler *handler, + const RGWBucketInfo& bucket_info, + const map& bucket_attrs); + + bool verify_bucket_permission(int perm); + bool verify_object_permission(const map& obj_attrs, + int perm); + }; + + static int policy_from_attrs(CephContext *cct, + const map& attrs, + RGWAccessControlPolicy *acl) { + acl->set_ctx(cct); + + auto aiter = attrs.find(RGW_ATTR_ACL); + if (aiter == attrs.end()) { + return -ENOENT; + } + auto iter = aiter->second.begin(); + try { + acl->decode(iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + + return 0; + } + + int init_bucket(const RGWBucketInfo& bucket_info, + const map& bucket_attrs, + Bucket *bs) { + return bs->init(this, bucket_info, bucket_attrs); + } +}; + +int RGWUserPermHandler::Bucket::init(RGWUserPermHandler *handler, + const RGWBucketInfo& bucket_info, + const map& bucket_attrs) +{ + sync_env = handler->sync_env; + info = handler->info; + + int r = RGWUserPermHandler::policy_from_attrs(sync_env->cct, bucket_attrs, &bucket_acl); + if (r < 0) { + return r; + } + + ps.emplace(sync_env->cct, + info->env, + info->identity.get(), + bucket_info, + info->identity->get_perm_mask(), + false, /* defer to bucket acls */ + nullptr, /* referer */ + false); /* request_payer */ + + return 0; +} + +bool RGWUserPermHandler::Bucket::verify_bucket_permission(int perm) +{ + return verify_bucket_permission_no_policy(sync_env->dpp, + &(*ps), + &info->user_acl, + &bucket_acl, + perm); +} + +bool RGWUserPermHandler::Bucket::verify_object_permission(const map& obj_attrs, + int perm) +{ + RGWAccessControlPolicy obj_acl; + + int r = policy_from_attrs(sync_env->cct, obj_attrs, &obj_acl); + if (r < 0) { + return r; + } + + return verify_bucket_permission_no_policy(sync_env->dpp, + &(*ps), + &bucket_acl, + &obj_acl, + perm); +} + +class RGWFetchObjFilter_Sync : public RGWFetchObjFilter_Default { + rgw_bucket_sync_pipe sync_pipe; + + std::shared_ptr bucket_perms; + std::optional verify_dest_params; + + std::optional mtime; + std::optional etag; + std::optional obj_size; + + std::unique_ptr identity; + + std::shared_ptr need_retry; + +public: + RGWFetchObjFilter_Sync(rgw_bucket_sync_pipe& _sync_pipe, + std::shared_ptr& _bucket_perms, + std::optional&& _verify_dest_params, + std::shared_ptr& _need_retry) : sync_pipe(_sync_pipe), + bucket_perms(_bucket_perms), + verify_dest_params(std::move(_verify_dest_params)), + need_retry(_need_retry) { + *need_retry = false; + } + + int filter(CephContext *cct, + const rgw_obj_key& source_key, + const RGWBucketInfo& dest_bucket_info, + std::optional dest_placement_rule, + const map& obj_attrs, + std::optional *poverride_owner, + const rgw_placement_rule **prule) override; +}; + +int RGWFetchObjFilter_Sync::filter(CephContext *cct, + const rgw_obj_key& source_key, + const RGWBucketInfo& dest_bucket_info, + std::optional dest_placement_rule, + const map& obj_attrs, + std::optional *poverride_owner, + const rgw_placement_rule **prule) +{ + int abort_err = -ERR_PRECONDITION_FAILED; + + rgw_sync_pipe_params params; + + RGWObjTags obj_tags; + + auto iter = obj_attrs.find(RGW_ATTR_TAGS); + if (iter != obj_attrs.end()) { + try { + auto it = iter->second.cbegin(); + obj_tags.decode(it); + } catch (buffer::error &err) { + ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl; + } + } + + if (!sync_pipe.info.handler.find_obj_params(source_key, + obj_tags.get_tags(), + ¶ms)) { + return abort_err; + } + + if (verify_dest_params && + !(*verify_dest_params == params.dest)) { + /* raced! original dest params were different, will need to retry */ + ldout(cct, 0) << "WARNING: " << __func__ << ": pipe dest params are different than original params, must have raced with object rewrite, retrying" << dendl; + *need_retry = true; + return -ECANCELED; + } + + std::optional > new_attrs; + + if (params.dest.acl_translation) { + rgw_user& acl_translation_owner = params.dest.acl_translation->owner; + if (!acl_translation_owner.empty()) { + if (params.mode == rgw_sync_pipe_params::MODE_USER && + acl_translation_owner != dest_bucket_info.owner) { + ldout(cct, 0) << "ERROR: " << __func__ << ": acl translation was requested, but user (" << acl_translation_owner + << ") is not dest bucket owner (" << dest_bucket_info.owner << ")" << dendl; + return -EPERM; + } + *poverride_owner = acl_translation_owner; + } + } + if (params.mode == rgw_sync_pipe_params::MODE_USER) { + if (!bucket_perms->verify_object_permission(obj_attrs, RGW_PERM_READ)) { + ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to fetch object" << dendl; + return -EPERM; + } + } + + if (!dest_placement_rule && + params.dest.storage_class) { + dest_rule.storage_class = *params.dest.storage_class; + dest_rule.inherit_from(dest_bucket_info.placement_rule); + dest_placement_rule = dest_rule; + *prule = &dest_rule; + } + + return RGWFetchObjFilter_Default::filter(cct, + source_key, + dest_bucket_info, + dest_placement_rule, + obj_attrs, + poverride_owner, + prule); +} + +class RGWObjFetchCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw_bucket_sync_pipe& sync_pipe; + rgw_obj_key& key; + std::optional dest_key; + std::optional versioned_epoch; + const rgw_zone_set_entry& source_trace_entry; + rgw_zone_set *zones_trace; + + bool need_more_info{false}; + bool check_change{false}; + + ceph::real_time src_mtime; + uint64_t src_size; + string src_etag; + map src_attrs; + map src_headers; + + std::optional param_user; + rgw_sync_pipe_params::Mode param_mode; + + std::optional user_perms; + std::shared_ptr source_bucket_perms; + RGWUserPermHandler::Bucket dest_bucket_perms; + + std::optional dest_params; + + int try_num{0}; + std::shared_ptr need_retry; +public: + RGWObjFetchCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, + rgw_obj_key& _key, + std::optional _dest_key, + std::optional _versioned_epoch, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *_zones_trace) : RGWCoroutine(_sc->cct), + sc(_sc), sync_env(_sc->env), + sync_pipe(_sync_pipe), + key(_key), + dest_key(_dest_key), + versioned_epoch(_versioned_epoch), + source_trace_entry(source_trace_entry), + zones_trace(_zones_trace) { + } + + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + +#define MAX_RACE_RETRIES_OBJ_FETCH 10 + for (try_num = 0; try_num < MAX_RACE_RETRIES_OBJ_FETCH; ++try_num) { + + { + std::optional param_acl_translation; + std::optional param_storage_class; + + if (!sync_pipe.info.handler.find_basic_info_without_tags(key, + ¶m_user, + ¶m_acl_translation, + ¶m_storage_class, + ¶m_mode, + &need_more_info)) { + if (!need_more_info) { + return set_cr_error(-ERR_PRECONDITION_FAILED); + } + } + } + + if (need_more_info) { + ldout(cct, 20) << "Could not determine exact policy rule for obj=" << key << ", will read source object attributes" << dendl; + /* + * we need to fetch info about source object, so that we can determine + * the correct policy configuration. This can happen if there are multiple + * policy rules, and some depend on the object tagging */ + yield call(new RGWStatRemoteObjCR(sync_env->async_rados, + sync_env->driver, + sc->source_zone, + sync_pipe.info.source_bs.bucket, + key, + &src_mtime, + &src_size, + &src_etag, + &src_attrs, + &src_headers)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + RGWObjTags obj_tags; + + auto iter = src_attrs.find(RGW_ATTR_TAGS); + if (iter != src_attrs.end()) { + try { + auto it = iter->second.cbegin(); + obj_tags.decode(it); + } catch (buffer::error &err) { + ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl; + } + } + + rgw_sync_pipe_params params; + if (!sync_pipe.info.handler.find_obj_params(key, + obj_tags.get_tags(), + ¶ms)) { + return set_cr_error(-ERR_PRECONDITION_FAILED); + } + + param_user = params.user; + param_mode = params.mode; + + dest_params = params.dest; + } + + if (param_mode == rgw_sync_pipe_params::MODE_USER) { + if (!param_user) { + ldout(cct, 20) << "ERROR: " << __func__ << ": user level sync but user param not set" << dendl; + return set_cr_error(-EPERM); + } + user_perms.emplace(sync_env, *param_user); + + yield call(user_perms->init_cr()); + if (retcode < 0) { + ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init user perms manager for uid=" << *param_user << dendl; + return set_cr_error(retcode); + } + + /* verify that user is allowed to write at the target bucket */ + int r = user_perms->init_bucket(sync_pipe.dest_bucket_info, + sync_pipe.dest_bucket_attrs, + &dest_bucket_perms); + if (r < 0) { + ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl; + return set_cr_error(retcode); + } + + if (!dest_bucket_perms.verify_bucket_permission(RGW_PERM_WRITE)) { + ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl; + return -EPERM; + } + + /* init source bucket permission structure */ + source_bucket_perms = make_shared(); + r = user_perms->init_bucket(sync_pipe.source_bucket_info, + sync_pipe.source_bucket_attrs, + source_bucket_perms.get()); + if (r < 0) { + ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl; + return set_cr_error(retcode); + } + } + + yield { + if (!need_retry) { + need_retry = make_shared(); + } + auto filter = make_shared(sync_pipe, + source_bucket_perms, + std::move(dest_params), + need_retry); + + call(new RGWFetchRemoteObjCR(sync_env->async_rados, sync_env->driver, sc->source_zone, + nullopt, + sync_pipe.info.source_bs.bucket, + std::nullopt, sync_pipe.dest_bucket_info, + key, dest_key, versioned_epoch, + true, + std::static_pointer_cast(filter), + source_trace_entry, zones_trace, + sync_env->counters, dpp)); + } + if (retcode < 0) { + if (*need_retry) { + continue; + } + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + ldout(cct, 0) << "ERROR: " << __func__ << ": Too many retries trying to fetch object, possibly a bug: bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << " key=" << key << dendl; + + return set_cr_error(-EIO); + } + return 0; + } +}; + +RGWCoroutine *RGWDefaultDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, + rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, + std::optional versioned_epoch, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *zones_trace) +{ + return new RGWObjFetchCR(sc, sync_pipe, key, std::nullopt, versioned_epoch, + source_trace_entry, zones_trace); +} + +RGWCoroutine *RGWDefaultDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, + real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) +{ + auto sync_env = sc->env; + return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone, + sync_pipe.dest_bucket_info, key, versioned, versioned_epoch, + NULL, NULL, false, &mtime, zones_trace); +} + +RGWCoroutine *RGWDefaultDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) +{ + auto sync_env = sc->env; + return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone, + sync_pipe.dest_bucket_info, key, versioned, versioned_epoch, + &owner.id, &owner.display_name, true, &mtime, zones_trace); +} + +class RGWArchiveDataSyncModule : public RGWDefaultDataSyncModule { +public: + RGWArchiveDataSyncModule() {} + + RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, + rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, + std::optional versioned_epoch, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *zones_trace) override; + RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override; + RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override; +}; + +class RGWArchiveSyncModuleInstance : public RGWDefaultSyncModuleInstance { + RGWArchiveDataSyncModule data_handler; +public: + RGWArchiveSyncModuleInstance() {} + RGWDataSyncModule *get_data_handler() override { + return &data_handler; + } + RGWMetadataHandler *alloc_bucket_meta_handler() override { + return RGWArchiveBucketMetaHandlerAllocator::alloc(); + } + RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver) override { + return RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(driver); + } +}; + +int RGWArchiveSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) +{ + instance->reset(new RGWArchiveSyncModuleInstance()); + return 0; +} + +RGWCoroutine *RGWArchiveDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, + rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, + std::optional versioned_epoch, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *zones_trace) +{ + auto sync_env = sc->env; + ldout(sc->cct, 5) << "SYNC_ARCHIVE: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + if (!sync_pipe.dest_bucket_info.versioned() || + (sync_pipe.dest_bucket_info.flags & BUCKET_VERSIONS_SUSPENDED)) { + ldout(sc->cct, 0) << "SYNC_ARCHIVE: sync_object: enabling object versioning for archive bucket" << dendl; + sync_pipe.dest_bucket_info.flags = (sync_pipe.dest_bucket_info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED; + int op_ret = sync_env->driver->getRados()->put_bucket_instance_info(sync_pipe.dest_bucket_info, false, real_time(), NULL, sync_env->dpp, null_yield); + if (op_ret < 0) { + ldpp_dout(sync_env->dpp, 0) << "SYNC_ARCHIVE: sync_object: error versioning archive bucket" << dendl; + return NULL; + } + } + + std::optional dest_key; + + if (versioned_epoch.value_or(0) == 0) { /* force version if not set */ + versioned_epoch = 0; + dest_key = key; + } + + if (key.instance.empty()) { + dest_key = key; + sync_env->driver->getRados()->gen_rand_obj_instance_name(&(*dest_key)); + } + + return new RGWObjFetchCR(sc, sync_pipe, key, dest_key, versioned_epoch, + source_trace_entry, zones_trace); +} + +RGWCoroutine *RGWArchiveDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, + real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) +{ + ldout(sc->cct, 0) << "SYNC_ARCHIVE: remove_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch << dendl; + return NULL; +} + +RGWCoroutine *RGWArchiveDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) +{ + ldout(sc->cct, 0) << "SYNC_ARCHIVE: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime + << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + auto sync_env = sc->env; + return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone, + sync_pipe.dest_bucket_info, key, versioned, versioned_epoch, + &owner.id, &owner.display_name, true, &mtime, zones_trace); +} + +class RGWDataSyncControlCR : public RGWBackoffControlCR +{ + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + uint32_t num_shards; + + RGWSyncTraceNodeRef tn; + + static constexpr bool exit_on_error = false; // retry on all errors +public: + RGWDataSyncControlCR(RGWDataSyncCtx *_sc, uint32_t _num_shards, + RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sc->cct, exit_on_error), + sc(_sc), sync_env(_sc->env), num_shards(_num_shards) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "sync"); + } + + RGWCoroutine *alloc_cr() override { + return new RGWDataSyncCR(sc, num_shards, tn, backoff_ptr()); + } + + void wakeup(int shard_id, bc::flat_set& entries) { + ceph::mutex& m = cr_lock(); + + m.lock(); + RGWDataSyncCR *cr = static_cast(get_cr()); + if (!cr) { + m.unlock(); + return; + } + + cr->get(); + m.unlock(); + + if (cr) { + cr->wakeup(shard_id, entries); + } + + cr->put(); + } +}; + +void RGWRemoteDataLog::wakeup(int shard_id, bc::flat_set& entries) { + std::shared_lock rl{lock}; + if (!data_sync_cr) { + return; + } + data_sync_cr->wakeup(shard_id, entries); +} + +int RGWRemoteDataLog::run_sync(const DoutPrefixProvider *dpp, int num_shards) +{ + lock.lock(); + data_sync_cr = new RGWDataSyncControlCR(&sc, num_shards, tn); + data_sync_cr->get(); // run() will drop a ref, so take another + lock.unlock(); + + int r = run(dpp, data_sync_cr); + + lock.lock(); + data_sync_cr->put(); + data_sync_cr = NULL; + lock.unlock(); + + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to run sync" << dendl; + return r; + } + return 0; +} + +CephContext *RGWDataSyncStatusManager::get_cct() const +{ + return driver->ctx(); +} + +int RGWDataSyncStatusManager::init(const DoutPrefixProvider *dpp) +{ + RGWZone *zone_def; + + if (!(zone_def = driver->svc()->zone->find_zone(source_zone))) { + ldpp_dout(this, 0) << "ERROR: failed to find zone config info for zone=" << source_zone << dendl; + return -EIO; + } + + if (!driver->svc()->sync_modules->get_manager()->supports_data_export(zone_def->tier_type)) { + return -ENOTSUP; + } + + const RGWZoneParams& zone_params = driver->svc()->zone->get_zone_params(); + + if (sync_module == nullptr) { + sync_module = driver->get_sync_module(); + } + + conn = driver->svc()->zone->get_zone_conn(source_zone); + if (!conn) { + ldpp_dout(this, 0) << "connection object to zone " << source_zone << " does not exist" << dendl; + return -EINVAL; + } + + error_logger = new RGWSyncErrorLogger(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS); + + int r = source_log.init(source_zone, conn, error_logger, driver->getRados()->get_sync_tracer(), + sync_module, counters); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to init remote log, r=" << r << dendl; + finalize(); + return r; + } + + rgw_datalog_info datalog_info; + r = source_log.read_log_info(dpp, &datalog_info); + if (r < 0) { + ldpp_dout(this, 5) << "ERROR: master.read_log_info() returned r=" << r << dendl; + finalize(); + return r; + } + + num_shards = datalog_info.num_shards; + + for (int i = 0; i < num_shards; i++) { + shard_objs[i] = rgw_raw_obj(zone_params.log_pool, shard_obj_name(source_zone, i)); + } + + return 0; +} + +void RGWDataSyncStatusManager::finalize() +{ + delete error_logger; + error_logger = nullptr; +} + +unsigned RGWDataSyncStatusManager::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWDataSyncStatusManager::gen_prefix(std::ostream& out) const +{ + auto zone = std::string_view{source_zone.id}; + return out << "data sync zone:" << zone.substr(0, 8) << ' '; +} + +string RGWDataSyncStatusManager::sync_status_oid(const rgw_zone_id& source_zone) +{ + char buf[datalog_sync_status_oid_prefix.size() + source_zone.id.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%s", datalog_sync_status_oid_prefix.c_str(), source_zone.id.c_str()); + + return string(buf); +} + +string RGWDataSyncStatusManager::shard_obj_name(const rgw_zone_id& source_zone, int shard_id) +{ + char buf[datalog_sync_status_shard_prefix.size() + source_zone.id.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_status_shard_prefix.c_str(), source_zone.id.c_str(), shard_id); + + return string(buf); +} + +class RGWInitBucketShardSyncStatusCoroutine : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + const rgw_bucket_sync_pair_info& sync_pair; + const string sync_status_oid; + + rgw_bucket_shard_sync_info& status; + RGWObjVersionTracker& objv_tracker; + const BucketIndexShardsManager& marker_mgr; + bool exclusive; +public: + RGWInitBucketShardSyncStatusCoroutine(RGWDataSyncCtx *_sc, + const rgw_bucket_sync_pair_info& _sync_pair, + rgw_bucket_shard_sync_info& _status, + uint64_t gen, + const BucketIndexShardsManager& _marker_mgr, + RGWObjVersionTracker& objv_tracker, + bool exclusive) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + sync_pair(_sync_pair), + sync_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, _sync_pair, gen)), + status(_status), objv_tracker(objv_tracker), marker_mgr(_marker_mgr), exclusive(exclusive) + {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield { + rgw_raw_obj obj(sync_env->svc->zone->get_zone_params().log_pool, sync_status_oid); + + // whether or not to do full sync, incremental sync will follow anyway + if (sync_env->sync_module->should_full_sync()) { + const auto max_marker = marker_mgr.get(sync_pair.source_bs.shard_id, ""); + status.inc_marker.position = max_marker; + } + status.inc_marker.timestamp = ceph::real_clock::now(); + status.state = rgw_bucket_shard_sync_info::StateIncrementalSync; + + map attrs; + status.encode_all_attrs(attrs); + call(new RGWSimpleRadosWriteAttrsCR(dpp, sync_env->driver, + obj, attrs, &objv_tracker, exclusive)); + } + + if (retcode < 0) { + ldout(cct, 20) << "ERROR: init marker position failed. error: " << retcode << dendl; + return set_cr_error(retcode); + } + ldout(cct, 20) << "init marker position: " << status.inc_marker.position << + ". written to shard status object: " << sync_status_oid << dendl; + return set_cr_done(); + } + return 0; + } +}; + +#define BUCKET_SYNC_ATTR_PREFIX RGW_ATTR_PREFIX "bucket-sync." + +template +static bool decode_attr(CephContext *cct, map& attrs, const string& attr_name, T *val) +{ + map::iterator iter = attrs.find(attr_name); + if (iter == attrs.end()) { + *val = T(); + return false; + } + + auto biter = iter->second.cbegin(); + try { + decode(*val, biter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode attribute: " << attr_name << dendl; + return false; + } + return true; +} + +void rgw_bucket_shard_sync_info::decode_from_attrs(CephContext *cct, map& attrs) +{ + if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "state", &state)) { + decode_attr(cct, attrs, "state", &state); + } + if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "inc_marker", &inc_marker)) { + decode_attr(cct, attrs, "inc_marker", &inc_marker); + } +} + +void rgw_bucket_shard_sync_info::encode_all_attrs(map& attrs) +{ + encode_state_attr(attrs); + inc_marker.encode_attr(attrs); +} + +void rgw_bucket_shard_sync_info::encode_state_attr(map& attrs) +{ + using ceph::encode; + encode(state, attrs[BUCKET_SYNC_ATTR_PREFIX "state"]); +} + +void rgw_bucket_shard_full_sync_marker::encode_attr(map& attrs) +{ + using ceph::encode; + encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "full_marker"]); +} + +void rgw_bucket_shard_inc_sync_marker::encode_attr(map& attrs) +{ + using ceph::encode; + encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "inc_marker"]); +} + +class RGWReadBucketPipeSyncStatusCoroutine : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + string oid; + rgw_bucket_shard_sync_info *status; + RGWObjVersionTracker* objv_tracker; + map attrs; +public: + RGWReadBucketPipeSyncStatusCoroutine(RGWDataSyncCtx *_sc, + const rgw_bucket_sync_pair_info& sync_pair, + rgw_bucket_shard_sync_info *_status, + RGWObjVersionTracker* objv_tracker, + uint64_t gen) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen)), + status(_status), objv_tracker(objv_tracker) + {} + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWReadBucketPipeSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + yield call(new RGWSimpleRadosReadAttrsCR(dpp, sync_env->driver, + rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, oid), + &attrs, true, objv_tracker)); + if (retcode == -ENOENT) { + *status = rgw_bucket_shard_sync_info(); + return set_cr_done(); + } + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to call fetch bucket shard info oid=" << oid << " ret=" << retcode << dendl; + return set_cr_error(retcode); + } + status->decode_from_attrs(sync_env->cct, attrs); + return set_cr_done(); + } + return 0; +} + +// wrap ReadSyncStatus and set a flag if it's not in incremental +class CheckBucketShardStatusIsIncremental : public RGWReadBucketPipeSyncStatusCoroutine { + bool* result; + rgw_bucket_shard_sync_info status; + public: + CheckBucketShardStatusIsIncremental(RGWDataSyncCtx* sc, + const rgw_bucket_sync_pair_info& sync_pair, + bool* result) + : RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &status, nullptr, 0 /*no gen in compat mode*/), + result(result) + {} + + int operate(const DoutPrefixProvider *dpp) override { + int r = RGWReadBucketPipeSyncStatusCoroutine::operate(dpp); + if (state == RGWCoroutine_Done && + status.state != rgw_bucket_shard_sync_info::StateIncrementalSync) { + *result = false; + } + return r; + } +}; + +class CheckAllBucketShardStatusIsIncremental : public RGWShardCollectCR { + // start with 1 shard, and only spawn more if we detect an existing shard. + // this makes the backward compatilibility check far less expensive in the + // general case where no shards exist + static constexpr int initial_concurrent_shards = 1; + static constexpr int max_concurrent_shards = 16; + + RGWDataSyncCtx* sc; + rgw_bucket_sync_pair_info sync_pair; + const int num_shards; + bool* result; + int shard = 0; + public: + CheckAllBucketShardStatusIsIncremental(RGWDataSyncCtx* sc, + const rgw_bucket_sync_pair_info& sync_pair, + int num_shards, bool* result) + : RGWShardCollectCR(sc->cct, initial_concurrent_shards), + sc(sc), sync_pair(sync_pair), num_shards(num_shards), result(result) + {} + + bool spawn_next() override { + // stop spawning if we saw any errors or non-incremental shards + if (shard >= num_shards || status < 0 || !*result) { + return false; + } + sync_pair.source_bs.shard_id = shard++; + spawn(new CheckBucketShardStatusIsIncremental(sc, sync_pair, result), false); + return true; + } + + private: + int handle_result(int r) override { + if (r < 0) { + ldout(cct, 4) << "failed to read bucket shard status: " + << cpp_strerror(r) << dendl; + } else if (shard == 0) { + // enable concurrency once the first shard succeeds + max_concurrent = max_concurrent_shards; + } + return r; + } +}; + +// wrap InitBucketShardSyncStatus with local storage for 'status' and 'objv' +// and a loop to retry on racing writes +class InitBucketShardStatusCR : public RGWCoroutine { + RGWDataSyncCtx* sc; + rgw_bucket_sync_pair_info pair; + rgw_bucket_shard_sync_info status; + RGWObjVersionTracker objv; + const uint64_t gen; + const BucketIndexShardsManager& marker_mgr; + + public: + InitBucketShardStatusCR(RGWDataSyncCtx* sc, + const rgw_bucket_sync_pair_info& pair, + uint64_t gen, + const BucketIndexShardsManager& marker_mgr) + : RGWCoroutine(sc->cct), sc(sc), pair(pair), gen(gen), marker_mgr(marker_mgr) + {} + int operate(const DoutPrefixProvider *dpp) { + reenter(this) { + // non exclusive create with empty status + objv.generate_new_write_ver(cct); + yield call(new RGWInitBucketShardSyncStatusCoroutine(sc, pair, status, gen, marker_mgr, objv, false)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +class InitBucketShardStatusCollectCR : public RGWShardCollectCR { + static constexpr int max_concurrent_shards = 16; + RGWDataSyncCtx* sc; + rgw_bucket_sync_pair_info sync_pair; + const uint64_t gen; + const BucketIndexShardsManager& marker_mgr; + + const int num_shards; + int shard = 0; + + int handle_result(int r) override { + if (r < 0) { + ldout(cct, 4) << "failed to init bucket shard status: " + << cpp_strerror(r) << dendl; + } + return r; + } + public: + InitBucketShardStatusCollectCR(RGWDataSyncCtx* sc, + const rgw_bucket_sync_pair_info& sync_pair, + uint64_t gen, + const BucketIndexShardsManager& marker_mgr, + int num_shards) + : RGWShardCollectCR(sc->cct, max_concurrent_shards), + sc(sc), sync_pair(sync_pair), gen(gen), marker_mgr(marker_mgr), num_shards(num_shards) + {} + + bool spawn_next() override { + if (shard >= num_shards || status < 0) { // stop spawning on any errors + return false; + } + sync_pair.source_bs.shard_id = shard++; + spawn(new InitBucketShardStatusCR(sc, sync_pair, gen, marker_mgr), false); + return true; + } +}; + +class RemoveBucketShardStatusCR : public RGWCoroutine { + RGWDataSyncCtx* const sc; + RGWDataSyncEnv* const sync_env; + + rgw_bucket_sync_pair_info sync_pair; + rgw_raw_obj obj; + RGWObjVersionTracker objv; + +public: + RemoveBucketShardStatusCR(RGWDataSyncCtx* sc, + const rgw_bucket_sync_pair_info& sync_pair, uint64_t gen) + : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env), + sync_pair(sync_pair), + obj(sync_env->svc->zone->get_zone_params().log_pool, + RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen)) + {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield call(new RGWRadosRemoveCR(sync_env->driver, obj, &objv)); + if (retcode < 0 && retcode != -ENOENT) { + ldout(cct, 20) << "ERROR: failed to remove bucket shard status for: " << sync_pair << + ". with error: " << retcode << dendl; + return set_cr_error(retcode); + } + ldout(cct, 20) << "removed bucket shard status object: " << obj.oid << dendl; + return set_cr_done(); + } + return 0; + } +}; + +class RemoveBucketShardStatusCollectCR : public RGWShardCollectCR { + static constexpr int max_concurrent_shards = 16; + RGWDataSyncCtx* const sc; + RGWDataSyncEnv* const sync_env; + rgw_bucket_sync_pair_info sync_pair; + const uint64_t gen; + + const int num_shards; + int shard = 0; + + int handle_result(int r) override { + if (r < 0) { + ldout(cct, 4) << "failed to remove bucket shard status object: " + << cpp_strerror(r) << dendl; + } + return r; + } + public: + RemoveBucketShardStatusCollectCR(RGWDataSyncCtx* sc, + const rgw_bucket_sync_pair_info& sync_pair, + uint64_t gen, + int num_shards) + : RGWShardCollectCR(sc->cct, max_concurrent_shards), + sc(sc), sync_env(sc->env), sync_pair(sync_pair), gen(gen), num_shards(num_shards) + {} + + bool spawn_next() override { + if (shard >= num_shards) { + return false; + } + sync_pair.source_bs.shard_id = shard++; + spawn(new RemoveBucketShardStatusCR(sc, sync_pair, gen), false); + return true; + } +}; + +class InitBucketFullSyncStatusCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + const rgw_bucket_sync_pair_info& sync_pair; + const rgw_raw_obj& status_obj; + rgw_bucket_sync_status& status; + RGWObjVersionTracker& objv; + const RGWBucketInfo& source_info; + const bool check_compat; + + const rgw_bucket_index_marker_info& info; + BucketIndexShardsManager marker_mgr; + + bool all_incremental = true; + bool no_zero = false; + +public: + InitBucketFullSyncStatusCR(RGWDataSyncCtx* sc, + const rgw_bucket_sync_pair_info& sync_pair, + const rgw_raw_obj& status_obj, + rgw_bucket_sync_status& status, + RGWObjVersionTracker& objv, + const RGWBucketInfo& source_info, + bool check_compat, + const rgw_bucket_index_marker_info& info) + : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env), + sync_pair(sync_pair), status_obj(status_obj), + status(status), objv(objv), source_info(source_info), + check_compat(check_compat), info(info) + {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + retcode = marker_mgr.from_string(info.max_marker, -1); + if (retcode < 0) { + lderr(cct) << "failed to parse bilog shard markers: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + status.state = BucketSyncState::Init; + + if (info.oldest_gen == 0) { + if (check_compat) { + // use shard count from our log gen=0 + // try to convert existing per-shard incremental status for backward compatibility + if (source_info.layout.logs.empty() || + source_info.layout.logs.front().gen > 0) { + ldpp_dout(dpp, 20) << "no generation zero when checking compatibility" << dendl; + no_zero = true; + } else if (auto& log = source_info.layout.logs.front(); + log.layout.type != rgw::BucketLogType::InIndex) { + ldpp_dout(dpp, 20) << "unrecognized log layout type when checking compatibility " << log.layout.type << dendl; + no_zero = true; + } + if (!no_zero) { + yield { + const int num_shards0 = rgw::num_shards( + source_info.layout.logs.front().layout.in_index.layout); + call(new CheckAllBucketShardStatusIsIncremental(sc, sync_pair, + num_shards0, + &all_incremental)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + if (all_incremental) { + // we can use existing status and resume incremental sync + status.state = BucketSyncState::Incremental; + } + } else { + all_incremental = false; + } + } + } + + if (status.state != BucketSyncState::Incremental) { + // initialize all shard sync status. this will populate the log marker + // positions where incremental sync will resume after full sync + yield { + const int num_shards = marker_mgr.get().size(); + call(new InitBucketShardStatusCollectCR(sc, sync_pair, info.latest_gen, marker_mgr, num_shards)); + } + if (retcode < 0) { + ldout(cct, 20) << "failed to init bucket shard status: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + if (sync_env->sync_module->should_full_sync()) { + status.state = BucketSyncState::Full; + } else { + status.state = BucketSyncState::Incremental; + } + } + + status.shards_done_with_gen.resize(marker_mgr.get().size()); + status.incremental_gen = info.latest_gen; + + ldout(cct, 20) << "writing bucket sync status during init. state=" << status.state << ". marker=" << status.full.position << dendl; + + // write bucket sync status + using CR = RGWSimpleRadosWriteCR; + yield call(new CR(dpp, sync_env->driver, + status_obj, status, &objv, false)); + if (retcode < 0) { + ldout(cct, 20) << "failed to write bucket shard status: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +#define OMAP_READ_MAX_ENTRIES 10 +class RGWReadRecoveringBucketShardsCoroutine : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw::sal::RadosStore* driver; + + const int shard_id; + int max_entries; + + set& recovering_buckets; + string marker; + string error_oid; + + RGWRadosGetOmapKeysCR::ResultPtr omapkeys; + set error_entries; + int max_omap_entries; + int count; + +public: + RGWReadRecoveringBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id, + set& _recovering_buckets, const int _max_entries) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries), + recovering_buckets(_recovering_buckets), max_omap_entries(OMAP_READ_MAX_ENTRIES) + { + error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry"; + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWReadRecoveringBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp) +{ + reenter(this){ + //read recovering bucket shards + count = 0; + do { + omapkeys = std::make_shared(); + yield call(new RGWRadosGetOmapKeysCR(driver, rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, error_oid), + marker, max_omap_entries, omapkeys)); + + if (retcode == -ENOENT) { + break; + } + + if (retcode < 0) { + ldpp_dout(dpp, 0) << "failed to read recovering bucket shards with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + error_entries = std::move(omapkeys->entries); + if (error_entries.empty()) { + break; + } + + count += error_entries.size(); + marker = *error_entries.rbegin(); + for (const std::string& key : error_entries) { + rgw_bucket_shard bs; + std::optional gen; + if (int r = rgw::error_repo::decode_key(key, bs, gen); r < 0) { + // insert the key as-is + recovering_buckets.insert(std::move(key)); + } else if (gen) { + recovering_buckets.insert(fmt::format("{}[{}]", bucket_shard_str{bs}, *gen)); + } else { + recovering_buckets.insert(fmt::format("{}[full]", bucket_shard_str{bs})); + } + } + } while (omapkeys->more && count < max_entries); + + return set_cr_done(); + } + + return 0; +} + +class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw::sal::RadosStore* driver; + + const int shard_id; + int max_entries; + + set& pending_buckets; + string marker; + string status_oid; + + rgw_data_sync_marker* sync_marker; + int count; + + std::string next_marker; + vector log_entries; + bool truncated; + +public: + RGWReadPendingBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id, + set& _pending_buckets, + rgw_data_sync_marker* _sync_marker, const int _max_entries) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries), + pending_buckets(_pending_buckets), sync_marker(_sync_marker) + { + status_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id); + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWReadPendingBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp) +{ + reenter(this){ + //read sync status marker + using CR = RGWSimpleRadosReadCR; + yield call(new CR(dpp, sync_env->driver, + rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, status_oid), + sync_marker)); + if (retcode < 0) { + ldpp_dout(dpp, 0) << "failed to read sync status marker with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + //read pending bucket shards + marker = sync_marker->marker; + count = 0; + do{ + yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id, marker, + &next_marker, &log_entries, &truncated)); + + if (retcode == -ENOENT) { + break; + } + + if (retcode < 0) { + ldpp_dout(dpp, 0) << "failed to read remote data log info with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + if (log_entries.empty()) { + break; + } + + count += log_entries.size(); + for (const auto& entry : log_entries) { + pending_buckets.insert(entry.entry.key); + } + }while(truncated && count < max_entries); + + return set_cr_done(); + } + + return 0; +} + +int RGWRemoteDataLog::read_shard_status(const DoutPrefixProvider *dpp, int shard_id, set& pending_buckets, set& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) +{ + // cannot run concurrently with run_sync(), so run in a separate manager + RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry()); + RGWHTTPManager http_manager(driver->ctx(), crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWDataSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + RGWDataSyncCtx sc_local = sc; + sc_local.env = &sync_env_local; + list stacks; + RGWCoroutinesStack* recovering_stack = new RGWCoroutinesStack(driver->ctx(), &crs); + recovering_stack->call(new RGWReadRecoveringBucketShardsCoroutine(&sc_local, shard_id, recovering_buckets, max_entries)); + stacks.push_back(recovering_stack); + RGWCoroutinesStack* pending_stack = new RGWCoroutinesStack(driver->ctx(), &crs); + pending_stack->call(new RGWReadPendingBucketShardsCoroutine(&sc_local, shard_id, pending_buckets, sync_marker, max_entries)); + stacks.push_back(pending_stack); + ret = crs.run(dpp, stacks); + http_manager.stop(); + return ret; +} + +CephContext *RGWBucketPipeSyncStatusManager::get_cct() const +{ + return driver->ctx(); +} + +void rgw_bucket_entry_owner::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("ID", id, obj); + JSONDecoder::decode_json("DisplayName", display_name, obj); +} + +struct bucket_list_entry { + bool delete_marker; + rgw_obj_key key; + bool is_latest; + real_time mtime; + string etag; + uint64_t size; + string storage_class; + rgw_bucket_entry_owner owner; + uint64_t versioned_epoch; + string rgw_tag; + + bucket_list_entry() : delete_marker(false), is_latest(false), size(0), versioned_epoch(0) {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("IsDeleteMarker", delete_marker, obj); + JSONDecoder::decode_json("Key", key.name, obj); + JSONDecoder::decode_json("VersionId", key.instance, obj); + JSONDecoder::decode_json("IsLatest", is_latest, obj); + string mtime_str; + JSONDecoder::decode_json("RgwxMtime", mtime_str, obj); + + struct tm t; + uint32_t nsec; + if (parse_iso8601(mtime_str.c_str(), &t, &nsec)) { + ceph_timespec ts; + ts.tv_sec = (uint64_t)internal_timegm(&t); + ts.tv_nsec = nsec; + mtime = real_clock::from_ceph_timespec(ts); + } + JSONDecoder::decode_json("ETag", etag, obj); + JSONDecoder::decode_json("Size", size, obj); + JSONDecoder::decode_json("StorageClass", storage_class, obj); + JSONDecoder::decode_json("Owner", owner, obj); + JSONDecoder::decode_json("VersionedEpoch", versioned_epoch, obj); + JSONDecoder::decode_json("RgwxTag", rgw_tag, obj); + if (key.instance == "null" && !versioned_epoch) { + key.instance.clear(); + } + } + + RGWModifyOp get_modify_op() const { + if (delete_marker) { + return CLS_RGW_OP_LINK_OLH_DM; + } else if (!key.instance.empty() && key.instance != "null") { + return CLS_RGW_OP_LINK_OLH; + } else { + return CLS_RGW_OP_ADD; + } + } +}; + +struct bucket_list_result { + string name; + string prefix; + string key_marker; + string version_id_marker; + int max_keys; + bool is_truncated; + list entries; + + bucket_list_result() : max_keys(0), is_truncated(false) {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("Name", name, obj); + JSONDecoder::decode_json("Prefix", prefix, obj); + JSONDecoder::decode_json("KeyMarker", key_marker, obj); + JSONDecoder::decode_json("VersionIdMarker", version_id_marker, obj); + JSONDecoder::decode_json("MaxKeys", max_keys, obj); + JSONDecoder::decode_json("IsTruncated", is_truncated, obj); + JSONDecoder::decode_json("Entries", entries, obj); + } +}; + +class RGWListRemoteBucketCR: public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + const rgw_bucket_shard& bs; + rgw_obj_key marker_position; + + bucket_list_result *result; + +public: + RGWListRemoteBucketCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs, + rgw_obj_key& _marker_position, bucket_list_result *_result) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), bs(bs), + marker_position(_marker_position), result(_result) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield { + rgw_http_param_pair pairs[] = { { "versions" , NULL }, + { "format" , "json" }, + { "objs-container" , "true" }, + { "key-marker" , marker_position.name.c_str() }, + { "version-id-marker" , marker_position.instance.c_str() }, + { NULL, NULL } }; + string p = string("/") + bs.bucket.get_key(':', 0); + call(new RGWReadRESTResourceCR(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, result)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +struct next_bilog_result { + uint64_t generation = 0; + int num_shards = 0; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("generation", generation, obj); + JSONDecoder::decode_json("num_shards", num_shards, obj); + } +}; + +struct bilog_list_result { + list entries; + bool truncated{false}; + std::optional next_log; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("entries", entries, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("next_log", next_log, obj); + } +}; + +class RGWListBucketIndexLogCR: public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + const string instance_key; + string marker; + + bilog_list_result *result; + std::optional timer; + uint64_t generation; + std::string gen_str = std::to_string(generation); + uint32_t format_ver{1}; + +public: + RGWListBucketIndexLogCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs, string& _marker, + uint64_t _generation, bilog_list_result *_result) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + instance_key(bs.get_key()), marker(_marker), result(_result), generation(_generation) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + if (sync_env->counters) { + timer.emplace(sync_env->counters, sync_counters::l_poll); + } + yield { + rgw_http_param_pair pairs[] = { { "bucket-instance", instance_key.c_str() }, + { "format" , "json" }, + { "marker" , marker.c_str() }, + { "type", "bucket-index" }, + { "generation", gen_str.c_str() }, + { "format-ver", "2"}, + { NULL, NULL } }; + + call(new RGWReadRESTResourceCR(sync_env->cct, sc->conn, sync_env->http_manager, + "/admin/log", pairs, result)); + } + timer.reset(); + if (retcode < 0) { + if (sync_env->counters) { + sync_env->counters->inc(sync_counters::l_poll_err); + } + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +#define BUCKET_SYNC_UPDATE_MARKER_WINDOW 10 + +class RGWBucketFullSyncMarkerTrack : public RGWSyncShardMarkerTrack { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + const rgw_raw_obj& status_obj; + rgw_bucket_sync_status& sync_status; + RGWSyncTraceNodeRef tn; + RGWObjVersionTracker& objv_tracker; + +public: + RGWBucketFullSyncMarkerTrack(RGWDataSyncCtx *_sc, + const rgw_raw_obj& status_obj, + rgw_bucket_sync_status& sync_status, + RGWSyncTraceNodeRef tn, + RGWObjVersionTracker& objv_tracker) + : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW), + sc(_sc), sync_env(_sc->env), status_obj(status_obj), + sync_status(sync_status), tn(std::move(tn)), objv_tracker(objv_tracker) + {} + + + RGWCoroutine *store_marker(const rgw_obj_key& new_marker, uint64_t index_pos, const real_time& timestamp) override { + sync_status.full.position = new_marker; + sync_status.full.count = index_pos; + + tn->log(20, SSTR("updating marker oid=" << status_obj.oid << " marker=" << new_marker)); + return new RGWSimpleRadosWriteCR( + sync_env->dpp, sync_env->driver, + status_obj, sync_status, &objv_tracker); + } + + RGWOrderCallCR *allocate_order_control_cr() override { + return new RGWLastCallerWinsCR(sync_env->cct); + } +}; + +// write the incremental sync status and update 'stable_timestamp' on success +class RGWWriteBucketShardIncSyncStatus : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + rgw_raw_obj obj; + rgw_bucket_shard_inc_sync_marker sync_marker; + ceph::real_time* stable_timestamp; + RGWObjVersionTracker& objv_tracker; + std::map attrs; + public: + RGWWriteBucketShardIncSyncStatus(RGWDataSyncEnv *sync_env, + const rgw_raw_obj& obj, + const rgw_bucket_shard_inc_sync_marker& sync_marker, + ceph::real_time* stable_timestamp, + RGWObjVersionTracker& objv_tracker) + : RGWCoroutine(sync_env->cct), sync_env(sync_env), obj(obj), + sync_marker(sync_marker), stable_timestamp(stable_timestamp), + objv_tracker(objv_tracker) + {} + int operate(const DoutPrefixProvider *dpp) { + reenter(this) { + sync_marker.encode_attr(attrs); + + yield call(new RGWSimpleRadosWriteAttrsCR(sync_env->dpp, sync_env->driver, + obj, attrs, &objv_tracker)); + if (retcode < 0) { + return set_cr_error(retcode); + } + if (stable_timestamp) { + *stable_timestamp = sync_marker.timestamp; + } + return set_cr_done(); + } + return 0; + } +}; + +class RGWBucketIncSyncShardMarkerTrack : public RGWSyncShardMarkerTrack { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + rgw_raw_obj obj; + rgw_bucket_shard_inc_sync_marker sync_marker; + + map key_to_marker; + + struct operation { + rgw_obj_key key; + bool is_olh; + }; + map marker_to_op; + std::set pending_olh; // object names with pending olh operations + + RGWSyncTraceNodeRef tn; + RGWObjVersionTracker& objv_tracker; + ceph::real_time* stable_timestamp; + + void handle_finish(const string& marker) override { + auto iter = marker_to_op.find(marker); + if (iter == marker_to_op.end()) { + return; + } + auto& op = iter->second; + key_to_marker.erase(op.key); + reset_need_retry(op.key); + if (op.is_olh) { + pending_olh.erase(op.key.name); + } + marker_to_op.erase(iter); + } + +public: + RGWBucketIncSyncShardMarkerTrack(RGWDataSyncCtx *_sc, + const string& _marker_oid, + const rgw_bucket_shard_inc_sync_marker& _marker, + RGWSyncTraceNodeRef tn, + RGWObjVersionTracker& objv_tracker, + ceph::real_time* stable_timestamp) + : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW), + sc(_sc), sync_env(_sc->env), + obj(sync_env->svc->zone->get_zone_params().log_pool, _marker_oid), + sync_marker(_marker), tn(std::move(tn)), objv_tracker(objv_tracker), + stable_timestamp(stable_timestamp) + {} + + const rgw_raw_obj& get_obj() const { return obj; } + + RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override { + sync_marker.position = new_marker; + sync_marker.timestamp = timestamp; + + tn->log(20, SSTR("updating marker marker_oid=" << obj.oid << " marker=" << new_marker << " timestamp=" << timestamp)); + return new RGWWriteBucketShardIncSyncStatus(sync_env, obj, sync_marker, + stable_timestamp, objv_tracker); + } + + /* + * create index from key -> , and from marker -> key + * this is useful so that we can insure that we only have one + * entry for any key that is used. This is needed when doing + * incremenatl sync of data, and we don't want to run multiple + * concurrent sync operations for the same bucket shard + * Also, we should make sure that we don't run concurrent operations on the same key with + * different ops. + */ + bool index_key_to_marker(const rgw_obj_key& key, const string& marker, bool is_olh) { + auto result = key_to_marker.emplace(key, marker); + if (!result.second) { // exists + set_need_retry(key); + return false; + } + marker_to_op[marker] = operation{key, is_olh}; + if (is_olh) { + // prevent other olh ops from starting on this object name + pending_olh.insert(key.name); + } + return true; + } + + bool can_do_op(const rgw_obj_key& key, bool is_olh) { + // serialize olh ops on the same object name + if (is_olh && pending_olh.count(key.name)) { + tn->log(20, SSTR("sync of " << key << " waiting for pending olh op")); + return false; + } + return (key_to_marker.find(key) == key_to_marker.end()); + } + + RGWOrderCallCR *allocate_order_control_cr() override { + return new RGWLastCallerWinsCR(sync_env->cct); + } +}; + +static bool ignore_sync_error(int err) { + switch (err) { + case -ENOENT: + case -EPERM: + return true; + default: + break; + } + return false; +} + +template +class RGWBucketSyncSingleEntryCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + rgw_bucket_sync_pipe& sync_pipe; + rgw_bucket_shard& bs; + + rgw_obj_key key; + bool versioned; + std::optional versioned_epoch; + rgw_bucket_entry_owner owner; + real_time timestamp; + RGWModifyOp op; + RGWPendingState op_state; + + T entry_marker; + RGWSyncShardMarkerTrack *marker_tracker; + + int sync_status; + + stringstream error_ss; + + bool error_injection; + + RGWDataSyncModule *data_sync_module; + + rgw_zone_set_entry source_trace_entry; + rgw_zone_set zones_trace; + + RGWSyncTraceNodeRef tn; + std::string zone_name; + +public: + RGWBucketSyncSingleEntryCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, + const rgw_obj_key& _key, bool _versioned, + std::optional _versioned_epoch, + real_time& _timestamp, + const rgw_bucket_entry_owner& _owner, + RGWModifyOp _op, RGWPendingState _op_state, + const T& _entry_marker, RGWSyncShardMarkerTrack *_marker_tracker, rgw_zone_set& _zones_trace, + RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sc->cct), + sc(_sc), sync_env(_sc->env), + sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs), + key(_key), versioned(_versioned), versioned_epoch(_versioned_epoch), + owner(_owner), + timestamp(_timestamp), op(_op), + op_state(_op_state), + entry_marker(_entry_marker), + marker_tracker(_marker_tracker), + sync_status(0){ + stringstream ss; + ss << bucket_shard_str{bs} << "/" << key << "[" << versioned_epoch.value_or(0) << "]"; + set_description() << "bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state; + set_status("init"); + + tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", SSTR(key)); + + tn->log(20, SSTR("bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state)); + error_injection = (sync_env->cct->_conf->rgw_sync_data_inject_err_probability > 0); + + data_sync_module = sync_env->sync_module->get_data_handler(); + + source_trace_entry.zone = sc->source_zone.id; + source_trace_entry.location_key = _sync_pipe.info.source_bs.bucket.get_key(); + + zones_trace = _zones_trace; + zones_trace.insert(sync_env->svc->zone->get_zone().id, _sync_pipe.info.dest_bucket.get_key()); + + if (sc->env->ostr) { + RGWZone* z; + if ((z = sc->env->driver->svc()->zone->find_zone(sc->source_zone))) { + zone_name = z->name; + } + } + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + /* skip entries that are not complete */ + if (op_state != CLS_RGW_STATE_COMPLETE) { + goto done; + } + tn->set_flag(RGW_SNS_FLAG_ACTIVE); + do { + yield { + marker_tracker->reset_need_retry(key); + if (key.name.empty()) { + /* shouldn't happen */ + set_status("skipping empty entry"); + tn->log(0, "entry with empty obj name, skipping"); + goto done; + } + if (error_injection && + rand() % 10000 < cct->_conf->rgw_sync_data_inject_err_probability * 10000.0) { + tn->log(0, SSTR(": injecting data sync error on key=" << key.name)); + retcode = -EIO; + } else if (op == CLS_RGW_OP_ADD || + op == CLS_RGW_OP_LINK_OLH) { + set_status("syncing obj"); + tn->log(5, SSTR("bucket sync: sync obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]")); + if (versioned_epoch) { + pretty_print(sc->env, "Syncing object s3://{}/{} version {} in sync from zone {}\n", + bs.bucket.name, key, *versioned_epoch, zone_name); + } else { + pretty_print(sc->env, "Syncing object s3://{}/{} in sync from zone {}\n", + bs.bucket.name, key, zone_name); + } + call(data_sync_module->sync_object(dpp, sc, sync_pipe, key, versioned_epoch, + source_trace_entry, &zones_trace)); + } else if (op == CLS_RGW_OP_DEL || op == CLS_RGW_OP_UNLINK_INSTANCE) { + set_status("removing obj"); + if (versioned_epoch) { + pretty_print(sc->env, "Deleting object s3://{}/{} version {} in sync from zone {}\n", + bs.bucket.name, key, *versioned_epoch, zone_name); + } else { + pretty_print(sc->env, "Deleting object s3://{}/{} in sync from zone {}\n", + bs.bucket.name, key, zone_name); + } + if (op == CLS_RGW_OP_UNLINK_INSTANCE) { + versioned = true; + } + tn->log(10, SSTR("removing obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]")); + call(data_sync_module->remove_object(dpp, sc, sync_pipe, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace)); + // our copy of the object is more recent, continue as if it succeeded + } else if (op == CLS_RGW_OP_LINK_OLH_DM) { + set_status("creating delete marker"); + tn->log(10, SSTR("creating delete marker: obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]")); + call(data_sync_module->create_delete_marker(dpp, sc, sync_pipe, key, timestamp, owner, versioned, versioned_epoch.value_or(0), &zones_trace)); + } + tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key)); + } + if (retcode == -ERR_PRECONDITION_FAILED) { + pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n", + bs.bucket.name, key, zone_name); + set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)"); + tn->log(0, "Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)"); + retcode = 0; + } + } while (marker_tracker->need_retry(key)); + { + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + if (retcode >= 0) { + tn->log(10, "success"); + } else { + tn->log(10, SSTR("failed, retcode=" << retcode << " (" << cpp_strerror(-retcode) << ")")); + } + } + + if (retcode < 0 && retcode != -ENOENT) { + set_status() << "failed to sync obj; retcode=" << retcode; + tn->log(0, SSTR("ERROR: failed to sync object: " + << bucket_shard_str{bs} << "/" << key.name)); + if (!ignore_sync_error(retcode)) { + error_ss << bucket_shard_str{bs} << "/" << key.name; + sync_status = retcode; + } + } + if (!error_ss.str().empty()) { + yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data", error_ss.str(), -retcode, string("failed to sync object") + cpp_strerror(-sync_status))); + } +done: + if (sync_status == 0) { + /* update marker */ + set_status() << "calling marker_tracker->finish(" << entry_marker << ")"; + yield call(marker_tracker->finish(entry_marker)); + sync_status = retcode; + } + if (sync_status < 0) { + return set_cr_error(sync_status); + } + return set_cr_done(); + } + return 0; + } +}; + +class RGWBucketFullSyncCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw_bucket_sync_pipe& sync_pipe; + rgw_bucket_sync_status& sync_status; + rgw_bucket_shard& bs; + boost::intrusive_ptr lease_cr; + bucket_list_result list_result; + list::iterator entries_iter; + rgw_obj_key list_marker; + bucket_list_entry *entry{nullptr}; + + int total_entries{0}; + + int sync_result{0}; + + const rgw_raw_obj& status_obj; + RGWObjVersionTracker& objv; + + rgw_zone_set zones_trace; + + RGWSyncTraceNodeRef tn; + RGWBucketFullSyncMarkerTrack marker_tracker; + + struct _prefix_handler { + RGWBucketSyncFlowManager::pipe_rules_ref rules; + RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator iter; + std::optional cur_prefix; + + void set_rules(RGWBucketSyncFlowManager::pipe_rules_ref& _rules) { + rules = _rules; + } + + bool revalidate_marker(rgw_obj_key *marker) { + if (cur_prefix && + boost::starts_with(marker->name, *cur_prefix)) { + return true; + } + if (!rules) { + return false; + } + iter = rules->prefix_search(marker->name); + if (iter == rules->prefix_end()) { + return false; + } + cur_prefix = iter->first; + marker->name = *cur_prefix; + marker->instance.clear(); + return true; + } + + bool check_key_handled(const rgw_obj_key& key) { + if (!rules) { + return false; + } + if (cur_prefix && + boost::starts_with(key.name, *cur_prefix)) { + return true; + } + iter = rules->prefix_search(key.name); + if (iter == rules->prefix_end()) { + return false; + } + cur_prefix = iter->first; + return boost::starts_with(key.name, iter->first); + } + } prefix_handler; + +public: + RGWBucketFullSyncCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, + const rgw_raw_obj& status_obj, + boost::intrusive_ptr lease_cr, + rgw_bucket_sync_status& sync_status, + RGWSyncTraceNodeRef tn_parent, + RGWObjVersionTracker& objv_tracker) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + sync_pipe(_sync_pipe), sync_status(sync_status), + bs(_sync_pipe.info.source_bs), + lease_cr(std::move(lease_cr)), status_obj(status_obj), objv(objv_tracker), + tn(sync_env->sync_tracer->add_node(tn_parent, "full_sync", + SSTR(bucket_shard_str{bs}))), + marker_tracker(sc, status_obj, sync_status, tn, objv_tracker) + { + zones_trace.insert(sc->source_zone.id, sync_pipe.info.dest_bucket.get_key()); + prefix_handler.set_rules(sync_pipe.get_rules()); + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + list_marker = sync_status.full.position; + + total_entries = sync_status.full.count; + do { + if (lease_cr && !lease_cr->is_locked()) { + tn->log(1, "no lease or lease is lost, abort"); + drain_all(); + yield call(marker_tracker.flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + return set_cr_error(-ECANCELED); + } + set_status("listing remote bucket"); + tn->log(20, "listing bucket for full sync"); + + if (!prefix_handler.revalidate_marker(&list_marker)) { + set_status() << "finished iterating over all available prefixes: last marker=" << list_marker; + tn->log(20, SSTR("finished iterating over all available prefixes: last marker=" << list_marker)); + break; + } + + yield call(new RGWListRemoteBucketCR(sc, bs, list_marker, &list_result)); + if (retcode < 0 && retcode != -ENOENT) { + set_status("failed bucket listing, going down"); + drain_all(); + yield spawn(marker_tracker.flush(), true); + return set_cr_error(retcode); + } + if (list_result.entries.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + entries_iter = list_result.entries.begin(); + for (; entries_iter != list_result.entries.end(); ++entries_iter) { + if (lease_cr && !lease_cr->is_locked()) { + drain_all(); + yield call(marker_tracker.flush()); + tn->log(1, "no lease or lease is lost, abort"); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + return set_cr_error(-ECANCELED); + } + tn->log(20, SSTR("[full sync] syncing object: " + << bucket_shard_str{bs} << "/" << entries_iter->key)); + entry = &(*entries_iter); + list_marker = entries_iter->key; + if (!prefix_handler.check_key_handled(entries_iter->key)) { + set_status() << "skipping entry due to policy rules: " << entries_iter->key; + tn->log(20, SSTR("skipping entry due to policy rules: " << entries_iter->key)); + continue; + } + total_entries++; + if (!marker_tracker.start(entry->key, total_entries, real_time())) { + tn->log(0, SSTR("ERROR: cannot start syncing " << entry->key << ". Duplicate entry?")); + } else { + using SyncCR = RGWBucketSyncSingleEntryCR; + yield spawn(new SyncCR(sc, sync_pipe, entry->key, + false, /* versioned, only matters for object removal */ + entry->versioned_epoch, entry->mtime, + entry->owner, entry->get_modify_op(), CLS_RGW_STATE_COMPLETE, + entry->key, &marker_tracker, zones_trace, tn), + false); + } + drain_with_cb(sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window), + [&](uint64_t stack_id, int ret) { + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + sync_result = ret; + } + return 0; + }); + } + } while (list_result.is_truncated && sync_result == 0); + set_status("done iterating over all objects"); + + /* wait for all operations to complete */ + drain_all_cb([&](uint64_t stack_id, int ret) { + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + sync_result = ret; + } + return 0; + }); + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + if (lease_cr && !lease_cr->is_locked()) { + tn->log(1, "no lease or lease is lost, abort"); + yield call(marker_tracker.flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + return set_cr_error(-ECANCELED); + } + yield call(marker_tracker.flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + /* update sync state to incremental */ + if (sync_result == 0) { + sync_status.state = BucketSyncState::Incremental; + tn->log(5, SSTR("set bucket state=" << sync_status.state)); + yield call(new RGWSimpleRadosWriteCR( + dpp, sync_env->driver, status_obj, sync_status, &objv)); + tn->log(5, SSTR("bucket status objv=" << objv)); + } else { + tn->log(10, SSTR("backing out with sync_status=" << sync_result)); + } + if (retcode < 0 && sync_result == 0) { /* actually tried to set incremental state and failed */ + tn->log(0, SSTR("ERROR: failed to set sync state on bucket " + << bucket_shard_str{bs} << " retcode=" << retcode)); + return set_cr_error(retcode); + } + if (sync_result < 0) { + return set_cr_error(sync_result); + } + return set_cr_done(); + } + return 0; +} + +static bool has_olh_epoch(RGWModifyOp op) { + return op == CLS_RGW_OP_LINK_OLH || op == CLS_RGW_OP_UNLINK_INSTANCE; +} + +class RGWBucketShardIsDoneCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw_bucket_sync_status bucket_status; + const rgw_raw_obj& bucket_status_obj; + const int shard_id; + RGWObjVersionTracker objv_tracker; + const next_bilog_result& next_log; + const uint64_t generation; + +public: + RGWBucketShardIsDoneCR(RGWDataSyncCtx *_sc, const rgw_raw_obj& _bucket_status_obj, + int _shard_id, const next_bilog_result& _next_log, const uint64_t _gen) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + bucket_status_obj(_bucket_status_obj), + shard_id(_shard_id), next_log(_next_log), generation(_gen) {} + + int operate(const DoutPrefixProvider* dpp) override + { + reenter(this) { + do { + // read bucket sync status + objv_tracker.clear(); + using ReadCR = RGWSimpleRadosReadCR; + yield call(new ReadCR(dpp, sync_env->driver, + bucket_status_obj, &bucket_status, false, &objv_tracker)); + if (retcode < 0) { + ldpp_dout(dpp, 20) << "failed to read bucket shard status: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + if (bucket_status.state != BucketSyncState::Incremental) { + // exit with success to avoid stale shard being + // retried in error repo if we lost a race + ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR found sync state = " << bucket_status.state << dendl; + return set_cr_done(); + } + + if (bucket_status.incremental_gen != generation) { + // exit with success to avoid stale shard being + // retried in error repo if we lost a race + ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR expected gen: " << generation + << ", got: " << bucket_status.incremental_gen << dendl; + return set_cr_done(); + } + + yield { + // update bucket_status after a shard is done with current gen + auto& done = bucket_status.shards_done_with_gen; + done[shard_id] = true; + + // increment gen if all shards are already done with current gen + if (std::all_of(done.begin(), done.end(), + [] (const bool done){return done; } )) { + bucket_status.incremental_gen = next_log.generation; + done.clear(); + done.resize(next_log.num_shards, false); + } + ldpp_dout(dpp, 20) << "bucket status incremental gen is " << bucket_status.incremental_gen << dendl; + using WriteCR = RGWSimpleRadosWriteCR; + call(new WriteCR(dpp, sync_env->driver, + bucket_status_obj, bucket_status, &objv_tracker, false)); + } + if (retcode < 0 && retcode != -ECANCELED) { + ldpp_dout(dpp, 20) << "failed to write bucket sync status: " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } else if (retcode >= 0) { + return set_cr_done(); + } + } while (retcode == -ECANCELED); + } + return 0; + } +}; + +class RGWBucketShardIncrementalSyncCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw_bucket_sync_pipe& sync_pipe; + RGWBucketSyncFlowManager::pipe_rules_ref rules; + rgw_bucket_shard& bs; + const rgw_raw_obj& bucket_status_obj; + boost::intrusive_ptr lease_cr; + bilog_list_result extended_result; + list list_result; + int next_num_shards; + uint64_t next_gen; + bool truncated; + + list::iterator entries_iter, entries_end; + map, pair > squash_map; + rgw_bucket_shard_sync_info& sync_info; + uint64_t generation; + rgw_obj_key key; + rgw_bi_log_entry *entry{nullptr}; + bool updated_status{false}; + rgw_zone_id zone_id; + string target_location_key; + + string cur_id; + + int sync_status{0}; + bool syncstopped{false}; + + RGWSyncTraceNodeRef tn; + RGWBucketIncSyncShardMarkerTrack marker_tracker; + +public: + RGWBucketShardIncrementalSyncCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, + const std::string& shard_status_oid, + const rgw_raw_obj& _bucket_status_obj, + boost::intrusive_ptr lease_cr, + rgw_bucket_shard_sync_info& sync_info, + uint64_t generation, + RGWSyncTraceNodeRef& _tn_parent, + RGWObjVersionTracker& objv_tracker, + ceph::real_time* stable_timestamp) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs), + bucket_status_obj(_bucket_status_obj), lease_cr(std::move(lease_cr)), + sync_info(sync_info), generation(generation), zone_id(sync_env->svc->zone->get_zone().id), + tn(sync_env->sync_tracer->add_node(_tn_parent, "inc_sync", + SSTR(bucket_shard_str{bs}))), + marker_tracker(sc, shard_status_oid, sync_info.inc_marker, tn, + objv_tracker, stable_timestamp) + { + set_description() << "bucket shard incremental sync bucket=" + << bucket_shard_str{bs}; + set_status("init"); + rules = sync_pipe.get_rules(); + target_location_key = sync_pipe.info.dest_bucket.get_key(); + } + + bool check_key_handled(const rgw_obj_key& key) { + if (!rules) { + return false; + } + auto iter = rules->prefix_search(key.name); + if (iter == rules->prefix_end()) { + return false; + } + return boost::starts_with(key.name, iter->first); + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWBucketShardIncrementalSyncCR::operate(const DoutPrefixProvider *dpp) +{ + int ret; + reenter(this) { + do { + if (lease_cr && !lease_cr->is_locked()) { + tn->log(1, "no lease or lease is lost, abort"); + drain_all(); + yield call(marker_tracker.flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + return set_cr_error(-ECANCELED); + } + tn->log(20, SSTR("listing bilog for incremental sync; position=" << sync_info.inc_marker.position)); + set_status() << "listing bilog; position=" << sync_info.inc_marker.position; + yield call(new RGWListBucketIndexLogCR(sc, bs, sync_info.inc_marker.position, generation, &extended_result)); + if (retcode < 0 && retcode != -ENOENT) { + /* wait for all operations to complete */ + drain_all(); + yield spawn(marker_tracker.flush(), true); + return set_cr_error(retcode); + } + list_result = std::move(extended_result.entries); + truncated = extended_result.truncated; + if (extended_result.next_log) { + next_gen = extended_result.next_log->generation; + next_num_shards = extended_result.next_log->num_shards; + } + + squash_map.clear(); + entries_iter = list_result.begin(); + entries_end = list_result.end(); + for (; entries_iter != entries_end; ++entries_iter) { + auto e = *entries_iter; + if (e.op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP) { + ldpp_dout(dpp, 20) << "syncstop at: " << e.timestamp << ". marker: " << e.id << dendl; + syncstopped = true; + entries_end = std::next(entries_iter); // stop after this entry + break; + } + if (e.op == RGWModifyOp::CLS_RGW_OP_RESYNC) { + ldpp_dout(dpp, 20) << "syncstart at: " << e.timestamp << ". marker: " << e.id << dendl; + continue; + } + if (e.op == CLS_RGW_OP_CANCEL) { + continue; + } + if (e.state != CLS_RGW_STATE_COMPLETE) { + continue; + } + if (e.zones_trace.exists(zone_id.id, target_location_key)) { + continue; + } + auto& squash_entry = squash_map[make_pair(e.object, e.instance)]; + // don't squash over olh entries - we need to apply their olh_epoch + if (has_olh_epoch(squash_entry.second) && !has_olh_epoch(e.op)) { + continue; + } + if (squash_entry.first <= e.timestamp) { + squash_entry = make_pair<>(e.timestamp, e.op); + } + } + + entries_iter = list_result.begin(); + for (; entries_iter != entries_end; ++entries_iter) { + if (lease_cr && !lease_cr->is_locked()) { + tn->log(1, "no lease or lease is lost, abort"); + drain_all(); + yield call(marker_tracker.flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + return set_cr_error(-ECANCELED); + } + entry = &(*entries_iter); + { + ssize_t p = entry->id.find('#'); /* entries might have explicit shard info in them, e.g., 6#00000000004.94.3 */ + if (p < 0) { + cur_id = entry->id; + } else { + cur_id = entry->id.substr(p + 1); + } + } + sync_info.inc_marker.position = cur_id; + + if (entry->op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP || entry->op == RGWModifyOp::CLS_RGW_OP_RESYNC) { + ldpp_dout(dpp, 20) << "detected syncstop or resync on " << entries_iter->timestamp << ", skipping entry" << dendl; + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + + if (!key.set(rgw_obj_index_key{entry->object, entry->instance})) { + set_status() << "parse_raw_oid() on " << entry->object << " returned false, skipping entry"; + tn->log(20, SSTR("parse_raw_oid() on " << entry->object << " returned false, skipping entry")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + + tn->log(20, SSTR("parsed entry: id=" << cur_id << " iter->object=" << entry->object << " iter->instance=" << entry->instance << " name=" << key.name << " instance=" << key.instance << " ns=" << key.ns)); + + if (!key.ns.empty()) { + set_status() << "skipping entry in namespace: " << entry->object; + tn->log(20, SSTR("skipping entry in namespace: " << entry->object)); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + + if (!check_key_handled(key)) { + set_status() << "skipping entry due to policy rules: " << entry->object; + tn->log(20, SSTR("skipping entry due to policy rules: " << entry->object)); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + + set_status() << "got entry.id=" << cur_id << " key=" << key << " op=" << (int)entry->op; + if (entry->op == CLS_RGW_OP_CANCEL) { + set_status() << "canceled operation, skipping"; + tn->log(20, SSTR("skipping object: " + << bucket_shard_str{bs} << "/" << key << ": canceled operation")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + if (entry->state != CLS_RGW_STATE_COMPLETE) { + set_status() << "non-complete operation, skipping"; + tn->log(20, SSTR("skipping object: " + << bucket_shard_str{bs} << "/" << key << ": non-complete operation")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + if (entry->zones_trace.exists(zone_id.id, target_location_key)) { + set_status() << "redundant operation, skipping"; + tn->log(20, SSTR("skipping object: " + <timestamp); + continue; + } + if (make_pair<>(entry->timestamp, entry->op) != squash_map[make_pair(entry->object, entry->instance)]) { + set_status() << "squashed operation, skipping"; + tn->log(20, SSTR("skipping object: " + << bucket_shard_str{bs} << "/" << key << ": squashed operation")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + tn->set_flag(RGW_SNS_FLAG_ACTIVE); + tn->log(20, SSTR("syncing object: " + << bucket_shard_str{bs} << "/" << key)); + updated_status = false; + while (!marker_tracker.can_do_op(key, has_olh_epoch(entry->op))) { + if (!updated_status) { + set_status() << "can't do op, conflicting inflight operation"; + updated_status = true; + } + tn->log(5, SSTR("can't do op on key=" << key << " need to wait for conflicting operation to complete")); + yield wait_for_child(); + bool again = true; + while (again) { + again = collect(&ret, nullptr); + if (ret < 0) { + tn->log(0, SSTR("ERROR: a child operation returned error (ret=" << ret << ")")); + sync_status = ret; + /* we have reported this error */ + } + } + if (sync_status != 0) + break; + } + if (sync_status != 0) { + /* get error, stop */ + break; + } + if (!marker_tracker.index_key_to_marker(key, cur_id, has_olh_epoch(entry->op))) { + set_status() << "can't do op, sync already in progress for object"; + tn->log(20, SSTR("skipping sync of entry: " << cur_id << ":" << key << " sync already in progress for object")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + // yield { + set_status() << "start object sync"; + if (!marker_tracker.start(cur_id, 0, entry->timestamp)) { + tn->log(0, SSTR("ERROR: cannot start syncing " << cur_id << ". Duplicate entry?")); + } else { + std::optional versioned_epoch; + rgw_bucket_entry_owner owner(entry->owner, entry->owner_display_name); + if (entry->ver.pool < 0) { + versioned_epoch = entry->ver.epoch; + } + tn->log(20, SSTR("entry->timestamp=" << entry->timestamp)); + using SyncCR = RGWBucketSyncSingleEntryCR; + spawn(new SyncCR(sc, sync_pipe, key, + entry->is_versioned(), versioned_epoch, + entry->timestamp, owner, entry->op, entry->state, + cur_id, &marker_tracker, entry->zones_trace, tn), + false); + } + // } + drain_with_cb(sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window), + [&](uint64_t stack_id, int ret) { + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + sync_status = ret; + } + return 0; + }); + } + + } while (!list_result.empty() && sync_status == 0 && !syncstopped); + + drain_all_cb([&](uint64_t stack_id, int ret) { + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + sync_status = ret; + } + return 0; + }); + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + + if (syncstopped) { + // transition to StateStopped in RGWSyncBucketShardCR. if sync is + // still disabled, we'll delete the sync status object. otherwise we'll + // restart full sync to catch any changes that happened while sync was + // disabled + sync_info.state = rgw_bucket_shard_sync_info::StateStopped; + return set_cr_done(); + } + + yield call(marker_tracker.flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + if (sync_status < 0) { + tn->log(10, SSTR("backing out with sync_status=" << sync_status)); + return set_cr_error(sync_status); + } + + if (!truncated && extended_result.next_log) { + yield call(new RGWBucketShardIsDoneCR(sc, bucket_status_obj, bs.shard_id, *extended_result.next_log, generation)); + if (retcode < 0) { + ldout(cct, 20) << "failed to update bucket sync status: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + yield { + // delete the shard status object + auto status_obj = sync_env->svc->rados->obj(marker_tracker.get_obj()); + retcode = status_obj.open(dpp); + if (retcode < 0) { + return set_cr_error(retcode); + } + call(new RGWRadosRemoveOidCR(sync_env->driver, std::move(status_obj))); + if (retcode < 0) { + ldpp_dout(dpp, 20) << "failed to remove shard status object: " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + } + } + + return set_cr_done(); + } + return 0; +} + +class RGWGetBucketPeersCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + + std::optional target_bucket; + std::optional source_zone; + std::optional source_bucket; + + rgw_sync_pipe_info_set *pipes; + map buckets_info; + map::iterator siiter; + std::optional target_bucket_info; + std::optional source_bucket_info; + + rgw_sync_pipe_info_set::iterator siter; + + std::shared_ptr source_policy; + std::shared_ptr target_policy; + + RGWSyncTraceNodeRef tn; + + using pipe_const_iter = map::const_iterator; + + static pair get_pipe_iters(const map& m, std::optional zone) { + if (!zone) { + return { m.begin(), m.end() }; + } + + auto b = m.find(*zone); + if (b == m.end()) { + return { b, b }; + } + return { b, std::next(b) }; + } + + void filter_sources(std::optional source_zone, + std::optional source_bucket, + const map& all_sources, + rgw_sync_pipe_info_set *result) { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": source_zone=" << source_zone.value_or(rgw_zone_id("*")).id + << " source_bucket=" << source_bucket.value_or(rgw_bucket()) + << " all_sources.size()=" << all_sources.size() << dendl; + auto iters = get_pipe_iters(all_sources, source_zone); + for (auto i = iters.first; i != iters.second; ++i) { + for (auto& handler : i->second) { + if (!handler.specific()) { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl; + continue; + } + if (source_bucket && + !source_bucket->match(*handler.source.bucket)) { + continue; + } + ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl; + result->insert(handler, source_bucket_info, target_bucket_info); + } + } + } + + void filter_targets(std::optional target_zone, + std::optional target_bucket, + const map& all_targets, + rgw_sync_pipe_info_set *result) { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": target_zone=" << source_zone.value_or(rgw_zone_id("*")).id + << " target_bucket=" << source_bucket.value_or(rgw_bucket()) + << " all_targets.size()=" << all_targets.size() << dendl; + auto iters = get_pipe_iters(all_targets, target_zone); + for (auto i = iters.first; i != iters.second; ++i) { + for (auto& handler : i->second) { + if (target_bucket && + handler.dest.bucket && + !target_bucket->match(*handler.dest.bucket)) { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl; + continue; + } + ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl; + result->insert(handler, source_bucket_info, target_bucket_info); + } + } + } + + void update_from_target_bucket_policy(); + void update_from_source_bucket_policy(); + + struct GetHintTargets : public RGWGenericAsyncCR::Action { + RGWDataSyncEnv *sync_env; + rgw_bucket source_bucket; + std::set targets; + + GetHintTargets(RGWDataSyncEnv *_sync_env, + const rgw_bucket& _source_bucket) : sync_env(_sync_env), + source_bucket(_source_bucket) {} + int operate() override { + int r = sync_env->svc->bucket_sync->get_bucket_sync_hints(sync_env->dpp, + source_bucket, + nullptr, + &targets, + null_yield); + if (r < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): failed to fetch bucket sync hints for bucket=" << source_bucket << dendl; + return r; + } + + return 0; + } + }; + + std::shared_ptr get_hint_targets_action; + std::set::iterator hiter; + +public: + RGWGetBucketPeersCR(RGWDataSyncEnv *_sync_env, + std::optional _target_bucket, + std::optional _source_zone, + std::optional _source_bucket, + rgw_sync_pipe_info_set *_pipes, + const RGWSyncTraceNodeRef& _tn_parent) + : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + target_bucket(_target_bucket), + source_zone(_source_zone), + source_bucket(_source_bucket), + pipes(_pipes), + tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_peers", + SSTR( "target=" << target_bucket.value_or(rgw_bucket()) + << ":source=" << target_bucket.value_or(rgw_bucket()) + << ":source_zone=" << source_zone.value_or(rgw_zone_id("*")).id))) { + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +std::ostream& operator<<(std::ostream& out, std::optional& bs) { + if (!bs) { + out << "*"; + } else { + out << *bs; + } + return out; +} + +static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc, + boost::intrusive_ptr lease, + const rgw_bucket_sync_pair_info& sync_pair, + std::optional gen, + const RGWSyncTraceNodeRef& tn, + ceph::real_time* progress); + +RGWRunBucketSourcesSyncCR::RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc, + boost::intrusive_ptr lease_cr, + const rgw_bucket_shard& source_bs, + const RGWSyncTraceNodeRef& _tn_parent, + std::optional gen, + ceph::real_time* progress) + : RGWCoroutine(_sc->env->cct), sc(_sc), sync_env(_sc->env), + lease_cr(std::move(lease_cr)), + tn(sync_env->sync_tracer->add_node( + _tn_parent, "bucket_sync_sources", + SSTR( "source=" << source_bs << ":source_zone=" << sc->source_zone))), + progress(progress), + gen(gen) +{ + sync_pair.source_bs = source_bs; +} + +int RGWRunBucketSourcesSyncCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + yield call(new RGWGetBucketPeersCR(sync_env, std::nullopt, sc->source_zone, + sync_pair.source_bs.bucket, &pipes, tn)); + if (retcode < 0 && retcode != -ENOENT) { + tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode)); + return set_cr_error(retcode); + } + + ldpp_dout(dpp, 20) << __func__ << "(): requested source_bs=" << sync_pair.source_bs << dendl; + + if (pipes.empty()) { + ldpp_dout(dpp, 20) << __func__ << "(): no relevant sync pipes found" << dendl; + return set_cr_done(); + } + + shard_progress.resize(pipes.size()); + cur_shard_progress = shard_progress.begin(); + + for (siter = pipes.begin(); siter != pipes.end(); ++siter, ++cur_shard_progress) { + ldpp_dout(dpp, 20) << __func__ << "(): sync pipe=" << *siter << dendl; + + sync_pair.dest_bucket = siter->target.get_bucket(); + sync_pair.handler = siter->handler; + + ldpp_dout(dpp, 20) << __func__ << "(): sync_pair=" << sync_pair << dendl; + + yield_spawn_window(sync_bucket_shard_cr(sc, lease_cr, sync_pair, + gen, tn, &*cur_shard_progress), + sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window), + [&](uint64_t stack_id, int ret) { + if (ret < 0) { + tn->log(10, SSTR("ERROR: a sync operation returned error: " << ret)); + } + return ret; + }); + } + drain_all_cb([&](uint64_t stack_id, int ret) { + if (ret < 0) { + tn->log(10, SSTR("a sync operation returned error: " << ret)); + } + return ret; + }); + if (progress) { + *progress = *std::min_element(shard_progress.begin(), shard_progress.end()); + } + return set_cr_done(); + } + + return 0; +} + +class RGWSyncGetBucketInfoCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + rgw_bucket bucket; + RGWBucketInfo *pbucket_info; + map *pattrs; + RGWMetaSyncEnv meta_sync_env; + + RGWSyncTraceNodeRef tn; + +public: + RGWSyncGetBucketInfoCR(RGWDataSyncEnv *_sync_env, + const rgw_bucket& _bucket, + RGWBucketInfo *_pbucket_info, + map *_pattrs, + const RGWSyncTraceNodeRef& _tn_parent) + : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + bucket(_bucket), + pbucket_info(_pbucket_info), + pattrs(_pattrs), + tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_info", + SSTR(bucket))) { + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWSyncGetBucketInfoCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp)); + if (retcode == -ENOENT) { + /* bucket instance info has not been synced in yet, fetch it now */ + yield { + tn->log(10, SSTR("no local info for bucket:" << ": fetching metadata")); + string raw_key = string("bucket.instance:") + bucket.get_key(); + + meta_sync_env.init(dpp, cct, sync_env->driver, sync_env->svc->zone->get_master_conn(), sync_env->async_rados, + sync_env->http_manager, sync_env->error_logger, sync_env->sync_tracer); + + call(new RGWMetaSyncSingleEntryCR(&meta_sync_env, raw_key, + string() /* no marker */, + MDLOG_STATUS_COMPLETE, + NULL /* no marker tracker */, + tn)); + } + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to fetch bucket instance info for " << bucket_str{bucket})); + return set_cr_error(retcode); + } + + yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp)); + } + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{bucket})); + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + return 0; +} + +void RGWGetBucketPeersCR::update_from_target_bucket_policy() +{ + if (!target_policy || + !target_policy->policy_handler || + !pipes) { + return; + } + + auto handler = target_policy->policy_handler.get(); + + filter_sources(source_zone, + source_bucket, + handler->get_sources(), + pipes); + + for (siter = pipes->begin(); siter != pipes->end(); ++siter) { + if (!siter->source.has_bucket_info()) { + buckets_info.emplace(siter->source.get_bucket(), all_bucket_info()); + } + if (!siter->target.has_bucket_info()) { + buckets_info.emplace(siter->target.get_bucket(), all_bucket_info()); + } + } +} + +void RGWGetBucketPeersCR::update_from_source_bucket_policy() +{ + if (!source_policy || + !source_policy->policy_handler || + !pipes) { + return; + } + + auto handler = source_policy->policy_handler.get(); + + filter_targets(sync_env->svc->zone->get_zone().id, + target_bucket, + handler->get_targets(), + pipes); + + for (siter = pipes->begin(); siter != pipes->end(); ++siter) { + if (!siter->source.has_bucket_info()) { + buckets_info.emplace(siter->source.get_bucket(), all_bucket_info()); + } + if (!siter->target.has_bucket_info()) { + buckets_info.emplace(siter->target.get_bucket(), all_bucket_info()); + } + } +} + + +class RGWSyncGetBucketSyncPolicyHandlerCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + rgw_bucket bucket; + rgw_bucket_get_sync_policy_params get_policy_params; + + std::shared_ptr policy; + + RGWSyncTraceNodeRef tn; + + int i; + +public: + RGWSyncGetBucketSyncPolicyHandlerCR(RGWDataSyncEnv *_sync_env, + std::optional zone, + const rgw_bucket& _bucket, + std::shared_ptr& _policy, + const RGWSyncTraceNodeRef& _tn_parent) + : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + bucket(_bucket), + policy(_policy), + tn(sync_env->sync_tracer->add_node(_tn_parent, "get_sync_policy_handler", + SSTR(bucket))) { + get_policy_params.zone = zone; + get_policy_params.bucket = bucket; + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + for (i = 0; i < 2; ++i) { + yield call(new RGWBucketGetSyncPolicyHandlerCR(sync_env->async_rados, + sync_env->driver, + get_policy_params, + policy, + dpp)); + if (retcode < 0 && + retcode != -ENOENT) { + return set_cr_error(retcode); + } + + if (retcode == 0) { + return set_cr_done(); + } + + /* bucket instance was not found, + * try to get bucket instance info, can trigger + * metadata sync of bucket instance + */ + yield call(new RGWSyncGetBucketInfoCR(sync_env, + bucket, + nullptr, + nullptr, + tn)); + if (retcode < 0) { + return set_cr_error(retcode); + } + } + } + + return 0; + } +}; + + +int RGWGetBucketPeersCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + if (pipes) { + pipes->clear(); + } + if (target_bucket) { + target_policy = make_shared(); + yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env, + nullopt, + *target_bucket, + target_policy, + tn)); + if (retcode < 0 && + retcode != -ENOENT) { + return set_cr_error(retcode); + } + + update_from_target_bucket_policy(); + } + + if (source_bucket && source_zone) { + source_policy = make_shared(); + yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env, + source_zone, + *source_bucket, + source_policy, + tn)); + if (retcode < 0 && + retcode != -ENOENT) { + return set_cr_error(retcode); + } + + if (source_policy->policy_handler) { + auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info(); + auto& opt_attrs = source_policy->policy_handler->get_bucket_attrs(); + if (opt_bucket_info && opt_attrs) { + source_bucket_info.emplace(); + source_bucket_info->bucket_info = *opt_bucket_info; + source_bucket_info->attrs = *opt_attrs; + } + } + + if (!target_bucket) { + get_hint_targets_action = make_shared(sync_env, *source_bucket); + + yield call(new RGWGenericAsyncCR(cct, sync_env->async_rados, + get_hint_targets_action)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + /* hints might have incomplete bucket ids, + * in which case we need to figure out the current + * bucket_id + */ + for (hiter = get_hint_targets_action->targets.begin(); + hiter != get_hint_targets_action->targets.end(); + ++hiter) { + ldpp_dout(dpp, 20) << "Got sync hint for bucket=" << *source_bucket << ": " << hiter->get_key() << dendl; + + target_policy = make_shared(); + yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env, + nullopt, + *hiter, + target_policy, + tn)); + if (retcode < 0 && + retcode != -ENOENT) { + return set_cr_error(retcode); + } + update_from_target_bucket_policy(); + } + } + } + + update_from_source_bucket_policy(); + + for (siiter = buckets_info.begin(); siiter != buckets_info.end(); ++siiter) { + if (siiter->second.bucket_info.bucket.name.empty()) { + yield call(new RGWSyncGetBucketInfoCR(sync_env, siiter->first, + &siiter->second.bucket_info, + &siiter->second.attrs, + tn)); + } + } + + if (pipes) { + pipes->update_empty_bucket_info(buckets_info); + } + + return set_cr_done(); + } + + return 0; +} + +class RGWSyncBucketShardCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + boost::intrusive_ptr lease_cr; + rgw_bucket_sync_pair_info sync_pair; + rgw_bucket_sync_pipe& sync_pipe; + bool& bucket_stopped; + uint64_t generation; + ceph::real_time* progress; + + const std::string shard_status_oid; + const rgw_raw_obj bucket_status_obj; + rgw_bucket_shard_sync_info sync_status; + RGWObjVersionTracker objv_tracker; + + RGWSyncTraceNodeRef tn; + +public: + RGWSyncBucketShardCR(RGWDataSyncCtx *_sc, + boost::intrusive_ptr lease_cr, + const rgw_bucket_sync_pair_info& _sync_pair, + rgw_bucket_sync_pipe& sync_pipe, + bool& bucket_stopped, + uint64_t generation, + const RGWSyncTraceNodeRef& tn, + ceph::real_time* progress) + : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + lease_cr(std::move(lease_cr)), sync_pair(_sync_pair), + sync_pipe(sync_pipe), bucket_stopped(bucket_stopped), generation(generation), progress(progress), + shard_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, generation)), + bucket_status_obj(sc->env->svc->zone->get_zone_params().log_pool, + RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone, + sync_pair.source_bs.bucket, + sync_pair.dest_bucket)), + tn(tn) { + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWSyncBucketShardCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + objv_tracker.clear(); + yield call(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &sync_status, &objv_tracker, generation)); + if (retcode < 0 && retcode != -ENOENT) { + tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode)); + return set_cr_error(retcode); + } + + tn->log(20, SSTR("sync status for source bucket shard: " << sync_status.state)); + sync_status.state = rgw_bucket_shard_sync_info::StateIncrementalSync; + if (progress) { + *progress = sync_status.inc_marker.timestamp; + } + + yield call(new RGWBucketShardIncrementalSyncCR(sc, sync_pipe, + shard_status_oid, bucket_status_obj, lease_cr, + sync_status, generation, tn, + objv_tracker, progress)); + if (retcode < 0) { + tn->log(5, SSTR("incremental sync on bucket failed, retcode=" << retcode)); + return set_cr_error(retcode); + } + + if (sync_status.state == rgw_bucket_shard_sync_info::StateStopped) { + tn->log(20, SSTR("syncstopped indication for source bucket shard")); + bucket_stopped = true; + } + + return set_cr_done(); + } + + return 0; +} + +class RGWSyncBucketCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *env; + boost::intrusive_ptr data_lease_cr; + boost::intrusive_ptr bucket_lease_cr; + rgw_bucket_sync_pair_info sync_pair; + rgw_bucket_sync_pipe sync_pipe; + std::optional gen; + ceph::real_time* progress; + + const std::string lock_name = "bucket sync"; + const uint32_t lock_duration; + const rgw_raw_obj status_obj; + rgw_bucket_sync_status bucket_status; + bool bucket_stopped = false; + RGWObjVersionTracker objv; + bool init_check_compat = false; + rgw_bucket_index_marker_info info; + rgw_raw_obj error_repo; + rgw_bucket_shard source_bs; + rgw_pool pool; + uint64_t current_gen = 0; + + RGWSyncTraceNodeRef tn; + +public: + RGWSyncBucketCR(RGWDataSyncCtx *_sc, + boost::intrusive_ptr lease_cr, + const rgw_bucket_sync_pair_info& _sync_pair, + std::optional gen, + const RGWSyncTraceNodeRef& _tn_parent, + ceph::real_time* progress) + : RGWCoroutine(_sc->cct), sc(_sc), env(_sc->env), + data_lease_cr(std::move(lease_cr)), sync_pair(_sync_pair), + gen(gen), progress(progress), + lock_duration(cct->_conf->rgw_sync_lease_period), + status_obj(env->svc->zone->get_zone_params().log_pool, + RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone, + sync_pair.source_bs.bucket, + sync_pair.dest_bucket)), + tn(env->sync_tracer->add_node(_tn_parent, "bucket", + SSTR(bucket_str{_sync_pair.dest_bucket} << "<-" << bucket_shard_str{_sync_pair.source_bs} ))) { + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc, + boost::intrusive_ptr lease, + const rgw_bucket_sync_pair_info& sync_pair, + std::optional gen, + const RGWSyncTraceNodeRef& tn, + ceph::real_time* progress) +{ + return new RGWSyncBucketCR(sc, std::move(lease), sync_pair, + gen, tn, progress); +} + +#define RELEASE_LOCK(cr) \ + if (cr) {cr->go_down(); drain_all(); cr.reset();} + +int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + // read source/destination bucket info + yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.source_bs.bucket, &sync_pipe.source_bucket_info, + &sync_pipe.source_bucket_attrs, tn)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket})); + return set_cr_error(retcode); + } + + yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.dest_bucket, &sync_pipe.dest_bucket_info, + &sync_pipe.dest_bucket_attrs, tn)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket})); + return set_cr_error(retcode); + } + + sync_pipe.info = sync_pair; + + // read bucket sync status + using ReadCR = RGWSimpleRadosReadCR; + using WriteCR = RGWSimpleRadosWriteCR; + + objv.clear(); + yield call(new ReadCR(dpp, env->driver, + status_obj, &bucket_status, false, &objv)); + if (retcode == -ENOENT) { + // if the full sync status object didn't exist yet, run the backward + // compatability logic in InitBucketFullSyncStatusCR below. if it did + // exist, a `bucket sync init` probably requested its re-initialization, + // and shouldn't try to resume incremental sync + init_check_compat = true; + + // use exclusive create to set state=Init + objv.generate_new_write_ver(cct); + yield call(new WriteCR(dpp, env->driver, status_obj, bucket_status, &objv, true)); + tn->log(20, "bucket status object does not exist, create a new one"); + if (retcode == -EEXIST) { + // raced with another create, read its status + tn->log(20, "raced with another create, read its status"); + objv.clear(); + yield call(new ReadCR(dpp, env->driver, + status_obj, &bucket_status, false, &objv)); + } + } + if (retcode < 0) { + tn->log(20, SSTR("ERROR: failed to read bucket status object. error: " << retcode)); + return set_cr_error(retcode); + } + + do { + tn->log(20, SSTR("sync status for source bucket: " << bucket_status.state << + ". lease is: " << (bucket_lease_cr ? "taken" : "not taken") << ". stop indications is: " << bucket_stopped)); + + if (bucket_status.state != BucketSyncState::Incremental || + bucket_stopped) { + + if (!bucket_lease_cr) { + bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj, + lock_name, lock_duration, this, &sc->lcc)); + yield spawn(bucket_lease_cr.get(), false); + while (!bucket_lease_cr->is_locked()) { + if (bucket_lease_cr->is_done()) { + tn->log(5, "failed to take lease"); + set_status("lease lock failed, early abort"); + drain_all(); + return set_cr_error(bucket_lease_cr->get_ret_status()); + } + tn->log(5, "waiting on bucket lease"); + yield set_sleeping(true); + } + } + + // if state is Init or Stopped, we query the remote RGW for ther state + yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, sync_pair.source_bs.bucket, &info)); + if (retcode < 0) { + RELEASE_LOCK(bucket_lease_cr); + return set_cr_error(retcode); + } + if (info.syncstopped) { + // remote indicates stopped state + tn->log(20, "remote bilog indicates that sync was stopped"); + + // if state was incremental, remove all per-shard status objects + if (bucket_status.state == BucketSyncState::Incremental) { + yield { + const auto num_shards = bucket_status.shards_done_with_gen.size(); + const auto gen = bucket_status.incremental_gen; + call(new RemoveBucketShardStatusCollectCR(sc, sync_pair, gen, num_shards)); + } + } + + // check if local state is "stopped" + objv.clear(); + yield call(new ReadCR(dpp, env->driver, + status_obj, &bucket_status, false, &objv)); + if (retcode < 0) { + tn->log(20, SSTR("ERROR: failed to read status before writing 'stopped'. error: " << retcode)); + RELEASE_LOCK(bucket_lease_cr); + return set_cr_error(retcode); + } + if (bucket_status.state != BucketSyncState::Stopped) { + // make sure that state is changed to stopped localy + bucket_status.state = BucketSyncState::Stopped; + yield call(new WriteCR(dpp, env->driver, status_obj, bucket_status, + &objv, false)); + if (retcode < 0) { + tn->log(20, SSTR("ERROR: failed to write 'stopped' status. error: " << retcode)); + RELEASE_LOCK(bucket_lease_cr); + return set_cr_error(retcode); + } + } + RELEASE_LOCK(bucket_lease_cr); + return set_cr_done(); + } + if (bucket_stopped) { + tn->log(20, SSTR("ERROR: switched from 'stop' to 'start' sync. while state is: " << bucket_status.state)); + bucket_stopped = false; + bucket_status.state = BucketSyncState::Init; + } + } + + if (bucket_status.state != BucketSyncState::Incremental) { + // if the state wasn't Incremental, take a bucket-wide lease to prevent + // different shards from duplicating the init and full sync + if (!bucket_lease_cr) { + bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj, + lock_name, lock_duration, this, &sc->lcc)); + yield spawn(bucket_lease_cr.get(), false); + while (!bucket_lease_cr->is_locked()) { + if (bucket_lease_cr->is_done()) { + tn->log(5, "failed to take lease"); + set_status("lease lock failed, early abort"); + drain_all(); + return set_cr_error(bucket_lease_cr->get_ret_status()); + } + tn->log(5, "waiting on bucket lease"); + yield set_sleeping(true); + } + } + + // reread the status after acquiring the lock + objv.clear(); + yield call(new ReadCR(dpp, env->driver, status_obj, + &bucket_status, false, &objv)); + if (retcode < 0) { + RELEASE_LOCK(bucket_lease_cr); + tn->log(20, SSTR("ERROR: reading the status after acquiring the lock failed. error: " << retcode)); + return set_cr_error(retcode); + } + tn->log(20, SSTR("status after acquiring the lock is: " << bucket_status.state)); + + yield call(new InitBucketFullSyncStatusCR(sc, sync_pair, status_obj, + bucket_status, objv, + sync_pipe.source_bucket_info, + init_check_compat, info)); + + if (retcode < 0) { + tn->log(20, SSTR("ERROR: init full sync failed. error: " << retcode)); + RELEASE_LOCK(bucket_lease_cr); + return set_cr_error(retcode); + } + } + + assert(bucket_status.state == BucketSyncState::Incremental || + bucket_status.state == BucketSyncState::Full); + + if (bucket_status.state == BucketSyncState::Full) { + assert(bucket_lease_cr); + yield call(new RGWBucketFullSyncCR(sc, sync_pipe, status_obj, + bucket_lease_cr, bucket_status, + tn, objv)); + if (retcode < 0) { + tn->log(20, SSTR("ERROR: full sync failed. error: " << retcode)); + RELEASE_LOCK(bucket_lease_cr); + return set_cr_error(retcode); + } + } + + if (bucket_status.state == BucketSyncState::Incremental) { + // lease not required for incremental sync + RELEASE_LOCK(bucket_lease_cr); + + assert(sync_pair.source_bs.shard_id >= 0); + // if a specific gen was requested, compare that to the sync status + if (gen) { + current_gen = bucket_status.incremental_gen; + source_bs = sync_pair.source_bs; + if (*gen > current_gen) { + /* In case the data log entry is missing for previous gen, it may + * not be marked complete and the sync can get stuck. To avoid it, + * may be we can add this (shardid, gen) to error repo to force + * sync and mark that shard as completed. + */ + pool = sc->env->svc->zone->get_zone_params().log_pool; + if ((static_cast(source_bs.shard_id) < bucket_status.shards_done_with_gen.size()) && + !bucket_status.shards_done_with_gen[source_bs.shard_id]) { + // use the error repo and sync status timestamp from the datalog shard corresponding to source_bs + error_repo = datalog_oid_for_error_repo(sc, sc->env->driver, + pool, source_bs); + yield call(rgw::error_repo::write_cr(sc->env->driver->svc()->rados, error_repo, + rgw::error_repo::encode_key(source_bs, current_gen), + ceph::real_clock::zero())); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to log prev gen entry (bucket=" << source_bs.bucket << ", shard_id=" << source_bs.shard_id << ", gen=" << current_gen << " in error repo: retcode=" << retcode)); + } else { + tn->log(20, SSTR("logged prev gen entry (bucket=" << source_bs.bucket << ", shard_id=" << source_bs.shard_id << ", gen=" << current_gen << " in error repo: retcode=" << retcode)); + } + } + retcode = -EAGAIN; + tn->log(10, SSTR("ERROR: requested sync of future generation " + << *gen << " > " << current_gen + << ", returning " << retcode << " for later retry")); + return set_cr_error(retcode); + } else if (*gen < current_gen) { + tn->log(10, SSTR("WARNING: requested sync of past generation " + << *gen << " < " << current_gen + << ", returning success")); + return set_cr_done(); + } + } + + if (static_cast(sync_pair.source_bs.shard_id) >= bucket_status.shards_done_with_gen.size()) { + tn->log(1, SSTR("bucket shard " << sync_pair.source_bs << " index out of bounds")); + return set_cr_done(); // return success so we don't retry + } + if (bucket_status.shards_done_with_gen[sync_pair.source_bs.shard_id]) { + tn->log(10, SSTR("bucket shard " << sync_pair.source_bs << " of gen " << + gen << " already synced.")); + return set_cr_done(); + } + + yield call(new RGWSyncBucketShardCR(sc, data_lease_cr, sync_pair, + sync_pipe, bucket_stopped, + bucket_status.incremental_gen, tn, progress)); + if (retcode < 0) { + tn->log(20, SSTR("ERROR: incremental sync failed. error: " << retcode)); + return set_cr_error(retcode); + } + } + // loop back to previous states unless incremental sync returns normally + } while (bucket_status.state != BucketSyncState::Incremental || bucket_stopped); + + return set_cr_done(); + } + + return 0; +} + +int RGWBucketPipeSyncStatusManager::do_init(const DoutPrefixProvider *dpp, + std::ostream* ostr) +{ + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(this, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + + sync_module.reset(new RGWDefaultSyncModuleInstance()); + auto async_rados = driver->svc()->rados->get_async_processor(); + + sync_env.init(this, driver->ctx(), driver, + driver->svc(), async_rados, &http_manager, + error_logger.get(), driver->getRados()->get_sync_tracer(), + sync_module, nullptr); + + sync_env.ostr = ostr; + + rgw_sync_pipe_info_set pipes; + + ret = cr_mgr.run(dpp, new RGWGetBucketPeersCR(&sync_env, + dest_bucket, + source_zone, + source_bucket, + &pipes, + sync_env.sync_tracer->root_node)); + if (ret < 0) { + ldpp_dout(this, 0) << "failed to get bucket source peers info: (ret=" << ret << "): " << cpp_strerror(-ret) << dendl; + return ret; + } + + if (pipes.empty()) { + ldpp_dout(this, 0) << "No peers. This is not a valid multisite configuration." << dendl; + return -EINVAL; + } + + for (auto& pipe : pipes) { + auto& szone = pipe.source.zone; + + auto conn = driver->svc()->zone->get_zone_conn(szone); + if (!conn) { + ldpp_dout(this, 0) << "connection object to zone " << szone << " does not exist" << dendl; + return -EINVAL; + } + + RGWZone* z; + if (!(z = driver->svc()->zone->find_zone(szone))) { + ldpp_dout(this, 0) << "zone " << szone << " does not exist" << dendl; + return -EINVAL; + } + sources.emplace_back(&sync_env, szone, conn, + pipe.source.get_bucket_info(), + pipe.target.get_bucket(), + pipe.handler, z->name); + } + + return 0; +} + +int RGWBucketPipeSyncStatusManager::remote_info(const DoutPrefixProvider *dpp, + source& s, + uint64_t* oldest_gen, + uint64_t* latest_gen, + uint64_t* num_shards) +{ + rgw_bucket_index_marker_info remote_info; + BucketIndexShardsManager remote_markers; + auto r = rgw_read_remote_bilog_info(dpp, s.sc.conn, s.info.bucket, + remote_info, remote_markers, + null_yield); + + if (r < 0) { + ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " rgw_read_remote_bilog_info: r=" + << r << dendl; + return r; + } + if (oldest_gen) + *oldest_gen = remote_info.oldest_gen; + + if (latest_gen) + *latest_gen = remote_info.latest_gen; + + if (num_shards) + *num_shards = remote_markers.get().size(); + + return 0; +} + +tl::expected, int> +RGWBucketPipeSyncStatusManager::construct( + const DoutPrefixProvider* dpp, + rgw::sal::RadosStore* driver, + std::optional source_zone, + std::optional source_bucket, + const rgw_bucket& dest_bucket, + std::ostream* ostr) +{ + std::unique_ptr self{ + new RGWBucketPipeSyncStatusManager(driver, source_zone, source_bucket, + dest_bucket)}; + auto r = self->do_init(dpp, ostr); + if (r < 0) { + return tl::unexpected(r); + } + return self; +} + +int RGWBucketPipeSyncStatusManager::init_sync_status( + const DoutPrefixProvider *dpp) +{ + // Just running one at a time saves us from buildup/teardown and in + // practice we only do one zone at a time. + for (auto& source : sources) { + list stacks; + RGWCoroutinesStack *stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr); + pretty_print(source.sc.env, "Initializing sync state of bucket {} with zone {}.\n", + source.info.bucket.name, source.zone_name); + stack->call(new RGWSimpleRadosWriteCR( + dpp, source.sc.env->driver, + {sync_env.svc->zone->get_zone_params().log_pool, + full_status_oid(source.sc.source_zone, + source.info.bucket, + source.dest)}, + rgw_bucket_sync_status{})); + stacks.push_back(stack); + auto r = cr_mgr.run(dpp, stacks); + if (r < 0) { + pretty_print(source.sc.env, + "Initialization of sync state for bucket {} with zone {} " + "failed with error {}\n", + source.info.bucket.name, source.zone_name, cpp_strerror(r)); + } + } + return 0; +} + +tl::expected, int> +RGWBucketPipeSyncStatusManager::read_sync_status( + const DoutPrefixProvider *dpp) +{ + std::map sync_status; + list stacks; + + auto sz = sources.begin(); + + if (source_zone) { + sz = std::find_if(sources.begin(), sources.end(), + [this](const source& s) { + return s.sc.source_zone == *source_zone; + } + ); + if (sz == sources.end()) { + ldpp_dout(this, 0) << "ERROR: failed to find source zone: " + << *source_zone << dendl; + return tl::unexpected(-ENOENT); + } + } else { + ldpp_dout(this, 5) << "No source zone specified, using source zone: " + << sz->sc.source_zone << dendl; + return tl::unexpected(-ENOENT); + } + uint64_t num_shards, latest_gen; + auto ret = remote_info(dpp, *sz, nullptr, &latest_gen, &num_shards); + if (ret < 0) { + ldpp_dout(this, 5) << "Unable to get remote info: " + << ret << dendl; + return tl::unexpected(ret); + } + auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr); + std::vector pairs(num_shards); + for (auto shard = 0u; shard < num_shards; ++shard) { + auto& pair = pairs[shard]; + pair.source_bs.bucket = sz->info.bucket; + pair.dest_bucket = sz->dest; + pair.source_bs.shard_id = shard; + stack->call(new RGWReadBucketPipeSyncStatusCoroutine( + &sz->sc, pair, &sync_status[shard], + nullptr, latest_gen)); + } + + stacks.push_back(stack); + + ret = cr_mgr.run(dpp, stacks); + if (ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to read sync status for " + << bucket_str{dest_bucket} << dendl; + return tl::unexpected(ret); + } + + return sync_status; +} + +namespace rgw::bucket_sync_run { +// Retry-loop over calls to sync_bucket_shard_cr +class ShardCR : public RGWCoroutine { + static constexpr auto allowed_retries = 10u; + + RGWDataSyncCtx& sc; + const rgw_bucket_sync_pair_info& pair; + const uint64_t gen; + unsigned retries = 0; + + ceph::real_time prev_progress; + ceph::real_time progress; + +public: + + ShardCR(RGWDataSyncCtx& sc, const rgw_bucket_sync_pair_info& pair, + const uint64_t gen) + : RGWCoroutine(sc.cct), sc(sc), pair(pair), gen(gen) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + // Since all errors (except ECANCELED) are considered retryable, + // retry other errors so long as we're making progress. + for (retries = 0u, retcode = -EDOM; + (retries < allowed_retries) && (retcode != 0); + ++retries) { + ldpp_dout(dpp, 5) << "ShardCR: syncing bucket shard on: " + << "zone=" << sc.source_zone + << ", bucket=" << pair.source_bs.bucket.name + << ", shard=" << pair.source_bs.shard_id + << ", gen=" << gen + << dendl; + yield call(sync_bucket_shard_cr(&sc, nullptr, pair, gen, + sc.env->sync_tracer->root_node, + &progress)); + + if (retcode == -ECANCELED) { + ldpp_dout(dpp, -1) << "ERROR: Got -ECANCELED for " + << pair.source_bs << dendl; + drain_all(); + return set_cr_error(retcode); + } else if (retcode < 0) { + ldpp_dout(dpp, 5) << "WARNING: Got error, retcode=" << retcode << " for " + << pair.source_bs << "on retry " + << retries + 1 << " of " << allowed_retries + << " allowed" << dendl; + // Reset the retry counter if we made any progress + if (progress != prev_progress) { + retries = 0; + } + prev_progress = progress; + } + } + if (retcode < 0) { + ldpp_dout(dpp, -1) << "ERROR: Exhausted retries for " + << pair.source_bs << " retcode=" + << retcode << dendl; + drain_all(); + return set_cr_error(retcode); + } + + drain_all(); + return set_cr_done(); + } + return 0; + } +}; + +// Loop over calls to ShardCR with limited concurrency +class GenCR : public RGWShardCollectCR { + static constexpr auto MAX_CONCURRENT_SHARDS = 64; + + RGWDataSyncCtx& sc; + const uint64_t gen; + + std::vector pairs; + decltype(pairs)::const_iterator iter; + +public: + GenCR(RGWDataSyncCtx& sc, const rgw_bucket& source, const rgw_bucket& dest, + const uint64_t gen, const uint64_t shards, + const RGWBucketSyncFlowManager::pipe_handler& handler) + : RGWShardCollectCR(sc.cct, MAX_CONCURRENT_SHARDS), + sc(sc), gen(gen) { + pairs.resize(shards); + for (auto shard = 0u; shard < shards; ++shard) { + auto& pair = pairs[shard]; + pair.handler = handler; + pair.source_bs.bucket = source; + pair.dest_bucket = dest; + pair.source_bs.shard_id = shard; + } + iter = pairs.cbegin(); + assert(pairs.size() == shards); + } + + virtual bool spawn_next() override { + if (iter == pairs.cend()) { + return false; + } + spawn(new ShardCR(sc, *iter, gen), false); + ++iter; + return true; + } + + int handle_result(int r) override { + if (r < 0) { + ldpp_dout(sc.env->dpp, 4) << "ERROR: Error syncing shard: " + << cpp_strerror(r) << dendl; + } + return r; + } +}; + +// Read sync status, loop over calls to GenCR +class SourceCR : public RGWCoroutine { + RGWDataSyncCtx& sc; + const RGWBucketInfo& info; + const rgw_bucket& dest; + const RGWBucketSyncFlowManager::pipe_handler& handler; + const rgw_raw_obj status_obj{ + sc.env->svc->zone->get_zone_params().log_pool, + RGWBucketPipeSyncStatusManager::full_status_oid(sc.source_zone, info.bucket, + dest)}; + + BucketSyncState state = BucketSyncState::Incremental; + uint64_t gen = 0; + uint64_t num_shards = 0; + rgw_bucket_sync_status status; + std::string zone_name; + +public: + + SourceCR(RGWDataSyncCtx& sc, const RGWBucketInfo& info, + const rgw_bucket& dest, + const RGWBucketSyncFlowManager::pipe_handler& handler, + const std::string& zone_name) + : RGWCoroutine(sc.cct), sc(sc), info(info), dest(dest), handler(handler), + zone_name(zone_name) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + // Get the source's status. In incremental sync, this gives us + // the generation and shard count that is next needed to be run. + yield call(new RGWSimpleRadosReadCR( + dpp, sc.env->driver, status_obj, &status)); + if (retcode < 0) { + ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone=" + << sc.source_zone << " retcode=" + << retcode << dendl; + drain_all(); + return set_cr_error(retcode); + } + + if (status.state == BucketSyncState::Stopped) { + // Nothing to do. + pretty_print(sc.env, "Sync of bucket {} from source zone {} is in state Stopped. " + "Nothing to do.\n", dest.name, zone_name); + ldpp_dout(dpp, 5) << "SourceCR: Bucket is in state Stopped, returning." + << dendl; + drain_all(); + return set_cr_done(); + } + + do { + state = status.state; + gen = status.incremental_gen; + num_shards = status.shards_done_with_gen.size(); + + ldpp_dout(dpp, 5) << "SourceCR: " + << "state=" << state + << ", gen=" << gen + << ", num_shards=" << num_shards + << dendl; + + // Special case to handle full sync. Since full sync no longer + // uses shards and has no generations, we sync shard zero, + // though use the current generation so a following + // incremental sync can carry on. + if (state != BucketSyncState::Incremental) { + pretty_print(sc.env, "Beginning full sync of bucket {} from source zone {}.\n", + dest.name, zone_name); + ldpp_dout(dpp, 5) << "SourceCR: Calling GenCR with " + << "gen=" << gen + << ", num_shards=" << 1 + << dendl; + yield call(new GenCR(sc, info.bucket, dest, gen, 1, handler)); + } else { + pretty_print(sc.env, "Beginning incremental sync of bucket {}, generation {} from source zone {}.\n", + dest.name, gen, zone_name); + ldpp_dout(dpp, 5) << "SourceCR: Calling GenCR with " + << "gen=" << gen + << ", num_shards=" << num_shards + << dendl; + yield call(new GenCR(sc, info.bucket, dest, gen, num_shards, + handler)); + } + if (retcode < 0) { + ldpp_dout(dpp, -1) << "ERROR: Giving up syncing from " + << sc.source_zone << " retcode=" + << retcode << dendl; + drain_all(); + return set_cr_error(retcode); + } + + pretty_print(sc.env, "Completed.\n"); + + yield call(new RGWSimpleRadosReadCR( + dpp, sc.env->driver, status_obj, &status)); + if (retcode < 0) { + ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone=" + << sc.source_zone << " retcode=" + << retcode << dendl; + drain_all(); + return set_cr_error(retcode); + } + // Repeat until we have done an incremental run and the + // generation remains unchanged. + ldpp_dout(dpp, 5) << "SourceCR: " + << "state=" << state + << ", gen=" << gen + << ", num_shards=" << num_shards + << ", status.state=" << status.state + << ", status.incremental_gen=" << status.incremental_gen + << ", status.shards_done_with_gen.size()=" << status.shards_done_with_gen.size() + << dendl; + } while (state != BucketSyncState::Incremental || + gen != status.incremental_gen); + drain_all(); + return set_cr_done(); + } + return 0; + } +}; +} // namespace rgw::bucket_sync_run + +int RGWBucketPipeSyncStatusManager::run(const DoutPrefixProvider *dpp) +{ + list stacks; + for (auto& source : sources) { + auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr); + stack->call(new rgw::bucket_sync_run::SourceCR( + source.sc, source.info, source.dest, source.handler, + source.zone_name)); + stacks.push_back(stack); + } + auto ret = cr_mgr.run(dpp, stacks); + if (ret < 0) { + ldpp_dout(this, 0) << "ERROR: Sync unsuccessful on bucket " + << bucket_str{dest_bucket} << dendl; + } + return ret; +} + +unsigned RGWBucketPipeSyncStatusManager::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWBucketPipeSyncStatusManager::gen_prefix(std::ostream& out) const +{ + auto zone = std::string_view{source_zone.value_or(rgw_zone_id("*")).id}; + return out << "bucket sync zone:" << zone.substr(0, 8) + << " bucket:" << dest_bucket << ' '; +} + +string RGWBucketPipeSyncStatusManager::full_status_oid(const rgw_zone_id& source_zone, + const rgw_bucket& source_bucket, + const rgw_bucket& dest_bucket) +{ + if (source_bucket == dest_bucket) { + return bucket_full_status_oid_prefix + "." + source_zone.id + ":" + + dest_bucket.get_key(); + } else { + return bucket_full_status_oid_prefix + "." + source_zone.id + ":" + + dest_bucket.get_key() + ":" + source_bucket.get_key(); + } +} + +inline std::string generation_token(uint64_t gen) { + return (gen == 0) ? "" : (":" + std::to_string(gen)); +} + +string RGWBucketPipeSyncStatusManager::inc_status_oid(const rgw_zone_id& source_zone, + const rgw_bucket_sync_pair_info& sync_pair, + uint64_t gen) +{ + if (sync_pair.source_bs.bucket == sync_pair.dest_bucket) { + return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.source_bs.get_key() + + generation_token(gen); + } else { + return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.dest_bucket.get_key() + ":" + sync_pair.source_bs.get_key() + + generation_token(gen); + } +} + +string RGWBucketPipeSyncStatusManager::obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe, + const rgw_zone_id& source_zone, + const rgw_obj& obj) +{ + string prefix = object_status_oid_prefix + "." + source_zone.id + ":" + obj.bucket.get_key(); + if (sync_pipe.source_bucket_info.bucket != + sync_pipe.dest_bucket_info.bucket) { + prefix += string("/") + sync_pipe.dest_bucket_info.bucket.get_key(); + } + return prefix + ":" + obj.key.name + ":" + obj.key.instance; +} + +int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp, + RGWRESTConn* conn, + const rgw_bucket& bucket, + rgw_bucket_index_marker_info& info, + BucketIndexShardsManager& markers, + optional_yield y) +{ + const auto instance_key = bucket.get_key(); + const rgw_http_param_pair params[] = { + { "type" , "bucket-index" }, + { "bucket-instance", instance_key.c_str() }, + { "info" , nullptr }, + { nullptr, nullptr } + }; + int r = conn->get_json_resource(dpp, "/admin/log/", params, y, info); + if (r < 0) { + ldpp_dout(dpp, -1) << "failed to fetch remote log markers: " << cpp_strerror(r) << dendl; + return r; + } + // parse shard markers + r = markers.from_string(info.max_marker, -1); + if (r < 0) { + ldpp_dout(dpp, -1) << "failed to decode remote log markers" << dendl; + return r; + } + return 0; +} + +class RGWCollectBucketSyncStatusCR : public RGWShardCollectCR { + static constexpr int max_concurrent_shards = 16; + rgw::sal::RadosStore* const driver; + RGWDataSyncCtx *const sc; + RGWDataSyncEnv *const env; + const uint64_t gen; + + rgw_bucket_sync_pair_info sync_pair; + using Vector = std::vector; + Vector::iterator i, end; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to read bucket shard sync status: " + << cpp_strerror(r) << dendl; + } + return r; + } + public: + RGWCollectBucketSyncStatusCR(rgw::sal::RadosStore* driver, RGWDataSyncCtx *sc, + const rgw_bucket_sync_pair_info& sync_pair, + uint64_t gen, + Vector *status) + : RGWShardCollectCR(sc->cct, max_concurrent_shards), + driver(driver), sc(sc), env(sc->env), gen(gen), sync_pair(sync_pair), + i(status->begin()), end(status->end()) + {} + + bool spawn_next() override { + if (i == end) { + return false; + } + spawn(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &*i, nullptr, gen), false); + ++i; + ++sync_pair.source_bs.shard_id; + return true; + } +}; + +int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore *driver, + const rgw_sync_bucket_pipe& pipe, + rgw_bucket_sync_status *status, + optional_yield y) +{ + auto get_oid = RGWBucketPipeSyncStatusManager::full_status_oid; + const rgw_raw_obj obj{driver->svc()->zone->get_zone_params().log_pool, + get_oid(*pipe.source.zone, *pipe.source.bucket, *pipe.dest.bucket)}; + + auto svc = driver->svc()->sysobj; + auto sysobj = svc->get_obj(obj); + bufferlist bl; + int ret = sysobj.rop().read(dpp, &bl, y); + if (ret < 0) + return ret; + + try { + auto iter = bl.cbegin(); + using ceph::decode; + rgw_bucket_sync_status result; + decode(result, iter); + *status = result; + return 0; + } catch (const buffer::error& err) { + lderr(svc->ctx()) << "error decoding " << obj << ": " << err.what() << dendl; + return -EIO; + } +} + +int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore *driver, + const rgw_sync_bucket_pipe& pipe, + uint64_t gen, + std::vector *status) +{ + if (!pipe.source.zone || + !pipe.source.bucket || + !pipe.dest.zone || + !pipe.dest.bucket) { + return -EINVAL; + } + + rgw_bucket_sync_pair_info sync_pair; + sync_pair.source_bs.bucket = *pipe.source.bucket; + sync_pair.source_bs.shard_id = 0; + sync_pair.dest_bucket = *pipe.dest.bucket; + + RGWDataSyncEnv env; + RGWSyncModuleInstanceRef module; // null sync module + env.init(dpp, driver->ctx(), driver, driver->svc(), driver->svc()->rados->get_async_processor(), + nullptr, nullptr, nullptr, module, nullptr); + + RGWDataSyncCtx sc; + sc.init(&env, nullptr, *pipe.source.zone); + + RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry()); + return crs.run(dpp, new RGWCollectBucketSyncStatusCR(driver, &sc, + sync_pair, + gen, + status)); +} + +void rgw_data_sync_info::generate_test_instances(list& o) +{ + auto info = new rgw_data_sync_info; + info->state = rgw_data_sync_info::StateBuildingFullSyncMaps; + info->num_shards = 8; + o.push_back(info); + o.push_back(new rgw_data_sync_info); +} + +void rgw_data_sync_marker::generate_test_instances(list& o) +{ + auto marker = new rgw_data_sync_marker; + marker->state = rgw_data_sync_marker::IncrementalSync; + marker->marker = "01234"; + marker->pos = 5; + o.push_back(marker); + o.push_back(new rgw_data_sync_marker); +} + +void rgw_data_sync_status::generate_test_instances(list& o) +{ + o.push_back(new rgw_data_sync_status); +} + +void rgw_bucket_shard_full_sync_marker::dump(Formatter *f) const +{ + encode_json("position", position, f); + encode_json("count", count, f); +} + +void rgw_bucket_shard_inc_sync_marker::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("position", position, obj); + JSONDecoder::decode_json("timestamp", timestamp, obj); +} + +void rgw_bucket_shard_inc_sync_marker::dump(Formatter *f) const +{ + encode_json("position", position, f); + encode_json("timestamp", timestamp, f); +} + +void rgw_bucket_shard_sync_info::decode_json(JSONObj *obj) +{ + std::string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "full-sync") { + state = StateFullSync; + } else if (s == "incremental-sync") { + state = StateIncrementalSync; + } else if (s == "stopped") { + state = StateStopped; + } else { + state = StateInit; + } + JSONDecoder::decode_json("inc_marker", inc_marker, obj); +} + +void rgw_bucket_shard_full_sync_marker::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("position", position, obj); + JSONDecoder::decode_json("count", count, obj); +} + +void rgw_bucket_shard_sync_info::dump(Formatter *f) const +{ + const char *s{nullptr}; + switch ((SyncState)state) { + case StateInit: + s = "init"; + break; + case StateFullSync: + s = "full-sync"; + break; + case StateIncrementalSync: + s = "incremental-sync"; + break; + case StateStopped: + s = "stopped"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); + encode_json("inc_marker", inc_marker, f); +} + +void rgw_bucket_full_sync_status::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("position", position, obj); + JSONDecoder::decode_json("count", count, obj); +} + +void rgw_bucket_full_sync_status::dump(Formatter *f) const +{ + encode_json("position", position, f); + encode_json("count", count, f); +} + +void encode_json(const char *name, BucketSyncState state, Formatter *f) +{ + switch (state) { + case BucketSyncState::Init: + encode_json(name, "init", f); + break; + case BucketSyncState::Full: + encode_json(name, "full-sync", f); + break; + case BucketSyncState::Incremental: + encode_json(name, "incremental-sync", f); + break; + case BucketSyncState::Stopped: + encode_json(name, "stopped", f); + break; + default: + encode_json(name, "unknown", f); + break; + } +} + +void decode_json_obj(BucketSyncState& state, JSONObj *obj) +{ + std::string s; + decode_json_obj(s, obj); + if (s == "full-sync") { + state = BucketSyncState::Full; + } else if (s == "incremental-sync") { + state = BucketSyncState::Incremental; + } else if (s == "stopped") { + state = BucketSyncState::Stopped; + } else { + state = BucketSyncState::Init; + } +} + +void rgw_bucket_sync_status::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("state", state, obj); + JSONDecoder::decode_json("full", full, obj); + JSONDecoder::decode_json("incremental_gen", incremental_gen, obj); +} + +void rgw_bucket_sync_status::dump(Formatter *f) const +{ + encode_json("state", state, f); + encode_json("full", full, f); + encode_json("incremental_gen", incremental_gen, f); +} + + +void bilog_status_v2::dump(Formatter *f) const +{ + encode_json("sync_status", sync_status, f); + encode_json("inc_status", inc_status, f); +} + +void bilog_status_v2::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("sync_status", sync_status, obj); + JSONDecoder::decode_json("inc_status", inc_status, obj); +} diff --git a/src/rgw/driver/rados/rgw_data_sync.h b/src/rgw/driver/rados/rgw_data_sync.h new file mode 100644 index 000000000..b9a39343f --- /dev/null +++ b/src/rgw/driver/rados/rgw_data_sync.h @@ -0,0 +1,868 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include + +#include "include/encoding.h" + +#include "common/ceph_json.h" +#include "common/likely.h" + +#include "rgw_coroutine.h" +#include "rgw_cr_rados.h" +#include "rgw_http_client.h" +#include "rgw_sal_rados.h" + +#include "rgw_datalog.h" +#include "rgw_sync.h" +#include "rgw_sync_module.h" +#include "rgw_sync_trace.h" +#include "rgw_sync_policy.h" + +#include "rgw_bucket_sync.h" + +// represents an obligation to sync an entry up a given time +struct rgw_data_sync_obligation { + rgw_bucket_shard bs; + std::optional gen; + std::string marker; + ceph::real_time timestamp; + bool retry = false; +}; + +inline std::ostream& operator<<(std::ostream& out, const rgw_data_sync_obligation& o) { + out << "key=" << o.bs; + if (o.gen) { + out << '[' << *o.gen << ']'; + } + if (!o.marker.empty()) { + out << " marker=" << o.marker; + } + if (o.timestamp != ceph::real_time{}) { + out << " timestamp=" << o.timestamp; + } + if (o.retry) { + out << " retry"; + } + return out; +} + +class JSONObj; +struct rgw_sync_bucket_pipe; + +struct rgw_bucket_sync_pair_info { + RGWBucketSyncFlowManager::pipe_handler handler; /* responsible for sync filters */ + rgw_bucket_shard source_bs; + rgw_bucket dest_bucket; +}; + +inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pair_info& p) { + if (p.source_bs.bucket == p.dest_bucket) { + return out << p.source_bs; + } + return out << p.source_bs << "->" << p.dest_bucket; +} + +struct rgw_bucket_sync_pipe { + rgw_bucket_sync_pair_info info; + RGWBucketInfo source_bucket_info; + std::map source_bucket_attrs; + RGWBucketInfo dest_bucket_info; + std::map dest_bucket_attrs; + + RGWBucketSyncFlowManager::pipe_rules_ref& get_rules() { + return info.handler.rules; + } +}; + +inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pipe& p) { + return out << p.info; +} + +struct rgw_datalog_info { + uint32_t num_shards; + + rgw_datalog_info() : num_shards(0) {} + + void decode_json(JSONObj *obj); +}; + +struct rgw_data_sync_info { + enum SyncState { + StateInit = 0, + StateBuildingFullSyncMaps = 1, + StateSync = 2, + }; + + uint16_t state; + uint32_t num_shards; + + uint64_t instance_id{0}; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(state, bl); + encode(num_shards, bl); + encode(instance_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(state, bl); + decode(num_shards, bl); + if (struct_v >= 2) { + decode(instance_id, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + std::string s; + switch ((SyncState)state) { + case StateInit: + s = "init"; + break; + case StateBuildingFullSyncMaps: + s = "building-full-sync-maps"; + break; + case StateSync: + s = "sync"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); + encode_json("num_shards", num_shards, f); + encode_json("instance_id", instance_id, f); + } + void decode_json(JSONObj *obj) { + std::string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "building-full-sync-maps") { + state = StateBuildingFullSyncMaps; + } else if (s == "sync") { + state = StateSync; + } else { + state = StateInit; + } + JSONDecoder::decode_json("num_shards", num_shards, obj); + JSONDecoder::decode_json("instance_id", instance_id, obj); + } + static void generate_test_instances(std::list& o); + + rgw_data_sync_info() : state((int)StateInit), num_shards(0) {} +}; +WRITE_CLASS_ENCODER(rgw_data_sync_info) + +struct rgw_data_sync_marker { + enum SyncState { + FullSync = 0, + IncrementalSync = 1, + }; + uint16_t state; + std::string marker; + std::string next_step_marker; + uint64_t total_entries; + uint64_t pos; + real_time timestamp; + + rgw_data_sync_marker() : state(FullSync), total_entries(0), pos(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(state, bl); + encode(marker, bl); + encode(next_step_marker, bl); + encode(total_entries, bl); + encode(pos, bl); + encode(timestamp, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(state, bl); + decode(marker, bl); + decode(next_step_marker, bl); + decode(total_entries, bl); + decode(pos, bl); + decode(timestamp, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + const char *s{nullptr}; + switch ((SyncState)state) { + case FullSync: + s = "full-sync"; + break; + case IncrementalSync: + s = "incremental-sync"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); + encode_json("marker", marker, f); + encode_json("next_step_marker", next_step_marker, f); + encode_json("total_entries", total_entries, f); + encode_json("pos", pos, f); + encode_json("timestamp", utime_t(timestamp), f); + } + void decode_json(JSONObj *obj) { + std::string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "full-sync") { + state = FullSync; + } else if (s == "incremental-sync") { + state = IncrementalSync; + } + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("next_step_marker", next_step_marker, obj); + JSONDecoder::decode_json("total_entries", total_entries, obj); + JSONDecoder::decode_json("pos", pos, obj); + utime_t t; + JSONDecoder::decode_json("timestamp", t, obj); + timestamp = t.to_real_time(); + } + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(rgw_data_sync_marker) + +struct rgw_data_sync_status { + rgw_data_sync_info sync_info; + std::map sync_markers; + + rgw_data_sync_status() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(sync_info, bl); + /* sync markers are encoded separately */ + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(sync_info, bl); + /* sync markers are decoded separately */ + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + encode_json("info", sync_info, f); + encode_json("markers", sync_markers, f); + } + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("info", sync_info, obj); + JSONDecoder::decode_json("markers", sync_markers, obj); + } + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(rgw_data_sync_status) + +struct rgw_datalog_entry { + std::string key; + ceph::real_time timestamp; + + void decode_json(JSONObj *obj); +}; + +struct rgw_datalog_shard_data { + std::string marker; + bool truncated; + std::vector entries; + + void decode_json(JSONObj *obj); +}; + +class RGWAsyncRadosProcessor; +class RGWDataSyncControlCR; + +struct rgw_bucket_entry_owner { + std::string id; + std::string display_name; + + rgw_bucket_entry_owner() {} + rgw_bucket_entry_owner(const std::string& _id, const std::string& _display_name) : id(_id), display_name(_display_name) {} + + void decode_json(JSONObj *obj); +}; + +class RGWSyncErrorLogger; +class RGWRESTConn; +class RGWServices; + +struct RGWDataSyncEnv { + const DoutPrefixProvider *dpp{nullptr}; + CephContext *cct{nullptr}; + rgw::sal::RadosStore* driver{nullptr}; + RGWServices *svc{nullptr}; + RGWAsyncRadosProcessor *async_rados{nullptr}; + RGWHTTPManager *http_manager{nullptr}; + RGWSyncErrorLogger *error_logger{nullptr}; + RGWSyncTraceManager *sync_tracer{nullptr}; + RGWSyncModuleInstanceRef sync_module{nullptr}; + PerfCounters* counters{nullptr}; + + RGWDataSyncEnv() {} + + void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _driver, RGWServices *_svc, + RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager, + RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer, + RGWSyncModuleInstanceRef& _sync_module, + PerfCounters* _counters) { + dpp = _dpp; + cct = _cct; + driver = _driver; + svc = _svc; + async_rados = _async_rados; + http_manager = _http_manager; + error_logger = _error_logger; + sync_tracer = _sync_tracer; + sync_module = _sync_module; + counters = _counters; + } + + std::string shard_obj_name(int shard_id); + std::string status_oid(); + + std::ostream* ostr{nullptr}; // For pretty printing progress +}; + +// pretty ostream output for `radosgw-admin bucket sync run` +#if FMT_VERSION >= 90000 +template +void pretty_print(const RGWDataSyncEnv* env, fmt::format_string fmt, T&& ...t) { +#else +template +void pretty_print(const RGWDataSyncEnv* env, const S& fmt, T&& ...t) { +#endif + if (unlikely(!!env->ostr)) { + fmt::print(*env->ostr, fmt, std::forward(t)...); + env->ostr->flush(); + } +} + +/// \brief Adjust concurrency based on latency +/// +/// Keep a running average of operation latency and scale concurrency +/// down when latency rises. +class LatencyConcurrencyControl : public LatencyMonitor { + static constexpr auto dout_subsys = ceph_subsys_rgw; + ceph::coarse_mono_time last_warning; +public: + CephContext* cct; + + LatencyConcurrencyControl(CephContext* cct) + : cct(cct) {} + + /// \brief Lower concurrency when latency rises + /// + /// Since we have multiple spawn windows (data sync overall and + /// bucket), accept a number of concurrent operations to spawn and, + /// if latency is high, cut it in half. If latency is really high, + /// cut it to 1. + int64_t adj_concurrency(int64_t concurrency) { + using namespace std::literals; + auto threshold = (cct->_conf->rgw_sync_lease_period * 1s) / 12; + + if (avg_latency() >= 2 * threshold) [[unlikely]] { + auto now = ceph::coarse_mono_clock::now(); + if (now - last_warning > 5min) { + ldout(cct, -1) + << "WARNING: The OSD cluster is overloaded and struggling to " + << "complete ops. You need more capacity to serve this level " + << "of demand." << dendl; + last_warning = now; + } + return 1; + } else if (avg_latency() >= threshold) [[unlikely]] { + return concurrency / 2; + } else [[likely]] { + return concurrency; + } + } +}; + +struct RGWDataSyncCtx { + RGWDataSyncEnv *env{nullptr}; + CephContext *cct{nullptr}; + + RGWRESTConn *conn{nullptr}; + rgw_zone_id source_zone; + + LatencyConcurrencyControl lcc{nullptr}; + + RGWDataSyncCtx() = default; + + RGWDataSyncCtx(RGWDataSyncEnv* env, + RGWRESTConn* conn, + const rgw_zone_id& source_zone) + : env(env), cct(env->cct), conn(conn), source_zone(source_zone), lcc(cct) {} + + void init(RGWDataSyncEnv *_env, + RGWRESTConn *_conn, + const rgw_zone_id& _source_zone) { + cct = _env->cct; + env = _env; + conn = _conn; + source_zone = _source_zone; + lcc.cct = cct; + } +}; + +class RGWRados; + +class RGWRemoteDataLog : public RGWCoroutinesManager { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* driver; + CephContext *cct; + RGWCoroutinesManagerRegistry *cr_registry; + RGWAsyncRadosProcessor *async_rados; + RGWHTTPManager http_manager; + + RGWDataSyncEnv sync_env; + RGWDataSyncCtx sc; + + ceph::shared_mutex lock = ceph::make_shared_mutex("RGWRemoteDataLog::lock"); + RGWDataSyncControlCR *data_sync_cr; + + RGWSyncTraceNodeRef tn; + + bool initialized; + +public: + RGWRemoteDataLog(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* _store, + RGWAsyncRadosProcessor *async_rados); + int init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger, + RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& module, + PerfCounters* _counters); + void finish(); + + int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info); + int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map *shards_info); + int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map shard_markers, std::map *result); + int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status); + int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set& recovering_shards); + int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set& lagging_buckets,std::set& recovering_buckets, rgw_data_sync_marker* sync_marker, const int max_entries); + int init_sync_status(const DoutPrefixProvider *dpp, int num_shards); + int run_sync(const DoutPrefixProvider *dpp, int num_shards); + + void wakeup(int shard_id, bc::flat_set& entries); +}; + +class RGWDataSyncStatusManager : public DoutPrefixProvider { + rgw::sal::RadosStore* driver; + + rgw_zone_id source_zone; + RGWRESTConn *conn; + RGWSyncErrorLogger *error_logger; + RGWSyncModuleInstanceRef sync_module; + PerfCounters* counters; + + RGWRemoteDataLog source_log; + + std::string source_status_oid; + std::string source_shard_status_oid_prefix; + + std::map shard_objs; + + int num_shards; + +public: + RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados, + const rgw_zone_id& _source_zone, PerfCounters* counters) + : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL), + sync_module(nullptr), counters(counters), + source_log(this, driver, async_rados), num_shards(0) {} + RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados, + const rgw_zone_id& _source_zone, PerfCounters* counters, + const RGWSyncModuleInstanceRef& _sync_module) + : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL), + sync_module(_sync_module), counters(counters), + source_log(this, driver, async_rados), num_shards(0) {} + ~RGWDataSyncStatusManager() { + finalize(); + } + int init(const DoutPrefixProvider *dpp); + void finalize(); + + static std::string shard_obj_name(const rgw_zone_id& source_zone, int shard_id); + static std::string sync_status_oid(const rgw_zone_id& source_zone); + + int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status) { + return source_log.read_sync_status(dpp, sync_status); + } + + int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set& recovering_shards) { + return source_log.read_recovering_shards(dpp, num_shards, recovering_shards); + } + + int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set& lagging_buckets, std::set& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) { + return source_log.read_shard_status(dpp, shard_id, lagging_buckets, recovering_buckets,sync_marker, max_entries); + } + int init_sync_status(const DoutPrefixProvider *dpp) { return source_log.init_sync_status(dpp, num_shards); } + + int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info) { + return source_log.read_log_info(dpp, log_info); + } + int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map *shards_info) { + return source_log.read_source_log_shards_info(dpp, shards_info); + } + int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map shard_markers, std::map *result) { + return source_log.read_source_log_shards_next(dpp, shard_markers, result); + } + + int run(const DoutPrefixProvider *dpp) { return source_log.run_sync(dpp, num_shards); } + + void wakeup(int shard_id, bc::flat_set& entries) { return source_log.wakeup(shard_id, entries); } + + void stop() { + source_log.finish(); + } + + // implements DoutPrefixProvider + CephContext *get_cct() const override; + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; +}; + +class RGWBucketPipeSyncStatusManager; +class RGWBucketSyncCR; + +struct rgw_bucket_shard_full_sync_marker { + rgw_obj_key position; + uint64_t count; + + rgw_bucket_shard_full_sync_marker() : count(0) {} + + void encode_attr(std::map& attrs); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(position, bl); + encode(count, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(position, bl); + decode(count, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_bucket_shard_full_sync_marker) + +struct rgw_bucket_shard_inc_sync_marker { + std::string position; + ceph::real_time timestamp; + + void encode_attr(std::map& attrs); + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(position, bl); + encode(timestamp, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(position, bl); + if (struct_v >= 2) { + decode(timestamp, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_bucket_shard_inc_sync_marker) + +struct rgw_bucket_shard_sync_info { + enum SyncState { + StateInit = 0, + StateFullSync = 1, + StateIncrementalSync = 2, + StateStopped = 3, + }; + + uint16_t state; + rgw_bucket_shard_inc_sync_marker inc_marker; + + void decode_from_attrs(CephContext *cct, std::map& attrs); + void encode_all_attrs(std::map& attrs); + void encode_state_attr(std::map& attrs); + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(state, bl); + encode(inc_marker, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(state, bl); + if (struct_v <= 1) { + rgw_bucket_shard_full_sync_marker full_marker; + decode(full_marker, bl); + } + decode(inc_marker, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + rgw_bucket_shard_sync_info() : state((int)StateInit) {} + +}; +WRITE_CLASS_ENCODER(rgw_bucket_shard_sync_info) + +struct rgw_bucket_full_sync_status { + rgw_obj_key position; + uint64_t count = 0; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(position, bl); + encode(count, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(position, bl); + decode(count, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_bucket_full_sync_status) + +enum class BucketSyncState : uint8_t { + Init = 0, + Full, + Incremental, + Stopped, +}; +inline std::ostream& operator<<(std::ostream& out, const BucketSyncState& s) { + switch (s) { + case BucketSyncState::Init: out << "init"; break; + case BucketSyncState::Full: out << "full"; break; + case BucketSyncState::Incremental: out << "incremental"; break; + case BucketSyncState::Stopped: out << "stopped"; break; + } + return out; +} + +void encode_json(const char *name, BucketSyncState state, Formatter *f); +void decode_json_obj(BucketSyncState& state, JSONObj *obj); + +struct rgw_bucket_sync_status { + BucketSyncState state = BucketSyncState::Init; + rgw_bucket_full_sync_status full; + uint64_t incremental_gen = 0; + std::vector shards_done_with_gen; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(state, bl); + encode(full, bl); + encode(incremental_gen, bl); + encode(shards_done_with_gen, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(state, bl); + decode(full, bl); + if (struct_v > 1) { + decode(incremental_gen, bl); + decode(shards_done_with_gen, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_bucket_sync_status) + +struct bilog_status_v2 { + rgw_bucket_sync_status sync_status; + std::vector inc_status; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; + +struct store_gen_shards { + uint64_t gen = 0; + uint32_t num_shards = 0; + + void dump(Formatter *f) const { + encode_json("gen", gen, f); + encode_json("num_shards", num_shards, f); + } + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("gen", gen, obj); + JSONDecoder::decode_json("num_shards", num_shards, obj); + } +}; + +struct rgw_bucket_index_marker_info { + std::string bucket_ver; + std::string master_ver; + std::string max_marker; + bool syncstopped{false}; + uint64_t oldest_gen = 0; + uint64_t latest_gen = 0; + std::vector generations; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket_ver", bucket_ver, obj); + JSONDecoder::decode_json("master_ver", master_ver, obj); + JSONDecoder::decode_json("max_marker", max_marker, obj); + JSONDecoder::decode_json("syncstopped", syncstopped, obj); + JSONDecoder::decode_json("oldest_gen", oldest_gen, obj); + JSONDecoder::decode_json("latest_gen", latest_gen, obj); + JSONDecoder::decode_json("generations", generations, obj); + } +}; + + +class BucketIndexShardsManager; + +int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp, + RGWRESTConn* conn, + const rgw_bucket& bucket, + rgw_bucket_index_marker_info& info, + BucketIndexShardsManager& markers, + optional_yield y); + +class RGWBucketPipeSyncStatusManager : public DoutPrefixProvider { + rgw::sal::RadosStore* driver; + + RGWDataSyncEnv sync_env; + + RGWCoroutinesManager cr_mgr{driver->ctx(), + driver->getRados()->get_cr_registry()}; + + RGWHTTPManager http_manager{driver->ctx(), cr_mgr.get_completion_mgr()}; + + std::optional source_zone; + std::optional source_bucket; + + std::unique_ptr error_logger = + std::make_unique(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, + ERROR_LOGGER_SHARDS); + RGWSyncModuleInstanceRef sync_module; + + rgw_bucket dest_bucket; + + struct source { + RGWDataSyncCtx sc; + RGWBucketInfo info; + rgw_bucket dest; + RGWBucketSyncFlowManager::pipe_handler handler; + std::string zone_name; + + source(RGWDataSyncEnv* env, const rgw_zone_id& zone, RGWRESTConn* conn, + const RGWBucketInfo& info, const rgw_bucket& dest, + const RGWBucketSyncFlowManager::pipe_handler& handler, + const std::string& zone_name) + : sc(env, conn, zone), info(info), dest(dest), handler(handler), + zone_name(zone_name) {} + }; + std::vector sources; + + int do_init(const DoutPrefixProvider *dpp, std::ostream* ostr); + RGWBucketPipeSyncStatusManager(rgw::sal::RadosStore* driver, + std::optional source_zone, + std::optional source_bucket, + const rgw_bucket& dest_bucket) + : driver(driver), source_zone(source_zone), source_bucket(source_bucket), + dest_bucket(dest_bucket) {} + + int remote_info(const DoutPrefixProvider *dpp, source& s, + uint64_t* oldest_gen, uint64_t* latest_gen, + uint64_t* num_shards); +public: + static tl::expected, int> + construct(const DoutPrefixProvider* dpp, rgw::sal::RadosStore* driver, + std::optional source_zone, + std::optional source_bucket, + const rgw_bucket& dest_bucket, std::ostream *ostream); + ~RGWBucketPipeSyncStatusManager() = default; + + + static std::string full_status_oid(const rgw_zone_id& source_zone, + const rgw_bucket& source_bucket, + const rgw_bucket& dest_bucket); + static std::string inc_status_oid(const rgw_zone_id& source_zone, + const rgw_bucket_sync_pair_info& bs, + uint64_t gen); + // specific source obj sync status, can be used by sync modules + static std::string obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe, + const rgw_zone_id& source_zone, + const rgw_obj& obj); + + // implements DoutPrefixProvider + CephContext *get_cct() const override; + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; + + int init_sync_status(const DoutPrefixProvider *dpp); + tl::expected, int> read_sync_status( + const DoutPrefixProvider *dpp); + int run(const DoutPrefixProvider *dpp); +}; + +/// read the full sync status with respect to a source bucket +int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore *driver, + const rgw_sync_bucket_pipe& pipe, + rgw_bucket_sync_status *status, + optional_yield y); + +/// read the incremental sync status of all bucket shards from the given source zone +int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore *driver, + const rgw_sync_bucket_pipe& pipe, + uint64_t gen, + std::vector *status); + +class RGWDefaultSyncModule : public RGWSyncModule { +public: + RGWDefaultSyncModule() {} + bool supports_writes() override { return true; } + bool supports_data_export() override { return true; } + int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; + +class RGWArchiveSyncModule : public RGWDefaultSyncModule { +public: + RGWArchiveSyncModule() {} + bool supports_writes() override { return true; } + bool supports_data_export() override { return false; } + int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; diff --git a/src/rgw/driver/rados/rgw_datalog.cc b/src/rgw/driver/rados/rgw_datalog.cc new file mode 100644 index 000000000..7ca37abf6 --- /dev/null +++ b/src/rgw/driver/rados/rgw_datalog.cc @@ -0,0 +1,1090 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "common/async/yield_context.h" +#include "common/debug.h" +#include "common/containers.h" +#include "common/errno.h" +#include "common/error_code.h" + +#include "common/async/blocked_completion.h" +#include "common/async/librados_completion.h" + +#include "cls/fifo/cls_fifo_types.h" +#include "cls/log/cls_log_client.h" + +#include "cls_fifo_legacy.h" +#include "rgw_bucket_layout.h" +#include "rgw_datalog.h" +#include "rgw_log_backing.h" +#include "rgw_tools.h" + +#define dout_context g_ceph_context +static constexpr auto dout_subsys = ceph_subsys_rgw; + +namespace bs = boost::system; +namespace lr = librados; + +using ceph::containers::tiny_vector; + +void rgw_data_change::dump(ceph::Formatter *f) const +{ + std::string type; + switch (entity_type) { + case ENTITY_TYPE_BUCKET: + type = "bucket"; + break; + default: + type = "unknown"; + } + encode_json("entity_type", type, f); + encode_json("key", key, f); + utime_t ut(timestamp); + encode_json("timestamp", ut, f); + encode_json("gen", gen, f); +} + +void rgw_data_change::decode_json(JSONObj *obj) { + std::string s; + JSONDecoder::decode_json("entity_type", s, obj); + if (s == "bucket") { + entity_type = ENTITY_TYPE_BUCKET; + } else { + entity_type = ENTITY_TYPE_UNKNOWN; + } + JSONDecoder::decode_json("key", key, obj); + utime_t ut; + JSONDecoder::decode_json("timestamp", ut, obj); + timestamp = ut.to_real_time(); + JSONDecoder::decode_json("gen", gen, obj); +} + +void rgw_data_change_log_entry::dump(Formatter *f) const +{ + encode_json("log_id", log_id, f); + utime_t ut(log_timestamp); + encode_json("log_timestamp", ut, f); + encode_json("entry", entry, f); +} + +void rgw_data_change_log_entry::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("log_id", log_id, obj); + utime_t ut; + JSONDecoder::decode_json("log_timestamp", ut, obj); + log_timestamp = ut.to_real_time(); + JSONDecoder::decode_json("entry", entry, obj); +} + +void rgw_data_notify_entry::dump(Formatter *f) const +{ + encode_json("key", key, f); + encode_json("gen", gen, f); +} + +void rgw_data_notify_entry::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("key", key, obj); + JSONDecoder::decode_json("gen", gen, obj); +} + +class RGWDataChangesOmap final : public RGWDataChangesBE { + using centries = std::list; + std::vector oids; + +public: + RGWDataChangesOmap(lr::IoCtx& ioctx, + RGWDataChangesLog& datalog, + uint64_t gen_id, + int num_shards) + : RGWDataChangesBE(ioctx, datalog, gen_id) { + oids.reserve(num_shards); + for (auto i = 0; i < num_shards; ++i) { + oids.push_back(get_oid(i)); + } + } + ~RGWDataChangesOmap() override = default; + + void prepare(ceph::real_time ut, const std::string& key, + ceph::buffer::list&& entry, entries& out) override { + if (!std::holds_alternative(out)) { + ceph_assert(std::visit([](const auto& v) { return std::empty(v); }, out)); + out = centries(); + } + + cls_log_entry e; + cls_log_add_prepare_entry(e, utime_t(ut), {}, key, entry); + std::get(out).push_back(std::move(e)); + } + int push(const DoutPrefixProvider *dpp, int index, entries&& items, optional_yield y) override { + lr::ObjectWriteOperation op; + cls_log_add(op, std::get(items), true); + auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to push to " << oids[index] << cpp_strerror(-r) + << dendl; + } + return r; + } + int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now, + const std::string& key, ceph::buffer::list&& bl, + optional_yield y) override { + lr::ObjectWriteOperation op; + cls_log_add(op, utime_t(now), {}, key, bl); + auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to push to " << oids[index] + << cpp_strerror(-r) << dendl; + } + return r; + } + int list(const DoutPrefixProvider *dpp, int index, int max_entries, + std::vector& entries, + std::optional marker, + std::string* out_marker, bool* truncated, + optional_yield y) override { + std::list log_entries; + lr::ObjectReadOperation op; + cls_log_list(op, {}, {}, std::string(marker.value_or("")), + max_entries, log_entries, out_marker, truncated); + auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, y); + if (r == -ENOENT) { + *truncated = false; + return 0; + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to list " << oids[index] + << cpp_strerror(-r) << dendl; + return r; + } + for (auto iter = log_entries.begin(); iter != log_entries.end(); ++iter) { + rgw_data_change_log_entry log_entry; + log_entry.log_id = iter->id; + auto rt = iter->timestamp.to_real_time(); + log_entry.log_timestamp = rt; + auto liter = iter->data.cbegin(); + try { + decode(log_entry.entry, liter); + } catch (ceph::buffer::error& err) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to decode data changes log entry: " + << err.what() << dendl; + return -EIO; + } + entries.push_back(log_entry); + } + return 0; + } + int get_info(const DoutPrefixProvider *dpp, int index, + RGWDataChangesLogInfo *info, optional_yield y) override { + cls_log_header header; + lr::ObjectReadOperation op; + cls_log_info(op, &header); + auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, y); + if (r == -ENOENT) r = 0; + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to get info from " << oids[index] + << cpp_strerror(-r) << dendl; + } else { + info->marker = header.max_marker; + info->last_update = header.max_time.to_real_time(); + } + return r; + } + int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker, + optional_yield y) override { + lr::ObjectWriteOperation op; + cls_log_trim(op, {}, {}, {}, std::string(marker)); + auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y); + if (r == -ENOENT) r = -ENODATA; + if (r < 0 && r != -ENODATA) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to get info from " << oids[index] + << cpp_strerror(-r) << dendl; + } + return r; + } + int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker, + lr::AioCompletion* c) override { + lr::ObjectWriteOperation op; + cls_log_trim(op, {}, {}, {}, std::string(marker)); + auto r = ioctx.aio_operate(oids[index], c, &op, 0); + if (r == -ENOENT) r = -ENODATA; + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to get info from " << oids[index] + << cpp_strerror(-r) << dendl; + } + return r; + } + std::string_view max_marker() const override { + return "99999999"; + } + int is_empty(const DoutPrefixProvider *dpp, optional_yield y) override { + for (auto shard = 0u; shard < oids.size(); ++shard) { + std::list log_entries; + lr::ObjectReadOperation op; + std::string out_marker; + bool truncated; + cls_log_list(op, {}, {}, {}, 1, log_entries, &out_marker, &truncated); + auto r = rgw_rados_operate(dpp, ioctx, oids[shard], &op, nullptr, y); + if (r == -ENOENT) { + continue; + } + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to list " << oids[shard] + << cpp_strerror(-r) << dendl; + return r; + } + if (!log_entries.empty()) { + return 0; + } + } + return 1; + } +}; + +class RGWDataChangesFIFO final : public RGWDataChangesBE { + using centries = std::vector; + tiny_vector fifos; + +public: + RGWDataChangesFIFO(lr::IoCtx& ioctx, + RGWDataChangesLog& datalog, + uint64_t gen_id, int shards) + : RGWDataChangesBE(ioctx, datalog, gen_id), + fifos(shards, [&ioctx, this](std::size_t i, auto emplacer) { + emplacer.emplace(ioctx, get_oid(i)); + }) {} + ~RGWDataChangesFIFO() override = default; + void prepare(ceph::real_time, const std::string&, + ceph::buffer::list&& entry, entries& out) override { + if (!std::holds_alternative(out)) { + ceph_assert(std::visit([](auto& v) { return std::empty(v); }, out)); + out = centries(); + } + std::get(out).push_back(std::move(entry)); + } + int push(const DoutPrefixProvider *dpp, int index, entries&& items, + optional_yield y) override { + auto r = fifos[index].push(dpp, std::get(items), y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": unable to push to FIFO: " << get_oid(index) + << ": " << cpp_strerror(-r) << dendl; + } + return r; + } + int push(const DoutPrefixProvider *dpp, int index, ceph::real_time, + const std::string&, ceph::buffer::list&& bl, + optional_yield y) override { + auto r = fifos[index].push(dpp, std::move(bl), y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": unable to push to FIFO: " << get_oid(index) + << ": " << cpp_strerror(-r) << dendl; + } + return r; + } + int list(const DoutPrefixProvider *dpp, int index, int max_entries, + std::vector& entries, + std::optional marker, std::string* out_marker, + bool* truncated, optional_yield y) override { + std::vector log_entries; + bool more = false; + auto r = fifos[index].list(dpp, max_entries, marker, &log_entries, &more, + y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": unable to list FIFO: " << get_oid(index) + << ": " << cpp_strerror(-r) << dendl; + return r; + } + for (const auto& entry : log_entries) { + rgw_data_change_log_entry log_entry; + log_entry.log_id = entry.marker; + log_entry.log_timestamp = entry.mtime; + auto liter = entry.data.cbegin(); + try { + decode(log_entry.entry, liter); + } catch (const buffer::error& err) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": failed to decode data changes log entry: " + << err.what() << dendl; + return -EIO; + } + entries.push_back(std::move(log_entry)); + } + if (truncated) + *truncated = more; + if (out_marker && !log_entries.empty()) { + *out_marker = log_entries.back().marker; + } + return 0; + } + int get_info(const DoutPrefixProvider *dpp, int index, + RGWDataChangesLogInfo *info, optional_yield y) override { + auto& fifo = fifos[index]; + auto r = fifo.read_meta(dpp, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": unable to get FIFO metadata: " << get_oid(index) + << ": " << cpp_strerror(-r) << dendl; + return r; + } + rados::cls::fifo::info m; + fifo.meta(dpp, m, y); + auto p = m.head_part_num; + if (p < 0) { + info->marker = ""; + info->last_update = ceph::real_clock::zero(); + return 0; + } + rgw::cls::fifo::part_info h; + r = fifo.get_part_info(dpp, p, &h, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": unable to get part info: " << get_oid(index) << "/" << p + << ": " << cpp_strerror(-r) << dendl; + return r; + } + info->marker = rgw::cls::fifo::marker{p, h.last_ofs}.to_string(); + info->last_update = h.max_time; + return 0; + } + int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker, + optional_yield y) override { + auto r = fifos[index].trim(dpp, marker, false, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": unable to trim FIFO: " << get_oid(index) + << ": " << cpp_strerror(-r) << dendl; + } + return r; + } + int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker, + librados::AioCompletion* c) override { + int r = 0; + if (marker == rgw::cls::fifo::marker(0, 0).to_string()) { + rgw_complete_aio_completion(c, -ENODATA); + } else { + // This null_yield is used for lazily opening FIFOs. + // + // shouldn't exist, but it can't be eliminated + // since your caller is an RGWCoroutine in the data sync code. + // + // It can be eliminated after Reef when we can get rid of + // AioCompletion entirely. + fifos[index].trim(dpp, marker, false, c, null_yield); + } + return r; + } + std::string_view max_marker() const override { + static const std::string mm = + rgw::cls::fifo::marker::max().to_string(); + return std::string_view(mm); + } + int is_empty(const DoutPrefixProvider *dpp, optional_yield y) override { + std::vector log_entries; + bool more = false; + for (auto shard = 0u; shard < fifos.size(); ++shard) { + auto r = fifos[shard].list(dpp, 1, {}, &log_entries, &more, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": unable to list FIFO: " << get_oid(shard) + << ": " << cpp_strerror(-r) << dendl; + return r; + } + if (!log_entries.empty()) { + return 0; + } + } + return 1; + } +}; + +RGWDataChangesLog::RGWDataChangesLog(CephContext* cct) + : cct(cct), + num_shards(cct->_conf->rgw_data_log_num_shards), + prefix(get_prefix()), + changes(cct->_conf->rgw_data_log_changes_size) {} + +bs::error_code DataLogBackends::handle_init(entries_t e) noexcept { + std::unique_lock l(m); + + for (const auto& [gen_id, gen] : e) { + if (gen.pruned) { + lderr(datalog.cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": ERROR: given empty generation: gen_id=" << gen_id << dendl; + } + if (count(gen_id) != 0) { + lderr(datalog.cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": ERROR: generation already exists: gen_id=" << gen_id << dendl; + } + try { + switch (gen.type) { + case log_type::omap: + emplace(gen_id, new RGWDataChangesOmap(ioctx, datalog, gen_id, shards)); + break; + case log_type::fifo: + emplace(gen_id, new RGWDataChangesFIFO(ioctx, datalog, gen_id, shards)); + break; + default: + lderr(datalog.cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": IMPOSSIBLE: invalid log type: gen_id=" << gen_id + << ", type" << gen.type << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + } catch (const bs::system_error& err) { + lderr(datalog.cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": error setting up backend: gen_id=" << gen_id + << ", err=" << err.what() << dendl; + return err.code(); + } + } + return {}; +} +bs::error_code DataLogBackends::handle_new_gens(entries_t e) noexcept { + return handle_init(std::move(e)); +} +bs::error_code DataLogBackends::handle_empty_to(uint64_t new_tail) noexcept { + std::unique_lock l(m); + auto i = cbegin(); + if (i->first < new_tail) { + return {}; + } + if (new_tail >= (cend() - 1)->first) { + lderr(datalog.cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": ERROR: attempt to trim head: new_tail=" << new_tail << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + erase(i, upper_bound(new_tail)); + return {}; +} + + +int RGWDataChangesLog::start(const DoutPrefixProvider *dpp, const RGWZone* _zone, + const RGWZoneParams& zoneparams, + librados::Rados* lr) +{ + zone = _zone; + ceph_assert(zone); + auto defbacking = to_log_type( + cct->_conf.get_val("rgw_default_data_log_backing")); + // Should be guaranteed by `set_enum_allowed` + ceph_assert(defbacking); + auto log_pool = zoneparams.log_pool; + auto r = rgw_init_ioctx(dpp, lr, log_pool, ioctx, true, false); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ + << ": Failed to initialized ioctx, r=" << r + << ", pool=" << log_pool << dendl; + return -r; + } + + // This null_yield is in startup code, so it doesn't matter that much. + auto besr = logback_generations::init( + dpp, ioctx, metadata_log_oid(), [this](uint64_t gen_id, int shard) { + return get_oid(gen_id, shard); + }, + num_shards, *defbacking, null_yield, *this); + + + if (!besr) { + lderr(cct) << __PRETTY_FUNCTION__ + << ": Error initializing backends: " + << besr.error().message() << dendl; + return ceph::from_error_code(besr.error()); + } + + bes = std::move(*besr); + renew_thread = make_named_thread("rgw_dt_lg_renew", + &RGWDataChangesLog::renew_run, this); + return 0; +} + +int RGWDataChangesLog::choose_oid(const rgw_bucket_shard& bs) { + const auto& name = bs.bucket.name; + auto shard_shift = (bs.shard_id > 0 ? bs.shard_id : 0); + auto r = (ceph_str_hash_linux(name.data(), name.size()) + + shard_shift) % num_shards; + return static_cast(r); +} + +int RGWDataChangesLog::renew_entries(const DoutPrefixProvider *dpp) +{ + if (!zone->log_data) + return 0; + + /* we can't keep the bucket name as part of the cls_log_entry, and we need + * it later, so we keep two lists under the map */ + bc::flat_map, + RGWDataChangesBE::entries>> m; + + std::unique_lock l(lock); + decltype(cur_cycle) entries; + entries.swap(cur_cycle); + l.unlock(); + + auto ut = real_clock::now(); + auto be = bes->head(); + for (const auto& [bs, gen] : entries) { + auto index = choose_oid(bs); + + rgw_data_change change; + bufferlist bl; + change.entity_type = ENTITY_TYPE_BUCKET; + change.key = bs.get_key(); + change.timestamp = ut; + change.gen = gen; + encode(change, bl); + + m[index].first.push_back({bs, gen}); + be->prepare(ut, change.key, std::move(bl), m[index].second); + } + + for (auto& [index, p] : m) { + auto& [buckets, entries] = p; + + auto now = real_clock::now(); + + // This null_yield can stay (for now) as we're in our own thread. + auto ret = be->push(dpp, index, std::move(entries), null_yield); + if (ret < 0) { + /* we don't really need to have a special handling for failed cases here, + * as this is just an optimization. */ + ldpp_dout(dpp, -1) << "ERROR: svc.cls->timelog.add() returned " << ret << dendl; + return ret; + } + + auto expiration = now; + expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window); + for (auto& [bs, gen] : buckets) { + update_renewed(bs, gen, expiration); + } + } + + return 0; +} + +auto RGWDataChangesLog::_get_change(const rgw_bucket_shard& bs, + uint64_t gen) + -> ChangeStatusPtr +{ + ceph_assert(ceph_mutex_is_locked(lock)); + ChangeStatusPtr status; + if (!changes.find({bs, gen}, status)) { + status = std::make_shared(); + changes.add({bs, gen}, status); + } + return status; +} + +void RGWDataChangesLog::register_renew(const rgw_bucket_shard& bs, + const rgw::bucket_log_layout_generation& gen) +{ + std::scoped_lock l{lock}; + cur_cycle.insert({bs, gen.gen}); +} + +void RGWDataChangesLog::update_renewed(const rgw_bucket_shard& bs, + uint64_t gen, + real_time expiration) +{ + std::unique_lock l{lock}; + auto status = _get_change(bs, gen); + l.unlock(); + + ldout(cct, 20) << "RGWDataChangesLog::update_renewd() bucket_name=" + << bs.bucket.name << " shard_id=" << bs.shard_id + << " expiration=" << expiration << dendl; + + std::unique_lock sl(status->lock); + status->cur_expiration = expiration; +} + +int RGWDataChangesLog::get_log_shard_id(rgw_bucket& bucket, int shard_id) { + rgw_bucket_shard bs(bucket, shard_id); + return choose_oid(bs); +} + +bool RGWDataChangesLog::filter_bucket(const DoutPrefixProvider *dpp, + const rgw_bucket& bucket, + optional_yield y) const +{ + if (!bucket_filter) { + return true; + } + + return bucket_filter(bucket, y, dpp); +} + +std::string RGWDataChangesLog::get_oid(uint64_t gen_id, int i) const { + return (gen_id > 0 ? + fmt::format("{}@G{}.{}", prefix, gen_id, i) : + fmt::format("{}.{}", prefix, i)); +} + +int RGWDataChangesLog::add_entry(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_log_layout_generation& gen, + int shard_id, optional_yield y) +{ + auto& bucket = bucket_info.bucket; + + if (!filter_bucket(dpp, bucket, y)) { + return 0; + } + + if (observer) { + observer->on_bucket_changed(bucket.get_key()); + } + + rgw_bucket_shard bs(bucket, shard_id); + + int index = choose_oid(bs); + + mark_modified(index, bs, gen.gen); + + std::unique_lock l(lock); + + auto status = _get_change(bs, gen.gen); + l.unlock(); + + auto now = real_clock::now(); + + std::unique_lock sl(status->lock); + + ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() bucket.name=" << bucket.name + << " shard_id=" << shard_id << " now=" << now + << " cur_expiration=" << status->cur_expiration << dendl; + + if (now < status->cur_expiration) { + /* no need to send, recently completed */ + sl.unlock(); + register_renew(bs, gen); + return 0; + } + + RefCountedCond* cond; + + if (status->pending) { + cond = status->cond; + + ceph_assert(cond); + + status->cond->get(); + sl.unlock(); + + int ret = cond->wait(); + cond->put(); + if (!ret) { + register_renew(bs, gen); + } + return ret; + } + + status->cond = new RefCountedCond; + status->pending = true; + + ceph::real_time expiration; + + int ret; + + do { + status->cur_sent = now; + + expiration = now; + expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window); + + sl.unlock(); + + ceph::buffer::list bl; + rgw_data_change change; + change.entity_type = ENTITY_TYPE_BUCKET; + change.key = bs.get_key(); + change.timestamp = now; + change.gen = gen.gen; + encode(change, bl); + + ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() sending update with now=" << now << " cur_expiration=" << expiration << dendl; + + auto be = bes->head(); + ret = be->push(dpp, index, now, change.key, std::move(bl), y); + + now = real_clock::now(); + + sl.lock(); + + } while (!ret && real_clock::now() > expiration); + + cond = status->cond; + + status->pending = false; + /* time of when operation started, not completed */ + status->cur_expiration = status->cur_sent; + status->cur_expiration += make_timespan(cct->_conf->rgw_data_log_window); + status->cond = nullptr; + sl.unlock(); + + cond->done(ret); + cond->put(); + + return ret; +} + +int DataLogBackends::list(const DoutPrefixProvider *dpp, int shard, int max_entries, + std::vector& entries, + std::string_view marker, std::string* out_marker, + bool* truncated, optional_yield y) +{ + const auto [start_id, start_cursor] = cursorgen(marker); + auto gen_id = start_id; + std::string out_cursor; + while (max_entries > 0) { + std::vector gentries; + std::unique_lock l(m); + auto i = lower_bound(gen_id); + if (i == end()) return 0; + auto be = i->second; + l.unlock(); + gen_id = be->gen_id; + auto r = be->list(dpp, shard, max_entries, gentries, + gen_id == start_id ? start_cursor : std::string{}, + &out_cursor, truncated, y); + if (r < 0) + return r; + + if (out_marker && !out_cursor.empty()) { + *out_marker = gencursor(gen_id, out_cursor); + } + for (auto& g : gentries) { + g.log_id = gencursor(gen_id, g.log_id); + } + if (int s = gentries.size(); s < 0 || s > max_entries) + max_entries = 0; + else + max_entries -= gentries.size(); + + std::move(gentries.begin(), gentries.end(), + std::back_inserter(entries)); + ++gen_id; + } + return 0; +} + +int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries, + std::vector& entries, + std::string_view marker, + std::string* out_marker, bool* truncated, + optional_yield y) +{ + assert(shard < num_shards); + return bes->list(dpp, shard, max_entries, entries, marker, out_marker, + truncated, y); +} + +int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int max_entries, + std::vector& entries, + LogMarker& marker, bool *ptruncated, + optional_yield y) +{ + bool truncated; + entries.clear(); + for (; marker.shard < num_shards && int(entries.size()) < max_entries; + marker.shard++, marker.marker.clear()) { + int ret = list_entries(dpp, marker.shard, max_entries - entries.size(), + entries, marker.marker, NULL, &truncated, y); + if (ret == -ENOENT) { + continue; + } + if (ret < 0) { + return ret; + } + if (!truncated) { + *ptruncated = false; + return 0; + } + } + *ptruncated = (marker.shard < num_shards); + return 0; +} + +int RGWDataChangesLog::get_info(const DoutPrefixProvider *dpp, int shard_id, + RGWDataChangesLogInfo *info, optional_yield y) +{ + assert(shard_id < num_shards); + auto be = bes->head(); + auto r = be->get_info(dpp, shard_id, info, y); + if (!info->marker.empty()) { + info->marker = gencursor(be->gen_id, info->marker); + } + return r; +} + +int DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id, + std::string_view marker, optional_yield y) +{ + auto [target_gen, cursor] = cursorgen(marker); + std::unique_lock l(m); + const auto head_gen = (end() - 1)->second->gen_id; + const auto tail_gen = begin()->first; + if (target_gen < tail_gen) return 0; + auto r = 0; + for (auto be = lower_bound(0)->second; + be->gen_id <= target_gen && be->gen_id <= head_gen && r >= 0; + be = upper_bound(be->gen_id)->second) { + l.unlock(); + auto c = be->gen_id == target_gen ? cursor : be->max_marker(); + r = be->trim(dpp, shard_id, c, y); + if (r == -ENOENT) + r = -ENODATA; + if (r == -ENODATA && be->gen_id < target_gen) + r = 0; + if (be->gen_id == target_gen) + break; + l.lock(); + }; + return r; +} + +int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id, + std::string_view marker, optional_yield y) +{ + assert(shard_id < num_shards); + return bes->trim_entries(dpp, shard_id, marker, y); +} + +class GenTrim : public rgw::cls::fifo::Completion { +public: + DataLogBackends* const bes; + const int shard_id; + const uint64_t target_gen; + const std::string cursor; + const uint64_t head_gen; + const uint64_t tail_gen; + boost::intrusive_ptr be; + + GenTrim(const DoutPrefixProvider *dpp, DataLogBackends* bes, int shard_id, uint64_t target_gen, + std::string cursor, uint64_t head_gen, uint64_t tail_gen, + boost::intrusive_ptr be, + lr::AioCompletion* super) + : Completion(dpp, super), bes(bes), shard_id(shard_id), target_gen(target_gen), + cursor(std::move(cursor)), head_gen(head_gen), tail_gen(tail_gen), + be(std::move(be)) {} + + void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) { + auto gen_id = be->gen_id; + be.reset(); + if (r == -ENOENT) + r = -ENODATA; + if (r == -ENODATA && gen_id < target_gen) + r = 0; + if (r < 0) { + complete(std::move(p), r); + return; + } + + { + std::unique_lock l(bes->m); + auto i = bes->upper_bound(gen_id); + if (i == bes->end() || i->first > target_gen || i->first > head_gen) { + l.unlock(); + complete(std::move(p), -ENODATA); + return; + } + be = i->second; + } + auto c = be->gen_id == target_gen ? cursor : be->max_marker(); + be->trim(dpp, shard_id, c, call(std::move(p))); + } +}; + +void DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker, + librados::AioCompletion* c) +{ + auto [target_gen, cursor] = cursorgen(marker); + std::unique_lock l(m); + const auto head_gen = (end() - 1)->second->gen_id; + const auto tail_gen = begin()->first; + if (target_gen < tail_gen) { + l.unlock(); + rgw_complete_aio_completion(c, -ENODATA); + return; + } + auto be = begin()->second; + l.unlock(); + auto gt = std::make_unique(dpp, this, shard_id, target_gen, + std::string(cursor), head_gen, tail_gen, + be, c); + + auto cc = be->gen_id == target_gen ? cursor : be->max_marker(); + be->trim(dpp, shard_id, cc, GenTrim::call(std::move(gt))); +} + +int DataLogBackends::trim_generations(const DoutPrefixProvider *dpp, + std::optional& through, + optional_yield y) { + if (size() != 1) { + std::vector candidates; + { + std::scoped_lock l(m); + auto e = cend() - 1; + for (auto i = cbegin(); i < e; ++i) { + candidates.push_back(i->second); + } + } + + std::optional highest; + for (auto& be : candidates) { + auto r = be->is_empty(dpp, y); + if (r < 0) { + return r; + } else if (r == 1) { + highest = be->gen_id; + } else { + break; + } + } + + through = highest; + if (!highest) { + return 0; + } + auto ec = empty_to(dpp, *highest, y); + if (ec) { + return ceph::from_error_code(ec); + } + } + + return ceph::from_error_code(remove_empty(dpp, y)); +} + + +int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker, + librados::AioCompletion* c) +{ + assert(shard_id < num_shards); + bes->trim_entries(dpp, shard_id, marker, c); + return 0; +} + +bool RGWDataChangesLog::going_down() const +{ + return down_flag; +} + +RGWDataChangesLog::~RGWDataChangesLog() { + down_flag = true; + if (renew_thread.joinable()) { + renew_stop(); + renew_thread.join(); + } +} + +void RGWDataChangesLog::renew_run() noexcept { + static constexpr auto runs_per_prune = 150; + auto run = 0; + for (;;) { + const DoutPrefix dp(cct, dout_subsys, "rgw data changes log: "); + ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: start" << dendl; + int r = renew_entries(&dp); + if (r < 0) { + ldpp_dout(&dp, 0) << "ERROR: RGWDataChangesLog::renew_entries returned error r=" << r << dendl; + } + + if (going_down()) + break; + + if (run == runs_per_prune) { + std::optional through; + ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruning old generations" << dendl; + // This null_yield can stay, for now, as it's in its own thread. + trim_generations(&dp, through, null_yield); + if (r < 0) { + derr << "RGWDataChangesLog::ChangesRenewThread: failed pruning r=" + << r << dendl; + } else if (through) { + ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruned generations " + << "through " << *through << "." << dendl; + } else { + ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: nothing to prune." + << dendl; + } + run = 0; + } else { + ++run; + } + + int interval = cct->_conf->rgw_data_log_window * 3 / 4; + std::unique_lock locker{renew_lock}; + renew_cond.wait_for(locker, std::chrono::seconds(interval)); + } +} + +void RGWDataChangesLog::renew_stop() +{ + std::lock_guard l{renew_lock}; + renew_cond.notify_all(); +} + +void RGWDataChangesLog::mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen) +{ + if (!cct->_conf->rgw_data_notify_interval_msec) { + return; + } + + auto key = bs.get_key(); + { + std::shared_lock rl{modified_lock}; // read lock to check for existence + auto shard = modified_shards.find(shard_id); + if (shard != modified_shards.end() && shard->second.count({key, gen})) { + return; + } + } + + std::unique_lock wl{modified_lock}; // write lock for insertion + modified_shards[shard_id].insert(rgw_data_notify_entry{key, gen}); +} + +std::string RGWDataChangesLog::max_marker() const { + return gencursor(std::numeric_limits::max(), + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); +} + +int RGWDataChangesLog::change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y) { + return ceph::from_error_code(bes->new_backing(dpp, type, y)); +} + +int RGWDataChangesLog::trim_generations(const DoutPrefixProvider *dpp, + std::optional& through, + optional_yield y) { + return bes->trim_generations(dpp, through, y); +} + +void RGWDataChangesLogInfo::dump(Formatter *f) const +{ + encode_json("marker", marker, f); + utime_t ut(last_update); + encode_json("last_update", ut, f); +} + +void RGWDataChangesLogInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("marker", marker, obj); + utime_t ut; + JSONDecoder::decode_json("last_update", ut, obj); + last_update = ut.to_real_time(); +} + + diff --git a/src/rgw/driver/rados/rgw_datalog.h b/src/rgw/driver/rados/rgw_datalog.h new file mode 100644 index 000000000..174cf86de --- /dev/null +++ b/src/rgw/driver/rados/rgw_datalog.h @@ -0,0 +1,394 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "common/async/yield_context.h" +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/function2.hpp" + +#include "include/rados/librados.hpp" + +#include "common/ceph_context.h" +#include "common/ceph_json.h" +#include "common/ceph_time.h" +#include "common/Formatter.h" +#include "common/lru_map.h" +#include "common/RefCountedObj.h" + +#include "cls/log/cls_log_types.h" + +#include "rgw_basic_types.h" +#include "rgw_log_backing.h" +#include "rgw_sync_policy.h" +#include "rgw_zone.h" +#include "rgw_trim_bilog.h" + +namespace bc = boost::container; + +enum DataLogEntityType { + ENTITY_TYPE_UNKNOWN = 0, + ENTITY_TYPE_BUCKET = 1, +}; + +struct rgw_data_change { + DataLogEntityType entity_type; + std::string key; + ceph::real_time timestamp; + uint64_t gen = 0; + + void encode(ceph::buffer::list& bl) const { + // require decoders to recognize v2 when gen>0 + const uint8_t compat = (gen == 0) ? 1 : 2; + ENCODE_START(2, compat, bl); + auto t = std::uint8_t(entity_type); + encode(t, bl); + encode(key, bl); + encode(timestamp, bl); + encode(gen, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + std::uint8_t t; + decode(t, bl); + entity_type = DataLogEntityType(t); + decode(key, bl); + decode(timestamp, bl); + if (struct_v < 2) { + gen = 0; + } else { + decode(gen, bl); + } + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter* f) const; + void decode_json(JSONObj* obj); +}; +WRITE_CLASS_ENCODER(rgw_data_change) + +struct rgw_data_change_log_entry { + std::string log_id; + ceph::real_time log_timestamp; + rgw_data_change entry; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(log_id, bl); + encode(log_timestamp, bl); + encode(entry, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(log_id, bl); + decode(log_timestamp, bl); + decode(entry, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter* f) const; + void decode_json(JSONObj* obj); +}; +WRITE_CLASS_ENCODER(rgw_data_change_log_entry) + +struct RGWDataChangesLogInfo { + std::string marker; + ceph::real_time last_update; + + void dump(ceph::Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct RGWDataChangesLogMarker { + int shard = 0; + std::string marker; + + RGWDataChangesLogMarker() = default; +}; + +class RGWDataChangesLog; + +struct rgw_data_notify_entry { + std::string key; + uint64_t gen = 0; + + void dump(ceph::Formatter* f) const; + void decode_json(JSONObj* obj); + + rgw_data_notify_entry& operator=(const rgw_data_notify_entry&) = default; + + bool operator <(const rgw_data_notify_entry& d) const { + if (key < d.key) { + return true; + } + if (d.key < key) { + return false; + } + return gen < d.gen; + } + friend std::ostream& operator <<(std::ostream& m, + const rgw_data_notify_entry& e) { + return m << "[key: " << e.key << ", gen: " << e.gen << "]"; + } +}; + +class RGWDataChangesBE; + +class DataLogBackends final + : public logback_generations, + private bc::flat_map> { + friend class logback_generations; + friend class GenTrim; + + std::mutex m; + RGWDataChangesLog& datalog; + + DataLogBackends(librados::IoCtx& ioctx, + std::string oid, + fu2::unique_function&& get_oid, + int shards, RGWDataChangesLog& datalog) noexcept + : logback_generations(ioctx, oid, std::move(get_oid), + shards), datalog(datalog) {} +public: + + boost::intrusive_ptr head() { + std::unique_lock l(m); + auto i = end(); + --i; + return i->second; + } + int list(const DoutPrefixProvider *dpp, int shard, int max_entries, + std::vector& entries, + std::string_view marker, std::string* out_marker, bool* truncated, + optional_yield y); + int trim_entries(const DoutPrefixProvider *dpp, int shard_id, + std::string_view marker, optional_yield y); + void trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker, + librados::AioCompletion* c); + void set_zero(RGWDataChangesBE* be) { + emplace(0, be); + } + + bs::error_code handle_init(entries_t e) noexcept override; + bs::error_code handle_new_gens(entries_t e) noexcept override; + bs::error_code handle_empty_to(uint64_t new_tail) noexcept override; + + int trim_generations(const DoutPrefixProvider *dpp, + std::optional& through, + optional_yield y); +}; + +struct BucketGen { + rgw_bucket_shard shard; + uint64_t gen; + + BucketGen(const rgw_bucket_shard& shard, uint64_t gen) + : shard(shard), gen(gen) {} + + BucketGen(rgw_bucket_shard&& shard, uint64_t gen) + : shard(std::move(shard)), gen(gen) {} + + BucketGen(const BucketGen&) = default; + BucketGen(BucketGen&&) = default; + BucketGen& operator =(const BucketGen&) = default; + BucketGen& operator =(BucketGen&&) = default; + + ~BucketGen() = default; +}; + +inline bool operator ==(const BucketGen& l, const BucketGen& r) { + return (l.shard == r.shard) && (l.gen == r.gen); +} + +inline bool operator <(const BucketGen& l, const BucketGen& r) { + if (l.shard < r.shard) { + return true; + } else if (l.shard == r.shard) { + return l.gen < r.gen; + } else { + return false; + } +} + +class RGWDataChangesLog { + friend DataLogBackends; + CephContext *cct; + librados::IoCtx ioctx; + rgw::BucketChangeObserver *observer = nullptr; + const RGWZone* zone; + std::unique_ptr bes; + + const int num_shards; + std::string get_prefix() { + auto prefix = cct->_conf->rgw_data_log_obj_prefix; + return prefix.empty() ? prefix : "data_log"; + } + std::string metadata_log_oid() { + return get_prefix() + "generations_metadata"; + } + std::string prefix; + + ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::lock"); + ceph::shared_mutex modified_lock = + ceph::make_shared_mutex("RGWDataChangesLog::modified_lock"); + bc::flat_map> modified_shards; + + std::atomic down_flag = { false }; + + struct ChangeStatus { + std::shared_ptr sync_policy; + ceph::real_time cur_expiration; + ceph::real_time cur_sent; + bool pending = false; + RefCountedCond* cond = nullptr; + ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::ChangeStatus"); + }; + + using ChangeStatusPtr = std::shared_ptr; + + lru_map changes; + + bc::flat_set cur_cycle; + + ChangeStatusPtr _get_change(const rgw_bucket_shard& bs, uint64_t gen); + void register_renew(const rgw_bucket_shard& bs, + const rgw::bucket_log_layout_generation& gen); + void update_renewed(const rgw_bucket_shard& bs, + uint64_t gen, + ceph::real_time expiration); + + ceph::mutex renew_lock = ceph::make_mutex("ChangesRenewThread::lock"); + ceph::condition_variable renew_cond; + void renew_run() noexcept; + void renew_stop(); + std::thread renew_thread; + + std::function bucket_filter; + bool going_down() const; + bool filter_bucket(const DoutPrefixProvider *dpp, const rgw_bucket& bucket, optional_yield y) const; + int renew_entries(const DoutPrefixProvider *dpp); + +public: + + RGWDataChangesLog(CephContext* cct); + ~RGWDataChangesLog(); + + int start(const DoutPrefixProvider *dpp, const RGWZone* _zone, const RGWZoneParams& zoneparams, + librados::Rados* lr); + int choose_oid(const rgw_bucket_shard& bs); + int add_entry(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, + const rgw::bucket_log_layout_generation& gen, int shard_id, + optional_yield y); + int get_log_shard_id(rgw_bucket& bucket, int shard_id); + int list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries, + std::vector& entries, + std::string_view marker, std::string* out_marker, + bool* truncated, optional_yield y); + int trim_entries(const DoutPrefixProvider *dpp, int shard_id, + std::string_view marker, optional_yield y); + int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker, + librados::AioCompletion* c); // :( + int get_info(const DoutPrefixProvider *dpp, int shard_id, + RGWDataChangesLogInfo *info, optional_yield y); + + using LogMarker = RGWDataChangesLogMarker; + + int list_entries(const DoutPrefixProvider *dpp, int max_entries, + std::vector& entries, + LogMarker& marker, bool* ptruncated, + optional_yield y); + + void mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen); + auto read_clear_modified() { + std::unique_lock wl{modified_lock}; + decltype(modified_shards) modified; + modified.swap(modified_shards); + modified_shards.clear(); + return modified; + } + + void set_observer(rgw::BucketChangeObserver *observer) { + this->observer = observer; + } + + void set_bucket_filter(decltype(bucket_filter)&& f) { + bucket_filter = std::move(f); + } + // a marker that compares greater than any other + std::string max_marker() const; + std::string get_oid(uint64_t gen_id, int shard_id) const; + + + int change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y); + int trim_generations(const DoutPrefixProvider *dpp, + std::optional& through, + optional_yield y); +}; + +class RGWDataChangesBE : public boost::intrusive_ref_counter { +protected: + librados::IoCtx& ioctx; + CephContext* const cct; + RGWDataChangesLog& datalog; + + std::string get_oid(int shard_id) { + return datalog.get_oid(gen_id, shard_id); + } +public: + using entries = std::variant, + std::vector>; + + const uint64_t gen_id; + + RGWDataChangesBE(librados::IoCtx& ioctx, + RGWDataChangesLog& datalog, + uint64_t gen_id) + : ioctx(ioctx), cct(static_cast(ioctx.cct())), + datalog(datalog), gen_id(gen_id) {} + virtual ~RGWDataChangesBE() = default; + + virtual void prepare(ceph::real_time now, + const std::string& key, + ceph::buffer::list&& entry, + entries& out) = 0; + virtual int push(const DoutPrefixProvider *dpp, int index, entries&& items, + optional_yield y) = 0; + virtual int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now, + const std::string& key, ceph::buffer::list&& bl, + optional_yield y) = 0; + virtual int list(const DoutPrefixProvider *dpp, int shard, int max_entries, + std::vector& entries, + std::optional marker, + std::string* out_marker, bool* truncated, + optional_yield y) = 0; + virtual int get_info(const DoutPrefixProvider *dpp, int index, + RGWDataChangesLogInfo *info, optional_yield y) = 0; + virtual int trim(const DoutPrefixProvider *dpp, int index, + std::string_view marker, optional_yield y) = 0; + virtual int trim(const DoutPrefixProvider *dpp, int index, + std::string_view marker, librados::AioCompletion* c) = 0; + virtual std::string_view max_marker() const = 0; + // 1 on empty, 0 on non-empty, negative on error. + virtual int is_empty(const DoutPrefixProvider *dpp, optional_yield y) = 0; +}; diff --git a/src/rgw/driver/rados/rgw_datalog_notify.cc b/src/rgw/driver/rados/rgw_datalog_notify.cc new file mode 100644 index 000000000..12cdc532f --- /dev/null +++ b/src/rgw/driver/rados/rgw_datalog_notify.cc @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_datalog_notify.h" +#include "rgw_datalog.h" + +// custom encoding for v1 notify API +struct EntryEncoderV1 { + const rgw_data_notify_entry& entry; +}; +struct SetEncoderV1 { + const bc::flat_set& entries; +}; + +// encode rgw_data_notify_entry as string +void encode_json(const char *name, const EntryEncoderV1& e, Formatter *f) +{ + f->dump_string(name, e.entry.key); // encode the key only +} +// encode set as set +void encode_json(const char *name, const SetEncoderV1& e, Formatter *f) +{ + f->open_array_section(name); + for (auto& entry : e.entries) { + encode_json("obj", EntryEncoderV1{entry}, f); + } + f->close_section(); +} +// encode map> as map> +void encode_json(const char *name, const rgw_data_notify_v1_encoder& e, Formatter *f) +{ + f->open_array_section(name); + for (auto& [key, val] : e.shards) { + f->open_object_section("entry"); + encode_json("key", key, f); + encode_json("val", SetEncoderV1{val}, f); + f->close_section(); + } + f->close_section(); +} + +struct EntryDecoderV1 { + rgw_data_notify_entry& entry; +}; +struct SetDecoderV1 { + bc::flat_set& entries; +}; + +// decode string into rgw_data_notify_entry +void decode_json_obj(EntryDecoderV1& d, JSONObj *obj) +{ + decode_json_obj(d.entry.key, obj); + d.entry.gen = 0; +} +// decode set into set +void decode_json_obj(SetDecoderV1& d, JSONObj *obj) +{ + for (JSONObjIter o = obj->find_first(); !o.end(); ++o) { + rgw_data_notify_entry val; + auto decoder = EntryDecoderV1{val}; + decode_json_obj(decoder, *o); + d.entries.insert(std::move(val)); + } +} +// decode map> into map> +void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj) +{ + for (JSONObjIter o = obj->find_first(); !o.end(); ++o) { + int shard_id = 0; + JSONDecoder::decode_json("key", shard_id, *o); + bc::flat_set val; + SetDecoderV1 decoder{val}; + JSONDecoder::decode_json("val", decoder, *o); + d.shards[shard_id] = std::move(val); + } +} diff --git a/src/rgw/driver/rados/rgw_datalog_notify.h b/src/rgw/driver/rados/rgw_datalog_notify.h new file mode 100644 index 000000000..4cd1b3c11 --- /dev/null +++ b/src/rgw/driver/rados/rgw_datalog_notify.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include + +#include "rgw_datalog.h" + +namespace bc = boost::container; + +namespace ceph { class Formatter; } +class JSONObj; + +class RGWCoroutine; +class RGWHTTPManager; +class RGWRESTConn; + +struct rgw_data_notify_entry; + +// json encoder and decoder for notify v1 API +struct rgw_data_notify_v1_encoder { + const bc::flat_map>& shards; +}; +void encode_json(const char *name, const rgw_data_notify_v1_encoder& e, + ceph::Formatter *f); +struct rgw_data_notify_v1_decoder { + bc::flat_map>& shards; +}; +void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj); diff --git a/src/rgw/driver/rados/rgw_etag_verifier.cc b/src/rgw/driver/rados/rgw_etag_verifier.cc new file mode 100644 index 000000000..52f7c7948 --- /dev/null +++ b/src/rgw/driver/rados/rgw_etag_verifier.cc @@ -0,0 +1,191 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_etag_verifier.h" +#include "rgw_obj_manifest.h" + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::putobj { + +int create_etag_verifier(const DoutPrefixProvider *dpp, + CephContext* cct, rgw::sal::DataProcessor* filter, + const bufferlist& manifest_bl, + const std::optional& compression, + etag_verifier_ptr& verifier) +{ + RGWObjManifest manifest; + + try { + auto miter = manifest_bl.cbegin(); + decode(manifest, miter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl; + return -EIO; + } + + RGWObjManifestRule rule; + bool found = manifest.get_rule(0, &rule); + if (!found) { + ldpp_dout(dpp, -1) << "ERROR: manifest->get_rule() could not find rule" << dendl; + return -EIO; + } + + if (rule.start_part_num == 0) { + /* Atomic object */ + verifier.emplace(cct, filter); + return 0; + } + + uint64_t cur_part_ofs = UINT64_MAX; + std::vector part_ofs; + + /* + * We must store the offset of each part to calculate the ETAGs for each + * MPU part. These part ETags then become the input for the MPU object + * Etag. + */ + for (auto mi = manifest.obj_begin(dpp); mi != manifest.obj_end(dpp); ++mi) { + if (cur_part_ofs == mi.get_part_ofs()) + continue; + cur_part_ofs = mi.get_part_ofs(); + ldpp_dout(dpp, 20) << "MPU Part offset:" << cur_part_ofs << dendl; + part_ofs.push_back(cur_part_ofs); + } + + if (compression) { + // if the source object was compressed, the manifest is storing + // compressed part offsets. transform the compressed offsets back to + // their original offsets by finding the first block of each part + const auto& blocks = compression->blocks; + auto block = blocks.begin(); + for (auto& ofs : part_ofs) { + // find the compression_block with new_ofs == ofs + constexpr auto less = [] (const compression_block& block, uint64_t ofs) { + return block.new_ofs < ofs; + }; + block = std::lower_bound(block, blocks.end(), ofs, less); + if (block == blocks.end() || block->new_ofs != ofs) { + ldpp_dout(dpp, 4) << "no match for compressed offset " << ofs + << ", disabling etag verification" << dendl; + return -EIO; + } + ofs = block->old_ofs; + ldpp_dout(dpp, 20) << "MPU Part uncompressed offset:" << ofs << dendl; + } + } + + verifier.emplace(cct, std::move(part_ofs), filter); + return 0; +} + +int ETagVerifier_Atomic::process(bufferlist&& in, uint64_t logical_offset) +{ + bufferlist out; + if (in.length() > 0) + hash.Update((const unsigned char *)in.c_str(), in.length()); + + return Pipe::process(std::move(in), logical_offset); +} + +void ETagVerifier_Atomic::calculate_etag() +{ + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + + /* Return early if ETag has already been calculated */ + if (!calculated_etag.empty()) + return; + + hash.Final(m); + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + calculated_etag = calc_md5; + ldout(cct, 20) << "Single part object: " << " etag:" << calculated_etag + << dendl; +} + +void ETagVerifier_MPU::process_end_of_MPU_part() +{ + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char calc_md5_part[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + std::string calculated_etag_part; + + hash.Final(m); + mpu_etag_hash.Update((const unsigned char *)m, sizeof(m)); + hash.Restart(); + + if (cct->_conf->subsys.should_gather(dout_subsys, 20)) { + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5_part); + calculated_etag_part = calc_md5_part; + ldout(cct, 20) << "Part etag: " << calculated_etag_part << dendl; + } + + cur_part_index++; + next_part_index++; +} + +int ETagVerifier_MPU::process(bufferlist&& in, uint64_t logical_offset) +{ + uint64_t bl_end = in.length() + logical_offset; + + /* Handle the last MPU part */ + if (size_t(next_part_index) == part_ofs.size()) { + hash.Update((const unsigned char *)in.c_str(), in.length()); + goto done; + } + + /* Incoming bufferlist spans two MPU parts. Calculate separate ETags */ + if (bl_end > part_ofs[next_part_index]) { + + uint64_t part_one_len = part_ofs[next_part_index] - logical_offset; + hash.Update((const unsigned char *)in.c_str(), part_one_len); + process_end_of_MPU_part(); + + hash.Update((const unsigned char *)in.c_str() + part_one_len, + bl_end - part_ofs[cur_part_index]); + /* + * If we've moved to the last part of the MPU, avoid usage of + * parts_ofs[next_part_index] as it will lead to our-of-range access. + */ + if (size_t(next_part_index) == part_ofs.size()) + goto done; + } else { + hash.Update((const unsigned char *)in.c_str(), in.length()); + } + + /* Update the MPU Etag if the current part has ended */ + if (logical_offset + in.length() + 1 == part_ofs[next_part_index]) + process_end_of_MPU_part(); + +done: + return Pipe::process(std::move(in), logical_offset); +} + +void ETagVerifier_MPU::calculate_etag() +{ + const uint32_t parts = part_ofs.size(); + constexpr auto digits10 = std::numeric_limits::digits10; + constexpr auto extra = 2 + digits10; // add "-%u\0" at the end + + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE], mpu_m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + extra]; + + /* Return early if ETag has already been calculated */ + if (!calculated_etag.empty()) + return; + + hash.Final(m); + mpu_etag_hash.Update((const unsigned char *)m, sizeof(m)); + + /* Refer RGWCompleteMultipart::execute() for ETag calculation for MPU object */ + mpu_etag_hash.Final(mpu_m); + buf_to_hex(mpu_m, CEPH_CRYPTO_MD5_DIGESTSIZE, final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], + sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%u", parts); + + calculated_etag = final_etag_str; + ldout(cct, 20) << "MPU calculated ETag:" << calculated_etag << dendl; +} + +} // namespace rgw::putobj diff --git a/src/rgw/driver/rados/rgw_etag_verifier.h b/src/rgw/driver/rados/rgw_etag_verifier.h new file mode 100644 index 000000000..18a4f5a3f --- /dev/null +++ b/src/rgw/driver/rados/rgw_etag_verifier.h @@ -0,0 +1,90 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * RGW Etag Verifier is an RGW filter which enables the objects copied using + * multisite sync to be verified using their ETag from source i.e. the MD5 + * checksum of the object is computed at the destination and is verified to be + * identical to the ETag stored in the object HEAD at source cluster. + * + * For MPU objects, a different filter named RGWMultipartEtagFilter is applied + * which re-computes ETag using RGWObjManifest. This computes the ETag using the + * same algorithm used at the source cluster i.e. MD5 sum of the individual ETag + * on the MPU parts. + */ + +#pragma once + +#include "rgw_putobj.h" +#include "rgw_op.h" +#include "common/static_ptr.h" + +namespace rgw::putobj { + +class ETagVerifier : public rgw::putobj::Pipe +{ +protected: + CephContext* cct; + MD5 hash; + std::string calculated_etag; + +public: + ETagVerifier(CephContext* cct_, rgw::sal::DataProcessor *next) + : Pipe(next), cct(cct_) { + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + } + + virtual void calculate_etag() = 0; + std::string get_calculated_etag() { return calculated_etag;} + +}; /* ETagVerifier */ + +class ETagVerifier_Atomic : public ETagVerifier +{ +public: + ETagVerifier_Atomic(CephContext* cct_, rgw::sal::DataProcessor *next) + : ETagVerifier(cct_, next) {} + + int process(bufferlist&& data, uint64_t logical_offset) override; + void calculate_etag() override; + +}; /* ETagVerifier_Atomic */ + +class ETagVerifier_MPU : public ETagVerifier +{ + std::vector part_ofs; + uint64_t cur_part_index{0}, next_part_index{1}; + MD5 mpu_etag_hash; + + void process_end_of_MPU_part(); + +public: + ETagVerifier_MPU(CephContext* cct, + std::vector part_ofs, + rgw::sal::DataProcessor *next) + : ETagVerifier(cct, next), + part_ofs(std::move(part_ofs)) + { + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + } + + int process(bufferlist&& data, uint64_t logical_offset) override; + void calculate_etag() override; + +}; /* ETagVerifier_MPU */ + +constexpr auto max_etag_verifier_size = std::max( + sizeof(ETagVerifier_Atomic), + sizeof(ETagVerifier_MPU) + ); +using etag_verifier_ptr = ceph::static_ptr; + +int create_etag_verifier(const DoutPrefixProvider *dpp, + CephContext* cct, rgw::sal::DataProcessor* next, + const bufferlist& manifest_bl, + const std::optional& compression, + etag_verifier_ptr& verifier); + +} // namespace rgw::putobj diff --git a/src/rgw/driver/rados/rgw_gc.cc b/src/rgw/driver/rados/rgw_gc.cc new file mode 100644 index 000000000..bd16bde1b --- /dev/null +++ b/src/rgw/driver/rados/rgw_gc.cc @@ -0,0 +1,811 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_gc.h" + +#include "rgw_tools.h" +#include "include/scope_guard.h" +#include "include/rados/librados.hpp" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/rgw_gc/cls_rgw_gc_client.h" +#include "cls/refcount/cls_refcount_client.h" +#include "cls/version/cls_version_client.h" +#include "rgw_perf_counters.h" +#include "cls/lock/cls_lock_client.h" +#include "include/random.h" +#include "rgw_gc_log.h" + +#include // XXX +#include +#include "xxhash.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace librados; + +static string gc_oid_prefix = "gc"; +static string gc_index_lock_name = "gc_process"; + +void RGWGC::initialize(CephContext *_cct, RGWRados *_store) { + cct = _cct; + store = _store; + + max_objs = min(static_cast(cct->_conf->rgw_gc_max_objs), rgw_shards_max()); + + obj_names = new string[max_objs]; + + for (int i = 0; i < max_objs; i++) { + obj_names[i] = gc_oid_prefix; + char buf[32]; + snprintf(buf, 32, ".%d", i); + obj_names[i].append(buf); + + auto it = transitioned_objects_cache.begin() + i; + transitioned_objects_cache.insert(it, false); + + //version = 0 -> not ready for transition + //version = 1 -> marked ready for transition + librados::ObjectWriteOperation op; + op.create(false); + const uint64_t queue_size = cct->_conf->rgw_gc_max_queue_size, num_deferred_entries = cct->_conf->rgw_gc_max_deferred; + gc_log_init2(op, queue_size, num_deferred_entries); + store->gc_operate(this, obj_names[i], &op); + } +} + +void RGWGC::finalize() +{ + delete[] obj_names; +} + +int RGWGC::tag_index(const string& tag) +{ + return rgw_shards_mod(XXH64(tag.c_str(), tag.size(), seed), max_objs); +} + +std::tuple> RGWGC::send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag) +{ + ldpp_dout(this, 20) << "RGWGC::send_split_chain - tag is: " << tag << dendl; + + if (cct->_conf->rgw_max_chunk_size) { + cls_rgw_obj_chain broken_chain; + ldpp_dout(this, 20) << "RGWGC::send_split_chain - rgw_max_chunk_size is: " << cct->_conf->rgw_max_chunk_size << dendl; + + for (auto it = chain.objs.begin(); it != chain.objs.end(); it++) { + ldpp_dout(this, 20) << "RGWGC::send_split_chain - adding obj with name: " << it->key << dendl; + broken_chain.objs.emplace_back(*it); + cls_rgw_gc_obj_info info; + info.tag = tag; + info.chain = broken_chain; + cls_rgw_gc_set_entry_op op; + op.info = info; + size_t total_encoded_size = op.estimate_encoded_size(); + ldpp_dout(this, 20) << "RGWGC::send_split_chain - total_encoded_size is: " << total_encoded_size << dendl; + + if (total_encoded_size > cct->_conf->rgw_max_chunk_size) { //dont add to chain, and send to gc + broken_chain.objs.pop_back(); + --it; + ldpp_dout(this, 20) << "RGWGC::send_split_chain - more than, dont add to broken chain and send chain" << dendl; + auto ret = send_chain(broken_chain, tag); + if (ret < 0) { + broken_chain.objs.insert(broken_chain.objs.end(), it, chain.objs.end()); // add all the remainder objs to the list to be deleted inline + ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl; + return {ret, {broken_chain}}; + } + broken_chain.objs.clear(); + } + } + if (!broken_chain.objs.empty()) { //when the chain is smaller than or equal to rgw_max_chunk_size + ldpp_dout(this, 20) << "RGWGC::send_split_chain - sending leftover objects" << dendl; + auto ret = send_chain(broken_chain, tag); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl; + return {ret, {broken_chain}}; + } + } + } else { + auto ret = send_chain(chain, tag); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl; + return {ret, {std::move(chain)}}; + } + } + return {0, {}}; +} + +int RGWGC::send_chain(const cls_rgw_obj_chain& chain, const string& tag) +{ + ObjectWriteOperation op; + cls_rgw_gc_obj_info info; + info.chain = chain; + info.tag = tag; + gc_log_enqueue2(op, cct->_conf->rgw_gc_obj_min_wait, info); + + int i = tag_index(tag); + + ldpp_dout(this, 20) << "RGWGC::send_chain - on object name: " << obj_names[i] << "tag is: " << tag << dendl; + + auto ret = store->gc_operate(this, obj_names[i], &op); + if (ret != -ECANCELED && ret != -EPERM) { + return ret; + } + ObjectWriteOperation set_entry_op; + cls_rgw_gc_set_entry(set_entry_op, cct->_conf->rgw_gc_obj_min_wait, info); + return store->gc_operate(this, obj_names[i], &set_entry_op); +} + +struct defer_chain_state { + librados::AioCompletion* completion = nullptr; + // TODO: hold a reference on the state in RGWGC to avoid use-after-free if + // RGWGC destructs before this completion fires + RGWGC* gc = nullptr; + cls_rgw_gc_obj_info info; + + ~defer_chain_state() { + if (completion) { + completion->release(); + } + } +}; + +static void async_defer_callback(librados::completion_t, void* arg) +{ + std::unique_ptr state{static_cast(arg)}; + if (state->completion->get_return_value() == -ECANCELED) { + state->gc->on_defer_canceled(state->info); + } +} + +void RGWGC::on_defer_canceled(const cls_rgw_gc_obj_info& info) +{ + const std::string& tag = info.tag; + const int i = tag_index(tag); + + // ECANCELED from cls_version_check() tells us that we've transitioned + transitioned_objects_cache[i] = true; + + ObjectWriteOperation op; + cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info); + cls_rgw_gc_remove(op, {tag}); + + auto c = librados::Rados::aio_create_completion(nullptr, nullptr); + store->gc_aio_operate(obj_names[i], c, &op); + c->release(); +} + +int RGWGC::async_defer_chain(const string& tag, const cls_rgw_obj_chain& chain) +{ + const int i = tag_index(tag); + cls_rgw_gc_obj_info info; + info.chain = chain; + info.tag = tag; + + // if we've transitioned this shard object, we can rely on the cls_rgw_gc queue + if (transitioned_objects_cache[i]) { + ObjectWriteOperation op; + cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info); + + // this tag may still be present in omap, so remove it once the cls_rgw_gc + // enqueue succeeds + cls_rgw_gc_remove(op, {tag}); + + auto c = librados::Rados::aio_create_completion(nullptr, nullptr); + int ret = store->gc_aio_operate(obj_names[i], c, &op); + c->release(); + return ret; + } + + // if we haven't seen the transition yet, write the defer to omap with cls_rgw + ObjectWriteOperation op; + + // assert that we haven't initialized cls_rgw_gc queue. this prevents us + // from writing new entries to omap after the transition + gc_log_defer1(op, cct->_conf->rgw_gc_obj_min_wait, info); + + // prepare a callback to detect the transition via ECANCELED from cls_version_check() + auto state = std::make_unique(); + state->gc = this; + state->info.chain = chain; + state->info.tag = tag; + state->completion = librados::Rados::aio_create_completion( + state.get(), async_defer_callback); + + int ret = store->gc_aio_operate(obj_names[i], state->completion, &op); + if (ret == 0) { + state.release(); // release ownership until async_defer_callback() + } + return ret; +} + +int RGWGC::remove(int index, const std::vector& tags, AioCompletion **pc) +{ + ObjectWriteOperation op; + cls_rgw_gc_remove(op, tags); + + auto c = librados::Rados::aio_create_completion(nullptr, nullptr); + int ret = store->gc_aio_operate(obj_names[index], c, &op); + if (ret < 0) { + c->release(); + } else { + *pc = c; + } + return ret; +} + +int RGWGC::remove(int index, int num_entries) +{ + ObjectWriteOperation op; + cls_rgw_gc_queue_remove_entries(op, num_entries); + + return store->gc_operate(this, obj_names[index], &op); +} + +int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated, bool& processing_queue) +{ + result.clear(); + string next_marker; + bool check_queue = false; + + for (; *index < max_objs && result.size() < max; (*index)++, marker.clear(), check_queue = false) { + std::list entries, queue_entries; + int ret = 0; + + //processing_queue is set to true from previous iteration if the queue was under process and probably has more elements in it. + if (! transitioned_objects_cache[*index] && ! check_queue && ! processing_queue) { + ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, max - result.size(), expired_only, entries, truncated, next_marker); + if (ret != -ENOENT && ret < 0) { + return ret; + } + obj_version objv; + cls_version_read(store->gc_pool_ctx, obj_names[*index], &objv); + if (ret == -ENOENT || entries.size() == 0) { + if (objv.ver == 0) { + continue; + } else { + if (! expired_only) { + transitioned_objects_cache[*index] = true; + marker.clear(); + } else { + std::list non_expired_entries; + ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, 1, false, non_expired_entries, truncated, next_marker); + if (non_expired_entries.size() == 0) { + transitioned_objects_cache[*index] = true; + marker.clear(); + } + } + } + } + if ((objv.ver == 1) && (entries.size() < max - result.size())) { + check_queue = true; + marker.clear(); + } + } + if (transitioned_objects_cache[*index] || check_queue || processing_queue) { + processing_queue = false; + ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[*index], marker, (max - result.size()) - entries.size(), expired_only, queue_entries, truncated, next_marker); + if (ret < 0) { + return ret; + } + } + if (entries.size() == 0 && queue_entries.size() == 0) + continue; + + std::list::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + result.push_back(*iter); + } + + for (iter = queue_entries.begin(); iter != queue_entries.end(); ++iter) { + result.push_back(*iter); + } + + marker = next_marker; + + if (*index == max_objs - 1) { + if (queue_entries.size() > 0 && *truncated) { + processing_queue = true; + } else { + processing_queue = false; + } + /* we cut short here, truncated will hold the correct value */ + return 0; + } + + if (result.size() == max) { + if (queue_entries.size() > 0 && *truncated) { + processing_queue = true; + } else { + processing_queue = false; + *index += 1; //move to next gc object + } + + /* close approximation, it might be that the next of the objects don't hold + * anything, in this case truncated should have been false, but we can find + * that out on the next iteration + */ + *truncated = true; + return 0; + } + } + *truncated = false; + processing_queue = false; + + return 0; +} + +class RGWGCIOManager { + const DoutPrefixProvider* dpp; + CephContext *cct; + RGWGC *gc; + + struct IO { + enum Type { + UnknownIO = 0, + TailIO = 1, + IndexIO = 2, + } type{UnknownIO}; + librados::AioCompletion *c{nullptr}; + string oid; + int index{-1}; + string tag; + }; + + deque ios; + vector > remove_tags; + /* tracks the number of remaining shadow objects for a given tag in order to + * only remove the tag once all shadow objects have themselves been removed + */ + vector > tag_io_size; + +#define MAX_AIO_DEFAULT 10 + size_t max_aio{MAX_AIO_DEFAULT}; + +public: + RGWGCIOManager(const DoutPrefixProvider* _dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp), + cct(_cct), + gc(_gc) { + max_aio = cct->_conf->rgw_gc_max_concurrent_io; + remove_tags.resize(min(static_cast(cct->_conf->rgw_gc_max_objs), rgw_shards_max())); + tag_io_size.resize(min(static_cast(cct->_conf->rgw_gc_max_objs), rgw_shards_max())); + } + + ~RGWGCIOManager() { + for (auto io : ios) { + io.c->release(); + } + } + + int schedule_io(IoCtx *ioctx, const string& oid, ObjectWriteOperation *op, + int index, const string& tag) { + while (ios.size() > max_aio) { + if (gc->going_down()) { + return 0; + } + auto ret = handle_next_completion(); + //Return error if we are using queue, else ignore it + if (gc->transitioned_objects_cache[index] && ret < 0) { + return ret; + } + } + + auto c = librados::Rados::aio_create_completion(nullptr, nullptr); + int ret = ioctx->aio_operate(oid, c, op); + if (ret < 0) { + return ret; + } + ios.push_back(IO{IO::TailIO, c, oid, index, tag}); + + return 0; + } + + int handle_next_completion() { + ceph_assert(!ios.empty()); + IO& io = ios.front(); + io.c->wait_for_complete(); + int ret = io.c->get_return_value(); + io.c->release(); + + if (ret == -ENOENT) { + ret = 0; + } + + if (io.type == IO::IndexIO && ! gc->transitioned_objects_cache[io.index]) { + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: gc cleanup of tags on gc shard index=" << + io.index << " returned error, ret=" << ret << dendl; + } + goto done; + } + + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: gc could not remove oid=" << io.oid << + ", ret=" << ret << dendl; + goto done; + } + + if (! gc->transitioned_objects_cache[io.index]) { + schedule_tag_removal(io.index, io.tag); + } + + done: + ios.pop_front(); + return ret; + } + + /* This is a request to schedule a tag removal. It will be called once when + * there are no shadow objects. But it will also be called for every shadow + * object when there are any. Since we do not want the tag to be removed + * until all shadow objects have been successfully removed, the scheduling + * will not happen until the shadow object count goes down to zero + */ + void schedule_tag_removal(int index, string tag) { + auto& ts = tag_io_size[index]; + auto ts_it = ts.find(tag); + if (ts_it != ts.end()) { + auto& size = ts_it->second; + --size; + // wait all shadow obj delete return + if (size != 0) + return; + + ts.erase(ts_it); + } + + auto& rt = remove_tags[index]; + + rt.push_back(tag); + if (rt.size() >= (size_t)cct->_conf->rgw_gc_max_trim_chunk) { + flush_remove_tags(index, rt); + } + } + + void add_tag_io_size(int index, string tag, size_t size) { + auto& ts = tag_io_size[index]; + ts.emplace(tag, size); + } + + int drain_ios() { + int ret_val = 0; + while (!ios.empty()) { + if (gc->going_down()) { + return -EAGAIN; + } + auto ret = handle_next_completion(); + if (ret < 0) { + ret_val = ret; + } + } + return ret_val; + } + + void drain() { + drain_ios(); + flush_remove_tags(); + /* the tags draining might have generated more ios, drain those too */ + drain_ios(); + } + + void flush_remove_tags(int index, vector& rt) { + IO index_io; + index_io.type = IO::IndexIO; + index_io.index = index; + + ldpp_dout(dpp, 20) << __func__ << + " removing entries from gc log shard index=" << index << ", size=" << + rt.size() << ", entries=" << rt << dendl; + + auto rt_guard = make_scope_guard( + [&] + { + rt.clear(); + } + ); + + int ret = gc->remove(index, rt, &index_io.c); + if (ret < 0) { + /* we already cleared list of tags, this prevents us from + * ballooning in case of a persistent problem + */ + ldpp_dout(dpp, 0) << "WARNING: failed to remove tags on gc shard index=" << + index << " ret=" << ret << dendl; + return; + } + if (perfcounter) { + /* log the count of tags retired for rate estimation */ + perfcounter->inc(l_rgw_gc_retire, rt.size()); + } + ios.push_back(index_io); + } + + void flush_remove_tags() { + int index = 0; + for (auto& rt : remove_tags) { + if (! gc->transitioned_objects_cache[index]) { + flush_remove_tags(index, rt); + } + ++index; + } + } + + int remove_queue_entries(int index, int num_entries) { + int ret = gc->remove(index, num_entries); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to remove queue entries on index=" << + index << " ret=" << ret << dendl; + return ret; + } + if (perfcounter) { + /* log the count of tags retired for rate estimation */ + perfcounter->inc(l_rgw_gc_retire, num_entries); + } + return 0; + } +}; // class RGWGCIOManger + +int RGWGC::process(int index, int max_secs, bool expired_only, + RGWGCIOManager& io_manager) +{ + ldpp_dout(this, 20) << "RGWGC::process entered with GC index_shard=" << + index << ", max_secs=" << max_secs << ", expired_only=" << + expired_only << dendl; + + rados::cls::lock::Lock l(gc_index_lock_name); + utime_t end = ceph_clock_now(); + + /* max_secs should be greater than zero. We don't want a zero max_secs + * to be translated as no timeout, since we'd then need to break the + * lock and that would require a manual intervention. In this case + * we can just wait it out. */ + if (max_secs <= 0) + return -EAGAIN; + + end += max_secs; + utime_t time(max_secs, 0); + l.set_duration(time); + + int ret = l.lock_exclusive(&store->gc_pool_ctx, obj_names[index]); + if (ret == -EBUSY) { /* already locked by another gc processor */ + ldpp_dout(this, 10) << "RGWGC::process failed to acquire lock on " << + obj_names[index] << dendl; + return 0; + } + if (ret < 0) + return ret; + + string marker; + string next_marker; + bool truncated; + IoCtx *ctx = new IoCtx; + do { + int max = 100; + std::list entries; + + int ret = 0; + + if (! transitioned_objects_cache[index]) { + ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker); + ldpp_dout(this, 20) << + "RGWGC::process cls_rgw_gc_list returned with returned:" << ret << + ", entries.size=" << entries.size() << ", truncated=" << truncated << + ", next_marker='" << next_marker << "'" << dendl; + obj_version objv; + cls_version_read(store->gc_pool_ctx, obj_names[index], &objv); + if ((objv.ver == 1) && entries.size() == 0) { + std::list non_expired_entries; + ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, 1, false, non_expired_entries, &truncated, next_marker); + if (non_expired_entries.size() == 0) { + transitioned_objects_cache[index] = true; + marker.clear(); + ldpp_dout(this, 20) << "RGWGC::process cls_rgw_gc_list returned NO non expired entries, so setting cache entry to TRUE" << dendl; + } else { + ret = 0; + goto done; + } + } + if ((objv.ver == 0) && (ret == -ENOENT || entries.size() == 0)) { + ret = 0; + goto done; + } + } + + if (transitioned_objects_cache[index]) { + ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker); + ldpp_dout(this, 20) << + "RGWGC::process cls_rgw_gc_queue_list_entries returned with return value:" << ret << + ", entries.size=" << entries.size() << ", truncated=" << truncated << + ", next_marker='" << next_marker << "'" << dendl; + if (entries.size() == 0) { + ret = 0; + goto done; + } + } + + if (ret < 0) + goto done; + + marker = next_marker; + + string last_pool; + std::list::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + cls_rgw_gc_obj_info& info = *iter; + + ldpp_dout(this, 20) << "RGWGC::process iterating over entry tag='" << + info.tag << "', time=" << info.time << ", chain.objs.size()=" << + info.chain.objs.size() << dendl; + + std::list::iterator liter; + cls_rgw_obj_chain& chain = info.chain; + + utime_t now = ceph_clock_now(); + if (now >= end) { + goto done; + } + if (! transitioned_objects_cache[index]) { + if (chain.objs.empty()) { + io_manager.schedule_tag_removal(index, info.tag); + } else { + io_manager.add_tag_io_size(index, info.tag, chain.objs.size()); + } + } + if (! chain.objs.empty()) { + for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) { + cls_rgw_obj& obj = *liter; + + if (obj.pool != last_pool) { + delete ctx; + ctx = new IoCtx; + ret = rgw_init_ioctx(this, store->get_rados_handle(), obj.pool, *ctx); + if (ret < 0) { + if (transitioned_objects_cache[index]) { + goto done; + } + last_pool = ""; + ldpp_dout(this, 0) << "ERROR: failed to create ioctx pool=" << + obj.pool << dendl; + continue; + } + last_pool = obj.pool; + } + + ctx->locator_set_key(obj.loc); + + const string& oid = obj.key.name; /* just stored raw oid there */ + + ldpp_dout(this, 5) << "RGWGC::process removing " << obj.pool << + ":" << obj.key.name << dendl; + ObjectWriteOperation op; + cls_refcount_put(op, info.tag, true); + + ret = io_manager.schedule_io(ctx, oid, &op, index, info.tag); + if (ret < 0) { + ldpp_dout(this, 0) << + "WARNING: failed to schedule deletion for oid=" << oid << dendl; + if (transitioned_objects_cache[index]) { + //If deleting oid failed for any of them, we will not delete queue entries + goto done; + } + } + if (going_down()) { + // leave early, even if tag isn't removed, it's ok since it + // will be picked up next time around + goto done; + } + } // chains loop + } // else -- chains not empty + } // entries loop + if (transitioned_objects_cache[index] && entries.size() > 0) { + ret = io_manager.drain_ios(); + if (ret < 0) { + goto done; + } + //Remove the entries from the queue + ldpp_dout(this, 5) << "RGWGC::process removing entries, marker: " << marker << dendl; + ret = io_manager.remove_queue_entries(index, entries.size()); + if (ret < 0) { + ldpp_dout(this, 0) << + "WARNING: failed to remove queue entries" << dendl; + goto done; + } + } + } while (truncated); + +done: + /* we don't drain here, because if we're going down we don't want to + * hold the system if backend is unresponsive + */ + l.unlock(&store->gc_pool_ctx, obj_names[index]); + delete ctx; + + return 0; +} + +int RGWGC::process(bool expired_only) +{ + int max_secs = cct->_conf->rgw_gc_processor_max_time; + + const int start = ceph::util::generate_random_number(0, max_objs - 1); + + RGWGCIOManager io_manager(this, store->ctx(), this); + + for (int i = 0; i < max_objs; i++) { + int index = (i + start) % max_objs; + int ret = process(index, max_secs, expired_only, io_manager); + if (ret < 0) + return ret; + } + if (!going_down()) { + io_manager.drain(); + } + + return 0; +} + +bool RGWGC::going_down() +{ + return down_flag; +} + +void RGWGC::start_processor() +{ + worker = new GCWorker(this, cct, this); + worker->create("rgw_gc"); +} + +void RGWGC::stop_processor() +{ + down_flag = true; + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = NULL; +} + +unsigned RGWGC::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWGC::gen_prefix(std::ostream& out) const +{ + return out << "garbage collection: "; +} + +void *RGWGC::GCWorker::entry() { + do { + utime_t start = ceph_clock_now(); + ldpp_dout(dpp, 2) << "garbage collection: start" << dendl; + int r = gc->process(true); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: garbage collection process() returned error r=" << r << dendl; + } + ldpp_dout(dpp, 2) << "garbage collection: stop" << dendl; + + if (gc->going_down()) + break; + + utime_t end = ceph_clock_now(); + end -= start; + int secs = cct->_conf->rgw_gc_processor_period; + + if (secs <= end.sec()) + continue; // next round + + secs -= end.sec(); + + std::unique_lock locker{lock}; + cond.wait_for(locker, std::chrono::seconds(secs)); + } while (!gc->going_down()); + + return NULL; +} + +void RGWGC::GCWorker::stop() +{ + std::lock_guard l{lock}; + cond.notify_all(); +} diff --git a/src/rgw/driver/rados/rgw_gc.h b/src/rgw/driver/rados/rgw_gc.h new file mode 100644 index 000000000..f3df64099 --- /dev/null +++ b/src/rgw/driver/rados/rgw_gc.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "include/types.h" +#include "include/rados/librados.hpp" +#include "common/ceph_mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" +#include "rgw_common.h" +#include "rgw_sal.h" +#include "rgw_rados.h" +#include "cls/rgw/cls_rgw_types.h" + +#include + +class RGWGCIOManager; + +class RGWGC : public DoutPrefixProvider { + CephContext *cct; + RGWRados *store; + int max_objs; + std::string *obj_names; + std::atomic down_flag = { false }; + + static constexpr uint64_t seed = 8675309; + + int tag_index(const std::string& tag); + int send_chain(const cls_rgw_obj_chain& chain, const std::string& tag); + + class GCWorker : public Thread { + const DoutPrefixProvider *dpp; + CephContext *cct; + RGWGC *gc; + ceph::mutex lock = ceph::make_mutex("GCWorker"); + ceph::condition_variable cond; + + public: + GCWorker(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp), cct(_cct), gc(_gc) {} + void *entry() override; + void stop(); + }; + + GCWorker *worker; +public: + RGWGC() : cct(NULL), store(NULL), max_objs(0), obj_names(NULL), worker(NULL) {} + ~RGWGC() { + stop_processor(); + finalize(); + } + std::vector transitioned_objects_cache; + std::tuple> send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag); + + // asynchronously defer garbage collection on an object that's still being read + int async_defer_chain(const std::string& tag, const cls_rgw_obj_chain& info); + + // callback for when async_defer_chain() fails with ECANCELED + void on_defer_canceled(const cls_rgw_gc_obj_info& info); + + int remove(int index, const std::vector& tags, librados::AioCompletion **pc); + int remove(int index, int num_entries); + + void initialize(CephContext *_cct, RGWRados *_store); + void finalize(); + + int list(int *index, std::string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated, bool& processing_queue); + void list_init(int *index) { *index = 0; } + int process(int index, int process_max_secs, bool expired_only, + RGWGCIOManager& io_manager); + int process(bool expired_only); + + bool going_down(); + void start_processor(); + void stop_processor(); + + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const; + + std::ostream& gen_prefix(std::ostream& out) const; + +}; diff --git a/src/rgw/driver/rados/rgw_gc_log.cc b/src/rgw/driver/rados/rgw_gc_log.cc new file mode 100644 index 000000000..ad819eddc --- /dev/null +++ b/src/rgw/driver/rados/rgw_gc_log.cc @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_gc_log.h" + +#include "cls/rgw/cls_rgw_client.h" +#include "cls/rgw_gc/cls_rgw_gc_client.h" +#include "cls/version/cls_version_client.h" + + +void gc_log_init2(librados::ObjectWriteOperation& op, + uint64_t max_size, uint64_t max_deferred) +{ + obj_version objv; // objv.ver = 0 + cls_version_check(op, objv, VER_COND_EQ); + cls_rgw_gc_queue_init(op, max_size, max_deferred); + objv.ver = 1; + cls_version_set(op, objv); +} + +void gc_log_enqueue1(librados::ObjectWriteOperation& op, + uint32_t expiration, cls_rgw_gc_obj_info& info) +{ + obj_version objv; // objv.ver = 0 + cls_version_check(op, objv, VER_COND_EQ); + cls_rgw_gc_set_entry(op, expiration, info); +} + +void gc_log_enqueue2(librados::ObjectWriteOperation& op, + uint32_t expiration, const cls_rgw_gc_obj_info& info) +{ + obj_version objv; + objv.ver = 1; + cls_version_check(op, objv, VER_COND_EQ); + cls_rgw_gc_queue_enqueue(op, expiration, info); +} + +void gc_log_defer1(librados::ObjectWriteOperation& op, + uint32_t expiration, const cls_rgw_gc_obj_info& info) +{ + obj_version objv; // objv.ver = 0 + cls_version_check(op, objv, VER_COND_EQ); + cls_rgw_gc_defer_entry(op, expiration, info.tag); +} + +void gc_log_defer2(librados::ObjectWriteOperation& op, + uint32_t expiration, const cls_rgw_gc_obj_info& info) +{ + obj_version objv; + objv.ver = 1; + cls_version_check(op, objv, VER_COND_EQ); + cls_rgw_gc_queue_defer_entry(op, expiration, info); + // TODO: conditional on whether omap is known to be empty + cls_rgw_gc_remove(op, {info.tag}); +} diff --git a/src/rgw/driver/rados/rgw_lc_tier.cc b/src/rgw/driver/rados/rgw_lc_tier.cc new file mode 100644 index 000000000..c52acef65 --- /dev/null +++ b/src/rgw/driver/rados/rgw_lc_tier.cc @@ -0,0 +1,1310 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include + +#include "common/Formatter.h" +#include +#include "rgw_lc.h" +#include "rgw_lc_tier.h" +#include "rgw_string.h" +#include "rgw_zone.h" +#include "rgw_common.h" +#include "rgw_rest.h" +#include "svc_zone.h" + +#include +#include +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +struct rgw_lc_multipart_part_info { + int part_num{0}; + uint64_t ofs{0}; + uint64_t size{0}; + std::string etag; +}; + +struct rgw_lc_obj_properties { + ceph::real_time mtime; + std::string etag; + uint64_t versioned_epoch{0}; + std::map& target_acl_mappings; + std::string target_storage_class; + + rgw_lc_obj_properties(ceph::real_time _mtime, std::string _etag, + uint64_t _versioned_epoch, std::map& _t_acl_mappings, + std::string _t_storage_class) : + mtime(_mtime), etag(_etag), + versioned_epoch(_versioned_epoch), + target_acl_mappings(_t_acl_mappings), + target_storage_class(_t_storage_class) {} +}; + +struct rgw_lc_multipart_upload_info { + std::string upload_id; + uint64_t obj_size; + ceph::real_time mtime; + std::string etag; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(upload_id, bl); + encode(obj_size, bl); + encode(mtime, bl); + encode(etag, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(upload_id, bl); + decode(obj_size, bl); + decode(mtime, bl); + decode(etag, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_lc_multipart_upload_info) + +static inline string get_key_instance(const rgw_obj_key& key) +{ + if (!key.instance.empty() && + !key.have_null_instance()) { + return "-" + key.instance; + } + return ""; +} + +static inline string get_key_oid(const rgw_obj_key& key) +{ + string oid = key.name; + if (!key.instance.empty() && + !key.have_null_instance()) { + oid += string("-") + key.instance; + } + return oid; +} + +static inline string obj_to_aws_path(const rgw_obj& obj) +{ + string path = obj.bucket.name + "/" + get_key_oid(obj.key); + return path; +} + +static int read_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver, + const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status) +{ + int ret = 0; + rgw::sal::RadosStore *rados = dynamic_cast(driver); + + if (!rados) { + ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl; + return -1; + } + + auto& pool = status_obj->pool; + const auto oid = status_obj->oid; + auto sysobj = rados->svc()->sysobj; + bufferlist bl; + + ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr, + null_yield, dpp); + + if (ret < 0) { + return ret; + } + + if (bl.length() > 0) { + try { + auto p = bl.cbegin(); + status->decode(p); + } catch (buffer::error& e) { + ldpp_dout(dpp, 10) << "failed to decode status obj: " + << e.what() << dendl; + return -EIO; + } + } else { + return -EIO; + } + + return 0; +} + +static int put_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver, + const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status) +{ + int ret = 0; + rgw::sal::RadosStore *rados = dynamic_cast(driver); + + if (!rados) { + ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl; + return -1; + } + + auto& pool = status_obj->pool; + const auto oid = status_obj->oid; + auto sysobj = rados->svc()->sysobj; + bufferlist bl; + status->encode(bl); + + ret = rgw_put_system_obj(dpp, sysobj, pool, oid, bl, true, nullptr, + real_time{}, null_yield); + + return ret; +} + +static int delete_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver, + const rgw_raw_obj *status_obj) +{ + int ret = 0; + rgw::sal::RadosStore *rados = dynamic_cast(driver); + + if (!rados) { + ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl; + return -1; + } + + auto& pool = status_obj->pool; + const auto oid = status_obj->oid; + auto sysobj = rados->svc()->sysobj; + + ret = rgw_delete_system_obj(dpp, sysobj, pool, oid, nullptr, null_yield); + + return ret; +} + +static std::set keep_headers = { "CONTENT_TYPE", + "CONTENT_ENCODING", + "CONTENT_DISPOSITION", + "CONTENT_LANGUAGE" }; + +/* + * mapping between rgw object attrs and output http fields + * + static const struct rgw_http_attr base_rgw_to_http_attrs[] = { + { RGW_ATTR_CONTENT_LANG, "Content-Language" }, + { RGW_ATTR_EXPIRES, "Expires" }, + { RGW_ATTR_CACHE_CONTROL, "Cache-Control" }, + { RGW_ATTR_CONTENT_DISP, "Content-Disposition" }, + { RGW_ATTR_CONTENT_ENC, "Content-Encoding" }, + { RGW_ATTR_USER_MANIFEST, "X-Object-Manifest" }, + { RGW_ATTR_X_ROBOTS_TAG , "X-Robots-Tag" }, + { RGW_ATTR_STORAGE_CLASS , "X-Amz-Storage-Class" }, +// RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION header depends on access mode: +// S3 endpoint: x-amz-website-redirect-location +// S3Website endpoint: Location +{ RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "x-amz-website-redirect-location" }, +}; */ + +static void init_headers(map& attrs, + map& headers) +{ + for (auto& kv : attrs) { + const char * name = kv.first.c_str(); + const auto aiter = rgw_to_http_attrs.find(name); + + if (aiter != std::end(rgw_to_http_attrs)) { + headers[aiter->second] = rgw_bl_str(kv.second); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, + sizeof(RGW_ATTR_META_PREFIX)-1) == 0) { + name += sizeof(RGW_ATTR_META_PREFIX) - 1; + string sname(name); + string name_prefix = RGW_ATTR_META_PREFIX; + char full_name_buf[name_prefix.size() + sname.size() + 1]; + snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s", + static_cast(name_prefix.length()), + name_prefix.data(), + static_cast(sname.length()), + sname.data()); + headers[full_name_buf] = rgw_bl_str(kv.second); + } else if (strcmp(name,RGW_ATTR_CONTENT_TYPE) == 0) { + headers["CONTENT_TYPE"] = rgw_bl_str(kv.second); + } + } +} + +/* Read object or just head from remote endpoint. For now initializes only headers, + * but can be extended to fetch etag, mtime etc if needed. + */ +static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head, + std::map& headers) { + RGWRESTConn::get_obj_params req_params; + std::string target_obj_name; + int ret = 0; + rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag, + tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings, + tier_ctx.target_storage_class); + std::string etag; + RGWRESTStreamRWRequest *in_req; + + rgw_bucket dest_bucket; + dest_bucket.name = tier_ctx.target_bucket_name; + target_obj_name = tier_ctx.bucket_info.bucket.name + "/" + + tier_ctx.obj->get_name(); + if (!tier_ctx.o.is_current()) { + target_obj_name += get_key_instance(tier_ctx.obj->get_key()); + } + + rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name)); + + /* init input connection */ + req_params.get_op = !head; + req_params.prepend_metadata = true; + req_params.rgwx_stat = true; + req_params.sync_manifest = true; + req_params.skip_decrypt = true; + + ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj, req_params, true /* send */, &in_req); + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl; + return ret; + } + + /* fetch headers */ + ret = tier_ctx.conn.complete_request(in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(tier_ctx.dpp, 20) << "ERROR: " << __func__ << "(): conn.complete_request() returned ret=" << ret << dendl; + return ret; + } + return 0; +} + +static bool is_already_tiered(const DoutPrefixProvider *dpp, + std::map& headers, + ceph::real_time& mtime) { + char buf[32]; + map attrs = headers; + + for (const auto& a : attrs) { + ldpp_dout(dpp, 20) << "GetCrf attr[" << a.first << "] = " << a.second <iterate(dpp, ofs, end, out_cb, null_yield); + return ret; +} + +int RGWLCCloudStreamPut::init() { + /* init output connection */ + if (multipart.is_multipart) { + char buf[32]; + snprintf(buf, sizeof(buf), "%d", multipart.part_num); + rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() }, + { "partNumber", buf }, + { nullptr, nullptr } }; + conn.put_obj_send_init(dest_obj, params, &out_req); + } else { + conn.put_obj_send_init(dest_obj, nullptr, &out_req); + } + + return 0; +} + +bool RGWLCCloudStreamPut::keep_attr(const string& h) { + return (keep_headers.find(h) != keep_headers.end()); +} + +void RGWLCCloudStreamPut::init_send_attrs(const DoutPrefixProvider *dpp, + const rgw_rest_obj& rest_obj, + const rgw_lc_obj_properties& obj_properties, + std::map& attrs) { + + map& acl_mappings(obj_properties.target_acl_mappings); + const std::string& target_storage_class = obj_properties.target_storage_class; + + attrs.clear(); + + for (auto& hi : rest_obj.attrs) { + if (keep_attr(hi.first)) { + attrs.insert(hi); + } else { + std::string s1 = boost::algorithm::to_lower_copy(hi.first); + const char* k = std::strstr(s1.c_str(), "x-amz"); + if (k) { + attrs[k] = hi.second; + } + } + } + + const auto acl = rest_obj.acls.get_acl(); + + map > access_map; + + if (!acl_mappings.empty()) { + for (auto& grant : acl.get_grant_map()) { + auto& orig_grantee = grant.first; + auto& perm = grant.second; + + string grantee; + + const auto& am = acl_mappings; + + const auto iter = am.find(orig_grantee); + if (iter == am.end()) { + ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl; + continue; + } + + grantee = iter->second.dest_id; + + string type; + + switch (iter->second.type) { + case ACL_TYPE_CANON_USER: + type = "id"; + break; + case ACL_TYPE_EMAIL_USER: + type = "emailAddress"; + break; + case ACL_TYPE_GROUP: + type = "uri"; + break; + default: + continue; + } + + string tv = type + "=" + grantee; + + int flags = perm.get_permission().get_permissions(); + if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) { + access_map[flags].push_back(tv); + continue; + } + + for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) { + if (flags & i) { + access_map[i].push_back(tv); + } + } + } + } + + for (const auto& aiter : access_map) { + int grant_type = aiter.first; + + string header_str("x-amz-grant-"); + + switch (grant_type) { + case RGW_PERM_READ: + header_str.append("read"); + break; + case RGW_PERM_WRITE: + header_str.append("write"); + break; + case RGW_PERM_READ_ACP: + header_str.append("read-acp"); + break; + case RGW_PERM_WRITE_ACP: + header_str.append("write-acp"); + break; + case RGW_PERM_FULL_CONTROL: + header_str.append("full-control"); + break; + } + + string s; + + for (const auto& viter : aiter.second) { + if (!s.empty()) { + s.append(", "); + } + s.append(viter); + } + + ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl; + + attrs[header_str] = s; + } + + /* Copy target storage class */ + if (!target_storage_class.empty()) { + attrs["x-amz-storage-class"] = target_storage_class; + } else { + attrs["x-amz-storage-class"] = "STANDARD"; + } + + /* New attribute to specify its transitioned from RGW */ + attrs["x-amz-meta-rgwx-source"] = "rgw"; + attrs["x-rgw-cloud"] = "true"; + attrs["x-rgw-cloud-keep-attrs"] = "true"; + + char buf[32]; + snprintf(buf, sizeof(buf), "%llu", (long long)obj_properties.versioned_epoch); + attrs["x-amz-meta-rgwx-versioned-epoch"] = buf; + + utime_t ut(obj_properties.mtime); + snprintf(buf, sizeof(buf), "%lld.%09lld", + (long long)ut.sec(), + (long long)ut.nsec()); + + attrs["x-amz-meta-rgwx-source-mtime"] = buf; + attrs["x-amz-meta-rgwx-source-etag"] = obj_properties.etag; + attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name; + if (!rest_obj.key.instance.empty()) { + attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance; + } + for (const auto& a : attrs) { + ldpp_dout(dpp, 30) << "init_send_attrs attr[" << a.first << "] = " << a.second <(out_req); + + std::map new_attrs; + if (!multipart.is_multipart) { + init_send_attrs(dpp, rest_obj, obj_properties, new_attrs); + } + + r->set_send_length(rest_obj.content_len); + + RGWAccessControlPolicy policy; + + r->send_ready(dpp, conn.get_key(), new_attrs, policy); +} + +void RGWLCCloudStreamPut::handle_headers(const map& headers) { + for (const auto& h : headers) { + if (h.first == "ETAG") { + etag = h.second; + } + } +} + +bool RGWLCCloudStreamPut::get_etag(string *petag) { + if (etag.empty()) { + return false; + } + *petag = etag; + return true; +} + +void RGWLCCloudStreamPut::set_multipart(const string& upload_id, int part_num, uint64_t part_size) { + multipart.is_multipart = true; + multipart.upload_id = upload_id; + multipart.part_num = part_num; + multipart.part_size = part_size; +} + +int RGWLCCloudStreamPut::send() { + int ret = RGWHTTP::send(out_req); + return ret; +} + +RGWGetDataCB *RGWLCCloudStreamPut::get_cb() { + return out_req->get_out_cb(); +} + +int RGWLCCloudStreamPut::complete_request() { + int ret = conn.complete_request(out_req, etag, &obj_properties.mtime, null_yield); + return ret; +} + +/* Read local copy and write to Cloud endpoint */ +static int cloud_tier_transfer_object(const DoutPrefixProvider* dpp, + RGWLCStreamRead* readf, RGWLCCloudStreamPut* writef) { + std::string url; + bufferlist bl; + bool sent_attrs{false}; + int ret{0}; + off_t ofs; + off_t end; + + ret = readf->init(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: fail to initialize in_crf, ret = " << ret << dendl; + return ret; + } + readf->get_range(ofs, end); + rgw_rest_obj& rest_obj = readf->get_rest_obj(); + if (!sent_attrs) { + ret = writef->init(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: fail to initialize out_crf, ret = " << ret << dendl; + return ret; + } + + writef->send_ready(dpp, rest_obj); + ret = writef->send(); + if (ret < 0) { + return ret; + } + sent_attrs = true; + } + + ret = readf->read(ofs, end, writef->get_cb()); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: fail to read from in_crf, ret = " << ret << dendl; + return ret; + } + + ret = writef->complete_request(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: fail to complete request, ret = " << ret << dendl; + return ret; + } + + return 0; +} + +static int cloud_tier_plain_transfer(RGWLCCloudTierCtx& tier_ctx) { + int ret; + + rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag, + tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings, + tier_ctx.target_storage_class); + std::string target_obj_name; + + rgw_bucket dest_bucket; + dest_bucket.name = tier_ctx.target_bucket_name; + + target_obj_name = tier_ctx.bucket_info.bucket.name + "/" + + tier_ctx.obj->get_name(); + if (!tier_ctx.o.is_current()) { + target_obj_name += get_key_instance(tier_ctx.obj->get_key()); + } + + rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name)); + + tier_ctx.obj->set_atomic(); + + /* Prepare Read from source */ + /* TODO: Define readf, writef as stack variables. For some reason, + * when used as stack variables (esp., readf), the transition seems to + * be taking lot of time eventually erroring out at times. + */ + std::shared_ptr readf; + readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp, + tier_ctx.obj, tier_ctx.o.meta.mtime)); + + std::shared_ptr writef; + writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn, + dest_obj)); + + /* actual Read & Write */ + ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get()); + + return ret; +} + +static int cloud_tier_send_multipart_part(RGWLCCloudTierCtx& tier_ctx, + const std::string& upload_id, + const rgw_lc_multipart_part_info& part_info, + std::string *petag) { + int ret; + + rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag, + tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings, + tier_ctx.target_storage_class); + std::string target_obj_name; + off_t end; + + rgw_bucket dest_bucket; + dest_bucket.name = tier_ctx.target_bucket_name; + + target_obj_name = tier_ctx.bucket_info.bucket.name + "/" + + tier_ctx.obj->get_name(); + if (!tier_ctx.o.is_current()) { + target_obj_name += get_key_instance(tier_ctx.obj->get_key()); + } + + rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name)); + + tier_ctx.obj->set_atomic(); + + /* TODO: Define readf, writef as stack variables. For some reason, + * when used as stack variables (esp., readf), the transition seems to + * be taking lot of time eventually erroring out at times. */ + std::shared_ptr readf; + readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp, + tier_ctx.obj, tier_ctx.o.meta.mtime)); + + std::shared_ptr writef; + writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn, + dest_obj)); + + /* Prepare Read from source */ + end = part_info.ofs + part_info.size - 1; + readf->set_multipart(part_info.size, part_info.ofs, end); + + /* Prepare write */ + writef->set_multipart(upload_id, part_info.part_num, part_info.size); + + /* actual Read & Write */ + ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get()); + if (ret < 0) { + return ret; + } + + if (!(writef->get_etag(petag))) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl; + return -EIO; + } + + return 0; +} + +static int cloud_tier_abort_multipart(const DoutPrefixProvider *dpp, + RGWRESTConn& dest_conn, const rgw_obj& dest_obj, + const std::string& upload_id) { + int ret; + bufferlist out_bl; + bufferlist bl; + rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} }; + + string resource = obj_to_aws_path(dest_obj); + ret = dest_conn.send_resource(dpp, "DELETE", resource, params, nullptr, + out_bl, &bl, nullptr, null_yield); + + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (ret=" << ret << ")" << dendl; + return ret; + } + + return 0; +} + +static int cloud_tier_init_multipart(const DoutPrefixProvider *dpp, + RGWRESTConn& dest_conn, const rgw_obj& dest_obj, + uint64_t obj_size, std::map& attrs, + std::string& upload_id) { + bufferlist out_bl; + bufferlist bl; + + struct InitMultipartResult { + std::string bucket; + std::string key; + std::string upload_id; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Bucket", bucket, obj); + RGWXMLDecoder::decode_xml("Key", key, obj); + RGWXMLDecoder::decode_xml("UploadId", upload_id, obj); + } + } result; + + int ret; + rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} }; + + string resource = obj_to_aws_path(dest_obj); + + ret = dest_conn.send_resource(dpp, "POST", resource, params, &attrs, + out_bl, &bl, nullptr, null_yield); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl; + return ret; + } + /* + * If one of the following fails we cannot abort upload, as we cannot + * extract the upload id. If one of these fail it's very likely that that's + * the least of our problem. + */ + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl; + return -EIO; + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: failed to parse xml initmultipart: " << str << dendl; + return -EIO; + } + + try { + RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl; + return -EIO; + } + + ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl; + + upload_id = result.upload_id; + + return 0; +} + +static int cloud_tier_complete_multipart(const DoutPrefixProvider *dpp, + RGWRESTConn& dest_conn, const rgw_obj& dest_obj, + std::string& upload_id, + const std::map& parts) { + rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} }; + + stringstream ss; + XMLFormatter formatter; + int ret; + + bufferlist bl, out_bl; + string resource = obj_to_aws_path(dest_obj); + + struct CompleteMultipartReq { + std::map parts; + + explicit CompleteMultipartReq(const std::map& _parts) : parts(_parts) {} + + void dump_xml(Formatter *f) const { + for (const auto& p : parts) { + f->open_object_section("Part"); + encode_xml("PartNumber", p.first, f); + encode_xml("ETag", p.second.etag, f); + f->close_section(); + }; + } + } req_enc(parts); + + struct CompleteMultipartResult { + std::string location; + std::string bucket; + std::string key; + std::string etag; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Location", bucket, obj); + RGWXMLDecoder::decode_xml("Bucket", bucket, obj); + RGWXMLDecoder::decode_xml("Key", key, obj); + RGWXMLDecoder::decode_xml("ETag", etag, obj); + } + } result; + + encode_xml("CompleteMultipartUpload", req_enc, &formatter); + + formatter.flush(ss); + bl.append(ss.str()); + + ret = dest_conn.send_resource(dpp, "POST", resource, params, nullptr, + out_bl, &bl, nullptr, null_yield); + + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload for dest object=" << dest_obj << dendl; + return ret; + } + /* + * If one of the following fails we cannot abort upload, as we cannot + * extract the upload id. If one of these fail it's very likely that that's + * the least of our problem. + */ + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl; + return -EIO; + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: failed to parse xml Completemultipart: " << str << dendl; + return -EIO; + } + + try { + RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl; + return -EIO; + } + + ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl; + + return ret; +} + +static int cloud_tier_abort_multipart_upload(RGWLCCloudTierCtx& tier_ctx, + const rgw_obj& dest_obj, const rgw_raw_obj& status_obj, + const std::string& upload_id) { + int ret; + + ret = cloud_tier_abort_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, upload_id); + + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " ret=" << ret << dendl; + /* ignore error, best effort */ + } + /* remove status obj */ + ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj); + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " ret=" << ret << dendl; + // ignore error, best effort + } + return 0; +} + +static int cloud_tier_multipart_transfer(RGWLCCloudTierCtx& tier_ctx) { + rgw_obj src_obj; + rgw_obj dest_obj; + + uint64_t obj_size; + std::string src_etag; + rgw_rest_obj rest_obj; + + rgw_lc_multipart_upload_info status; + + std::map new_attrs; + + rgw_raw_obj status_obj; + + RGWBucketInfo b; + std::string target_obj_name; + rgw_bucket target_bucket; + + int ret; + + rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag, + tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings, + tier_ctx.target_storage_class); + + uint32_t part_size{0}; + uint32_t num_parts{0}; + + int cur_part{0}; + uint64_t cur_ofs{0}; + std::map parts; + + obj_size = tier_ctx.o.meta.size; + + target_bucket.name = tier_ctx.target_bucket_name; + + target_obj_name = tier_ctx.bucket_info.bucket.name + "/" + + tier_ctx.obj->get_name(); + if (!tier_ctx.o.is_current()) { + target_obj_name += get_key_instance(tier_ctx.obj->get_key()); + } + dest_obj.init(target_bucket, target_obj_name); + + rgw_pool pool = static_cast(tier_ctx.driver)->svc()->zone->get_zone_params().log_pool; + status_obj = rgw_raw_obj(pool, "lc_multipart_" + tier_ctx.obj->get_oid()); + + ret = read_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status); + + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " ret=" << ret << dendl; + return ret; + } + + if (ret >= 0) { + // check here that mtime and size did not change + if (status.mtime != obj_properties.mtime || status.obj_size != obj_size || + status.etag != obj_properties.etag) { + cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id); + ret = -ENOENT; + } + } + + if (ret == -ENOENT) { + RGWLCStreamRead readf(tier_ctx.cct, tier_ctx.dpp, tier_ctx.obj, tier_ctx.o.meta.mtime); + + readf.init(); + + rest_obj = readf.get_rest_obj(); + + RGWLCCloudStreamPut::init_send_attrs(tier_ctx.dpp, rest_obj, obj_properties, new_attrs); + + ret = cloud_tier_init_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, obj_size, new_attrs, status.upload_id); + if (ret < 0) { + return ret; + } + + status.obj_size = obj_size; + status.mtime = obj_properties.mtime; + status.etag = obj_properties.etag; + + ret = put_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status); + + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to driver multipart upload state, ret=" << ret << dendl; + // continue with upload anyway + } + +#define MULTIPART_MAX_PARTS 10000 +#define MULTIPART_MAX_PARTS 10000 + uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS; + uint64_t min_conf_size = tier_ctx.multipart_min_part_size; + + if (min_conf_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) { + min_conf_size = MULTIPART_MIN_POSSIBLE_PART_SIZE; + } + + part_size = std::max(min_conf_size, min_part_size); + num_parts = (obj_size + part_size - 1) / part_size; + cur_part = 1; + cur_ofs = 0; + } + + for (; (uint32_t)cur_part <= num_parts; ++cur_part) { + ldpp_dout(tier_ctx.dpp, 20) << "cur_part = "<< cur_part << ", info.ofs = " << cur_ofs << ", info.size = " << part_size << ", obj size = " << obj_size<< ", num_parts:" << num_parts << dendl; + rgw_lc_multipart_part_info& cur_part_info = parts[cur_part]; + cur_part_info.part_num = cur_part; + cur_part_info.ofs = cur_ofs; + cur_part_info.size = std::min((uint64_t)part_size, obj_size - cur_ofs); + + cur_ofs += cur_part_info.size; + + ret = cloud_tier_send_multipart_part(tier_ctx, + status.upload_id, + cur_part_info, + &cur_part_info.etag); + + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to send multipart part of obj=" << tier_ctx.obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << cur_part << " (error: " << cpp_strerror(-ret) << ")" << dendl; + cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id); + return ret; + } + + } + + ret = cloud_tier_complete_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, status.upload_id, parts); + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << tier_ctx.obj << " (error: " << cpp_strerror(-ret) << ")" << dendl; + cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id); + return ret; + } + + /* remove status obj */ + ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj); + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload obj=" << tier_ctx.obj << " upload_id=" << status.upload_id << " part number " << cur_part << " (" << cpp_strerror(-ret) << ")" << dendl; + // ignore error, best effort + } + return 0; +} + +/* Check if object has already been transitioned */ +static int cloud_tier_check_object(RGWLCCloudTierCtx& tier_ctx, bool& already_tiered) { + int ret; + std::map headers; + + /* Fetch Head object */ + ret = cloud_tier_get_object(tier_ctx, true, headers); + + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to fetch HEAD from cloud for obj=" << tier_ctx.obj << " , ret = " << ret << dendl; + return ret; + } + + already_tiered = is_already_tiered(tier_ctx.dpp, headers, tier_ctx.o.meta.mtime); + + if (already_tiered) { + ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered true" << dendl; + } else { + ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered false..going with out_crf writing" << dendl; + } + + return ret; +} + +static int cloud_tier_create_bucket(RGWLCCloudTierCtx& tier_ctx) { + bufferlist out_bl; + int ret = 0; + pair key(tier_ctx.storage_class, tier_ctx.target_bucket_name); + struct CreateBucketResult { + std::string code; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Code", code, obj); + } + } result; + + ldpp_dout(tier_ctx.dpp, 30) << "Cloud_tier_ctx: creating bucket:" << tier_ctx.target_bucket_name << dendl; + bufferlist bl; + string resource = tier_ctx.target_bucket_name; + + ret = tier_ctx.conn.send_resource(tier_ctx.dpp, "PUT", resource, nullptr, nullptr, + out_bl, &bl, nullptr, null_yield); + + if (ret < 0 ) { + ldpp_dout(tier_ctx.dpp, 0) << "create target bucket : " << tier_ctx.target_bucket_name << " returned ret:" << ret << dendl; + } + if (out_bl.length() > 0) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize xml parser for parsing create_bucket response from server" << dendl; + return -EIO; + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(tier_ctx.dpp, 5) << "ERROR: failed to parse xml createbucket: " << str << dendl; + return -EIO; + } + + try { + RGWXMLDecoder::decode_xml("Error", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(tier_ctx.dpp, 5) << "ERROR: unexpected xml: " << str << dendl; + return -EIO; + } + + if (result.code != "BucketAlreadyOwnedByYou" && result.code != "BucketAlreadyExists") { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: Creating target bucket failed with error: " << result.code << dendl; + return -EIO; + } + } + + return 0; +} + +int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set& cloud_targets) { + int ret = 0; + + // check if target_path is already created + std::set::iterator it; + + it = cloud_targets.find(tier_ctx.target_bucket_name); + tier_ctx.target_bucket_created = (it != cloud_targets.end()); + + /* If run first time attempt to create the target bucket */ + if (!tier_ctx.target_bucket_created) { + ret = cloud_tier_create_bucket(tier_ctx); + + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to create target bucket on the cloud endpoint ret=" << ret << dendl; + return ret; + } + tier_ctx.target_bucket_created = true; + cloud_targets.insert(tier_ctx.target_bucket_name); + } + + /* Since multiple zones may try to transition the same object to the cloud, + * verify if the object is already transitioned. And since its just a best + * effort, do not bail out in case of any errors. + */ + bool already_tiered = false; + ret = cloud_tier_check_object(tier_ctx, already_tiered); + + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to check object on the cloud endpoint ret=" << ret << dendl; + } + + if (already_tiered) { + ldpp_dout(tier_ctx.dpp, 20) << "Object (" << tier_ctx.o.key << ") is already tiered" << dendl; + return 0; + } + + uint64_t size = tier_ctx.o.meta.size; + uint64_t multipart_sync_threshold = tier_ctx.multipart_sync_threshold; + + if (multipart_sync_threshold < MULTIPART_MIN_POSSIBLE_PART_SIZE) { + multipart_sync_threshold = MULTIPART_MIN_POSSIBLE_PART_SIZE; + } + + if (size < multipart_sync_threshold) { + ret = cloud_tier_plain_transfer(tier_ctx); + } else { + tier_ctx.is_multipart_upload = true; + ret = cloud_tier_multipart_transfer(tier_ctx); + } + + if (ret < 0) { + ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to transition object ret=" << ret << dendl; + } + + return ret; +} diff --git a/src/rgw/driver/rados/rgw_lc_tier.h b/src/rgw/driver/rados/rgw_lc_tier.h new file mode 100644 index 000000000..729c4c304 --- /dev/null +++ b/src/rgw/driver/rados/rgw_lc_tier.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_lc.h" +#include "rgw_rest_conn.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_sal_rados.h" +#include "rgw_cr_rest.h" + +#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024) +#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024) + +struct RGWLCCloudTierCtx { + CephContext *cct; + const DoutPrefixProvider *dpp; + + /* Source */ + rgw_bucket_dir_entry& o; + rgw::sal::Driver *driver; + RGWBucketInfo& bucket_info; + std::string storage_class; + + rgw::sal::Object *obj; + + /* Remote */ + RGWRESTConn& conn; + std::string target_bucket_name; + std::string target_storage_class; + + std::map acl_mappings; + uint64_t multipart_min_part_size; + uint64_t multipart_sync_threshold; + + bool is_multipart_upload{false}; + bool target_bucket_created{true}; + + RGWLCCloudTierCtx(CephContext* _cct, const DoutPrefixProvider *_dpp, + rgw_bucket_dir_entry& _o, rgw::sal::Driver *_driver, + RGWBucketInfo &_binfo, rgw::sal::Object *_obj, + RGWRESTConn& _conn, std::string& _bucket, + std::string& _storage_class) : + cct(_cct), dpp(_dpp), o(_o), driver(_driver), bucket_info(_binfo), + obj(_obj), conn(_conn), target_bucket_name(_bucket), + target_storage_class(_storage_class) {} +}; + +/* Transition object to cloud endpoint */ +int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set& cloud_targets); diff --git a/src/rgw/driver/rados/rgw_log_backing.cc b/src/rgw/driver/rados/rgw_log_backing.cc new file mode 100644 index 000000000..7c9dafe7e --- /dev/null +++ b/src/rgw/driver/rados/rgw_log_backing.cc @@ -0,0 +1,708 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "cls/log/cls_log_client.h" +#include "cls/version/cls_version_client.h" + +#include "rgw_log_backing.h" +#include "rgw_tools.h" +#include "cls_fifo_legacy.h" + +using namespace std::chrono_literals; +namespace cb = ceph::buffer; + +static constexpr auto dout_subsys = ceph_subsys_rgw; + +enum class shard_check { dne, omap, fifo, corrupt }; +inline std::ostream& operator <<(std::ostream& m, const shard_check& t) { + switch (t) { + case shard_check::dne: + return m << "shard_check::dne"; + case shard_check::omap: + return m << "shard_check::omap"; + case shard_check::fifo: + return m << "shard_check::fifo"; + case shard_check::corrupt: + return m << "shard_check::corrupt"; + } + + return m << "shard_check::UNKNOWN=" << static_cast(t); +} + +namespace { +/// Return the shard type, and a bool to see whether it has entries. +shard_check +probe_shard(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid, + bool& fifo_unsupported, optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " probing oid=" << oid + << dendl; + if (!fifo_unsupported) { + std::unique_ptr fifo; + auto r = rgw::cls::fifo::FIFO::open(dpp, ioctx, oid, + &fifo, y, + std::nullopt, true); + switch (r) { + case 0: + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": oid=" << oid << " is FIFO" + << dendl; + return shard_check::fifo; + + case -ENODATA: + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": oid=" << oid << " is empty and therefore OMAP" + << dendl; + return shard_check::omap; + + case -ENOENT: + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": oid=" << oid << " does not exist" + << dendl; + return shard_check::dne; + + case -EPERM: + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": FIFO is unsupported, marking." + << dendl; + fifo_unsupported = true; + return shard_check::omap; + + default: + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": error probing: r=" << r + << ", oid=" << oid << dendl; + return shard_check::corrupt; + } + } else { + // Since FIFO is unsupported, OMAP is the only alternative + return shard_check::omap; + } +} + +tl::expected +handle_dne(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, + log_type def, + std::string oid, + bool fifo_unsupported, + optional_yield y) +{ + if (def == log_type::fifo) { + if (fifo_unsupported) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " WARNING: FIFO set as default but not supported by OSD. " + << "Falling back to OMAP." << dendl; + return log_type::omap; + } + std::unique_ptr fifo; + auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid, + &fifo, y, + std::nullopt); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " error creating FIFO: r=" << r + << ", oid=" << oid << dendl; + return tl::unexpected(bs::error_code(-r, bs::system_category())); + } + } + return def; +} +} + +tl::expected +log_backing_type(const DoutPrefixProvider *dpp, + librados::IoCtx& ioctx, + log_type def, + int shards, + const fu2::unique_function& get_oid, + optional_yield y) +{ + auto check = shard_check::dne; + bool fifo_unsupported = false; + for (int i = 0; i < shards; ++i) { + auto c = probe_shard(dpp, ioctx, get_oid(i), fifo_unsupported, y); + if (c == shard_check::corrupt) + return tl::unexpected(bs::error_code(EIO, bs::system_category())); + if (c == shard_check::dne) continue; + if (check == shard_check::dne) { + check = c; + continue; + } + + if (check != c) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " clashing types: check=" << check + << ", c=" << c << dendl; + return tl::unexpected(bs::error_code(EIO, bs::system_category())); + } + } + if (check == shard_check::corrupt) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " should be unreachable!" << dendl; + return tl::unexpected(bs::error_code(EIO, bs::system_category())); + } + + if (check == shard_check::dne) + return handle_dne(dpp, ioctx, + def, + get_oid(0), + fifo_unsupported, + y); + + return (check == shard_check::fifo ? log_type::fifo : log_type::omap); +} + +bs::error_code log_remove(const DoutPrefixProvider *dpp, + librados::IoCtx& ioctx, + int shards, + const fu2::unique_function& get_oid, + bool leave_zero, + optional_yield y) +{ + bs::error_code ec; + for (int i = 0; i < shards; ++i) { + auto oid = get_oid(i); + rados::cls::fifo::info info; + uint32_t part_header_size = 0, part_entry_overhead = 0; + + auto r = rgw::cls::fifo::get_meta(dpp, ioctx, oid, std::nullopt, &info, + &part_header_size, &part_entry_overhead, + 0, y, true); + if (r == -ENOENT) continue; + if (r == 0 && info.head_part_num > -1) { + for (auto j = info.tail_part_num; j <= info.head_part_num; ++j) { + librados::ObjectWriteOperation op; + op.remove(); + auto part_oid = info.part_oid(j); + auto subr = rgw_rados_operate(dpp, ioctx, part_oid, &op, null_yield); + if (subr < 0 && subr != -ENOENT) { + if (!ec) + ec = bs::error_code(-subr, bs::system_category()); + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed removing FIFO part: part_oid=" << part_oid + << ", subr=" << subr << dendl; + } + } + } + if (r < 0 && r != -ENODATA) { + if (!ec) + ec = bs::error_code(-r, bs::system_category()); + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed checking FIFO part: oid=" << oid + << ", r=" << r << dendl; + } + librados::ObjectWriteOperation op; + if (i == 0 && leave_zero) { + // Leave shard 0 in existence, but remove contents and + // omap. cls_lock stores things in the xattrs. And sync needs to + // rendezvous with locks on generation 0 shard 0. + op.omap_set_header({}); + op.omap_clear(); + op.truncate(0); + } else { + op.remove(); + } + r = rgw_rados_operate(dpp, ioctx, oid, &op, null_yield); + if (r < 0 && r != -ENOENT) { + if (!ec) + ec = bs::error_code(-r, bs::system_category()); + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed removing shard: oid=" << oid + << ", r=" << r << dendl; + } + } + return ec; +} + +logback_generations::~logback_generations() { + if (watchcookie > 0) { + auto cct = static_cast(ioctx.cct()); + auto r = ioctx.unwatch2(watchcookie); + if (r < 0) { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed unwatching oid=" << oid + << ", r=" << r << dendl; + } + } +} + +bs::error_code logback_generations::setup(const DoutPrefixProvider *dpp, + log_type def, + optional_yield y) noexcept +{ + try { + // First, read. + auto cct = static_cast(ioctx.cct()); + auto res = read(dpp, y); + if (!res && res.error() != bs::errc::no_such_file_or_directory) { + return res.error(); + } + if (res) { + std::unique_lock lock(m); + std::tie(entries_, version) = std::move(*res); + } else { + // Are we the first? Then create generation 0 and the generations + // metadata. + librados::ObjectWriteOperation op; + auto type = log_backing_type(dpp, ioctx, def, shards, + [this](int shard) { + return this->get_oid(0, shard); + }, y); + if (!type) + return type.error(); + + logback_generation l; + l.type = *type; + + std::unique_lock lock(m); + version.ver = 1; + static constexpr auto TAG_LEN = 24; + version.tag.clear(); + append_rand_alpha(cct, version.tag, version.tag, TAG_LEN); + op.create(true); + cls_version_set(op, version); + cb::list bl; + entries_.emplace(0, std::move(l)); + encode(entries_, bl); + lock.unlock(); + + op.write_full(bl); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed writing oid=" << oid + << ", r=" << r << dendl; + bs::system_error(-r, bs::system_category()); + } + // Did someone race us? Then re-read. + if (r != 0) { + res = read(dpp, y); + if (!res) + return res.error(); + if (res->first.empty()) + return bs::error_code(EIO, bs::system_category()); + auto l = res->first.begin()->second; + // In the unlikely event that someone raced us, created + // generation zero, incremented, then erased generation zero, + // don't leave generation zero lying around. + if (l.gen_id != 0) { + auto ec = log_remove(dpp, ioctx, shards, + [this](int shard) { + return this->get_oid(0, shard); + }, true, y); + if (ec) return ec; + } + std::unique_lock lock(m); + std::tie(entries_, version) = std::move(*res); + } + } + // Pass all non-empty generations to the handler + std::unique_lock lock(m); + auto i = lowest_nomempty(entries_); + entries_t e; + std::copy(i, entries_.cend(), + std::inserter(e, e.end())); + m.unlock(); + auto ec = watch(); + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed to re-establish watch, unsafe to continue: oid=" + << oid << ", ec=" << ec.message() << dendl; + } + return handle_init(std::move(e)); + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } +} + +bs::error_code logback_generations::update(const DoutPrefixProvider *dpp, optional_yield y) noexcept +{ + try { + auto res = read(dpp, y); + if (!res) { + return res.error(); + } + + std::unique_lock l(m); + auto& [es, v] = *res; + if (v == version) { + // Nothing to do! + return {}; + } + + // Check consistency and prepare update + if (es.empty()) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": INCONSISTENCY! Read empty update." << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + auto cur_lowest = lowest_nomempty(entries_); + // Straight up can't happen + assert(cur_lowest != entries_.cend()); + auto new_lowest = lowest_nomempty(es); + if (new_lowest == es.cend()) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": INCONSISTENCY! Read update with no active head." << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + if (new_lowest->first < cur_lowest->first) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": INCONSISTENCY! Tail moved wrong way." << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + + std::optional highest_empty; + if (new_lowest->first > cur_lowest->first && new_lowest != es.begin()) { + --new_lowest; + highest_empty = new_lowest->first; + } + + entries_t new_entries; + + if ((es.end() - 1)->first < (entries_.end() - 1)->first) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": INCONSISTENCY! Head moved wrong way." << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + + if ((es.end() - 1)->first > (entries_.end() - 1)->first) { + auto ei = es.lower_bound((entries_.end() - 1)->first + 1); + std::copy(ei, es.end(), std::inserter(new_entries, new_entries.end())); + } + + // Everything checks out! + + version = v; + entries_ = es; + l.unlock(); + + if (highest_empty) { + auto ec = handle_empty_to(*highest_empty); + if (ec) return ec; + } + + if (!new_entries.empty()) { + auto ec = handle_new_gens(std::move(new_entries)); + if (ec) return ec; + } + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +auto logback_generations::read(const DoutPrefixProvider *dpp, optional_yield y) noexcept -> + tl::expected, bs::error_code> +{ + try { + librados::ObjectReadOperation op; + std::unique_lock l(m); + cls_version_check(op, version, VER_COND_GE); + l.unlock(); + obj_version v2; + cls_version_read(op, &v2); + cb::list bl; + op.read(0, 0, &bl, nullptr); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y); + if (r < 0) { + if (r == -ENOENT) { + ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": oid=" << oid + << " not found" << dendl; + } else { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed reading oid=" << oid + << ", r=" << r << dendl; + } + return tl::unexpected(bs::error_code(-r, bs::system_category())); + } + auto bi = bl.cbegin(); + entries_t e; + try { + decode(e, bi); + } catch (const cb::error& err) { + return tl::unexpected(err.code()); + } + return std::pair{ std::move(e), std::move(v2) }; + } catch (const std::bad_alloc&) { + return tl::unexpected(bs::error_code(ENOMEM, bs::system_category())); + } +} + +bs::error_code logback_generations::write(const DoutPrefixProvider *dpp, entries_t&& e, + std::unique_lock&& l_, + optional_yield y) noexcept +{ + auto l = std::move(l_); + ceph_assert(l.mutex() == &m && + l.owns_lock()); + try { + librados::ObjectWriteOperation op; + cls_version_check(op, version, VER_COND_GE); + cb::list bl; + encode(e, bl); + op.write_full(bl); + cls_version_inc(op); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r == 0) { + entries_ = std::move(e); + version.inc(); + return {}; + } + l.unlock(); + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed reading oid=" << oid + << ", r=" << r << dendl; + return { -r, bs::system_category() }; + } + if (r == -ECANCELED) { + auto ec = update(dpp, y); + if (ec) { + return ec; + } else { + return { ECANCELED, bs::system_category() }; + } + } + } catch (const std::bad_alloc&) { + return { ENOMEM, bs::system_category() }; + } + return {}; +} + + +bs::error_code logback_generations::watch() noexcept { + try { + auto cct = static_cast(ioctx.cct()); + auto r = ioctx.watch2(oid, &watchcookie, this); + if (r < 0) { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed to set watch oid=" << oid + << ", r=" << r << dendl; + return { -r, bs::system_category() }; + } + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +bs::error_code logback_generations::new_backing(const DoutPrefixProvider *dpp, + log_type type, + optional_yield y) noexcept { + static constexpr auto max_tries = 10; + try { + auto ec = update(dpp, y); + if (ec) return ec; + auto tries = 0; + entries_t new_entries; + do { + std::unique_lock l(m); + auto last = entries_.end() - 1; + if (last->second.type == type) { + // Nothing to be done + return {}; + } + auto newgenid = last->first + 1; + logback_generation newgen; + newgen.gen_id = newgenid; + newgen.type = type; + new_entries.emplace(newgenid, newgen); + auto es = entries_; + es.emplace(newgenid, std::move(newgen)); + ec = write(dpp, std::move(es), std::move(l), y); + ++tries; + } while (ec == bs::errc::operation_canceled && + tries < max_tries); + if (tries >= max_tries) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": exhausted retry attempts." << dendl; + return ec; + } + + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": write failed with ec=" << ec.message() << dendl; + return ec; + } + + cb::list bl, rbl; + + auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": notify failed with r=" << r << dendl; + return { -r, bs::system_category() }; + } + ec = handle_new_gens(new_entries); + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +bs::error_code logback_generations::empty_to(const DoutPrefixProvider *dpp, + uint64_t gen_id, + optional_yield y) noexcept { + static constexpr auto max_tries = 10; + try { + auto ec = update(dpp, y); + if (ec) return ec; + auto tries = 0; + uint64_t newtail = 0; + do { + std::unique_lock l(m); + { + auto last = entries_.end() - 1; + if (gen_id >= last->first) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": Attempt to trim beyond the possible." << dendl; + return bs::error_code(EINVAL, bs::system_category()); + } + } + auto es = entries_; + auto ei = es.upper_bound(gen_id); + if (ei == es.begin()) { + // Nothing to be done. + return {}; + } + for (auto i = es.begin(); i < ei; ++i) { + newtail = i->first; + i->second.pruned = ceph::real_clock::now(); + } + ec = write(dpp, std::move(es), std::move(l), y); + ++tries; + } while (ec == bs::errc::operation_canceled && + tries < max_tries); + if (tries >= max_tries) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": exhausted retry attempts." << dendl; + return ec; + } + + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": write failed with ec=" << ec.message() << dendl; + return ec; + } + + cb::list bl, rbl; + + auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": notify failed with r=" << r << dendl; + return { -r, bs::system_category() }; + } + ec = handle_empty_to(newtail); + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +bs::error_code logback_generations::remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept { + static constexpr auto max_tries = 10; + try { + auto ec = update(dpp, y); + if (ec) return ec; + auto tries = 0; + entries_t new_entries; + std::unique_lock l(m); + ceph_assert(!entries_.empty()); + { + auto i = lowest_nomempty(entries_); + if (i == entries_.begin()) { + return {}; + } + } + entries_t es; + auto now = ceph::real_clock::now(); + l.unlock(); + do { + std::copy_if(entries_.cbegin(), entries_.cend(), + std::inserter(es, es.end()), + [now](const auto& e) { + if (!e.second.pruned) + return false; + + auto pruned = *e.second.pruned; + return (now - pruned) >= 1h; + }); + auto es2 = entries_; + for (const auto& [gen_id, e] : es) { + ceph_assert(e.pruned); + auto ec = log_remove(dpp, ioctx, shards, + [this, gen_id = gen_id](int shard) { + return this->get_oid(gen_id, shard); + }, (gen_id == 0), y); + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": Error pruning: gen_id=" << gen_id + << " ec=" << ec.message() << dendl; + } + if (auto i = es2.find(gen_id); i != es2.end()) { + es2.erase(i); + } + } + l.lock(); + es.clear(); + ec = write(dpp, std::move(es2), std::move(l), y); + ++tries; + } while (ec == bs::errc::operation_canceled && + tries < max_tries); + if (tries >= max_tries) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": exhausted retry attempts." << dendl; + return ec; + } + + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": write failed with ec=" << ec.message() << dendl; + return ec; + } + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +void logback_generations::handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) +{ + auto cct = static_cast(ioctx.cct()); + const DoutPrefix dp(cct, dout_subsys, "logback generations handle_notify: "); + if (notifier_id != my_id) { + auto ec = update(&dp, null_yield); + if (ec) { + lderr(cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": update failed, no one to report to and no safe way to continue." + << dendl; + abort(); + } + } + cb::list rbl; + ioctx.notify_ack(oid, notify_id, watchcookie, rbl); +} + +void logback_generations::handle_error(uint64_t cookie, int err) { + auto cct = static_cast(ioctx.cct()); + auto r = ioctx.unwatch2(watchcookie); + if (r < 0) { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed to set unwatch oid=" << oid + << ", r=" << r << dendl; + } + + auto ec = watch(); + if (ec) { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed to re-establish watch, unsafe to continue: oid=" + << oid << ", ec=" << ec.message() << dendl; + } +} diff --git a/src/rgw/driver/rados/rgw_log_backing.h b/src/rgw/driver/rados/rgw_log_backing.h new file mode 100644 index 000000000..3dfdb8ee4 --- /dev/null +++ b/src/rgw/driver/rados/rgw_log_backing.h @@ -0,0 +1,394 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include + +#include + +#include "include/rados/librados.hpp" +#include "include/encoding.h" +#include "include/expected.hpp" +#include "include/function2.hpp" + +#include "cls/version/cls_version_types.h" + +#include "common/async/yield_context.h" +#include "common/Formatter.h" +#include "common/strtol.h" + +namespace bc = boost::container; +namespace bs = boost::system; + +#include "cls_fifo_legacy.h" + +/// Type of log backing, stored in the mark used in the quick check, +/// and passed to checking functions. +enum class log_type { + omap = 0, + fifo = 1 +}; + +inline void encode(const log_type& type, ceph::buffer::list& bl) { + auto t = static_cast(type); + encode(t, bl); +} + +inline void decode(log_type& type, bufferlist::const_iterator& bl) { + uint8_t t; + decode(t, bl); + type = static_cast(t); +} + +inline std::optional to_log_type(std::string_view s) { + if (strncasecmp(s.data(), "omap", s.length()) == 0) { + return log_type::omap; + } else if (strncasecmp(s.data(), "fifo", s.length()) == 0) { + return log_type::fifo; + } else { + return std::nullopt; + } +} +inline std::ostream& operator <<(std::ostream& m, const log_type& t) { + switch (t) { + case log_type::omap: + return m << "log_type::omap"; + case log_type::fifo: + return m << "log_type::fifo"; + } + + return m << "log_type::UNKNOWN=" << static_cast(t); +} + +/// Look over the shards in a log and determine the type. +tl::expected +log_backing_type(const DoutPrefixProvider *dpp, + librados::IoCtx& ioctx, + log_type def, + int shards, //< Total number of shards + /// A function taking a shard number and + /// returning an oid. + const fu2::unique_function& get_oid, + optional_yield y); + +/// Remove all log shards and associated parts of fifos. +bs::error_code log_remove(librados::IoCtx& ioctx, + int shards, //< Total number of shards + /// A function taking a shard number and + /// returning an oid. + const fu2::unique_function& get_oid, + bool leave_zero, + optional_yield y); + + +struct logback_generation { + uint64_t gen_id = 0; + log_type type; + std::optional pruned; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(gen_id, bl); + encode(type, bl); + encode(pruned, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(gen_id, bl); + decode(type, bl); + decode(pruned, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(logback_generation) +inline std::ostream& operator <<(std::ostream& m, const logback_generation& g) { + return m << "[" << g.gen_id << "," << g.type << "," + << (g.pruned ? "PRUNED" : "NOT PRUNED") << "]"; +} + +class logback_generations : public librados::WatchCtx2 { +public: + using entries_t = bc::flat_map; + +protected: + librados::IoCtx& ioctx; + logback_generations(librados::IoCtx& ioctx, + std::string oid, + fu2::unique_function&& get_oid, + int shards) noexcept + : ioctx(ioctx), oid(oid), get_oid(std::move(get_oid)), + shards(shards) {} + + uint64_t my_id = ioctx.get_instance_id(); + +private: + const std::string oid; + const fu2::unique_function get_oid; + +protected: + const int shards; + +private: + + uint64_t watchcookie = 0; + + obj_version version; + std::mutex m; + entries_t entries_; + + tl::expected, bs::error_code> + read(const DoutPrefixProvider *dpp, optional_yield y) noexcept; + bs::error_code write(const DoutPrefixProvider *dpp, entries_t&& e, std::unique_lock&& l_, + optional_yield y) noexcept; + bs::error_code setup(const DoutPrefixProvider *dpp, log_type def, optional_yield y) noexcept; + + bs::error_code watch() noexcept; + + auto lowest_nomempty(const entries_t& es) { + return std::find_if(es.begin(), es.end(), + [](const auto& e) { + return !e.second.pruned; + }); + } + +public: + + /// For the use of watch/notify. + + void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) override final; + + void handle_error(uint64_t cookie, int err) override final; + + /// Public interface + + virtual ~logback_generations(); + + template + static tl::expected, bs::error_code> + init(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx_, std::string oid_, + fu2::unique_function&& get_oid_, + int shards_, log_type def, optional_yield y, + Args&& ...args) noexcept { + try { + T* lgp = new T(ioctx_, std::move(oid_), + std::move(get_oid_), + shards_, std::forward(args)...); + std::unique_ptr lg(lgp); + lgp = nullptr; + auto ec = lg->setup(dpp, def, y); + if (ec) + return tl::unexpected(ec); + // Obnoxiousness for C++ Compiler in Bionic Beaver + return tl::expected, bs::error_code>(std::move(lg)); + } catch (const std::bad_alloc&) { + return tl::unexpected(bs::error_code(ENOMEM, bs::system_category())); + } + } + + bs::error_code update(const DoutPrefixProvider *dpp, optional_yield y) noexcept; + + entries_t entries() const { + return entries_; + } + + bs::error_code new_backing(const DoutPrefixProvider *dpp, log_type type, optional_yield y) noexcept; + + bs::error_code empty_to(const DoutPrefixProvider *dpp, uint64_t gen_id, optional_yield y) noexcept; + + bs::error_code remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept; + + // Callbacks, to be defined by descendant. + + /// Handle initialization on startup + /// + /// @param e All non-empty generations + virtual bs::error_code handle_init(entries_t e) noexcept = 0; + + /// Handle new generations. + /// + /// @param e Map of generations added since last update + virtual bs::error_code handle_new_gens(entries_t e) noexcept = 0; + + /// Handle generations being marked empty + /// + /// @param new_tail Lowest non-empty generation + virtual bs::error_code handle_empty_to(uint64_t new_tail) noexcept = 0; +}; + +inline std::string gencursor(uint64_t gen_id, std::string_view cursor) { + return (gen_id > 0 ? + fmt::format("G{:0>20}@{}", gen_id, cursor) : + std::string(cursor)); +} + +inline std::pair +cursorgen(std::string_view cursor_) { + if (cursor_.empty()) { + return { 0, "" }; + } + std::string_view cursor = cursor_; + if (cursor[0] != 'G') { + return { 0, cursor }; + } + cursor.remove_prefix(1); + auto gen_id = ceph::consume(cursor); + if (!gen_id || cursor[0] != '@') { + return { 0, cursor_ }; + } + cursor.remove_prefix(1); + return { *gen_id, cursor }; +} + +class LazyFIFO { + librados::IoCtx& ioctx; + std::string oid; + std::mutex m; + std::unique_ptr fifo; + + int lazy_init(const DoutPrefixProvider *dpp, optional_yield y) { + std::unique_lock l(m); + if (fifo) return 0; + auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid, &fifo, y); + if (r) { + fifo.reset(); + } + return r; + } + +public: + + LazyFIFO(librados::IoCtx& ioctx, std::string oid) + : ioctx(ioctx), oid(std::move(oid)) {} + + int read_meta(const DoutPrefixProvider *dpp, optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + return fifo->read_meta(dpp, y); + } + + int meta(const DoutPrefixProvider *dpp, rados::cls::fifo::info& info, optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + info = fifo->meta(); + return 0; + } + + int get_part_layout_info(const DoutPrefixProvider *dpp, + std::uint32_t& part_header_size, + std::uint32_t& part_entry_overhead, + optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + std::tie(part_header_size, part_entry_overhead) + = fifo->get_part_layout_info(); + return 0; + } + + int push(const DoutPrefixProvider *dpp, + const ceph::buffer::list& bl, + optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + return fifo->push(dpp, bl, y); + } + + int push(const DoutPrefixProvider *dpp, + ceph::buffer::list& bl, + librados::AioCompletion* c, + optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + fifo->push(dpp, bl, c); + return 0; + } + + int push(const DoutPrefixProvider *dpp, + const std::vector& data_bufs, + optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + return fifo->push(dpp, data_bufs, y); + } + + int push(const DoutPrefixProvider *dpp, + const std::vector& data_bufs, + librados::AioCompletion* c, + optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + fifo->push(dpp, data_bufs, c); + return 0; + } + + int list(const DoutPrefixProvider *dpp, + int max_entries, std::optional markstr, + std::vector* out, + bool* more, optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + return fifo->list(dpp, max_entries, markstr, out, more, y); + } + + int list(const DoutPrefixProvider *dpp, int max_entries, std::optional markstr, + std::vector* out, bool* more, + librados::AioCompletion* c, optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + fifo->list(dpp, max_entries, markstr, out, more, c); + return 0; + } + + int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + return fifo->trim(dpp, markstr, exclusive, y); + } + + int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, librados::AioCompletion* c, + optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + fifo->trim(dpp, markstr, exclusive, c); + return 0; + } + + int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header, + optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + return fifo->get_part_info(dpp, part_num, header, y); + } + + int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header, + librados::AioCompletion* c, optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + fifo->get_part_info(part_num, header, c); + return 0; + } + + int get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function< + void(int r, rados::cls::fifo::part_header&&)>&& f, + librados::AioCompletion* c, + optional_yield y) { + auto r = lazy_init(dpp, y); + if (r < 0) return r; + fifo->get_head_info(dpp, std::move(f), c); + return 0; + } +}; diff --git a/src/rgw/driver/rados/rgw_metadata.cc b/src/rgw/driver/rados/rgw_metadata.cc new file mode 100644 index 000000000..e3e49316e --- /dev/null +++ b/src/rgw/driver/rados/rgw_metadata.cc @@ -0,0 +1,233 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_metadata.h" + +#include "rgw_zone.h" +#include "rgw_mdlog.h" + +#include "services/svc_zone.h" +#include "services/svc_cls.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +const std::string RGWMetadataLogHistory::oid = "meta.history"; + +struct obj_version; + +void rgw_shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id) +{ + uint32_t val = ceph_str_hash_linux(key.c_str(), key.size()); + char buf[16]; + if (shard_id) { + *shard_id = val % max_shards; + } + snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards)); + name = prefix + buf; +} + +void rgw_shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name) +{ + uint32_t val = ceph_str_hash_linux(key.c_str(), key.size()); + val ^= ceph_str_hash_linux(section.c_str(), section.size()); + char buf[16]; + snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards)); + name = prefix + buf; +} + +void rgw_shard_name(const string& prefix, unsigned shard_id, string& name) +{ + char buf[16]; + snprintf(buf, sizeof(buf), "%u", shard_id); + name = prefix + buf; +} + +int RGWMetadataLog::add_entry(const DoutPrefixProvider *dpp, const string& hash_key, const string& section, const string& key, bufferlist& bl) { + if (!svc.zone->need_to_log_metadata()) + return 0; + + string oid; + int shard_id; + + rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, &shard_id); + mark_modified(shard_id); + real_time now = real_clock::now(); + return svc.cls->timelog.add(dpp, oid, now, section, key, bl, null_yield); +} + +int RGWMetadataLog::get_shard_id(const string& hash_key, int *shard_id) +{ + string oid; + + rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, shard_id); + return 0; +} + +int RGWMetadataLog::store_entries_in_shard(const DoutPrefixProvider *dpp, list& entries, int shard_id, librados::AioCompletion *completion) +{ + string oid; + + mark_modified(shard_id); + rgw_shard_name(prefix, shard_id, oid); + return svc.cls->timelog.add(dpp, oid, entries, completion, false, null_yield); +} + +void RGWMetadataLog::init_list_entries(int shard_id, const real_time& from_time, const real_time& end_time, + const string& marker, void **handle) +{ + LogListCtx *ctx = new LogListCtx(); + + ctx->cur_shard = shard_id; + ctx->from_time = from_time; + ctx->end_time = end_time; + ctx->marker = marker; + + get_shard_oid(ctx->cur_shard, ctx->cur_oid); + + *handle = (void *)ctx; +} + +void RGWMetadataLog::complete_list_entries(void *handle) { + LogListCtx *ctx = static_cast(handle); + delete ctx; +} + +int RGWMetadataLog::list_entries(const DoutPrefixProvider *dpp, void *handle, + int max_entries, + list& entries, + string *last_marker, + bool *truncated) { + LogListCtx *ctx = static_cast(handle); + + if (!max_entries) { + *truncated = false; + return 0; + } + + std::string next_marker; + int ret = svc.cls->timelog.list(dpp, ctx->cur_oid, ctx->from_time, ctx->end_time, + max_entries, entries, ctx->marker, + &next_marker, truncated, null_yield); + if ((ret < 0) && (ret != -ENOENT)) + return ret; + + ctx->marker = std::move(next_marker); + if (last_marker) { + *last_marker = ctx->marker; + } + + if (ret == -ENOENT) + *truncated = false; + + return 0; +} + +int RGWMetadataLog::get_info(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfo *info) +{ + string oid; + get_shard_oid(shard_id, oid); + + cls_log_header header; + + int ret = svc.cls->timelog.info(dpp, oid, &header, null_yield); + if ((ret < 0) && (ret != -ENOENT)) + return ret; + + info->marker = header.max_marker; + info->last_update = header.max_time.to_real_time(); + + return 0; +} + +static void _mdlog_info_completion(librados::completion_t cb, void *arg) +{ + auto infoc = static_cast(arg); + infoc->finish(cb); + infoc->put(); // drop the ref from get_info_async() +} + +RGWMetadataLogInfoCompletion::RGWMetadataLogInfoCompletion(info_callback_t cb) + : completion(librados::Rados::aio_create_completion((void *)this, + _mdlog_info_completion)), + callback(cb) +{ +} + +RGWMetadataLogInfoCompletion::~RGWMetadataLogInfoCompletion() +{ + completion->release(); +} + +int RGWMetadataLog::get_info_async(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfoCompletion *completion) +{ + string oid; + get_shard_oid(shard_id, oid); + + completion->get(); // hold a ref until the completion fires + + return svc.cls->timelog.info_async(dpp, completion->get_io_obj(), oid, + &completion->get_header(), + completion->get_completion()); +} + +int RGWMetadataLog::trim(const DoutPrefixProvider *dpp, int shard_id, const real_time& from_time, const real_time& end_time, + const string& start_marker, const string& end_marker) +{ + string oid; + get_shard_oid(shard_id, oid); + + return svc.cls->timelog.trim(dpp, oid, from_time, end_time, start_marker, + end_marker, nullptr, null_yield); +} + +int RGWMetadataLog::lock_exclusive(const DoutPrefixProvider *dpp, int shard_id, timespan duration, string& zone_id, string& owner_id) { + string oid; + get_shard_oid(shard_id, oid); + + return svc.cls->lock.lock_exclusive(dpp, svc.zone->get_zone_params().log_pool, oid, duration, zone_id, owner_id); +} + +int RGWMetadataLog::unlock(const DoutPrefixProvider *dpp, int shard_id, string& zone_id, string& owner_id) { + string oid; + get_shard_oid(shard_id, oid); + + return svc.cls->lock.unlock(dpp, svc.zone->get_zone_params().log_pool, oid, zone_id, owner_id); +} + +void RGWMetadataLog::mark_modified(int shard_id) +{ + lock.get_read(); + if (modified_shards.find(shard_id) != modified_shards.end()) { + lock.unlock(); + return; + } + lock.unlock(); + + std::unique_lock wl{lock}; + modified_shards.insert(shard_id); +} + +void RGWMetadataLog::read_clear_modified(set &modified) +{ + std::unique_lock wl{lock}; + modified.swap(modified_shards); + modified_shards.clear(); +} + +void RGWMetadataLogInfo::dump(Formatter *f) const +{ + encode_json("marker", marker, f); + utime_t ut(last_update); + encode_json("last_update", ut, f); +} + +void RGWMetadataLogInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("marker", marker, obj); + utime_t ut; + JSONDecoder::decode_json("last_update", ut, obj); + last_update = ut.to_real_time(); +} + diff --git a/src/rgw/driver/rados/rgw_metadata.h b/src/rgw/driver/rados/rgw_metadata.h new file mode 100644 index 000000000..c83db7c40 --- /dev/null +++ b/src/rgw/driver/rados/rgw_metadata.h @@ -0,0 +1,298 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include "include/types.h" +#include "rgw_common.h" +#include "rgw_period_history.h" +#include "rgw_mdlog_types.h" +#include "cls/version/cls_version_types.h" +#include "cls/log/cls_log_types.h" +#include "common/RefCountedObj.h" +#include "common/ceph_time.h" +#include "services/svc_meta_be.h" +#include "rgw_sal_fwd.h" + + +class RGWCoroutine; +class JSONObj; +struct RGWObjVersionTracker; + +struct obj_version; + + +class RGWMetadataObject { +protected: + obj_version objv; + ceph::real_time mtime; + std::map *pattrs{nullptr}; + +public: + RGWMetadataObject() {} + RGWMetadataObject(const obj_version& v, + real_time m) : objv(v), mtime(m) {} + virtual ~RGWMetadataObject() {} + obj_version& get_version(); + real_time& get_mtime() { return mtime; } + void set_pattrs(std::map *_pattrs) { + pattrs = _pattrs; + } + std::map *get_pattrs() { + return pattrs; + } + + virtual void dump(Formatter *f) const {} +}; + +class RGWMetadataManager; + +class RGWMetadataHandler { + friend class RGWMetadataManager; + +protected: + CephContext *cct; + +public: + RGWMetadataHandler() {} + virtual ~RGWMetadataHandler(); + virtual std::string get_type() = 0; + + void base_init(CephContext *_cct) { + cct = _cct; + } + + virtual RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) = 0; + + virtual int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) = 0; + virtual int put(std::string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, + bool from_remote_zone) = 0; + virtual int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) = 0; + + virtual int mutate(const std::string& entry, + const ceph::real_time& mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogStatus op_type, + std::function f) = 0; + + virtual int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) = 0; + virtual int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list& keys, bool *truncated) = 0; + virtual void list_keys_complete(void *handle) = 0; + + virtual std::string get_marker(void *handle) = 0; + + virtual int get_shard_id(const std::string& entry, int *shard_id) { + *shard_id = 0; + return 0; + } + virtual int attach(RGWMetadataManager *manager); +}; + +class RGWMetadataHandler_GenericMetaBE : public RGWMetadataHandler { + friend class RGWSI_MetaBackend; + friend class RGWMetadataManager; + friend class Put; + +public: + class Put; + +protected: + RGWSI_MetaBackend_Handler *be_handler; + + virtual int do_get(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) = 0; + virtual int do_put(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, optional_yield y, + const DoutPrefixProvider *dpp, RGWMDLogSyncType type, + bool from_remote_zone) = 0; + virtual int do_put_operate(Put *put_op, const DoutPrefixProvider *dpp); + virtual int do_remove(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) = 0; + +public: + RGWMetadataHandler_GenericMetaBE() {} + + void base_init(CephContext *_cct, + RGWSI_MetaBackend_Handler *_be_handler) { + RGWMetadataHandler::base_init(_cct); + be_handler = _be_handler; + } + + RGWSI_MetaBackend_Handler *get_be_handler() { + return be_handler; + } + + class Put { + protected: + RGWMetadataHandler_GenericMetaBE *handler; + RGWSI_MetaBackend_Handler::Op *op; + std::string& entry; + RGWMetadataObject *obj; + RGWObjVersionTracker& objv_tracker; + RGWMDLogSyncType apply_type; + optional_yield y; + bool from_remote_zone{false}; + + int get(RGWMetadataObject **obj, const DoutPrefixProvider *dpp) { + return handler->do_get(op, entry, obj, y, dpp); + } + public: + Put(RGWMetadataHandler_GenericMetaBE *_handler, RGWSI_MetaBackend_Handler::Op *_op, + std::string& _entry, RGWMetadataObject *_obj, + RGWObjVersionTracker& _objv_tracker, optional_yield _y, + RGWMDLogSyncType _type, bool from_remote_zone); + + virtual ~Put() {} + + virtual int put_pre(const DoutPrefixProvider *dpp) { + return 0; + } + virtual int put(const DoutPrefixProvider *dpp) { + return 0; + } + virtual int put_post(const DoutPrefixProvider *dpp) { + return 0; + } + virtual int finalize() { + return 0; + } + }; + + int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) override; + int put(std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone) override; + int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) override; + + int mutate(const std::string& entry, + const ceph::real_time& mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogStatus op_type, + std::function f) override; + + int get_shard_id(const std::string& entry, int *shard_id) override; + + int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) override; + int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list& keys, bool *truncated) override; + void list_keys_complete(void *handle) override; + + std::string get_marker(void *handle) override; + + /** + * Compare an incoming versus on-disk tag/version+mtime combo against + * the sync mode to see if the new one should replace the on-disk one. + * + * @return true if the update should proceed, false otherwise. + */ + static bool check_versions(bool exists, + const obj_version& ondisk, const real_time& ondisk_time, + const obj_version& incoming, const real_time& incoming_time, + RGWMDLogSyncType sync_mode) { + switch (sync_mode) { + case APPLY_UPDATES: + if ((ondisk.tag != incoming.tag) || + (ondisk.ver >= incoming.ver)) + return false; + break; + case APPLY_NEWER: + if (ondisk_time >= incoming_time) + return false; + break; + case APPLY_EXCLUSIVE: + if (exists) + return false; + break; + case APPLY_ALWAYS: //deliberate fall-thru -- we always apply! + default: break; + } + return true; + } +}; + +class RGWMetadataTopHandler; + +class RGWMetadataManager { + friend class RGWMetadataHandler; + + CephContext *cct; + RGWSI_Meta *meta_svc; + std::map handlers; + std::unique_ptr md_top_handler; + + int find_handler(const std::string& metadata_key, RGWMetadataHandler **handler, std::string& entry); + int register_handler(RGWMetadataHandler *handler); + +public: + RGWMetadataManager(RGWSI_Meta *_meta_svc); + ~RGWMetadataManager(); + + RGWMetadataHandler *get_handler(const std::string& type); + + int get(std::string& metadata_key, Formatter *f, optional_yield y, const DoutPrefixProvider *dpp); + int put(std::string& metadata_key, bufferlist& bl, optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType sync_mode, + bool from_remote_zone, + obj_version *existing_version = NULL); + int remove(std::string& metadata_key, optional_yield y, const DoutPrefixProvider *dpp); + + int mutate(const std::string& metadata_key, + const ceph::real_time& mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogStatus op_type, + std::function f); + + int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, void **phandle); + int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void **phandle); + int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list& keys, bool *truncated); + void list_keys_complete(void *handle); + + std::string get_marker(void *handle); + + void dump_log_entry(cls_log_entry& entry, Formatter *f); + + void get_sections(std::list& sections); + + void parse_metadata_key(const std::string& metadata_key, std::string& type, std::string& entry); + + int get_shard_id(const std::string& section, const std::string& key, int *shard_id); +}; + +class RGWMetadataHandlerPut_SObj : public RGWMetadataHandler_GenericMetaBE::Put +{ +protected: + std::unique_ptr oo; + RGWMetadataObject *old_obj{nullptr}; + bool exists{false}; + +public: + RGWMetadataHandlerPut_SObj(RGWMetadataHandler_GenericMetaBE *handler, RGWSI_MetaBackend_Handler::Op *op, + std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, + RGWMDLogSyncType type, bool from_remote_zone); + ~RGWMetadataHandlerPut_SObj(); + + int put_pre(const DoutPrefixProvider *dpp) override; + int put(const DoutPrefixProvider *dpp) override; + virtual int put_check(const DoutPrefixProvider *dpp) { + return 0; + } + virtual int put_checked(const DoutPrefixProvider *dpp); + virtual void encode_obj(bufferlist *bl) {} +}; + +void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id); +void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name); +void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name); + diff --git a/src/rgw/driver/rados/rgw_notify.cc b/src/rgw/driver/rados/rgw_notify.cc new file mode 100644 index 000000000..b1835016e --- /dev/null +++ b/src/rgw/driver/rados/rgw_notify.cc @@ -0,0 +1,1023 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_notify.h" +#include "cls/2pc_queue/cls_2pc_queue_client.h" +#include "cls/lock/cls_lock_client.h" +#include +#include +#include +#include +#include "rgw_sal_rados.h" +#include "rgw_pubsub.h" +#include "rgw_pubsub_push.h" +#include "rgw_perf_counters.h" +#include "common/dout.h" +#include + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::notify { + +struct event_entry_t { + rgw_pubsub_s3_event event; + std::string push_endpoint; + std::string push_endpoint_args; + std::string arn_topic; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(event, bl); + encode(push_endpoint, bl); + encode(push_endpoint_args, bl); + encode(arn_topic, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(event, bl); + decode(push_endpoint, bl); + decode(push_endpoint_args, bl); + decode(arn_topic, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(event_entry_t) + +using queues_t = std::set; + +// use mmap/mprotect to allocate 128k coroutine stacks +auto make_stack_allocator() { + return boost::context::protected_fixedsize_stack{128*1024}; +} + +const std::string Q_LIST_OBJECT_NAME = "queues_list_object"; + +class Manager : public DoutPrefixProvider { + const size_t max_queue_size; + const uint32_t queues_update_period_ms; + const uint32_t queues_update_retry_ms; + const uint32_t queue_idle_sleep_us; + const utime_t failover_time; + CephContext* const cct; + static constexpr auto COOKIE_LEN = 16; + const std::string lock_cookie; + boost::asio::io_context io_context; + boost::asio::executor_work_guard work_guard; + const uint32_t worker_count; + std::vector workers; + const uint32_t stale_reservations_period_s; + const uint32_t reservations_cleanup_period_s; +public: + librados::IoCtx& rados_ioctx; +private: + + CephContext *get_cct() const override { return cct; } + unsigned get_subsys() const override { return dout_subsys; } + std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw notify: "; } + + // read the list of queues from the queue list object + int read_queue_list(queues_t& queues, optional_yield y) { + constexpr auto max_chunk = 1024U; + std::string start_after; + bool more = true; + int rval; + while (more) { + librados::ObjectReadOperation op; + queues_t queues_chunk; + op.omap_get_keys2(start_after, max_chunk, &queues_chunk, &more, &rval); + const auto ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, nullptr, y); + if (ret == -ENOENT) { + // queue list object was not created - nothing to do + return 0; + } + if (ret < 0) { + // TODO: do we need to check on rval as well as ret? + ldpp_dout(this, 1) << "ERROR: failed to read queue list. error: " << ret << dendl; + return ret; + } + queues.merge(queues_chunk); + } + return 0; + } + + // set m1 to be the minimum between m1 and m2 + static int set_min_marker(std::string& m1, const std::string m2) { + cls_queue_marker mr1; + cls_queue_marker mr2; + if (mr1.from_str(m1.c_str()) < 0 || mr2.from_str(m2.c_str()) < 0) { + return -EINVAL; + } + if (mr2.gen <= mr1.gen && mr2.offset < mr1.offset) { + m1 = m2; + } + return 0; + } + + using Clock = ceph::coarse_mono_clock; + using Executor = boost::asio::io_context::executor_type; + using Timer = boost::asio::basic_waitable_timer, Executor>; + + class tokens_waiter { + const std::chrono::hours infinite_duration; + size_t pending_tokens; + Timer timer; + + struct token { + tokens_waiter& waiter; + token(tokens_waiter& _waiter) : waiter(_waiter) { + ++waiter.pending_tokens; + } + + ~token() { + --waiter.pending_tokens; + if (waiter.pending_tokens == 0) { + waiter.timer.cancel(); + } + } + }; + + public: + + tokens_waiter(boost::asio::io_context& io_context) : + infinite_duration(1000), + pending_tokens(0), + timer(io_context) {} + + void async_wait(yield_context yield) { + if (pending_tokens == 0) { + return; + } + timer.expires_from_now(infinite_duration); + boost::system::error_code ec; + timer.async_wait(yield[ec]); + ceph_assert(ec == boost::system::errc::operation_canceled); + } + + token make_token() { + return token(*this); + } + }; + + // processing of a specific entry + // return whether processing was successfull (true) or not (false) + bool process_entry(const cls_queue_entry& entry, yield_context yield) { + event_entry_t event_entry; + auto iter = entry.data.cbegin(); + try { + decode(event_entry, iter); + } catch (buffer::error& err) { + ldpp_dout(this, 5) << "WARNING: failed to decode entry. error: " << err.what() << dendl; + return false; + } + try { + // TODO move endpoint creation to queue level + const auto push_endpoint = RGWPubSubEndpoint::create(event_entry.push_endpoint, event_entry.arn_topic, + RGWHTTPArgs(event_entry.push_endpoint_args, this), + cct); + ldpp_dout(this, 20) << "INFO: push endpoint created: " << event_entry.push_endpoint << + " for entry: " << entry.marker << dendl; + const auto ret = push_endpoint->send_to_completion_async(cct, event_entry.event, optional_yield(io_context, yield)); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint + << " failed. error: " << ret << " (will retry)" << dendl; + return false; + } else { + ldpp_dout(this, 20) << "INFO: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint + << " ok" << dendl; + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok); + return true; + } + } catch (const RGWPubSubEndpoint::configuration_error& e) { + ldpp_dout(this, 5) << "WARNING: failed to create push endpoint: " + << event_entry.push_endpoint << " for entry: " << entry.marker << ". error: " << e.what() << " (will retry) " << dendl; + return false; + } + } + + // clean stale reservation from queue + void cleanup_queue(const std::string& queue_name, yield_context yield) { + while (true) { + ldpp_dout(this, 20) << "INFO: trying to perform stale reservation cleanup for queue: " << queue_name << dendl; + const auto now = ceph::coarse_real_time::clock::now(); + const auto stale_time = now - std::chrono::seconds(stale_reservations_period_s); + librados::ObjectWriteOperation op; + op.assert_exists(); + rados::cls::lock::assert_locked(&op, queue_name+"_lock", + ClsLockType::EXCLUSIVE, + lock_cookie, + "" /*no tag*/); + cls_2pc_queue_expire_reservations(op, stale_time); + // check ownership and do reservation cleanup in one batch + auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield)); + if (ret == -ENOENT) { + // queue was deleted + ldpp_dout(this, 5) << "INFO: queue: " + << queue_name << ". was removed. cleanup will stop" << dendl; + return; + } + if (ret == -EBUSY) { + ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl; + return; + } + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: failed to cleanup stale reservation from queue and/or lock queue: " << queue_name + << ". error: " << ret << dendl; + } + Timer timer(io_context); + timer.expires_from_now(std::chrono::seconds(reservations_cleanup_period_s)); + boost::system::error_code ec; + timer.async_wait(yield[ec]); + } + } + + // processing of a specific queue + void process_queue(const std::string& queue_name, yield_context yield) { + constexpr auto max_elements = 1024; + auto is_idle = false; + const std::string start_marker; + + // start a the cleanup coroutine for the queue + spawn::spawn(io_context, [this, queue_name](yield_context yield) { + cleanup_queue(queue_name, yield); + }, make_stack_allocator()); + + while (true) { + // if queue was empty the last time, sleep for idle timeout + if (is_idle) { + Timer timer(io_context); + timer.expires_from_now(std::chrono::microseconds(queue_idle_sleep_us)); + boost::system::error_code ec; + timer.async_wait(yield[ec]); + } + + // get list of entries in the queue + is_idle = true; + bool truncated = false; + std::string end_marker; + std::vector entries; + auto total_entries = 0U; + { + librados::ObjectReadOperation op; + op.assert_exists(); + bufferlist obl; + int rval; + rados::cls::lock::assert_locked(&op, queue_name+"_lock", + ClsLockType::EXCLUSIVE, + lock_cookie, + "" /*no tag*/); + cls_2pc_queue_list_entries(op, start_marker, max_elements, &obl, &rval); + // check ownership and list entries in one batch + auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, nullptr, optional_yield(io_context, yield)); + if (ret == -ENOENT) { + // queue was deleted + ldpp_dout(this, 5) << "INFO: queue: " + << queue_name << ". was removed. processing will stop" << dendl; + return; + } + if (ret == -EBUSY) { + ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl; + return; + } + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: failed to get list of entries in queue and/or lock queue: " + << queue_name << ". error: " << ret << " (will retry)" << dendl; + continue; + } + ret = cls_2pc_queue_list_entries_result(obl, entries, &truncated, end_marker); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: failed to parse list of entries in queue: " + << queue_name << ". error: " << ret << " (will retry)" << dendl; + continue; + } + } + total_entries = entries.size(); + if (total_entries == 0) { + // nothing in the queue + continue; + } + // log when queue is not idle + ldpp_dout(this, 20) << "INFO: found: " << total_entries << " entries in: " << queue_name << + ". end marker is: " << end_marker << dendl; + + is_idle = false; + auto has_error = false; + auto remove_entries = false; + auto entry_idx = 1U; + tokens_waiter waiter(io_context); + for (auto& entry : entries) { + if (has_error) { + // bail out on first error + break; + } + // TODO pass entry pointer instead of by-value + spawn::spawn(yield, [this, &queue_name, entry_idx, total_entries, &end_marker, &remove_entries, &has_error, &waiter, entry](yield_context yield) { + const auto token = waiter.make_token(); + if (process_entry(entry, yield)) { + ldpp_dout(this, 20) << "INFO: processing of entry: " << + entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " ok" << dendl; + remove_entries = true; + } else { + if (set_min_marker(end_marker, entry.marker) < 0) { + ldpp_dout(this, 1) << "ERROR: cannot determin minimum between malformed markers: " << end_marker << ", " << entry.marker << dendl; + } else { + ldpp_dout(this, 20) << "INFO: new end marker for removal: " << end_marker << " from: " << queue_name << dendl; + } + has_error = true; + ldpp_dout(this, 20) << "INFO: processing of entry: " << + entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " failed" << dendl; + } + }, make_stack_allocator()); + ++entry_idx; + } + + // wait for all pending work to finish + waiter.async_wait(yield); + + // delete all published entries from queue + if (remove_entries) { + librados::ObjectWriteOperation op; + op.assert_exists(); + rados::cls::lock::assert_locked(&op, queue_name+"_lock", + ClsLockType::EXCLUSIVE, + lock_cookie, + "" /*no tag*/); + cls_2pc_queue_remove_entries(op, end_marker); + // check ownership and deleted entries in one batch + const auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield)); + if (ret == -ENOENT) { + // queue was deleted + ldpp_dout(this, 5) << "INFO: queue: " + << queue_name << ". was removed. processing will stop" << dendl; + return; + } + if (ret == -EBUSY) { + ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl; + return; + } + if (ret < 0) { + ldpp_dout(this, 1) << "ERROR: failed to remove entries and/or lock queue up to: " << end_marker << " from queue: " + << queue_name << ". error: " << ret << dendl; + } else { + ldpp_dout(this, 20) << "INFO: removed entries up to: " << end_marker << " from queue: " + << queue_name << dendl; + } + } + } + } + + // lits of owned queues + using owned_queues_t = std::unordered_set; + + // process all queues + // find which of the queues is owned by this daemon and process it + void process_queues(yield_context yield) { + auto has_error = false; + owned_queues_t owned_queues; + + // add randomness to the duration between queue checking + // to make sure that different daemons are not synced + std::random_device seed; + std::mt19937 rnd_gen(seed()); + const auto min_jitter = 100; // ms + const auto max_jitter = 500; // ms + std::uniform_int_distribution<> duration_jitter(min_jitter, max_jitter); + + std::vector queue_gc; + std::mutex queue_gc_lock; + while (true) { + Timer timer(io_context); + const auto duration = (has_error ? + std::chrono::milliseconds(queues_update_retry_ms) : std::chrono::milliseconds(queues_update_period_ms)) + + std::chrono::milliseconds(duration_jitter(rnd_gen)); + timer.expires_from_now(duration); + const auto tp = ceph::coarse_real_time::clock::to_time_t(ceph::coarse_real_time::clock::now() + duration); + ldpp_dout(this, 20) << "INFO: next queues processing will happen at: " << std::ctime(&tp) << dendl; + boost::system::error_code ec; + timer.async_wait(yield[ec]); + + queues_t queues; + auto ret = read_queue_list(queues, optional_yield(io_context, yield)); + if (ret < 0) { + has_error = true; + continue; + } + + for (const auto& queue_name : queues) { + // try to lock the queue to check if it is owned by this rgw + // or if ownershif needs to be taken + librados::ObjectWriteOperation op; + op.assert_exists(); + rados::cls::lock::lock(&op, queue_name+"_lock", + ClsLockType::EXCLUSIVE, + lock_cookie, + "" /*no tag*/, + "" /*no description*/, + failover_time, + LOCK_FLAG_MAY_RENEW); + + ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield)); + if (ret == -EBUSY) { + // lock is already taken by another RGW + ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " owned (locked) by another daemon" << dendl; + // if queue was owned by this RGW, processing should be stopped, queue would be deleted from list afterwards + continue; + } + if (ret == -ENOENT) { + // queue is deleted - processing will stop the next time we try to read from the queue + ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " should not be locked - already deleted" << dendl; + continue; + } + if (ret < 0) { + // failed to lock for another reason, continue to process other queues + ldpp_dout(this, 1) << "ERROR: failed to lock queue: " << queue_name << ". error: " << ret << dendl; + has_error = true; + continue; + } + // add queue to list of owned queues + if (owned_queues.insert(queue_name).second) { + ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " now owned (locked) by this daemon" << dendl; + // start processing this queue + spawn::spawn(io_context, [this, &queue_gc, &queue_gc_lock, queue_name](yield_context yield) { + process_queue(queue_name, yield); + // if queue processing ended, it measn that the queue was removed or not owned anymore + // mark it for deletion + std::lock_guard lock_guard(queue_gc_lock); + queue_gc.push_back(queue_name); + ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " marked for removal" << dendl; + }, make_stack_allocator()); + } else { + ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " ownership (lock) renewed" << dendl; + } + } + // erase all queue that were deleted + { + std::lock_guard lock_guard(queue_gc_lock); + std::for_each(queue_gc.begin(), queue_gc.end(), [this, &owned_queues](const std::string& queue_name) { + owned_queues.erase(queue_name); + ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " removed" << dendl; + }); + queue_gc.clear(); + } + } + } + +public: + + ~Manager() { + work_guard.reset(); + io_context.stop(); + std::for_each(workers.begin(), workers.end(), [] (auto& worker) { worker.join(); }); + } + + // ctor: start all threads + Manager(CephContext* _cct, uint32_t _max_queue_size, uint32_t _queues_update_period_ms, + uint32_t _queues_update_retry_ms, uint32_t _queue_idle_sleep_us, u_int32_t failover_time_ms, + uint32_t _stale_reservations_period_s, uint32_t _reservations_cleanup_period_s, + uint32_t _worker_count, rgw::sal::RadosStore* store) : + max_queue_size(_max_queue_size), + queues_update_period_ms(_queues_update_period_ms), + queues_update_retry_ms(_queues_update_retry_ms), + queue_idle_sleep_us(_queue_idle_sleep_us), + failover_time(std::chrono::milliseconds(failover_time_ms)), + cct(_cct), + lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)), + work_guard(boost::asio::make_work_guard(io_context)), + worker_count(_worker_count), + stale_reservations_period_s(_stale_reservations_period_s), + reservations_cleanup_period_s(_reservations_cleanup_period_s), + rados_ioctx(store->getRados()->get_notif_pool_ctx()) + { + spawn::spawn(io_context, [this] (yield_context yield) { + process_queues(yield); + }, make_stack_allocator()); + + // start the worker threads to do the actual queue processing + const std::string WORKER_THREAD_NAME = "notif-worker"; + for (auto worker_id = 0U; worker_id < worker_count; ++worker_id) { + workers.emplace_back([this]() { + try { + io_context.run(); + } catch (const std::exception& err) { + ldpp_dout(this, 10) << "Notification worker failed with error: " << err.what() << dendl; + throw(err); + } + }); + const auto rc = ceph_pthread_setname(workers.back().native_handle(), + (WORKER_THREAD_NAME+std::to_string(worker_id)).c_str()); + ceph_assert(rc == 0); + } + ldpp_dout(this, 10) << "Started notification manager with: " << worker_count << " workers" << dendl; + } + + int add_persistent_topic(const std::string& topic_name, optional_yield y) { + if (topic_name == Q_LIST_OBJECT_NAME) { + ldpp_dout(this, 1) << "ERROR: topic name cannot be: " << Q_LIST_OBJECT_NAME << " (conflict with queue list object name)" << dendl; + return -EINVAL; + } + librados::ObjectWriteOperation op; + op.create(true); + cls_2pc_queue_init(op, topic_name, max_queue_size); + auto ret = rgw_rados_operate(this, rados_ioctx, topic_name, &op, y); + if (ret == -EEXIST) { + // queue already exists - nothing to do + ldpp_dout(this, 20) << "INFO: queue for topic: " << topic_name << " already exists. nothing to do" << dendl; + return 0; + } + if (ret < 0) { + // failed to create queue + ldpp_dout(this, 1) << "ERROR: failed to create queue for topic: " << topic_name << ". error: " << ret << dendl; + return ret; + } + + bufferlist empty_bl; + std::map new_topic{{topic_name, empty_bl}}; + op.omap_set(new_topic); + ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y); + if (ret < 0) { + ldpp_dout(this, 1) << "ERROR: failed to add queue: " << topic_name << " to queue list. error: " << ret << dendl; + return ret; + } + ldpp_dout(this, 20) << "INFO: queue: " << topic_name << " added to queue list" << dendl; + return 0; + } +}; + +// singleton manager +// note that the manager itself is not a singleton, and multiple instances may co-exist +// TODO make the pointer atomic in allocation and deallocation to avoid race conditions +static Manager* s_manager = nullptr; + +constexpr size_t MAX_QUEUE_SIZE = 128*1000*1000; // 128MB +constexpr uint32_t Q_LIST_UPDATE_MSEC = 1000*30; // check queue list every 30seconds +constexpr uint32_t Q_LIST_RETRY_MSEC = 1000; // retry every second if queue list update failed +constexpr uint32_t IDLE_TIMEOUT_USEC = 100*1000; // idle sleep 100ms +constexpr uint32_t FAILOVER_TIME_MSEC = 3*Q_LIST_UPDATE_MSEC; // FAILOVER TIME 3x renew time +constexpr uint32_t WORKER_COUNT = 1; // 1 worker thread +constexpr uint32_t STALE_RESERVATIONS_PERIOD_S = 120; // cleanup reservations that are more than 2 minutes old +constexpr uint32_t RESERVATIONS_CLEANUP_PERIOD_S = 30; // reservation cleanup every 30 seconds + +bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp) { + if (s_manager) { + return false; + } + // TODO: take conf from CephContext + s_manager = new Manager(cct, MAX_QUEUE_SIZE, + Q_LIST_UPDATE_MSEC, Q_LIST_RETRY_MSEC, + IDLE_TIMEOUT_USEC, FAILOVER_TIME_MSEC, + STALE_RESERVATIONS_PERIOD_S, RESERVATIONS_CLEANUP_PERIOD_S, + WORKER_COUNT, + store); + return true; +} + +void shutdown() { + delete s_manager; + s_manager = nullptr; +} + +int add_persistent_topic(const std::string& topic_name, optional_yield y) { + if (!s_manager) { + return -EAGAIN; + } + return s_manager->add_persistent_topic(topic_name, y); +} + +int remove_persistent_topic(const DoutPrefixProvider* dpp, librados::IoCtx& rados_ioctx, const std::string& topic_name, optional_yield y) { + librados::ObjectWriteOperation op; + op.remove(); + auto ret = rgw_rados_operate(dpp, rados_ioctx, topic_name, &op, y); + if (ret == -ENOENT) { + // queue already removed - nothing to do + ldpp_dout(dpp, 20) << "INFO: queue for topic: " << topic_name << " already removed. nothing to do" << dendl; + return 0; + } + if (ret < 0) { + // failed to remove queue + ldpp_dout(dpp, 1) << "ERROR: failed to remove queue for topic: " << topic_name << ". error: " << ret << dendl; + return ret; + } + + std::set topic_to_remove{{topic_name}}; + op.omap_rm_keys(topic_to_remove); + ret = rgw_rados_operate(dpp, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to remove queue: " << topic_name << " from queue list. error: " << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << "INFO: queue: " << topic_name << " removed from queue list" << dendl; + return 0; +} + +int remove_persistent_topic(const std::string& topic_name, optional_yield y) { + if (!s_manager) { + return -EAGAIN; + } + return remove_persistent_topic(s_manager, s_manager->rados_ioctx, topic_name, y); +} + +rgw::sal::Object* get_object_with_atttributes( + const reservation_t& res, rgw::sal::Object* obj) { + // in case of copy obj, the tags and metadata are taken from source + const auto src_obj = res.src_object ? res.src_object : obj; + if (src_obj->get_attrs().empty()) { + if (!src_obj->get_bucket()) { + src_obj->set_bucket(res.bucket); + } + const auto ret = src_obj->get_obj_attrs(res.yield, res.dpp); + if (ret < 0) { + ldpp_dout(res.dpp, 20) << "failed to get attributes from object: " << + src_obj->get_key() << ". ret = " << ret << dendl; + return nullptr; + } + } + return src_obj; +} + +static inline void filter_amz_meta(meta_map_t& dest, const meta_map_t& src) { + std::copy_if(src.cbegin(), src.cend(), + std::inserter(dest, dest.end()), + [](const auto& m) { + return (boost::algorithm::starts_with(m.first, RGW_AMZ_META_PREFIX)); + }); +} + + +static inline void metadata_from_attributes( + reservation_t& res, rgw::sal::Object* obj) { + auto& metadata = res.x_meta_map; + const auto src_obj = get_object_with_atttributes(res, obj); + if (!src_obj) { + return; + } + res.metadata_fetched_from_attributes = true; + for (auto& attr : src_obj->get_attrs()) { + if (boost::algorithm::starts_with(attr.first, RGW_ATTR_META_PREFIX)) { + std::string_view key(attr.first); + key.remove_prefix(sizeof(RGW_ATTR_PREFIX)-1); + // we want to pass a null terminated version + // of the bufferlist, hence "to_str().c_str()" + metadata.emplace(key, attr.second.to_str().c_str()); + } + } +} + +static inline void tags_from_attributes( + const reservation_t& res, rgw::sal::Object* obj, KeyMultiValueMap& tags) { + const auto src_obj = get_object_with_atttributes(res, obj); + if (!src_obj) { + return; + } + const auto& attrs = src_obj->get_attrs(); + const auto attr_iter = attrs.find(RGW_ATTR_TAGS); + if (attr_iter != attrs.end()) { + auto bliter = attr_iter->second.cbegin(); + RGWObjTags obj_tags; + try { + ::decode(obj_tags, bliter); + } catch(buffer::error&) { + // not able to decode tags + return; + } + tags = std::move(obj_tags.get_tags()); + } +} + +// populate event from request +static inline void populate_event(reservation_t& res, + rgw::sal::Object* obj, + uint64_t size, + const ceph::real_time& mtime, + const std::string& etag, + const std::string& version, + EventType event_type, + rgw_pubsub_s3_event& event) { + event.eventTime = mtime; + event.eventName = to_event_string(event_type); + event.userIdentity = res.user_id; // user that triggered the change + event.x_amz_request_id = res.req_id; // request ID of the original change + event.x_amz_id_2 = res.store->getRados()->host_id; // RGW on which the change was made + // configurationId is filled from notification configuration + event.bucket_name = res.bucket->get_name(); + event.bucket_ownerIdentity = res.bucket->get_owner() ? + res.bucket->get_owner()->get_id().id : res.bucket->get_info().owner.id; + const auto region = res.store->get_zone()->get_zonegroup().get_api_name(); + rgw::ARN bucket_arn(res.bucket->get_key()); + bucket_arn.region = region; + event.bucket_arn = to_string(bucket_arn); + event.object_key = res.object_name ? *res.object_name : obj->get_name(); + event.object_size = size; + event.object_etag = etag; + event.object_versionId = version; + event.awsRegion = region; + // use timestamp as per key sequence id (hex encoded) + const utime_t ts(real_clock::now()); + boost::algorithm::hex((const char*)&ts, (const char*)&ts + sizeof(utime_t), + std::back_inserter(event.object_sequencer)); + set_event_id(event.id, etag, ts); + event.bucket_id = res.bucket->get_bucket_id(); + // pass meta data + if (!res.metadata_fetched_from_attributes) { + // either no metadata exist or no metadata filter was used + metadata_from_attributes(res, obj); + } + event.x_meta_map = res.x_meta_map; + // pass tags + if (!res.tagset || + (*res.tagset).get_tags().empty()) { + // try to fetch the tags from the attributes + tags_from_attributes(res, obj, event.tags); + } else { + event.tags = (*res.tagset).get_tags(); + } + // opaque data will be filled from topic configuration +} + +static inline bool notification_match(reservation_t& res, + const rgw_pubsub_topic_filter& filter, + EventType event, + const RGWObjTags* req_tags) { + if (!match(filter.events, event)) { + return false; + } + const auto obj = res.object; + if (!match(filter.s3_filter.key_filter, + res.object_name ? *res.object_name : obj->get_name())) { + return false; + } + + if (!filter.s3_filter.metadata_filter.kv.empty()) { + // metadata filter exists + if (res.s) { + filter_amz_meta(res.x_meta_map, res.s->info.x_meta_map); + } + metadata_from_attributes(res, obj); + if (!match(filter.s3_filter.metadata_filter, res.x_meta_map)) { + return false; + } + } + + if (!filter.s3_filter.tag_filter.kv.empty()) { + // tag filter exists + if (req_tags) { + // tags in the request + if (!match(filter.s3_filter.tag_filter, req_tags->get_tags())) { + return false; + } + } else if (res.tagset && !(*res.tagset).get_tags().empty()) { + // tags were cached in req_state + if (!match(filter.s3_filter.tag_filter, (*res.tagset).get_tags())) { + return false; + } + } else { + // try to fetch tags from the attributes + KeyMultiValueMap tags; + tags_from_attributes(res, obj, tags); + if (!match(filter.s3_filter.tag_filter, tags)) { + return false; + } + } + } + + return true; +} + + int publish_reserve(const DoutPrefixProvider* dpp, + EventType event_type, + reservation_t& res, + const RGWObjTags* req_tags) +{ + const RGWPubSub ps(res.store, res.user_tenant); + const RGWPubSub::Bucket ps_bucket(ps, res.bucket); + rgw_pubsub_bucket_topics bucket_topics; + auto rc = ps_bucket.get_topics(res.dpp, bucket_topics, res.yield); + if (rc < 0) { + // failed to fetch bucket topics + return rc; + } + for (const auto& bucket_topic : bucket_topics.topics) { + const rgw_pubsub_topic_filter& topic_filter = bucket_topic.second; + const rgw_pubsub_topic& topic_cfg = topic_filter.topic; + if (!notification_match(res, topic_filter, event_type, req_tags)) { + // notification does not apply to req_state + continue; + } + ldpp_dout(res.dpp, 20) << "INFO: notification: '" << topic_filter.s3_id << + "' on topic: '" << topic_cfg.dest.arn_topic << + "' and bucket: '" << res.bucket->get_name() << + "' (unique topic: '" << topic_cfg.name << + "') apply to event of type: '" << to_string(event_type) << "'" << dendl; + + cls_2pc_reservation::id_t res_id; + if (topic_cfg.dest.persistent) { + // TODO: take default reservation size from conf + constexpr auto DEFAULT_RESERVATION = 4*1024U; // 4K + res.size = DEFAULT_RESERVATION; + librados::ObjectWriteOperation op; + bufferlist obl; + int rval; + const auto& queue_name = topic_cfg.dest.arn_topic; + cls_2pc_queue_reserve(op, res.size, 1, &obl, &rval); + auto ret = rgw_rados_operate( + res.dpp, res.store->getRados()->get_notif_pool_ctx(), + queue_name, &op, res.yield, librados::OPERATION_RETURNVEC); + if (ret < 0) { + ldpp_dout(res.dpp, 1) << + "ERROR: failed to reserve notification on queue: " + << queue_name << ". error: " << ret << dendl; + // if no space is left in queue we ask client to slow down + return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret; + } + ret = cls_2pc_queue_reserve_result(obl, res_id); + if (ret < 0) { + ldpp_dout(res.dpp, 1) << "ERROR: failed to parse reservation id. error: " << ret << dendl; + return ret; + } + } + res.topics.emplace_back(topic_filter.s3_id, topic_cfg, res_id); + } + return 0; +} + +int publish_commit(rgw::sal::Object* obj, + uint64_t size, + const ceph::real_time& mtime, + const std::string& etag, + const std::string& version, + EventType event_type, + reservation_t& res, + const DoutPrefixProvider* dpp) +{ + for (auto& topic : res.topics) { + if (topic.cfg.dest.persistent && + topic.res_id == cls_2pc_reservation::NO_ID) { + // nothing to commit or already committed/aborted + continue; + } + event_entry_t event_entry; + populate_event(res, obj, size, mtime, etag, version, event_type, event_entry.event); + event_entry.event.configurationId = topic.configurationId; + event_entry.event.opaque_data = topic.cfg.opaque_data; + if (topic.cfg.dest.persistent) { + event_entry.push_endpoint = std::move(topic.cfg.dest.push_endpoint); + event_entry.push_endpoint_args = + std::move(topic.cfg.dest.push_endpoint_args); + event_entry.arn_topic = topic.cfg.dest.arn_topic; + bufferlist bl; + encode(event_entry, bl); + const auto& queue_name = topic.cfg.dest.arn_topic; + if (bl.length() > res.size) { + // try to make a larger reservation, fail only if this is not possible + ldpp_dout(dpp, 5) << "WARNING: committed size: " << bl.length() + << " exceeded reserved size: " << res.size + << + " . trying to make a larger reservation on queue:" << queue_name + << dendl; + // first cancel the existing reservation + librados::ObjectWriteOperation op; + cls_2pc_queue_abort(op, topic.res_id); + auto ret = rgw_rados_operate( + dpp, res.store->getRados()->get_notif_pool_ctx(), + topic.cfg.dest.arn_topic, &op, + res.yield); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to abort reservation: " + << topic.res_id << + " when trying to make a larger reservation on queue: " << queue_name + << ". error: " << ret << dendl; + return ret; + } + // now try to make a bigger one + buffer::list obl; + int rval; + cls_2pc_queue_reserve(op, bl.length(), 1, &obl, &rval); + ret = rgw_rados_operate( + dpp, res.store->getRados()->get_notif_pool_ctx(), + queue_name, &op, res.yield, librados::OPERATION_RETURNVEC); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to reserve extra space on queue: " + << queue_name + << ". error: " << ret << dendl; + return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret; + } + ret = cls_2pc_queue_reserve_result(obl, topic.res_id); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to parse reservation id for " + "extra space. error: " << ret << dendl; + return ret; + } + } + std::vector bl_data_vec{std::move(bl)}; + librados::ObjectWriteOperation op; + cls_2pc_queue_commit(op, bl_data_vec, topic.res_id); + const auto ret = rgw_rados_operate( + dpp, res.store->getRados()->get_notif_pool_ctx(), + queue_name, &op, res.yield); + topic.res_id = cls_2pc_reservation::NO_ID; + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to commit reservation to queue: " + << queue_name << ". error: " << ret + << dendl; + return ret; + } + } else { + try { + // TODO add endpoint LRU cache + const auto push_endpoint = RGWPubSubEndpoint::create( + topic.cfg.dest.push_endpoint, + topic.cfg.dest.arn_topic, + RGWHTTPArgs(topic.cfg.dest.push_endpoint_args, dpp), + dpp->get_cct()); + ldpp_dout(res.dpp, 20) << "INFO: push endpoint created: " + << topic.cfg.dest.push_endpoint << dendl; + const auto ret = push_endpoint->send_to_completion_async( + dpp->get_cct(), event_entry.event, res.yield); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: push to endpoint " + << topic.cfg.dest.push_endpoint + << " failed. error: " << ret << dendl; + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed); + return ret; + } + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok); + } catch (const RGWPubSubEndpoint::configuration_error& e) { + ldpp_dout(dpp, 1) << "ERROR: failed to create push endpoint: " + << topic.cfg.dest.push_endpoint << ". error: " << e.what() << dendl; + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed); + return -EINVAL; + } + } + } + return 0; +} + +int publish_abort(reservation_t& res) { + for (auto& topic : res.topics) { + if (!topic.cfg.dest.persistent || + topic.res_id == cls_2pc_reservation::NO_ID) { + // nothing to abort or already committed/aborted + continue; + } + const auto& queue_name = topic.cfg.dest.arn_topic; + librados::ObjectWriteOperation op; + cls_2pc_queue_abort(op, topic.res_id); + const auto ret = rgw_rados_operate( + res.dpp, res.store->getRados()->get_notif_pool_ctx(), + queue_name, &op, res.yield); + if (ret < 0) { + ldpp_dout(res.dpp, 1) << "ERROR: failed to abort reservation: " + << topic.res_id << + " from queue: " << queue_name << ". error: " << ret << dendl; + return ret; + } + topic.res_id = cls_2pc_reservation::NO_ID; + } + return 0; +} + +reservation_t::reservation_t(const DoutPrefixProvider* _dpp, + rgw::sal::RadosStore* _store, + const req_state* _s, + rgw::sal::Object* _object, + rgw::sal::Object* _src_object, + const std::string* _object_name, + optional_yield y) : + dpp(_s), store(_store), s(_s), size(0) /* XXX */, + object(_object), src_object(_src_object), bucket(_s->bucket.get()), + object_name(_object_name), + tagset(_s->tagset), + metadata_fetched_from_attributes(false), + user_id(_s->user->get_id().id), + user_tenant(_s->user->get_id().tenant), + req_id(_s->req_id), + yield(y) +{ + filter_amz_meta(x_meta_map, _s->info.x_meta_map); +} + +reservation_t::reservation_t(const DoutPrefixProvider* _dpp, + rgw::sal::RadosStore* _store, + rgw::sal::Object* _object, + rgw::sal::Object* _src_object, + rgw::sal::Bucket* _bucket, + const std::string& _user_id, + const std::string& _user_tenant, + const std::string& _req_id, + optional_yield y) : + dpp(_dpp), store(_store), s(nullptr), size(0) /* XXX */, + object(_object), src_object(_src_object), bucket(_bucket), + object_name(nullptr), + metadata_fetched_from_attributes(false), + user_id(_user_id), + user_tenant(_user_tenant), + req_id(_req_id), + yield(y) +{} + +reservation_t::~reservation_t() { + publish_abort(*this); +} + +} // namespace rgw::notify diff --git a/src/rgw/driver/rados/rgw_notify.h b/src/rgw/driver/rados/rgw_notify.h new file mode 100644 index 000000000..9269611e4 --- /dev/null +++ b/src/rgw/driver/rados/rgw_notify.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "common/ceph_time.h" +#include "include/common_fwd.h" +#include "rgw_notify_event_type.h" +#include "common/async/yield_context.h" +#include "cls/2pc_queue/cls_2pc_queue_types.h" +#include "rgw_pubsub.h" + +// forward declarations +namespace rgw::sal { + class RadosStore; + class RGWObject; +} + +class RGWRados; +struct rgw_obj_key; + +namespace rgw::notify { + +// initialize the notification manager +// notification manager is dequeing the 2-phase-commit queues +// and send the notifications to the endpoints +bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp); + +// shutdown the notification manager +void shutdown(); + +// create persistent delivery queue for a topic (endpoint) +// this operation also add a topic name to the common (to all RGWs) list of all topics +int add_persistent_topic(const std::string& topic_name, optional_yield y); + +// remove persistent delivery queue for a topic (endpoint) +// this operation also remove the topic name from the common (to all RGWs) list of all topics +int remove_persistent_topic(const std::string& topic_name, optional_yield y); + +// same as the above, expect you need to provide the IoCtx, the above uses rgw::notify::Manager::rados_ioctx +int remove_persistent_topic(const DoutPrefixProvider* dpp, librados::IoCtx& rados_ioctx, const std::string& topic_name, optional_yield y); + +// struct holding reservation information +// populated in the publish_reserve call +// then used to commit or abort the reservation +struct reservation_t { + struct topic_t { + topic_t(const std::string& _configurationId, const rgw_pubsub_topic& _cfg, + cls_2pc_reservation::id_t _res_id) : + configurationId(_configurationId), cfg(_cfg), res_id(_res_id) {} + + const std::string configurationId; + const rgw_pubsub_topic cfg; + // res_id is reset after topic is committed/aborted + cls_2pc_reservation::id_t res_id; + }; + + const DoutPrefixProvider* const dpp; + std::vector topics; + rgw::sal::RadosStore* const store; + const req_state* const s; + size_t size; + rgw::sal::Object* const object; + rgw::sal::Object* const src_object; // may differ from object + rgw::sal::Bucket* const bucket; + const std::string* const object_name; + boost::optional tagset; + meta_map_t x_meta_map; // metadata cached by value + bool metadata_fetched_from_attributes; + const std::string user_id; + const std::string user_tenant; + const std::string req_id; + optional_yield yield; + + /* ctor for rgw_op callers */ + reservation_t(const DoutPrefixProvider* _dpp, + rgw::sal::RadosStore* _store, + const req_state* _s, + rgw::sal::Object* _object, + rgw::sal::Object* _src_object, + const std::string* _object_name, + optional_yield y); + + /* ctor for non-request caller (e.g., lifecycle) */ + reservation_t(const DoutPrefixProvider* _dpp, + rgw::sal::RadosStore* _store, + rgw::sal::Object* _object, + rgw::sal::Object* _src_object, + rgw::sal::Bucket* _bucket, + const std::string& _user_id, + const std::string& _user_tenant, + const std::string& _req_id, + optional_yield y); + + // dtor doing resource leak guarding + // aborting the reservation if not already committed or aborted + ~reservation_t(); +}; + +// create a reservation on the 2-phase-commit queue + int publish_reserve(const DoutPrefixProvider *dpp, + EventType event_type, + reservation_t& reservation, + const RGWObjTags* req_tags); + +// commit the reservation to the queue +int publish_commit(rgw::sal::Object* obj, + uint64_t size, + const ceph::real_time& mtime, + const std::string& etag, + const std::string& version, + EventType event_type, + reservation_t& reservation, + const DoutPrefixProvider *dpp); + +// cancel the reservation +int publish_abort(reservation_t& reservation); + +} + diff --git a/src/rgw/driver/rados/rgw_obj_manifest.cc b/src/rgw/driver/rados/rgw_obj_manifest.cc new file mode 100644 index 000000000..92ade8120 --- /dev/null +++ b/src/rgw/driver/rados/rgw_obj_manifest.cc @@ -0,0 +1,409 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_obj_manifest.h" + +#include "services/svc_zone.h" +#include "rgw_rados.h" +#include "rgw_bucket.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int RGWObjManifest::generator::create_next(uint64_t ofs) +{ + if (ofs < last_ofs) /* only going forward */ + return -EINVAL; + + uint64_t max_head_size = manifest->get_max_head_size(); + + if (ofs < max_head_size) { + manifest->set_head_size(ofs); + } + + if (ofs >= max_head_size) { + manifest->set_head_size(max_head_size); + cur_stripe = (ofs - max_head_size) / rule.stripe_max_size; + cur_stripe_size = rule.stripe_max_size; + + if (cur_part_id == 0 && max_head_size > 0) { + cur_stripe++; + } + } + + last_ofs = ofs; + manifest->set_obj_size(ofs); + + manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj); + + return 0; +} + +int RGWObjManifest::append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, + const RGWZoneParams& zone_params) +{ + if (explicit_objs || m.explicit_objs) { + return append_explicit(dpp, m, zonegroup, zone_params); + } + + if (rules.empty()) { + *this = m; + return 0; + } + + string override_prefix; + + if (prefix.empty()) { + prefix = m.prefix; + } + + if (prefix != m.prefix) { + override_prefix = m.prefix; + } + + map::iterator miter = m.rules.begin(); + if (miter == m.rules.end()) { + return append_explicit(dpp, m, zonegroup, zone_params); + } + + for (; miter != m.rules.end(); ++miter) { + map::reverse_iterator last_rule = rules.rbegin(); + + RGWObjManifestRule& rule = last_rule->second; + + if (rule.part_size == 0) { + rule.part_size = obj_size - rule.start_ofs; + } + + RGWObjManifestRule& next_rule = miter->second; + if (!next_rule.part_size) { + next_rule.part_size = m.obj_size - next_rule.start_ofs; + } + + string rule_prefix = prefix; + if (!rule.override_prefix.empty()) { + rule_prefix = rule.override_prefix; + } + + string next_rule_prefix = m.prefix; + if (!next_rule.override_prefix.empty()) { + next_rule_prefix = next_rule.override_prefix; + } + + if (rule.part_size != next_rule.part_size || + rule.stripe_max_size != next_rule.stripe_max_size || + rule_prefix != next_rule_prefix) { + if (next_rule_prefix != prefix) { + append_rules(m, miter, &next_rule_prefix); + } else { + append_rules(m, miter, NULL); + } + break; + } + + uint64_t expected_part_num = rule.start_part_num + 1; + if (rule.part_size > 0) { + expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size; + } + + if (expected_part_num != next_rule.start_part_num) { + append_rules(m, miter, NULL); + break; + } + } + + set_obj_size(obj_size + m.obj_size); + + return 0; +} + +void RGWObjManifest::append_rules(RGWObjManifest& m, map::iterator& miter, + string *override_prefix) +{ + for (; miter != m.rules.end(); ++miter) { + RGWObjManifestRule rule = miter->second; + rule.start_ofs += obj_size; + if (override_prefix) + rule.override_prefix = *override_prefix; + rules[rule.start_ofs] = rule; + } +} + +void RGWObjManifest::convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) +{ + if (explicit_objs) { + return; + } + obj_iterator iter = obj_begin(dpp); + + while (iter != obj_end(dpp)) { + RGWObjManifestPart& part = objs[iter.get_stripe_ofs()]; + const rgw_obj_select& os = iter.get_location(); + const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params); + part.loc_ofs = 0; + + uint64_t ofs = iter.get_stripe_ofs(); + + if (ofs == 0) { + part.loc = obj; + } else { + RGWSI_Tier_RADOS::raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc); + } + ++iter; + uint64_t next_ofs = iter.get_stripe_ofs(); + + part.size = next_ofs - ofs; + } + + explicit_objs = true; + rules.clear(); + prefix.clear(); +} + +int RGWObjManifest::append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) +{ + if (!explicit_objs) { + convert_to_explicit(dpp, zonegroup, zone_params); + } + if (!m.explicit_objs) { + m.convert_to_explicit(dpp, zonegroup, zone_params); + } + map::iterator iter; + uint64_t base = obj_size; + for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) { + RGWObjManifestPart& part = iter->second; + objs[base + iter->first] = part; + } + obj_size += m.obj_size; + + return 0; +} + +bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule) +{ + if (rules.empty()) { + return false; + } + + map::iterator iter = rules.upper_bound(ofs); + if (iter != rules.begin()) { + --iter; + } + + *rule = iter->second; + + return true; +} + +int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, + const rgw_placement_rule& head_placement_rule, + const rgw_placement_rule *tail_placement_rule, + const rgw_bucket& _b, const rgw_obj& _obj) +{ + manifest = _m; + + if (!tail_placement_rule) { + manifest->set_tail_placement(head_placement_rule, _b); + } else { + rgw_placement_rule new_tail_rule = *tail_placement_rule; + new_tail_rule.inherit_from(head_placement_rule); + manifest->set_tail_placement(new_tail_rule, _b); + } + + manifest->set_head(head_placement_rule, _obj, 0); + last_ofs = 0; + + if (manifest->get_prefix().empty()) { + char buf[33]; + gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1); + + string oid_prefix = "."; + oid_prefix.append(buf); + oid_prefix.append("_"); + + manifest->set_prefix(oid_prefix); + } + + bool found = manifest->get_rule(0, &rule); + if (!found) { + derr << "ERROR: manifest->get_rule() could not find rule" << dendl; + return -EIO; + } + + uint64_t head_size = manifest->get_head_size(); + + if (head_size > 0) { + cur_stripe_size = head_size; + } else { + cur_stripe_size = rule.stripe_max_size; + } + + cur_part_id = rule.start_part_num; + + manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj); + + // Normal object which not generated through copy operation + manifest->set_tail_instance(_obj.key.instance); + + return 0; +} + +void RGWObjManifestPart::generate_test_instances(std::list& o) +{ + o.push_back(new RGWObjManifestPart); + + RGWObjManifestPart *p = new RGWObjManifestPart; + rgw_bucket b; + init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12"); + + p->loc = rgw_obj(b, "object"); + p->loc_ofs = 512 * 1024; + p->size = 128 * 1024; + o.push_back(p); +} + +void RGWObjManifest::generate_test_instances(std::list& o) +{ + RGWObjManifest *m = new RGWObjManifest; + map objs; + uint64_t total_size = 0; + for (int i = 0; i<10; i++) { + RGWObjManifestPart p; + rgw_bucket b; + init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12"); + p.loc = rgw_obj(b, "object"); + p.loc_ofs = 0; + p.size = 512 * 1024; + total_size += p.size; + objs[total_size] = p; + } + m->set_explicit(total_size, objs); + o.push_back(m); + o.push_back(new RGWObjManifest); +} + +void RGWObjManifestPart::dump(Formatter *f) const +{ + f->open_object_section("loc"); + loc.dump(f); + f->close_section(); + f->dump_unsigned("loc_ofs", loc_ofs); + f->dump_unsigned("size", size); +} + +void RGWObjManifest::obj_iterator::dump(Formatter *f) const +{ + f->dump_unsigned("part_ofs", part_ofs); + f->dump_unsigned("stripe_ofs", stripe_ofs); + f->dump_unsigned("ofs", ofs); + f->dump_unsigned("stripe_size", stripe_size); + f->dump_int("cur_part_id", cur_part_id); + f->dump_int("cur_stripe", cur_stripe); + f->dump_string("cur_override_prefix", cur_override_prefix); + f->dump_object("location", location); +} + +void RGWObjManifest::dump(Formatter *f) const +{ + map::const_iterator iter = objs.begin(); + f->open_array_section("objs"); + for (; iter != objs.end(); ++iter) { + f->dump_unsigned("ofs", iter->first); + f->open_object_section("part"); + iter->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_unsigned("obj_size", obj_size); + ::encode_json("explicit_objs", explicit_objs, f); + ::encode_json("head_size", head_size, f); + ::encode_json("max_head_size", max_head_size, f); + ::encode_json("prefix", prefix, f); + ::encode_json("rules", rules, f); + ::encode_json("tail_instance", tail_instance, f); + ::encode_json("tail_placement", tail_placement, f); + ::encode_json("tier_type", tier_type, f); + + if (tier_type == "cloud-s3") { + ::encode_json("tier_config", tier_config, f); + } + + // nullptr being passed into iterators since there + // is no cct and we aren't doing anything with these + // iterators that would write do the log + f->dump_object("begin_iter", obj_begin(nullptr)); + f->dump_object("end_iter", obj_end(nullptr)); +} + +void RGWObjManifestRule::dump(Formatter *f) const +{ + encode_json("start_part_num", start_part_num, f); + encode_json("start_ofs", start_ofs, f); + encode_json("part_size", part_size, f); + encode_json("stripe_max_size", stripe_max_size, f); + encode_json("override_prefix", override_prefix, f); +} + +void rgw_obj_select::dump(Formatter *f) const +{ + f->dump_string("placement_rule", placement_rule.to_str()); + f->dump_object("obj", obj); + f->dump_object("raw_obj", raw_obj); + f->dump_bool("is_raw", is_raw); +} + +void RGWObjTier::dump(Formatter *f) const +{ + encode_json("name", name, f); + encode_json("tier_placement", tier_placement, f); + encode_json("is_multipart_upload", is_multipart_upload, f); +} + +// returns true on success, false on failure +static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params, + const rgw_placement_rule& head_placement_rule, + const rgw_obj& obj, rgw_pool *pool) +{ + if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) { + RGWZonePlacementInfo placement; + if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) { + return false; + } + + if (!obj.in_extra_data) { + *pool = placement.get_data_pool(zonegroup.default_placement.storage_class); + } else { + *pool = placement.get_data_extra_pool(); + } + } + + return true; +} + +static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params, + const rgw_placement_rule& head_placement_rule, + const rgw_obj& obj, rgw_raw_obj *raw_obj) +{ + get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc); + + return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool); +} + +rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const +{ + if (!is_raw) { + rgw_raw_obj r; + rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r); + return r; + } + return raw_obj; +} + +// returns true on success, false on failure +bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool) +{ + return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool); +} + diff --git a/src/rgw/driver/rados/rgw_obj_manifest.h b/src/rgw/driver/rados/rgw_obj_manifest.h new file mode 100644 index 000000000..6984184aa --- /dev/null +++ b/src/rgw/driver/rados/rgw_obj_manifest.h @@ -0,0 +1,622 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * introduce changes or include files which can only be compiled in + * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h) + */ + +#pragma once + +#include "rgw_zone_types.h" +#include "rgw_bucket_types.h" +#include "rgw_obj_types.h" +#include "rgw_placement_types.h" + +#include "common/dout.h" +#include "common/Formatter.h" + +class RGWSI_Zone; +struct RGWZoneGroup; +struct RGWZoneParams; +class RGWRados; + +namespace rgw { namespace sal { + class RadosStore; +} }; + +class rgw_obj_select { + rgw_placement_rule placement_rule; + rgw_obj obj; + rgw_raw_obj raw_obj; + bool is_raw; + +public: + rgw_obj_select() : is_raw(false) {} + explicit rgw_obj_select(const rgw_obj& _obj) : obj(_obj), is_raw(false) {} + explicit rgw_obj_select(const rgw_raw_obj& _raw_obj) : raw_obj(_raw_obj), is_raw(true) {} + rgw_obj_select(const rgw_obj_select& rhs) { + placement_rule = rhs.placement_rule; + is_raw = rhs.is_raw; + if (is_raw) { + raw_obj = rhs.raw_obj; + } else { + obj = rhs.obj; + } + } + + rgw_raw_obj get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const; + rgw_raw_obj get_raw_obj(RGWRados* store) const; + + rgw_obj_select& operator=(const rgw_obj& rhs) { + obj = rhs; + is_raw = false; + return *this; + } + + rgw_obj_select& operator=(const rgw_raw_obj& rhs) { + raw_obj = rhs; + is_raw = true; + return *this; + } + + void set_placement_rule(const rgw_placement_rule& rule) { + placement_rule = rule; + } + void dump(Formatter *f) const; +}; + +struct RGWObjManifestPart { + rgw_obj loc; /* the object where the data is located */ + uint64_t loc_ofs; /* the offset at that object where the data is located */ + uint64_t size; /* the part size */ + + RGWObjManifestPart() : loc_ofs(0), size(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(loc, bl); + encode(loc_ofs, bl); + encode(size, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl); + decode(loc, bl); + decode(loc_ofs, bl); + decode(size, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(RGWObjManifestPart) + +/* + The manifest defines a set of rules for structuring the object parts. + There are a few terms to note: + - head: the head part of the object, which is the part that contains + the first chunk of data. An object might not have a head (as in the + case of multipart-part objects). + - stripe: data portion of a single rgw object that resides on a single + rados object. + - part: a collection of stripes that make a contiguous part of an + object. A regular object will only have one part (although might have + many stripes), a multipart object might have many parts. Each part + has a fixed stripe size, although the last stripe of a part might + be smaller than that. Consecutive parts may be merged if their stripe + value is the same. +*/ + +struct RGWObjManifestRule { + uint32_t start_part_num; + uint64_t start_ofs; + uint64_t part_size; /* each part size, 0 if there's no part size, meaning it's unlimited */ + uint64_t stripe_max_size; /* underlying obj max size */ + std::string override_prefix; + + RGWObjManifestRule() : start_part_num(0), start_ofs(0), part_size(0), stripe_max_size(0) {} + RGWObjManifestRule(uint32_t _start_part_num, uint64_t _start_ofs, uint64_t _part_size, uint64_t _stripe_max_size) : + start_part_num(_start_part_num), start_ofs(_start_ofs), part_size(_part_size), stripe_max_size(_stripe_max_size) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(start_part_num, bl); + encode(start_ofs, bl); + encode(part_size, bl); + encode(stripe_max_size, bl); + encode(override_prefix, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(start_part_num, bl); + decode(start_ofs, bl); + decode(part_size, bl); + decode(stripe_max_size, bl); + if (struct_v >= 2) + decode(override_prefix, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWObjManifestRule) + +struct RGWObjTier { + std::string name; + RGWZoneGroupPlacementTier tier_placement; + bool is_multipart_upload{false}; + + RGWObjTier(): name("none") {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(name, bl); + encode(tier_placement, bl); + encode(is_multipart_upload, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(name, bl); + decode(tier_placement, bl); + decode(is_multipart_upload, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWObjTier) + +class RGWObjManifest { +protected: + bool explicit_objs{false}; /* really old manifest? */ + std::map objs; + + uint64_t obj_size{0}; + + rgw_obj obj; + uint64_t head_size{0}; + rgw_placement_rule head_placement_rule; + + uint64_t max_head_size{0}; + std::string prefix; + rgw_bucket_placement tail_placement; /* might be different than the original bucket, + as object might have been copied across pools */ + std::map rules; + + std::string tail_instance; /* tail object's instance */ + + std::string tier_type; + RGWObjTier tier_config; + + void convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params); + int append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params); + void append_rules(RGWObjManifest& m, std::map::iterator& iter, std::string *override_prefix); + +public: + + RGWObjManifest() = default; + RGWObjManifest(const RGWObjManifest& rhs) { + *this = rhs; + } + RGWObjManifest& operator=(const RGWObjManifest& rhs) { + explicit_objs = rhs.explicit_objs; + objs = rhs.objs; + obj_size = rhs.obj_size; + obj = rhs.obj; + head_size = rhs.head_size; + max_head_size = rhs.max_head_size; + prefix = rhs.prefix; + tail_placement = rhs.tail_placement; + rules = rhs.rules; + tail_instance = rhs.tail_instance; + tier_type = rhs.tier_type; + tier_config = rhs.tier_config; + return *this; + } + + std::map& get_explicit_objs() { + return objs; + } + + + void set_explicit(uint64_t _size, std::map& _objs) { + explicit_objs = true; + objs.swap(_objs); + set_obj_size(_size); + } + + void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs, + std::string *override_prefix, rgw_obj_select *location) const; + + void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) { + RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size); + rules[0] = rule; + max_head_size = tail_ofs; + } + + void set_multipart_part_rule(uint64_t stripe_max_size, uint64_t part_num) { + RGWObjManifestRule rule(0, 0, 0, stripe_max_size); + rule.start_part_num = part_num; + rules[0] = rule; + max_head_size = 0; + } + + void encode(bufferlist& bl) const { + ENCODE_START(8, 6, bl); + encode(obj_size, bl); + encode(objs, bl); + encode(explicit_objs, bl); + encode(obj, bl); + encode(head_size, bl); + encode(max_head_size, bl); + encode(prefix, bl); + encode(rules, bl); + bool encode_tail_bucket = !(tail_placement.bucket == obj.bucket); + encode(encode_tail_bucket, bl); + if (encode_tail_bucket) { + encode(tail_placement.bucket, bl); + } + bool encode_tail_instance = (tail_instance != obj.key.instance); + encode(encode_tail_instance, bl); + if (encode_tail_instance) { + encode(tail_instance, bl); + } + encode(head_placement_rule, bl); + encode(tail_placement.placement_rule, bl); + encode(tier_type, bl); + encode(tier_config, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(7, 2, 2, bl); + decode(obj_size, bl); + decode(objs, bl); + if (struct_v >= 3) { + decode(explicit_objs, bl); + decode(obj, bl); + decode(head_size, bl); + decode(max_head_size, bl); + decode(prefix, bl); + decode(rules, bl); + } else { + explicit_objs = true; + if (!objs.empty()) { + std::map::iterator iter = objs.begin(); + obj = iter->second.loc; + head_size = iter->second.size; + max_head_size = head_size; + } + } + + if (explicit_objs && head_size > 0 && !objs.empty()) { + /* patch up manifest due to issue 16435: + * the first object in the explicit objs list might not be the one we need to access, use the + * head object instead if set. This would happen if we had an old object that was created + * when the explicit objs manifest was around, and it got copied. + */ + rgw_obj& obj_0 = objs[0].loc; + if (!obj_0.get_oid().empty() && obj_0.key.ns.empty()) { + objs[0].loc = obj; + objs[0].size = head_size; + } + } + + if (struct_v >= 4) { + if (struct_v < 6) { + decode(tail_placement.bucket, bl); + } else { + bool need_to_decode; + decode(need_to_decode, bl); + if (need_to_decode) { + decode(tail_placement.bucket, bl); + } else { + tail_placement.bucket = obj.bucket; + } + } + } + + if (struct_v >= 5) { + if (struct_v < 6) { + decode(tail_instance, bl); + } else { + bool need_to_decode; + decode(need_to_decode, bl); + if (need_to_decode) { + decode(tail_instance, bl); + } else { + tail_instance = obj.key.instance; + } + } + } else { // old object created before 'tail_instance' field added to manifest + tail_instance = obj.key.instance; + } + + if (struct_v >= 7) { + decode(head_placement_rule, bl); + decode(tail_placement.placement_rule, bl); + } + + if (struct_v >= 8) { + decode(tier_type, bl); + decode(tier_config, bl); + } + + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + int append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, + const RGWZoneParams& zone_params); + + bool get_rule(uint64_t ofs, RGWObjManifestRule *rule); + + bool empty() const { + if (explicit_objs) + return objs.empty(); + return rules.empty(); + } + + bool has_explicit_objs() const { + return explicit_objs; + } + + bool has_tail() const { + if (explicit_objs) { + if (objs.size() == 1) { + auto iter = objs.begin(); + const rgw_obj& o = iter->second.loc; + return !(obj == o); + } + return (objs.size() >= 2); + } + return (obj_size > head_size); + } + + void set_head(const rgw_placement_rule& placement_rule, const rgw_obj& _o, uint64_t _s) { + head_placement_rule = placement_rule; + obj = _o; + head_size = _s; + + if (explicit_objs && head_size > 0) { + objs[0].loc = obj; + objs[0].size = head_size; + } + } + + const rgw_obj& get_obj() const { + return obj; + } + + void set_tail_placement(const rgw_placement_rule& placement_rule, const rgw_bucket& _b) { + tail_placement.placement_rule = placement_rule; + tail_placement.bucket = _b; + } + + const rgw_bucket_placement& get_tail_placement() const { + return tail_placement; + } + + const rgw_placement_rule& get_head_placement_rule() const { + return head_placement_rule; + } + + void set_prefix(const std::string& _p) { + prefix = _p; + } + + const std::string& get_prefix() const { + return prefix; + } + + void set_tail_instance(const std::string& _ti) { + tail_instance = _ti; + } + + const std::string& get_tail_instance() const { + return tail_instance; + } + + void set_head_size(uint64_t _s) { + head_size = _s; + } + + void set_obj_size(uint64_t s) { + obj_size = s; + } + + uint64_t get_obj_size() const { + return obj_size; + } + + uint64_t get_head_size() const { + return head_size; + } + + uint64_t get_max_head_size() const { + return max_head_size; + } + + const std::string& get_tier_type() { + return tier_type; + } + + inline void set_tier_type(std::string value) { + /* Only "cloud-s3" tier-type is supported for now */ + if (value == "cloud-s3") { + tier_type = value; + } + } + + inline void set_tier_config(RGWObjTier t) { + /* Set only if tier_type set to "cloud-s3" */ + if (tier_type != "cloud-s3") + return; + + tier_config.name = t.name; + tier_config.tier_placement = t.tier_placement; + tier_config.is_multipart_upload = t.is_multipart_upload; + } + + inline const void get_tier_config(RGWObjTier* t) { + if (tier_type != "cloud-s3") + return; + + t->name = tier_config.name; + t->tier_placement = tier_config.tier_placement; + t->is_multipart_upload = tier_config.is_multipart_upload; + } + + class obj_iterator { + const DoutPrefixProvider *dpp; + const RGWObjManifest *manifest = nullptr; + uint64_t part_ofs = 0; /* where current part starts */ + uint64_t stripe_ofs = 0; /* where current stripe starts */ + uint64_t ofs = 0; /* current position within the object */ + uint64_t stripe_size = 0; /* current part size */ + + int cur_part_id = 0; + int cur_stripe = 0; + std::string cur_override_prefix; + + rgw_obj_select location; + + std::map::const_iterator rule_iter; + std::map::const_iterator next_rule_iter; + std::map::const_iterator explicit_iter; + + void update_explicit_pos(); + + public: + obj_iterator() = default; + explicit obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m) + : obj_iterator(_dpp, _m, 0) + {} + obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m, uint64_t _ofs) : dpp(_dpp), manifest(_m) { + seek(_ofs); + } + void seek(uint64_t ofs); + + void operator++(); + bool operator==(const obj_iterator& rhs) const { + return (ofs == rhs.ofs); + } + bool operator!=(const obj_iterator& rhs) const { + return (ofs != rhs.ofs); + } + const rgw_obj_select& get_location() { + return location; + } + + /* where current part starts */ + uint64_t get_part_ofs() const { + return part_ofs; + } + + /* start of current stripe */ + uint64_t get_stripe_ofs() { + if (manifest->explicit_objs) { + return explicit_iter->first; + } + return stripe_ofs; + } + + /* current ofs relative to start of rgw object */ + uint64_t get_ofs() const { + return ofs; + } + + int get_cur_part_id() const { + return cur_part_id; + } + + /* stripe number */ + int get_cur_stripe() const { + return cur_stripe; + } + + /* current stripe size */ + uint64_t get_stripe_size() { + if (manifest->explicit_objs) { + return explicit_iter->second.size; + } + return stripe_size; + } + + /* offset where data starts within current stripe */ + uint64_t location_ofs() { + if (manifest->explicit_objs) { + return explicit_iter->second.loc_ofs; + } + return 0; /* all stripes start at zero offset */ + } + + void update_location(); + + void dump(Formatter *f) const; + }; // class obj_iterator + + obj_iterator obj_begin(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this}; } + obj_iterator obj_end(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this, obj_size}; } + obj_iterator obj_find(const DoutPrefixProvider *dpp, uint64_t ofs) const { + return obj_iterator{dpp, this, std::min(ofs, obj_size)}; + } + + /* + * simple object generator. Using a simple single rule manifest. + */ + class generator { + RGWObjManifest *manifest; + uint64_t last_ofs; + uint64_t cur_part_ofs; + int cur_part_id; + int cur_stripe; + uint64_t cur_stripe_size; + std::string cur_oid; + + std::string oid_prefix; + + rgw_obj_select cur_obj; + + RGWObjManifestRule rule; + + public: + generator() : manifest(NULL), last_ofs(0), cur_part_ofs(0), cur_part_id(0), + cur_stripe(0), cur_stripe_size(0) {} + int create_begin(CephContext *cct, RGWObjManifest *manifest, + const rgw_placement_rule& head_placement_rule, + const rgw_placement_rule *tail_placement_rule, + const rgw_bucket& bucket, + const rgw_obj& obj); + + int create_next(uint64_t ofs); + + rgw_raw_obj get_cur_obj(RGWZoneGroup& zonegroup, RGWZoneParams& zone_params) { return cur_obj.get_raw_obj(zonegroup, zone_params); } + rgw_raw_obj get_cur_obj(RGWRados* store) const { return cur_obj.get_raw_obj(store); } + + /* total max size of current stripe (including head obj) */ + uint64_t cur_stripe_max_size() const { + return cur_stripe_size; + } + }; +}; +WRITE_CLASS_ENCODER(RGWObjManifest) diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.cc b/src/rgw/driver/rados/rgw_object_expirer_core.cc new file mode 100644 index 000000000..ec1bf3fb6 --- /dev/null +++ b/src/rgw/driver/rados/rgw_object_expirer_core.cc @@ -0,0 +1,442 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_object_expirer_core.h" +#include "rgw_zone.h" +#include "rgw_sal_rados.h" + +#include "services/svc_rados.h" +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" +#include "services/svc_bi_rados.h" + +#include "cls/lock/cls_lock_client.h" +#include "cls/timeindex/cls_timeindex_client.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static string objexp_lock_name = "gc_process"; + +static string objexp_hint_get_shardname(int shard_num) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "obj_delete_at_hint.%010u", (unsigned)shard_num); + return buf; +} + +static int objexp_key_shard(const rgw_obj_index_key& key, int num_shards) +{ + string obj_key = key.name + key.instance; + return RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards); +} + +static string objexp_hint_get_keyext(const string& tenant_name, + const string& bucket_name, + const string& bucket_id, + const rgw_obj_key& obj_key) { + return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id + + ":" + obj_key.name + ":" + obj_key.instance; +} + +static void objexp_get_shard(int shard_num, + string *shard) +{ + *shard = objexp_hint_get_shardname(shard_num); +} + +static int objexp_hint_parse(const DoutPrefixProvider *dpp, CephContext *cct, cls_timeindex_entry &ti_entry, + objexp_hint_entry *hint_entry) +{ + try { + auto iter = ti_entry.value.cbegin(); + decode(*hint_entry, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode avail_pools" << dendl; + } + + return 0; +} + +int RGWObjExpStore::objexp_hint_add(const DoutPrefixProvider *dpp, + const ceph::real_time& delete_at, + const string& tenant_name, + const string& bucket_name, + const string& bucket_id, + const rgw_obj_index_key& obj_key) +{ + const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name, + bucket_id, obj_key); + objexp_hint_entry he = { + .tenant = tenant_name, + .bucket_name = bucket_name, + .bucket_id = bucket_id, + .obj_key = obj_key, + .exp_time = delete_at }; + bufferlist hebl; + encode(he, hebl); + librados::ObjectWriteOperation op; + cls_timeindex_add(op, utime_t(delete_at), keyext, hebl); + + string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key, cct->_conf->rgw_objexp_hints_num_shards)); + auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, shard_name)); + int r = obj.open(dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl; + return r; + } + return obj.operate(dpp, &op, null_yield); +} + +int RGWObjExpStore::objexp_hint_list(const DoutPrefixProvider *dpp, + const string& oid, + const ceph::real_time& start_time, + const ceph::real_time& end_time, + const int max_entries, + const string& marker, + list& entries, /* out */ + string *out_marker, /* out */ + bool *truncated) /* out */ +{ + librados::ObjectReadOperation op; + cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries, + out_marker, truncated); + + auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid)); + int r = obj.open(dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl; + return r; + } + bufferlist obl; + int ret = obj.operate(dpp, &op, &obl, null_yield); + + if ((ret < 0 ) && (ret != -ENOENT)) { + return ret; + } + + if ((ret == -ENOENT) && truncated) { + *truncated = false; + } + + return 0; +} + +static int cls_timeindex_trim_repeat(const DoutPrefixProvider *dpp, + rgw_rados_ref ref, + const string& oid, + const utime_t& from_time, + const utime_t& to_time, + const string& from_marker, + const string& to_marker) +{ + bool done = false; + do { + librados::ObjectWriteOperation op; + cls_timeindex_trim(op, from_time, to_time, from_marker, to_marker); + int r = rgw_rados_operate(dpp, ref.pool.ioctx(), oid, &op, null_yield); + if (r == -ENODATA) + done = true; + else if (r < 0) + return r; + } while (!done); + + return 0; +} + +int RGWObjExpStore::objexp_hint_trim(const DoutPrefixProvider *dpp, + const string& oid, + const ceph::real_time& start_time, + const ceph::real_time& end_time, + const string& from_marker, + const string& to_marker) +{ + auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid)); + int r = obj.open(dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl; + return r; + } + auto& ref = obj.get_ref(); + int ret = cls_timeindex_trim_repeat(dpp, ref, oid, utime_t(start_time), utime_t(end_time), + from_marker, to_marker); + if ((ret < 0 ) && (ret != -ENOENT)) { + return ret; + } + + return 0; +} + +int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint) +{ + RGWBucketInfo bucket_info; + std::unique_ptr bucket; + + int ret = driver->get_bucket(dpp, nullptr, rgw_bucket(hint.tenant, hint.bucket_name, hint.bucket_id), &bucket, null_yield); + if (-ENOENT == ret) { + ldpp_dout(dpp, 15) << "NOTICE: cannot find bucket = " \ + << hint.bucket_name << ". The object must be already removed" << dendl; + return -ERR_PRECONDITION_FAILED; + } else if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: could not init bucket = " \ + << hint.bucket_name << "due to ret = " << ret << dendl; + return ret; + } + + rgw_obj_key key = hint.obj_key; + if (key.instance.empty()) { + key.instance = "null"; + } + + std::unique_ptr obj = bucket->get_object(key); + obj->set_atomic(); + ret = obj->delete_object(dpp, null_yield); + + return ret; +} + +void RGWObjectExpirer::garbage_chunk(const DoutPrefixProvider *dpp, + list& entries, /* in */ + bool& need_trim) /* out */ +{ + need_trim = false; + + for (list::iterator iter = entries.begin(); + iter != entries.end(); + ++iter) + { + objexp_hint_entry hint; + ldpp_dout(dpp, 15) << "got removal hint for: " << iter->key_ts.sec() \ + << " - " << iter->key_ext << dendl; + + int ret = objexp_hint_parse(dpp, driver->ctx(), *iter, &hint); + if (ret < 0) { + ldpp_dout(dpp, 1) << "cannot parse removal hint for " << hint.obj_key << dendl; + continue; + } + + /* PRECOND_FAILED simply means that our hint is not valid. + * We can silently ignore that and move forward. */ + ret = garbage_single_object(dpp, hint); + if (ret == -ERR_PRECONDITION_FAILED) { + ldpp_dout(dpp, 15) << "not actual hint for object: " << hint.obj_key << dendl; + } else if (ret < 0) { + ldpp_dout(dpp, 1) << "cannot remove expired object: " << hint.obj_key << dendl; + } + + need_trim = true; + } + + return; +} + +void RGWObjectExpirer::trim_chunk(const DoutPrefixProvider *dpp, + const string& shard, + const utime_t& from, + const utime_t& to, + const string& from_marker, + const string& to_marker) +{ + ldpp_dout(dpp, 20) << "trying to trim removal hints to=" << to + << ", to_marker=" << to_marker << dendl; + + real_time rt_from = from.to_real_time(); + real_time rt_to = to.to_real_time(); + + int ret = exp_store.objexp_hint_trim(dpp, shard, rt_from, rt_to, + from_marker, to_marker); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR during trim: " << ret << dendl; + } + + return; +} + +bool RGWObjectExpirer::process_single_shard(const DoutPrefixProvider *dpp, + const string& shard, + const utime_t& last_run, + const utime_t& round_start) +{ + string marker; + string out_marker; + bool truncated = false; + bool done = true; + + CephContext *cct = driver->ctx(); + int num_entries = cct->_conf->rgw_objexp_chunk_size; + + int max_secs = cct->_conf->rgw_objexp_gc_interval; + utime_t end = ceph_clock_now(); + end += max_secs; + + rados::cls::lock::Lock l(objexp_lock_name); + + utime_t time(max_secs, 0); + l.set_duration(time); + + int ret = l.lock_exclusive(&static_cast(driver)->getRados()->objexp_pool_ctx, shard); + if (ret == -EBUSY) { /* already locked by another processor */ + ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " << shard << dendl; + return false; + } + + do { + real_time rt_last = last_run.to_real_time(); + real_time rt_start = round_start.to_real_time(); + + list entries; + ret = exp_store.objexp_hint_list(dpp, shard, rt_last, rt_start, + num_entries, marker, entries, + &out_marker, &truncated); + if (ret < 0) { + ldpp_dout(dpp, 10) << "cannot get removal hints from shard: " << shard + << dendl; + continue; + } + + bool need_trim; + garbage_chunk(dpp, entries, need_trim); + + if (need_trim) { + trim_chunk(dpp, shard, last_run, round_start, marker, out_marker); + } + + utime_t now = ceph_clock_now(); + if (now >= end) { + done = false; + break; + } + + marker = out_marker; + } while (truncated); + + l.unlock(&static_cast(driver)->getRados()->objexp_pool_ctx, shard); + return done; +} + +/* Returns true if all shards have been processed successfully. */ +bool RGWObjectExpirer::inspect_all_shards(const DoutPrefixProvider *dpp, + const utime_t& last_run, + const utime_t& round_start) +{ + CephContext * const cct = driver->ctx(); + int num_shards = cct->_conf->rgw_objexp_hints_num_shards; + bool all_done = true; + + for (int i = 0; i < num_shards; i++) { + string shard; + objexp_get_shard(i, &shard); + + ldpp_dout(dpp, 20) << "processing shard = " << shard << dendl; + + if (! process_single_shard(dpp, shard, last_run, round_start)) { + all_done = false; + } + } + + return all_done; +} + +bool RGWObjectExpirer::going_down() +{ + return down_flag; +} + +void RGWObjectExpirer::start_processor() +{ + worker = new OEWorker(driver->ctx(), this); + worker->create("rgw_obj_expirer"); +} + +void RGWObjectExpirer::stop_processor() +{ + down_flag = true; + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = NULL; +} + +void *RGWObjectExpirer::OEWorker::entry() { + utime_t last_run; + do { + utime_t start = ceph_clock_now(); + ldpp_dout(this, 2) << "object expiration: start" << dendl; + if (oe->inspect_all_shards(this, last_run, start)) { + /* All shards have been processed properly. Next time we can start + * from this moment. */ + last_run = start; + } + ldpp_dout(this, 2) << "object expiration: stop" << dendl; + + + if (oe->going_down()) + break; + + utime_t end = ceph_clock_now(); + end -= start; + int secs = cct->_conf->rgw_objexp_gc_interval; + + if (secs <= end.sec()) + continue; // next round + + secs -= end.sec(); + + std::unique_lock l{lock}; + cond.wait_for(l, std::chrono::seconds(secs)); + } while (!oe->going_down()); + + return NULL; +} + +void RGWObjectExpirer::OEWorker::stop() +{ + std::lock_guard l{lock}; + cond.notify_all(); +} + +CephContext *RGWObjectExpirer::OEWorker::get_cct() const +{ + return cct; +} + +unsigned RGWObjectExpirer::OEWorker::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWObjectExpirer::OEWorker::gen_prefix(std::ostream& out) const +{ + return out << "rgw object expirer Worker thread: "; +} diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.h b/src/rgw/driver/rados/rgw_object_expirer_core.h new file mode 100644 index 000000000..be63815c1 --- /dev/null +++ b/src/rgw/driver/rados/rgw_object_expirer_core.h @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "common/ceph_mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" + +#include "global/global_init.h" + +#include "include/common_fwd.h" +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_sal_rados.h" + +class RGWSI_RADOS; +class RGWSI_Zone; +class RGWBucketInfo; +class cls_timeindex_entry; + +class RGWObjExpStore { + CephContext *cct; + RGWSI_RADOS *rados_svc; + rgw::sal::RadosStore* driver; +public: + RGWObjExpStore(CephContext *_cct, RGWSI_RADOS *_rados_svc, rgw::sal::RadosStore* _driver) : cct(_cct), + rados_svc(_rados_svc), + driver(_driver) {} + + int objexp_hint_add(const DoutPrefixProvider *dpp, + const ceph::real_time& delete_at, + const std::string& tenant_name, + const std::string& bucket_name, + const std::string& bucket_id, + const rgw_obj_index_key& obj_key); + + int objexp_hint_list(const DoutPrefixProvider *dpp, + const std::string& oid, + const ceph::real_time& start_time, + const ceph::real_time& end_time, + const int max_entries, + const std::string& marker, + std::list& entries, /* out */ + std::string *out_marker, /* out */ + bool *truncated); /* out */ + + int objexp_hint_trim(const DoutPrefixProvider *dpp, + const std::string& oid, + const ceph::real_time& start_time, + const ceph::real_time& end_time, + const std::string& from_marker, + const std::string& to_marker); +}; + +class RGWObjectExpirer { +protected: + rgw::sal::Driver* driver; + RGWObjExpStore exp_store; + + class OEWorker : public Thread, public DoutPrefixProvider { + CephContext *cct; + RGWObjectExpirer *oe; + ceph::mutex lock = ceph::make_mutex("OEWorker"); + ceph::condition_variable cond; + + public: + OEWorker(CephContext * const cct, + RGWObjectExpirer * const oe) + : cct(cct), + oe(oe) { + } + + void *entry() override; + void stop(); + + CephContext *get_cct() const override; + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; + }; + + OEWorker *worker{nullptr}; + std::atomic down_flag = { false }; + +public: + explicit RGWObjectExpirer(rgw::sal::Driver* _driver) + : driver(_driver), + exp_store(_driver->ctx(), static_cast(driver)->svc()->rados, static_cast(driver)), + worker(NULL) { + } + ~RGWObjectExpirer() { + stop_processor(); + } + + int hint_add(const DoutPrefixProvider *dpp, + const ceph::real_time& delete_at, + const std::string& tenant_name, + const std::string& bucket_name, + const std::string& bucket_id, + const rgw_obj_index_key& obj_key) { + return exp_store.objexp_hint_add(dpp, delete_at, tenant_name, bucket_name, + bucket_id, obj_key); + } + + int garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint); + + void garbage_chunk(const DoutPrefixProvider *dpp, + std::list& entries, /* in */ + bool& need_trim); /* out */ + + void trim_chunk(const DoutPrefixProvider *dpp, + const std::string& shard, + const utime_t& from, + const utime_t& to, + const std::string& from_marker, + const std::string& to_marker); + + bool process_single_shard(const DoutPrefixProvider *dpp, + const std::string& shard, + const utime_t& last_run, + const utime_t& round_start); + + bool inspect_all_shards(const DoutPrefixProvider *dpp, + const utime_t& last_run, + const utime_t& round_start); + + bool going_down(); + void start_processor(); + void stop_processor(); +}; diff --git a/src/rgw/driver/rados/rgw_otp.cc b/src/rgw/driver/rados/rgw_otp.cc new file mode 100644 index 000000000..07cc14f11 --- /dev/null +++ b/src/rgw/driver/rados/rgw_otp.cc @@ -0,0 +1,211 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "rgw_otp.h" +#include "rgw_zone.h" +#include "rgw_metadata.h" + +#include "include/types.h" + +#include "rgw_common.h" +#include "rgw_tools.h" + +#include "services/svc_zone.h" +#include "services/svc_meta.h" +#include "services/svc_meta_be.h" +#include "services/svc_meta_be_otp.h" +#include "services/svc_otp.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + + +class RGWOTPMetadataHandler; + +class RGWOTPMetadataObject : public RGWMetadataObject { + friend class RGWOTPMetadataHandler; + + otp_devices_list_t devices; +public: + RGWOTPMetadataObject() {} + RGWOTPMetadataObject(otp_devices_list_t&& _devices, const obj_version& v, const real_time m) { + devices = std::move(_devices); + objv = v; + mtime = m; + } + + void dump(Formatter *f) const override { + encode_json("devices", devices, f); + } + + otp_devices_list_t& get_devs() { + return devices; + } +}; + + +class RGWOTPMetadataHandler : public RGWOTPMetadataHandlerBase { + friend class RGWOTPCtl; + + struct Svc { + RGWSI_Zone *zone; + RGWSI_MetaBackend *meta_be; + RGWSI_OTP *otp; + } svc; + + int init(RGWSI_Zone *zone, + RGWSI_MetaBackend *_meta_be, + RGWSI_OTP *_otp) { + base_init(zone->ctx(), _otp->get_be_handler().get()); + svc.zone = zone; + svc.meta_be = _meta_be; + svc.otp = _otp; + return 0; + } + + int call(std::function f) { + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + RGWSI_OTP_BE_Ctx ctx(op->ctx()); + return f(ctx); + }); + } + + RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override { + otp_devices_list_t devices; + try { + JSONDecoder::decode_json("devices", devices, jo); + } catch (JSONDecoder::err& e) { + return nullptr; + } + + return new RGWOTPMetadataObject(std::move(devices), objv, mtime); + } + + int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override { + RGWObjVersionTracker objv_tracker; + + std::unique_ptr mdo(new RGWOTPMetadataObject); + + + RGWSI_OTP_BE_Ctx be_ctx(op->ctx()); + + int ret = svc.otp->read_all(be_ctx, + entry, + &mdo->get_devs(), + &mdo->get_mtime(), + &objv_tracker, + y, + dpp); + if (ret < 0) { + return ret; + } + + mdo->objv = objv_tracker.read_version; + + *obj = mdo.release(); + + return 0; + } + + int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, bool from_remote_zone) override { + RGWOTPMetadataObject *obj = static_cast(_obj); + + RGWSI_OTP_BE_Ctx be_ctx(op->ctx()); + + int ret = svc.otp->store_all(dpp, be_ctx, + entry, + obj->devices, + obj->mtime, + &objv_tracker, + y); + if (ret < 0) { + return ret; + } + + return STATUS_APPLIED; + } + + int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + RGWSI_MBOTP_RemoveParams params; + + RGWSI_OTP_BE_Ctx be_ctx(op->ctx()); + + return svc.otp->remove_all(dpp, be_ctx, + entry, + &objv_tracker, + y); + } + +public: + RGWOTPMetadataHandler() {} + + string get_type() override { return "otp"; } +}; + + +RGWOTPCtl::RGWOTPCtl(RGWSI_Zone *zone_svc, + RGWSI_OTP *otp_svc) +{ + svc.zone = zone_svc; + svc.otp = otp_svc; +} + + +void RGWOTPCtl::init(RGWOTPMetadataHandler *_meta_handler) +{ + meta_handler = _meta_handler; + be_handler = meta_handler->get_be_handler(); +} + +int RGWOTPCtl::read_all(const rgw_user& uid, + RGWOTPInfo *info, + optional_yield y, + const DoutPrefixProvider *dpp, + const GetParams& params) +{ + info->uid = uid; + return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) { + return svc.otp->read_all(ctx, uid, &info->devices, params.mtime, params.objv_tracker, y, dpp); + }); +} + +int RGWOTPCtl::store_all(const DoutPrefixProvider *dpp, + const RGWOTPInfo& info, + optional_yield y, + const PutParams& params) +{ + return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) { + return svc.otp->store_all(dpp, ctx, info.uid, info.devices, params.mtime, params.objv_tracker, y); + }); +} + +int RGWOTPCtl::remove_all(const DoutPrefixProvider *dpp, + const rgw_user& uid, + optional_yield y, + const RemoveParams& params) +{ + return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) { + return svc.otp->remove_all(dpp, ctx, uid, params.objv_tracker, y); + }); +} + + +RGWMetadataHandler *RGWOTPMetaHandlerAllocator::alloc() +{ + return new RGWOTPMetadataHandler(); +} diff --git a/src/rgw/driver/rados/rgw_otp.h b/src/rgw/driver/rados/rgw_otp.h new file mode 100644 index 000000000..885e8abb8 --- /dev/null +++ b/src/rgw/driver/rados/rgw_otp.h @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_sal_fwd.h" +#include "cls/otp/cls_otp_types.h" +#include "services/svc_meta_be_otp.h" + +#include "rgw_basic_types.h" +#include "rgw_metadata.h" + + +class RGWObjVersionTracker; +class RGWMetadataHandler; +class RGWOTPMetadataHandler; +class RGWSI_Zone; +class RGWSI_OTP; +class RGWSI_MetaBackend; + +class RGWOTPMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE { +public: + virtual ~RGWOTPMetadataHandlerBase() {} + virtual int init(RGWSI_Zone *zone, + RGWSI_MetaBackend *_meta_be, + RGWSI_OTP *_otp) = 0; +}; + +class RGWOTPMetaHandlerAllocator { +public: + static RGWMetadataHandler *alloc(); +}; + +struct RGWOTPInfo { + rgw_user uid; + otp_devices_list_t devices; +}; + + +class RGWOTPCtl +{ + struct Svc { + RGWSI_Zone *zone{nullptr}; + RGWSI_OTP *otp{nullptr}; + } svc; + + RGWOTPMetadataHandler *meta_handler; + RGWSI_MetaBackend_Handler *be_handler; + +public: + RGWOTPCtl(RGWSI_Zone *zone_svc, + RGWSI_OTP *otp_svc); + + void init(RGWOTPMetadataHandler *_meta_handler); + + struct GetParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + ceph::real_time *mtime{nullptr}; + + GetParams() {} + + GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + GetParams& set_mtime(ceph::real_time *_mtime) { + mtime = _mtime; + return *this; + } + }; + + struct PutParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + ceph::real_time mtime; + + PutParams() {} + + PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + PutParams& set_mtime(const ceph::real_time& _mtime) { + mtime = _mtime; + return *this; + } + }; + + struct RemoveParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + + RemoveParams() {} + + RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + }; + + int read_all(const rgw_user& uid, RGWOTPInfo *info, optional_yield y, + const DoutPrefixProvider *dpp, + const GetParams& params = {}); + int store_all(const DoutPrefixProvider *dpp, + const RGWOTPInfo& info, optional_yield y, + const PutParams& params = {}); + int remove_all(const DoutPrefixProvider *dpp, + const rgw_user& user, optional_yield y, + const RemoveParams& params = {}); +}; diff --git a/src/rgw/driver/rados/rgw_period.cc b/src/rgw/driver/rados/rgw_period.cc new file mode 100644 index 000000000..61602b354 --- /dev/null +++ b/src/rgw/driver/rados/rgw_period.cc @@ -0,0 +1,324 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_sync.h" + +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace rgw_zone_defaults; + +int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, + const string& zonegroup_id) const +{ + map::const_iterator iter; + if (!zonegroup_id.empty()) { + iter = period_map.zonegroups.find(zonegroup_id); + } else { + iter = period_map.zonegroups.find("default"); + } + if (iter != period_map.zonegroups.end()) { + zonegroup = iter->second; + return 0; + } + + return -ENOENT; +} + +int RGWPeriod::get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& latest_epoch, optional_yield y) +{ + RGWPeriodLatestEpochInfo info; + + int ret = read_latest_epoch(dpp, info, y); + if (ret < 0) { + return ret; + } + + latest_epoch = info.epoch; + + return 0; +} + +int RGWPeriod::delete_obj(const DoutPrefixProvider *dpp, optional_yield y) +{ + rgw_pool pool(get_pool(cct)); + + // delete the object for each period epoch + for (epoch_t e = 1; e <= epoch; e++) { + RGWPeriod p{get_id(), e}; + rgw_raw_obj oid{pool, p.get_period_oid()}; + auto sysobj = sysobj_svc->get_obj(oid); + int ret = sysobj.wop().remove(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid + << ": " << cpp_strerror(-ret) << dendl; + } + } + + // delete the .latest_epoch object + rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()}; + auto sysobj = sysobj_svc->get_obj(oid); + int ret = sysobj.wop().remove(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid + << ": " << cpp_strerror(-ret) << dendl; + } + return ret; +} + +int RGWPeriod::add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y) +{ + if (zonegroup.realm_id != realm_id) { + return 0; + } + int ret = period_map.update(zonegroup, cct); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl; + return ret; + } + + return store_info(dpp, false, y); +} + +int RGWPeriod::update(const DoutPrefixProvider *dpp, optional_yield y) +{ + auto zone_svc = sysobj_svc->get_zone_svc(); + ldpp_dout(dpp, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl; + list zonegroups; + int ret = zone_svc->list_zonegroups(dpp, zonegroups); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl; + return ret; + } + + // clear zone short ids of removed zones. period_map.update() will add the + // remaining zones back + period_map.short_zone_ids.clear(); + + for (auto& iter : zonegroups) { + RGWZoneGroup zg(string(), iter); + ret = zg.init(dpp, cct, sysobj_svc, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl; + continue; + } + + if (zg.realm_id != realm_id) { + ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl; + continue; + } + + if (zg.master_zone.empty()) { + ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl; + return -EINVAL; + } + + if (zg.zones.find(zg.master_zone) == zg.zones.end()) { + ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() + << " has a non existent master zone "<< dendl; + return -EINVAL; + } + + if (zg.is_master_zonegroup()) { + master_zonegroup = zg.get_id(); + master_zone = zg.master_zone; + } + + int ret = period_map.update(zg, cct); + if (ret < 0) { + return ret; + } + } + + ret = period_config.read(dpp, sysobj_svc, realm_id, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: failed to read period config: " + << cpp_strerror(ret) << dendl; + return ret; + } + return 0; +} + +void RGWPeriod::fork() +{ + ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl; + predecessor_uuid = id; + id = get_staging_id(realm_id); + period_map.reset(); + realm_epoch++; +} + +static int read_sync_status(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw_meta_sync_status *sync_status) +{ + rgw::sal::RadosStore* rados_store = static_cast(driver); + // initialize a sync status manager to read the status + RGWMetaSyncStatusManager mgr(rados_store, rados_store->svc()->rados->get_async_processor()); + int r = mgr.init(dpp); + if (r < 0) { + return r; + } + r = mgr.read_sync_status(dpp, sync_status); + mgr.stop(); + return r; +} + +int RGWPeriod::update_sync_status(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, /* for now */ + const RGWPeriod ¤t_period, + std::ostream& error_stream, + bool force_if_stale) +{ + rgw_meta_sync_status status; + int r = read_sync_status(dpp, driver, &status); + if (r < 0) { + ldpp_dout(dpp, 0) << "period failed to read sync status: " + << cpp_strerror(-r) << dendl; + return r; + } + + std::vector markers; + + const auto current_epoch = current_period.get_realm_epoch(); + if (current_epoch != status.sync_info.realm_epoch) { + // no sync status markers for the current period + ceph_assert(current_epoch > status.sync_info.realm_epoch); + const int behind = current_epoch - status.sync_info.realm_epoch; + if (!force_if_stale && current_epoch > 1) { + error_stream << "ERROR: This zone is " << behind << " period(s) behind " + "the current master zone in metadata sync. If this zone is promoted " + "to master, any metadata changes during that time are likely to " + "be lost.\n" + "Waiting for this zone to catch up on metadata sync (see " + "'radosgw-admin sync status') is recommended.\n" + "To promote this zone to master anyway, add the flag " + "--yes-i-really-mean-it." << std::endl; + return -EINVAL; + } + // empty sync status markers - other zones will skip this period during + // incremental metadata sync + markers.resize(status.sync_info.num_shards); + } else { + markers.reserve(status.sync_info.num_shards); + for (auto& i : status.sync_markers) { + auto& marker = i.second; + // filter out markers from other periods + if (marker.realm_epoch != current_epoch) { + marker.marker.clear(); + } + markers.emplace_back(std::move(marker.marker)); + } + } + + std::swap(sync_status, markers); + return 0; +} + +int RGWPeriod::commit(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWRealm& realm, const RGWPeriod& current_period, + std::ostream& error_stream, optional_yield y, + bool force_if_stale) +{ + auto zone_svc = sysobj_svc->get_zone_svc(); + ldpp_dout(dpp, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl; + // gateway must be in the master zone to commit + if (master_zone != zone_svc->get_zone_params().get_id()) { + error_stream << "Cannot commit period on zone " + << zone_svc->get_zone_params().get_id() << ", it must be sent to " + "the period's master zone " << master_zone << '.' << std::endl; + return -EINVAL; + } + // period predecessor must match current period + if (predecessor_uuid != current_period.get_id()) { + error_stream << "Period predecessor " << predecessor_uuid + << " does not match current period " << current_period.get_id() + << ". Use 'period pull' to get the latest period from the master, " + "reapply your changes, and try again." << std::endl; + return -EINVAL; + } + // realm epoch must be 1 greater than current period + if (realm_epoch != current_period.get_realm_epoch() + 1) { + error_stream << "Period's realm epoch " << realm_epoch + << " does not come directly after current realm epoch " + << current_period.get_realm_epoch() << ". Use 'realm pull' to get the " + "latest realm and period from the master zone, reapply your changes, " + "and try again." << std::endl; + return -EINVAL; + } + // did the master zone change? + if (master_zone != current_period.get_master_zone()) { + // store the current metadata sync status in the period + int r = update_sync_status(dpp, driver, current_period, error_stream, force_if_stale); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to update metadata sync status: " + << cpp_strerror(-r) << dendl; + return r; + } + // create an object with a new period id + r = create(dpp, y, true); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl; + return r; + } + // set as current period + r = realm.set_current_period(dpp, *this, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to update realm's current period: " + << cpp_strerror(-r) << dendl; + return r; + } + ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period " + << id << dendl; + realm.notify_new_period(dpp, *this, y); + return 0; + } + // period must be based on current epoch + if (epoch != current_period.get_epoch()) { + error_stream << "Period epoch " << epoch << " does not match " + "predecessor epoch " << current_period.get_epoch() + << ". Use 'period pull' to get the latest epoch from the master zone, " + "reapply your changes, and try again." << std::endl; + return -EINVAL; + } + // set period as next epoch + set_id(current_period.get_id()); + set_epoch(current_period.get_epoch() + 1); + set_predecessor(current_period.get_predecessor()); + realm_epoch = current_period.get_realm_epoch(); + // write the period to rados + int r = store_info(dpp, false, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(-r) << dendl; + return r; + } + // set as latest epoch + r = update_latest_epoch(dpp, epoch, y); + if (r == -EEXIST) { + // already have this epoch (or a more recent one) + return 0; + } + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl; + return r; + } + r = reflect(dpp, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl; + return r; + } + ldpp_dout(dpp, 4) << "Committed new epoch " << epoch + << " for period " << id << dendl; + realm.notify_new_period(dpp, *this, y); + return 0; +} + +void RGWPeriod::generate_test_instances(list &o) +{ + RGWPeriod *z = new RGWPeriod; + o.push_back(z); + o.push_back(new RGWPeriod); +} + + diff --git a/src/rgw/driver/rados/rgw_pubsub_push.cc b/src/rgw/driver/rados/rgw_pubsub_push.cc new file mode 100644 index 000000000..bdb24ce9a --- /dev/null +++ b/src/rgw/driver/rados/rgw_pubsub_push.cc @@ -0,0 +1,460 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_pubsub_push.h" +#include +#include +#include +#include "include/buffer_fwd.h" +#include "common/Formatter.h" +#include "common/iso_8601.h" +#include "common/async/completion.h" +#include "rgw_common.h" +#include "rgw_data_sync.h" +#include "rgw_pubsub.h" +#include "acconfig.h" +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +#include "rgw_amqp.h" +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +#include "rgw_kafka.h" +#endif +#include +#include +#include +#include "rgw_perf_counters.h" + +using namespace rgw; + +template +std::string json_format_pubsub_event(const EventType& event) { + std::stringstream ss; + JSONFormatter f(false); + { + Formatter::ObjectSection s(f, EventType::json_type_plural); + { + Formatter::ArraySection s(f, EventType::json_type_plural); + encode_json("", event, &f); + } + } + f.flush(ss); + return ss.str(); +} + +bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) { + bool value; + bool exists; + if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) { + throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name); + } + if (!exists) { + return default_value; + } + return value; +} + +class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint { +private: + const std::string endpoint; + typedef unsigned ack_level_t; + ack_level_t ack_level; // TODO: not used for now + const bool verify_ssl; + const bool cloudevents; + static const ack_level_t ACK_LEVEL_ANY = 0; + static const ack_level_t ACK_LEVEL_NON_ERROR = 1; + +public: + RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) : + endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false)) + { + bool exists; + const auto& str_ack_level = args.get("http-ack-level", &exists); + if (!exists || str_ack_level == "any") { + // "any" is default + ack_level = ACK_LEVEL_ANY; + } else if (str_ack_level == "non-error") { + ack_level = ACK_LEVEL_NON_ERROR; + } else { + ack_level = std::atoi(str_ack_level.c_str()); + if (ack_level < 100 || ack_level >= 600) { + throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level); + } + } + } + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { + bufferlist read_bl; + RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl); + const auto post_data = json_format_pubsub_event(event); + if (cloudevents) { + // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md + // using "Binary Content Mode" + request.append_header("ce-specversion", "1.0"); + request.append_header("ce-type", "com.amazonaws." + event.eventName); + request.append_header("ce-time", to_iso_8601(event.eventTime)); + // default output of iso8601 is also RFC3339 compatible + request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2); + request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name); + request.append_header("ce-subject", event.object_key); + } + request.set_post_data(post_data); + request.set_send_length(post_data.length()); + request.append_header("Content-Type", "application/json"); + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending); + const auto rc = RGWHTTP::process(&request, y); + if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending); + // TODO: use read_bl to process return code and handle according to ack level + return rc; + } + + std::string to_str() const override { + std::string str("HTTP/S Endpoint"); + str += "\nURI: " + endpoint; + str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL"); + return str; + } +}; + +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint { +private: + enum class ack_level_t { + None, + Broker, + Routable + }; + CephContext* const cct; + const std::string endpoint; + const std::string topic; + const std::string exchange; + ack_level_t ack_level; + amqp::connection_id_t conn_id; + + bool get_verify_ssl(const RGWHTTPArgs& args) { + bool exists; + auto str_verify_ssl = args.get("verify-ssl", &exists); + if (!exists) { + // verify server certificate by default + return true; + } + boost::algorithm::to_lower(str_verify_ssl); + if (str_verify_ssl == "true") { + return true; + } + if (str_verify_ssl == "false") { + return false; + } + throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl); + } + + std::string get_exchange(const RGWHTTPArgs& args) { + bool exists; + const auto exchange = args.get("amqp-exchange", &exists); + if (!exists) { + throw configuration_error("AMQP: missing amqp-exchange"); + } + return exchange; + } + + ack_level_t get_ack_level(const RGWHTTPArgs& args) { + bool exists; + const auto& str_ack_level = args.get("amqp-ack-level", &exists); + if (!exists || str_ack_level == "broker") { + // "broker" is default + return ack_level_t::Broker; + } + if (str_ack_level == "none") { + return ack_level_t::None; + } + if (str_ack_level == "routable") { + return ack_level_t::Routable; + } + throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level); + } + +public: + RGWPubSubAMQPEndpoint(const std::string& _endpoint, + const std::string& _topic, + const RGWHTTPArgs& args, + CephContext* _cct) : + cct(_cct), + endpoint(_endpoint), + topic(_topic), + exchange(get_exchange(args)), + ack_level(get_ack_level(args)) { + if (!amqp::connect(conn_id, endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) { + throw configuration_error("AMQP: failed to create connection to: " + endpoint); + } + } + + // this allows waiting untill "finish()" is called from a different thread + // waiting could be blocking the waiting thread or yielding, depending + // with compilation flag support and whether the optional_yield is set + class Waiter { + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion; + std::unique_ptr completion = nullptr; + int ret; + + mutable std::atomic done = false; + mutable std::mutex lock; + mutable std::condition_variable cond; + + template + auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { + boost::asio::async_completion init(token); + auto& handler = init.completion_handler; + { + std::unique_lock l{lock}; + completion = Completion::create(ctx.get_executor(), std::move(handler)); + } + return init.result.get(); + } + + public: + int wait(optional_yield y) { + if (done) { + return ret; + } + if (y) { + auto& io_ctx = y.get_io_context(); + auto& yield_ctx = y.get_yield_context(); + boost::system::error_code ec; + async_wait(io_ctx, yield_ctx[ec]); + return -ec.value(); + } + std::unique_lock l(lock); + cond.wait(l, [this]{return (done==true);}); + return ret; + } + + void finish(int r) { + std::unique_lock l{lock}; + ret = r; + done = true; + if (completion) { + boost::system::error_code ec(-ret, boost::system::system_category()); + Completion::post(std::move(completion), ec); + } else { + cond.notify_all(); + } + } + }; + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { + if (ack_level == ack_level_t::None) { + return amqp::publish(conn_id, topic, json_format_pubsub_event(event)); + } else { + // TODO: currently broker and routable are the same - this will require different flags but the same mechanism + // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine + auto w = std::unique_ptr(new Waiter); + const auto rc = amqp::publish_with_confirm(conn_id, + topic, + json_format_pubsub_event(event), + std::bind(&Waiter::finish, w.get(), std::placeholders::_1)); + if (rc < 0) { + // failed to publish, does not wait for reply + return rc; + } + return w->wait(y); + } + } + + std::string to_str() const override { + std::string str("AMQP(0.9.1) Endpoint"); + str += "\nURI: " + endpoint; + str += "\nTopic: " + topic; + str += "\nExchange: " + exchange; + return str; + } +}; + +static const std::string AMQP_0_9_1("0-9-1"); +static const std::string AMQP_1_0("1-0"); +static const std::string AMQP_SCHEMA("amqp"); +#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT + +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint { +private: + enum class ack_level_t { + None, + Broker, + }; + CephContext* const cct; + const std::string topic; + const ack_level_t ack_level; + std::string conn_name; + + + ack_level_t get_ack_level(const RGWHTTPArgs& args) { + bool exists; + const auto& str_ack_level = args.get("kafka-ack-level", &exists); + if (!exists || str_ack_level == "broker") { + // "broker" is default + return ack_level_t::Broker; + } + if (str_ack_level == "none") { + return ack_level_t::None; + } + throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level); + } + +public: + RGWPubSubKafkaEndpoint(const std::string& _endpoint, + const std::string& _topic, + const RGWHTTPArgs& args, + CephContext* _cct) : + cct(_cct), + topic(_topic), + ack_level(get_ack_level(args)) { + if (!kafka::connect(conn_name, _endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), + args.get_optional("ca-location"), args.get_optional("mechanism"))) { + throw configuration_error("Kafka: failed to create connection to: " + _endpoint); + } + } + + // this allows waiting untill "finish()" is called from a different thread + // waiting could be blocking the waiting thread or yielding, depending + // with compilation flag support and whether the optional_yield is set + class Waiter { + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion; + std::unique_ptr completion = nullptr; + int ret; + + mutable std::atomic done = false; + mutable std::mutex lock; + mutable std::condition_variable cond; + + template + auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { + boost::asio::async_completion init(token); + auto& handler = init.completion_handler; + { + std::unique_lock l{lock}; + completion = Completion::create(ctx.get_executor(), std::move(handler)); + } + return init.result.get(); + } + + public: + int wait(optional_yield y) { + if (done) { + return ret; + } + if (y) { + auto& io_ctx = y.get_io_context(); + auto& yield_ctx = y.get_yield_context(); + boost::system::error_code ec; + async_wait(io_ctx, yield_ctx[ec]); + return -ec.value(); + } + std::unique_lock l(lock); + cond.wait(l, [this]{return (done==true);}); + return ret; + } + + void finish(int r) { + std::unique_lock l{lock}; + ret = r; + done = true; + if (completion) { + boost::system::error_code ec(-ret, boost::system::system_category()); + Completion::post(std::move(completion), ec); + } else { + cond.notify_all(); + } + } + }; + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { + if (ack_level == ack_level_t::None) { + return kafka::publish(conn_name, topic, json_format_pubsub_event(event)); + } else { + // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine + auto w = std::unique_ptr(new Waiter); + const auto rc = kafka::publish_with_confirm(conn_name, + topic, + json_format_pubsub_event(event), + std::bind(&Waiter::finish, w.get(), std::placeholders::_1)); + if (rc < 0) { + // failed to publish, does not wait for reply + return rc; + } + return w->wait(y); + } + } + + std::string to_str() const override { + std::string str("Kafka Endpoint"); + str += "\nBroker: " + conn_name; + str += "\nTopic: " + topic; + return str; + } +}; + +static const std::string KAFKA_SCHEMA("kafka"); +#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT + +static const std::string WEBHOOK_SCHEMA("webhook"); +static const std::string UNKNOWN_SCHEMA("unknown"); +static const std::string NO_SCHEMA(""); + +const std::string& get_schema(const std::string& endpoint) { + if (endpoint.empty()) { + return NO_SCHEMA; + } + const auto pos = endpoint.find(':'); + if (pos == std::string::npos) { + return UNKNOWN_SCHEMA; + } + const auto& schema = endpoint.substr(0,pos); + if (schema == "http" || schema == "https") { + return WEBHOOK_SCHEMA; +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + } else if (schema == "amqp" || schema == "amqps") { + return AMQP_SCHEMA; +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + } else if (schema == "kafka") { + return KAFKA_SCHEMA; +#endif + } + return UNKNOWN_SCHEMA; +} + +RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint, + const std::string& topic, + const RGWHTTPArgs& args, + CephContext* cct) { + const auto& schema = get_schema(endpoint); + if (schema == WEBHOOK_SCHEMA) { + return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args)); +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + } else if (schema == AMQP_SCHEMA) { + bool exists; + std::string version = args.get("amqp-version", &exists); + if (!exists) { + version = AMQP_0_9_1; + } + if (version == AMQP_0_9_1) { + return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct)); + } else if (version == AMQP_1_0) { + throw configuration_error("AMQP: v1.0 not supported"); + return nullptr; + } else { + throw configuration_error("AMQP: unknown version: " + version); + return nullptr; + } +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + } else if (schema == KAFKA_SCHEMA) { + return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct)); +#endif + } + + throw configuration_error("unknown schema in: " + endpoint); + return nullptr; +} + diff --git a/src/rgw/driver/rados/rgw_pubsub_push.h b/src/rgw/driver/rados/rgw_pubsub_push.h new file mode 100644 index 000000000..17905937c --- /dev/null +++ b/src/rgw/driver/rados/rgw_pubsub_push.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +#pragma once + +#include +#include +#include +#include "include/buffer_fwd.h" +#include "include/common_fwd.h" +#include "common/async/yield_context.h" + +// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes +class RGWDataSyncEnv; +class RGWHTTPArgs; +struct rgw_pubsub_s3_event; + +// endpoint base class all endpoint - types should derive from it +class RGWPubSubEndpoint { +public: + RGWPubSubEndpoint() = default; + // endpoint should not be copied + RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete; + const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete; + + typedef std::unique_ptr Ptr; + + // factory method for the actual notification endpoint + // derived class specific arguments are passed in http args format + // may throw a configuration_error if creation fails + static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr); + + // this method is used in order to send notification (S3 compliant) and wait for completion + // in async manner via a coroutine when invoked in the frontend environment + virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0; + + // present as string + virtual std::string to_str() const { return ""; } + + virtual ~RGWPubSubEndpoint() = default; + + // exception object for configuration error + struct configuration_error : public std::logic_error { + configuration_error(const std::string& what_arg) : + std::logic_error("pubsub endpoint configuration error: " + what_arg) {} + }; +}; + diff --git a/src/rgw/driver/rados/rgw_putobj_processor.cc b/src/rgw/driver/rados/rgw_putobj_processor.cc new file mode 100644 index 000000000..e453db5a9 --- /dev/null +++ b/src/rgw/driver/rados/rgw_putobj_processor.cc @@ -0,0 +1,761 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/librados.hpp" +#include "rgw_aio.h" +#include "rgw_putobj_processor.h" +#include "rgw_multi.h" +#include "rgw_compression.h" +#include "services/svc_sys_obj.h" +#include "services/svc_zone.h" +#include "rgw_sal_rados.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw::putobj { + +/* + * For the cloudtiered objects, update the object manifest with the + * cloudtier config info read from the attrs. + * Since these attrs are used internally for only replication, do not store them + * in the head object. + */ +void read_cloudtier_info_from_attrs(rgw::sal::Attrs& attrs, RGWObjCategory& category, + RGWObjManifest& manifest) { + auto attr_iter = attrs.find(RGW_ATTR_CLOUD_TIER_TYPE); + if (attr_iter != attrs.end()) { + auto i = attr_iter->second; + string m = i.to_str(); + + if (m == "cloud-s3") { + category = RGWObjCategory::CloudTiered; + manifest.set_tier_type("cloud-s3"); + + auto config_iter = attrs.find(RGW_ATTR_CLOUD_TIER_CONFIG); + if (config_iter != attrs.end()) { + auto i = config_iter->second.cbegin(); + RGWObjTier tier_config; + + try { + using ceph::decode; + decode(tier_config, i); + manifest.set_tier_config(tier_config); + attrs.erase(config_iter); + } catch (buffer::error& err) { + } + } + } + attrs.erase(attr_iter); + } +} + +int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset) +{ + const bool flush = (data.length() == 0); + + // capture the first chunk for special handling + if (data_offset < head_chunk_size || data_offset == 0) { + if (flush) { + // flush partial chunk + return process_first_chunk(std::move(head_data), &processor); + } + + auto remaining = head_chunk_size - data_offset; + auto count = std::min(data.length(), remaining); + data.splice(0, count, &head_data); + data_offset += count; + + if (data_offset == head_chunk_size) { + // process the first complete chunk + ceph_assert(head_data.length() == head_chunk_size); + int r = process_first_chunk(std::move(head_data), &processor); + if (r < 0) { + return r; + } + } + if (data.length() == 0) { // avoid flushing stripe processor + return 0; + } + } + ceph_assert(processor); // process_first_chunk() must initialize + + // send everything else through the processor + auto write_offset = data_offset; + data_offset += data.length(); + return processor->process(std::move(data), write_offset); +} + + +static int process_completed(const AioResultList& completed, RawObjSet *written) +{ + std::optional error; + for (auto& r : completed) { + if (r.result >= 0) { + written->insert(r.obj.get_ref().obj); + } else if (!error) { // record first error code + error = r.result; + } + } + return error.value_or(0); +} + +void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) { + const RGWObjStateManifest *sm = obj_ctx.get_state(head_obj); + const bool compressed = sm->state.compressed; + uint32_t alloc_hint_flags = 0; + if (compressed) { + alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE; + } + + op.set_alloc_hint2(0, 0, alloc_hint_flags); +} + +int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj) +{ + stripe_obj = store->svc.rados->obj(raw_obj); + return stripe_obj.open(dpp); +} + +int RadosWriter::process(bufferlist&& bl, uint64_t offset) +{ + bufferlist data = std::move(bl); + const uint64_t cost = data.length(); + if (cost == 0) { // no empty writes, use aio directly for creates + return 0; + } + librados::ObjectWriteOperation op; + add_write_hint(op); + if (offset == 0) { + op.write_full(data); + } else { + op.write(offset, data); + } + constexpr uint64_t id = 0; // unused + auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id); + return process_completed(c, &written); +} + +int RadosWriter::write_exclusive(const bufferlist& data) +{ + const uint64_t cost = data.length(); + + librados::ObjectWriteOperation op; + op.create(true); // exclusive create + add_write_hint(op); + op.write_full(data); + + constexpr uint64_t id = 0; // unused + auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id); + auto d = aio->drain(); + c.splice(c.end(), d); + return process_completed(c, &written); +} + +int RadosWriter::drain() +{ + return process_completed(aio->drain(), &written); +} + +RadosWriter::~RadosWriter() +{ + // wait on any outstanding aio completions + process_completed(aio->drain(), &written); + + bool need_to_remove_head = false; + std::optional raw_head; + if (!head_obj.empty()) { + raw_head.emplace(); + store->obj_to_raw(bucket_info.placement_rule, head_obj, &*raw_head); + } + + /** + * We should delete the object in the "multipart" namespace to avoid race condition. + * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart + * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects + * written by the second upload may be deleted by the first upload. + * details is describled on #11749 + * + * The above comment still stands, but instead of searching for a specific object in the multipart + * namespace, we just make sure that we remove the object that is marked as the head object after + * we remove all the other raw objects. Note that we use different call to remove the head object, + * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme. + */ + for (const auto& obj : written) { + if (raw_head && obj == *raw_head) { + ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl; + need_to_remove_head = true; + continue; + } + + int r = store->delete_raw_obj(dpp, obj); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl; + } + } + + if (need_to_remove_head) { + std::string version_id; + ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl; + int r = store->delete_obj(dpp, obj_ctx, bucket_info, head_obj, 0, 0); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl; + } + } +} + + +// advance to the next stripe +int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size) +{ + // advance the manifest + int r = manifest_gen.create_next(offset); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + uint64_t chunk_size = 0; + r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size, dpp); + if (r < 0) { + return r; + } + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + + chunk = ChunkProcessor(&writer, chunk_size); + *pstripe_size = manifest_gen.cur_stripe_max_size(); + return 0; +} + + + +int AtomicObjectProcessor::process_first_chunk(bufferlist&& data, + DataProcessor **processor) +{ + first_chunk = std::move(data); + *processor = &stripe; + return 0; +} + +int AtomicObjectProcessor::prepare(optional_yield y) +{ + uint64_t max_head_chunk_size; + uint64_t head_max_size; + uint64_t chunk_size = 0; + uint64_t alignment; + rgw_pool head_pool; + + if (!store->get_obj_data_pool(bucket_info.placement_rule, head_obj, &head_pool)) { + return -EIO; + } + + int r = store->get_max_chunk_size(head_pool, &max_head_chunk_size, dpp, &alignment); + if (r < 0) { + return r; + } + + bool same_pool = true; + if (bucket_info.placement_rule != tail_placement_rule) { + rgw_pool tail_pool; + if (!store->get_obj_data_pool(tail_placement_rule, head_obj, &tail_pool)) { + return -EIO; + } + + if (tail_pool != head_pool) { + same_pool = false; + + r = store->get_max_chunk_size(tail_pool, &chunk_size, dpp); + if (r < 0) { + return r; + } + + head_max_size = 0; + } + } + + if (same_pool) { + RGWZonePlacementInfo placement_info; + if (!store->svc.zone->get_zone_params().get_placement(bucket_info.placement_rule.name, &placement_info) || placement_info.inline_data) { + head_max_size = max_head_chunk_size; + } else { + head_max_size = 0; + } + chunk_size = max_head_chunk_size; + } + + uint64_t stripe_size; + const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size; + + store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size); + + manifest.set_trivial_rule(head_max_size, stripe_size); + + r = manifest_gen.create_begin(store->ctx(), &manifest, + bucket_info.placement_rule, + &tail_placement_rule, + head_obj.bucket, head_obj); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + + set_head_chunk_size(head_max_size); + // initialize the processors + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, head_max_size); + return 0; +} + +int AtomicObjectProcessor::complete(size_t accounted_size, + const std::string& etag, + ceph::real_time *mtime, + ceph::real_time set_mtime, + rgw::sal::Attrs& attrs, + ceph::real_time delete_at, + const char *if_match, + const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, + bool *pcanceled, optional_yield y) +{ + int r = writer.drain(); + if (r < 0) { + return r; + } + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + + obj_ctx.set_atomic(head_obj); + + RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj); + + /* some object types shouldn't be versioned, e.g., multipart parts */ + op_target.set_versioning_disabled(!bucket_info.versioning_enabled()); + + RGWRados::Object::Write obj_op(&op_target); + obj_op.meta.data = &first_chunk; + obj_op.meta.manifest = &manifest; + obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */ + obj_op.meta.if_match = if_match; + obj_op.meta.if_nomatch = if_nomatch; + obj_op.meta.mtime = mtime; + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.owner = owner; + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.olh_epoch = olh_epoch; + obj_op.meta.delete_at = delete_at; + obj_op.meta.user_data = user_data; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + + read_cloudtier_info_from_attrs(attrs, obj_op.meta.category, manifest); + + r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y); + if (r < 0) { + if (r == -ETIMEDOUT) { + // The head object write may eventually succeed, clear the set of objects for deletion. if it + // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write + writer.clear_written(); + } + return r; + } + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + return 0; +} + + +int MultipartObjectProcessor::process_first_chunk(bufferlist&& data, + DataProcessor **processor) +{ + // write the first chunk of the head object as part of an exclusive create, + // then drain to wait for the result in case of EEXIST + int r = writer.write_exclusive(data); + if (r == -EEXIST) { + // randomize the oid prefix and reprepare the head/manifest + std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32); + + mp.init(target_obj.key.name, upload_id, oid_rand); + manifest.set_prefix(target_obj.key.name + "." + oid_rand); + + r = prepare_head(); + if (r < 0) { + return r; + } + // resubmit the write op on the new head object + r = writer.write_exclusive(data); + } + if (r < 0) { + return r; + } + *processor = &stripe; + return 0; +} + +int MultipartObjectProcessor::prepare_head() +{ + const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size; + uint64_t chunk_size; + uint64_t stripe_size; + uint64_t alignment; + + int r = store->get_max_chunk_size(tail_placement_rule, target_obj, &chunk_size, dpp, &alignment); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl; + return r; + } + store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size); + + manifest.set_multipart_part_rule(stripe_size, part_num); + + r = manifest_gen.create_begin(store->ctx(), &manifest, + bucket_info.placement_rule, + &tail_placement_rule, + target_obj.bucket, target_obj); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + RGWSI_Tier_RADOS::raw_obj_to_obj(head_obj.bucket, stripe_obj, &head_obj); + head_obj.index_hash_source = target_obj.key.name; + + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + stripe_size = manifest_gen.cur_stripe_max_size(); + set_head_chunk_size(stripe_size); + + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, stripe_size); + return 0; +} + +int MultipartObjectProcessor::prepare(optional_yield y) +{ + manifest.set_prefix(target_obj.key.name + "." + upload_id); + + return prepare_head(); +} + +int MultipartObjectProcessor::complete(size_t accounted_size, + const std::string& etag, + ceph::real_time *mtime, + ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, + const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, + bool *pcanceled, optional_yield y) +{ + int r = writer.drain(); + if (r < 0) { + return r; + } + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + + RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj); + op_target.set_versioning_disabled(true); + op_target.set_meta_placement_rule(&tail_placement_rule); + + RGWRados::Object::Write obj_op(&op_target); + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.mtime = mtime; + obj_op.meta.owner = owner; + obj_op.meta.delete_at = delete_at; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + + r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y); + if (r < 0) + return r; + + RGWUploadPartInfo info; + string p = "part."; + bool sorted_omap = is_v2_upload_id(upload_id); + + if (sorted_omap) { + char buf[32]; + snprintf(buf, sizeof(buf), "%08d", part_num); + p.append(buf); + } else { + p.append(part_num_str); + } + info.num = part_num; + info.etag = etag; + info.size = actual_size; + info.accounted_size = accounted_size; + info.modified = real_clock::now(); + info.manifest = manifest; + + bool compressed; + r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info); + if (r < 0) { + ldpp_dout(dpp, 1) << "cannot get compression info" << dendl; + return r; + } + + rgw_obj meta_obj; + meta_obj.init_ns(bucket_info.bucket, mp.get_meta(), RGW_OBJ_NS_MULTIPART); + meta_obj.set_in_extra_data(true); + + rgw_raw_obj meta_raw_obj; + store->obj_to_raw(bucket_info.placement_rule, meta_obj, &meta_raw_obj); + + rgw_rados_ref meta_obj_ref; + r = store->get_raw_obj_ref(dpp, meta_raw_obj, &meta_obj_ref); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref of meta obj with ret=" << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + cls_rgw_mp_upload_part_info_update(op, p, info); + r = rgw_rados_operate(dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, y); + ldpp_dout(dpp, 20) << "Update meta: " << meta_obj_ref.obj.oid << " part " << p << " prefix " << info.manifest.get_prefix() << " return " << r << dendl; + + if (r == -EOPNOTSUPP) { + // New CLS call to update part info is not yet supported. Fall back to the old handling. + bufferlist bl; + encode(info, bl); + + map m; + m[p] = bl; + + op = librados::ObjectWriteOperation{}; + op.assert_exists(); // detect races with abort + op.omap_set(m); + r = rgw_rados_operate(dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, y); + } + if (r < 0) { + return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r; + } + + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + return 0; +} + +int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor) +{ + int r = writer.write_exclusive(data); + if (r < 0) { + return r; + } + *processor = &stripe; + return 0; +} + +int AppendObjectProcessor::prepare(optional_yield y) +{ + RGWObjState *astate; + int r = store->get_obj_state(dpp, &obj_ctx, bucket_info, head_obj, + &astate, &cur_manifest, y); + if (r < 0) { + return r; + } + cur_size = astate->size; + *cur_accounted_size = astate->accounted_size; + if (!astate->exists) { + if (position != 0) { + ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl; + return -ERR_POSITION_NOT_EQUAL_TO_LENGTH; + } else { + cur_part_num = 1; + //set the prefix + char buf[33]; + gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); + string oid_prefix = head_obj.key.name; + oid_prefix.append("."); + oid_prefix.append(buf); + oid_prefix.append("_"); + manifest.set_prefix(oid_prefix); + } + } else { + // check whether the object appendable + map::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM); + if (iter == astate->attrset.end()) { + ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl; + return -ERR_OBJECT_NOT_APPENDABLE; + } + if (position != *cur_accounted_size) { + ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl; + return -ERR_POSITION_NOT_EQUAL_TO_LENGTH; + } + try { + using ceph::decode; + decode(cur_part_num, iter->second); + } catch (buffer::error& err) { + ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl; + return -EIO; + } + cur_part_num++; + //get the current obj etag + iter = astate->attrset.find(RGW_ATTR_ETAG); + if (iter != astate->attrset.end()) { + string s = rgw_string_unquote(iter->second.c_str()); + size_t pos = s.find("-"); + cur_etag = s.substr(0, pos); + } + + iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS); + if (iter != astate->attrset.end()) { + tail_placement_rule.storage_class = iter->second.to_str(); + } else { + tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD; + } + manifest.set_prefix(cur_manifest->get_prefix()); + astate->keep_tail = true; + } + manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num); + + r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, &tail_placement_rule, head_obj.bucket, head_obj); + if (r < 0) { + return r; + } + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + uint64_t chunk_size = 0; + r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size, dpp); + if (r < 0) { + return r; + } + r = writer.set_stripe_obj(std::move(stripe_obj)); + if (r < 0) { + return r; + } + + uint64_t stripe_size = manifest_gen.cur_stripe_max_size(); + + uint64_t max_head_size = std::min(chunk_size, stripe_size); + set_head_chunk_size(max_head_size); + + // initialize the processors + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, stripe_size); + + return 0; +} + +int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime, + ceph::real_time set_mtime, rgw::sal::Attrs& attrs, + ceph::real_time delete_at, const char *if_match, const char *if_nomatch, + const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled, + optional_yield y) +{ + int r = writer.drain(); + if (r < 0) + return r; + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + obj_ctx.set_atomic(head_obj); + RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj); + //For Append obj, disable versioning + op_target.set_versioning_disabled(true); + RGWRados::Object::Write obj_op(&op_target); + if (cur_manifest) { + cur_manifest->append(dpp, manifest, store->svc.zone->get_zonegroup(), store->svc.zone->get_zone_params()); + obj_op.meta.manifest = cur_manifest; + } else { + obj_op.meta.manifest = &manifest; + } + obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */ + obj_op.meta.mtime = mtime; + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.owner = owner; + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.delete_at = delete_at; + obj_op.meta.user_data = user_data; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + obj_op.meta.appendable = true; + //Add the append part number + bufferlist cur_part_num_bl; + using ceph::encode; + encode(cur_part_num, cur_part_num_bl); + attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl; + //calculate the etag + if (!cur_etag.empty()) { + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + hash.Final((unsigned char *)final_etag); + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)cur_part_num); + bufferlist etag_bl; + etag_bl.append(final_etag_str, strlen(final_etag_str) + 1); + attrs[RGW_ATTR_ETAG] = etag_bl; + } + r = obj_op.write_meta(dpp, actual_size + cur_size, + accounted_size + *cur_accounted_size, + attrs, y); + if (r < 0) { + return r; + } + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + *cur_accounted_size += accounted_size; + + return 0; +} + +} // namespace rgw::putobj diff --git a/src/rgw/driver/rados/rgw_putobj_processor.h b/src/rgw/driver/rados/rgw_putobj_processor.h new file mode 100644 index 000000000..fa9200f32 --- /dev/null +++ b/src/rgw/driver/rados/rgw_putobj_processor.h @@ -0,0 +1,282 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include + +#include "rgw_putobj.h" +#include "services/svc_rados.h" +#include "services/svc_tier_rados.h" +#include "rgw_sal.h" +#include "rgw_obj_manifest.h" + +namespace rgw { + +namespace sal { + class RadosStore; +} + +class Aio; + +namespace putobj { + +// an object processor with special handling for the first chunk of the head. +// the virtual process_first_chunk() function returns a processor to handle the +// rest of the object +class HeadObjectProcessor : public rgw::sal::ObjectProcessor { + uint64_t head_chunk_size; + // buffer to capture the first chunk of the head object + bufferlist head_data; + // initialized after process_first_chunk() to process everything else + rgw::sal::DataProcessor *processor = nullptr; + uint64_t data_offset = 0; // maximum offset of data written (ie compressed) + protected: + uint64_t get_actual_size() const { return data_offset; } + + // process the first chunk of data and return a processor for the rest + virtual int process_first_chunk(bufferlist&& data, + rgw::sal::DataProcessor **processor) = 0; + public: + HeadObjectProcessor(uint64_t head_chunk_size) + : head_chunk_size(head_chunk_size) + {} + + void set_head_chunk_size(uint64_t size) { head_chunk_size = size; } + + // cache first chunk for process_first_chunk(), then forward everything else + // to the returned processor + int process(bufferlist&& data, uint64_t logical_offset) final override; +}; + +using RawObjSet = std::set; + +// a data sink that writes to rados objects and deletes them on cancelation +class RadosWriter : public rgw::sal::DataProcessor { + Aio *const aio; + RGWRados *const store; + const RGWBucketInfo& bucket_info; + RGWObjectCtx& obj_ctx; + const rgw_obj head_obj; + RGWSI_RADOS::Obj stripe_obj; // current stripe object + RawObjSet written; // set of written objects for deletion + const DoutPrefixProvider *dpp; + optional_yield y; + + public: + RadosWriter(Aio *aio, RGWRados *store, + const RGWBucketInfo& bucket_info, + RGWObjectCtx& obj_ctx, const rgw_obj& _head_obj, + const DoutPrefixProvider *dpp, optional_yield y) + : aio(aio), store(store), bucket_info(bucket_info), + obj_ctx(obj_ctx), head_obj(_head_obj), dpp(dpp), y(y) + {} + ~RadosWriter(); + + // add alloc hint to osd + void add_write_hint(librados::ObjectWriteOperation& op); + + // change the current stripe object + int set_stripe_obj(const rgw_raw_obj& obj); + + // write the data at the given offset of the current stripe object + int process(bufferlist&& data, uint64_t stripe_offset) override; + + // write the data as an exclusive create and wait for it to complete + int write_exclusive(const bufferlist& data); + + int drain(); + + // when the operation completes successfully, clear the set of written objects + // so they aren't deleted on destruction + void clear_written() { written.clear(); } + +}; + + +// a rados object processor that stripes according to RGWObjManifest +class ManifestObjectProcessor : public HeadObjectProcessor, + public StripeGenerator { + protected: + RGWRados* const store; + RGWBucketInfo& bucket_info; + rgw_placement_rule tail_placement_rule; + rgw_user owner; + RGWObjectCtx& obj_ctx; + rgw_obj head_obj; + + RadosWriter writer; + RGWObjManifest manifest; + RGWObjManifest::generator manifest_gen; + ChunkProcessor chunk; + StripeProcessor stripe; + const DoutPrefixProvider *dpp; + + // implements StripeGenerator + int next(uint64_t offset, uint64_t *stripe_size) override; + + public: + ManifestObjectProcessor(Aio *aio, RGWRados* store, + RGWBucketInfo& bucket_info, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& _obj_ctx, + const rgw_obj& _head_obj, + const DoutPrefixProvider* dpp, optional_yield y) + : HeadObjectProcessor(0), + store(store), bucket_info(bucket_info), + owner(owner), + obj_ctx(_obj_ctx), head_obj(_head_obj), + writer(aio, store, bucket_info, obj_ctx, head_obj, dpp, y), + chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) { + if (ptail_placement_rule) { + tail_placement_rule = *ptail_placement_rule; + } + } + + void set_owner(const rgw_user& _owner) { + owner = _owner; + } + + void set_tail_placement(const rgw_placement_rule& tpr) { + tail_placement_rule = tpr; + } + void set_tail_placement(const rgw_placement_rule&& tpr) { + tail_placement_rule = tpr; + } + +}; + + +// a processor that completes with an atomic write to the head object as part of +// a bucket index transaction +class AtomicObjectProcessor : public ManifestObjectProcessor { + const std::optional olh_epoch; + const std::string unique_tag; + bufferlist first_chunk; // written with the head in complete() + + int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; + public: + AtomicObjectProcessor(Aio *aio, RGWRados* store, + RGWBucketInfo& bucket_info, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, + RGWObjectCtx& obj_ctx, const rgw_obj& _head_obj, + std::optional olh_epoch, + const std::string& unique_tag, + const DoutPrefixProvider *dpp, optional_yield y) + : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule, + owner, obj_ctx, _head_obj, dpp, y), + olh_epoch(olh_epoch), unique_tag(unique_tag) + {} + + // prepare a trivial manifest + int prepare(optional_yield y) override; + // write the head object atomically in a bucket index transaction + int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + +}; + + +// a processor for multipart parts, which don't require atomic completion. the +// part's head is written with an exclusive create to detect racing uploads of +// the same part/upload id, which are restarted with a random oid prefix +class MultipartObjectProcessor : public ManifestObjectProcessor { + const rgw_obj target_obj; // target multipart object + const std::string upload_id; + const int part_num; + const std::string part_num_str; + RGWMPObj mp; + + // write the first chunk and wait on aio->drain() for its completion. + // on EEXIST, retry with random prefix + int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; + // prepare the head stripe and manifest + int prepare_head(); + public: + MultipartObjectProcessor(Aio *aio, RGWRados* store, + RGWBucketInfo& bucket_info, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& obj_ctx, + const rgw_obj& _head_obj, + const std::string& upload_id, uint64_t part_num, + const std::string& part_num_str, + const DoutPrefixProvider *dpp, optional_yield y) + : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule, + owner, obj_ctx, _head_obj, dpp, y), + target_obj(head_obj), upload_id(upload_id), + part_num(part_num), part_num_str(part_num_str), + mp(head_obj.key.name, upload_id) + {} + + // prepare a multipart manifest + int prepare(optional_yield y) override; + // write the head object attributes in a bucket index transaction, then + // register the completed part with the multipart meta object + int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + +}; + + class AppendObjectProcessor : public ManifestObjectProcessor { + uint64_t cur_part_num; + uint64_t position; + uint64_t cur_size; + uint64_t *cur_accounted_size; + std::string cur_etag; + const std::string unique_tag; + + RGWObjManifest *cur_manifest; + + int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; + + public: + AppendObjectProcessor(Aio *aio, RGWRados* store, + RGWBucketInfo& bucket_info, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& obj_ctx, + const rgw_obj& _head_obj, + const std::string& unique_tag, uint64_t position, + uint64_t *cur_accounted_size, + const DoutPrefixProvider *dpp, optional_yield y) + : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule, + owner, obj_ctx, _head_obj, dpp, y), + position(position), cur_size(0), cur_accounted_size(cur_accounted_size), + unique_tag(unique_tag), cur_manifest(nullptr) + {} + int prepare(optional_yield y) override; + int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + }; + +} // namespace putobj +} // namespace rgw + diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc new file mode 100644 index 000000000..10018d4a6 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -0,0 +1,10076 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "include/compat.h" +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "common/ceph_json.h" + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/Throttle.h" +#include "common/BackTrace.h" + +#include "rgw_sal.h" +#include "rgw_zone.h" +#include "rgw_cache.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */ +#include "rgw_aio_throttle.h" +#include "driver/rados/rgw_bucket.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_datalog.h" +#include "rgw_putobj_processor.h" + +#include "cls/rgw/cls_rgw_ops.h" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/rgw/cls_rgw_const.h" +#include "cls/refcount/cls_refcount_client.h" +#include "cls/version/cls_version_client.h" +#include "osd/osd_types.h" + +#include "rgw_tools.h" +#include "rgw_coroutine.h" +#include "rgw_compression.h" +#include "rgw_crypt.h" +#include "rgw_etag_verifier.h" +#include "rgw_worker.h" +#include "rgw_notify.h" +#include "rgw_http_errors.h" + +#undef fork // fails to compile RGWPeriod::fork() below + +#include "common/Clock.h" + +#include +#include +#include +#include +#include +#include +#include "include/random.h" + +#include "rgw_gc.h" +#include "rgw_lc.h" + +#include "rgw_object_expirer_core.h" +#include "rgw_sync.h" +#include "rgw_sync_counters.h" +#include "rgw_sync_trace.h" +#include "rgw_trim_datalog.h" +#include "rgw_trim_mdlog.h" +#include "rgw_data_sync.h" +#include "rgw_realm_watcher.h" +#include "rgw_reshard.h" +#include "rgw_cr_rados.h" + +#include "services/svc_zone.h" +#include "services/svc_zone_utils.h" +#include "services/svc_quota.h" +#include "services/svc_sync_modules.h" +#include "services/svc_sys_obj.h" +#include "services/svc_sys_obj_cache.h" +#include "services/svc_bucket.h" +#include "services/svc_mdlog.h" + +#include "compressor/Compressor.h" + +#include "rgw_d3n_datacache.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/rgw_rados.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace librados; + +#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: " +#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: " +#define dendl_bitx dendl ; } + +static string shadow_ns = "shadow"; +static string default_bucket_index_pool_suffix = "rgw.buckets.index"; +static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec"; + +static RGWObjCategory main_category = RGWObjCategory::Main; +#define RGW_USAGE_OBJ_PREFIX "usage." + +rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados* store) const +{ + if (!is_raw) { + rgw_raw_obj r; + store->obj_to_raw(placement_rule, obj, &r); + return r; + } + return raw_obj; +} + +void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op) +{ + obj_version* check_objv = version_for_check(); + + if (check_objv) { + cls_version_check(*op, *check_objv, VER_COND_EQ); + } + + cls_version_read(*op, &read_version); +} + +void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op) +{ + obj_version* check_objv = version_for_check(); + obj_version* modify_version = version_for_write(); + + if (check_objv) { + cls_version_check(*op, *check_objv, VER_COND_EQ); + } + + if (modify_version) { + cls_version_set(*op, *modify_version); + } else { + cls_version_inc(*op); + } +} + +void RGWObjVersionTracker::apply_write() +{ + const bool checked = (read_version.ver != 0); + const bool incremented = (write_version.ver == 0); + + if (checked && incremented) { + // apply cls_version_inc() so our next operation can recheck it + ++read_version.ver; + } else { + read_version = write_version; + } + write_version = obj_version(); +} + +RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) { + RGWObjStateManifest *result; + typename std::map::iterator iter; + lock.lock_shared(); + assert (!obj.empty()); + iter = objs_state.find(obj); + if (iter != objs_state.end()) { + result = &iter->second; + lock.unlock_shared(); + } else { + lock.unlock_shared(); + lock.lock(); + result = &objs_state[obj]; + lock.unlock(); + } + return result; +} + +void RGWObjectCtx::set_compressed(const rgw_obj& obj) { + std::unique_lock wl{lock}; + assert (!obj.empty()); + objs_state[obj].state.compressed = true; +} + +void RGWObjectCtx::set_atomic(const rgw_obj& obj) { + std::unique_lock wl{lock}; + assert (!obj.empty()); + objs_state[obj].state.is_atomic = true; +} +void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) { + std::unique_lock wl{lock}; + assert (!obj.empty()); + objs_state[obj].state.prefetch_data = true; +} + +void RGWObjectCtx::invalidate(const rgw_obj& obj) { + std::unique_lock wl{lock}; + auto iter = objs_state.find(obj); + if (iter == objs_state.end()) { + return; + } + bool is_atomic = iter->second.state.is_atomic; + bool prefetch_data = iter->second.state.prefetch_data; + bool compressed = iter->second.state.compressed; + + objs_state.erase(iter); + + if (is_atomic || prefetch_data) { + auto& sm = objs_state[obj]; + sm.state.is_atomic = is_atomic; + sm.state.prefetch_data = prefetch_data; + sm.state.compressed = compressed; + } +} + +class RGWMetaNotifierManager : public RGWCoroutinesManager { + RGWRados* store; + RGWHTTPManager http_manager; + +public: + RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver), + http_manager(store->ctx(), completion_mgr) { + http_manager.start(); + } + + int notify_all(const DoutPrefixProvider *dpp, map& conn_map, set& shards) { + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { "notify", NULL }, + { NULL, NULL } }; + + list stacks; + for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this); + stack->call(new RGWPostRESTResourceCR, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL)); + + stacks.push_back(stack); + } + return run(dpp, stacks); + } +}; + +class RGWDataNotifierManager : public RGWCoroutinesManager { + RGWRados* store; + RGWHTTPManager http_manager; + +public: + RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver), + http_manager(store->ctx(), completion_mgr) { + http_manager.start(); + } + + int notify_all(const DoutPrefixProvider *dpp, map& conn_map, + bc::flat_map >& shards) { + + list stacks; + const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str(); + for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this); + stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn)); + stacks.push_back(stack); + } + + return run(dpp, stacks); + } +}; + +/* class RGWRadosThread */ + +void RGWRadosThread::start() +{ + worker = new Worker(cct, this); + worker->create(thread_name.c_str()); +} + +void RGWRadosThread::stop() +{ + down_flag = true; + stop_process(); + if (worker) { + worker->signal(); + worker->join(); + } + delete worker; + worker = NULL; +} + +void *RGWRadosThread::Worker::entry() { + uint64_t msec = processor->interval_msec(); + auto interval = std::chrono::milliseconds(msec); + + do { + auto start = ceph::real_clock::now(); + int r = processor->process(this); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl; + } + + if (processor->going_down()) + break; + + auto end = ceph::real_clock::now() - start; + + uint64_t cur_msec = processor->interval_msec(); + if (cur_msec != msec) { /* was it reconfigured? */ + msec = cur_msec; + interval = std::chrono::milliseconds(msec); + } + + if (cur_msec > 0) { + if (interval <= end) + continue; // next round + + auto wait_time = interval - end; + wait_interval(wait_time); + } else { + wait(); + } + } while (!processor->going_down()); + + return NULL; +} + +class RGWMetaNotifier : public RGWRadosThread { + RGWMetaNotifierManager notify_mgr; + RGWMetadataLog *const log; + + uint64_t interval_msec() override { + return cct->_conf->rgw_md_notify_interval_msec; + } + void stop_process() override { + notify_mgr.stop(); + } +public: + RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log) + : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {} + + int process(const DoutPrefixProvider *dpp) override; +}; + +int RGWMetaNotifier::process(const DoutPrefixProvider *dpp) +{ + set shards; + + log->read_clear_modified(shards); + + if (shards.empty()) { + return 0; + } + + for (set::iterator iter = shards.begin(); iter != shards.end(); ++iter) { + ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl; + } + + notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards); + + return 0; +} + +class RGWDataNotifier : public RGWRadosThread { + RGWDataNotifierManager notify_mgr; + bc::flat_set entry; + + uint64_t interval_msec() override { + return cct->_conf.get_val("rgw_data_notify_interval_msec"); + } + void stop_process() override { + notify_mgr.stop(); + } +public: + RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {} + + int process(const DoutPrefixProvider *dpp) override; +}; + +int RGWDataNotifier::process(const DoutPrefixProvider *dpp) +{ + auto data_log = store->svc.datalog_rados; + if (!data_log) { + return 0; + } + + auto shards = data_log->read_clear_modified(); + + if (shards.empty()) { + return 0; + } + + for (const auto& [shard_id, entries] : shards) { + bc::flat_set::iterator it; + for (const auto& entry : entries) { + ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id=" + << shard_id << ":" << entry.gen << ":" << entry.key << dendl; + } + } + + notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards); + + return 0; +} + +class RGWSyncProcessorThread : public RGWRadosThread { +public: + RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {} + RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {} + ~RGWSyncProcessorThread() override {} + int init(const DoutPrefixProvider *dpp) override = 0 ; + int process(const DoutPrefixProvider *dpp) override = 0; +}; + +class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread +{ + RGWMetaSyncStatusManager sync; + + uint64_t interval_msec() override { + return 0; /* no interval associated, it'll run once until stopped */ + } + void stop_process() override { + sync.stop(); + } +public: + RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados) + : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {} + + void wakeup_sync_shards(set& shard_ids) { + for (set::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) { + sync.wakeup(*iter); + } + } + RGWMetaSyncStatusManager* get_manager() { return &sync; } + + int init(const DoutPrefixProvider *dpp) override { + int ret = sync.init(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl; + return ret; + } + return 0; + } + + int process(const DoutPrefixProvider *dpp) override { + sync.run(dpp, null_yield); + return 0; + } +}; + +class RGWDataSyncProcessorThread : public RGWSyncProcessorThread +{ + PerfCountersRef counters; + RGWDataSyncStatusManager sync; + bool initialized; + + uint64_t interval_msec() override { + if (initialized) { + return 0; /* no interval associated, it'll run once until stopped */ + } else { +#define DATA_SYNC_INIT_WAIT_SEC 20 + return DATA_SYNC_INIT_WAIT_SEC * 1000; + } + } + void stop_process() override { + sync.stop(); + } +public: + RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados, + const RGWZone* source_zone) + : RGWSyncProcessorThread(_driver->getRados(), "data-sync"), + counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)), + sync(_driver, async_rados, source_zone->id, counters.get()), + initialized(false) {} + + void wakeup_sync_shards(bc::flat_map >& entries) { + for (bc::flat_map >::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + sync.wakeup(iter->first, iter->second); + } + } + + RGWDataSyncStatusManager* get_manager() { return &sync; } + + int init(const DoutPrefixProvider *dpp) override { + return 0; + } + + int process(const DoutPrefixProvider *dpp) override { + while (!initialized) { + if (going_down()) { + return 0; + } + int ret = sync.init(dpp); + if (ret >= 0) { + initialized = true; + break; + } + /* we'll be back! */ + return 0; + } + sync.run(dpp); + return 0; + } +}; + +class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider +{ + RGWCoroutinesManager crs; + rgw::sal::RadosStore* store; + rgw::BucketTrimManager *bucket_trim; + RGWHTTPManager http; + const utime_t trim_interval; + + uint64_t interval_msec() override { return 0; } + void stop_process() override { crs.stop(); } +public: + RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim, + int interval) + : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"), + crs(store->ctx(), store->getRados()->get_cr_registry()), store(store), + bucket_trim(bucket_trim), + http(store->ctx(), crs.get_completion_mgr()), + trim_interval(interval, 0) + {} + + int init(const DoutPrefixProvider *dpp) override { + return http.start(); + } + int process(const DoutPrefixProvider *dpp) override { + list stacks; + auto metatrimcr = create_meta_log_trim_cr(this, static_cast(store), &http, + cct->_conf->rgw_md_log_max_shards, + trim_interval); + if (!metatrimcr) { + ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl; + return -EINVAL; + } + auto meta = new RGWCoroutinesStack(store->ctx(), &crs); + meta->call(metatrimcr); + + stacks.push_back(meta); + + if (store->svc()->zone->sync_module_exports_data()) { + auto data = new RGWCoroutinesStack(store->ctx(), &crs); + data->call(create_data_log_trim_cr(dpp, static_cast(store), &http, + cct->_conf->rgw_data_log_num_shards, + trim_interval)); + stacks.push_back(data); + + auto bucket = new RGWCoroutinesStack(store->ctx(), &crs); + bucket->call(bucket_trim->create_bucket_trim_cr(&http)); + stacks.push_back(bucket); + } + + crs.run(dpp, stacks); + return 0; + } + + // implements DoutPrefixProvider + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const override + { + return dout_subsys; + } + + std::ostream& gen_prefix(std::ostream& out) const override + { + return out << "sync log trim: "; + } + +}; + +void RGWRados::wakeup_meta_sync_shards(set& shard_ids) +{ + std::lock_guard l{meta_sync_thread_lock}; + if (meta_sync_processor_thread) { + meta_sync_processor_thread->wakeup_sync_shards(shard_ids); + } +} + +void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map >& entries) +{ + ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl; + for (bc::flat_map >::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl; + bc::flat_set& entries = iter->second; + for (const auto& [key, gen] : entries) { + ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key + << ", gen=" << gen << dendl; + } + } + + std::lock_guard l{data_sync_thread_lock}; + auto iter = data_sync_processor_threads.find(source_zone); + if (iter == data_sync_processor_threads.end()) { + ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl; + return; + } + + RGWDataSyncProcessorThread *thread = iter->second; + ceph_assert(thread); + thread->wakeup_sync_shards(entries); +} + +RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager() +{ + std::lock_guard l{meta_sync_thread_lock}; + if (meta_sync_processor_thread) { + return meta_sync_processor_thread->get_manager(); + } + return nullptr; +} + +RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone) +{ + std::lock_guard l{data_sync_thread_lock}; + auto thread = data_sync_processor_threads.find(source_zone); + if (thread == data_sync_processor_threads.end()) { + return nullptr; + } + return thread->second->get_manager(); +} + +int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment) +{ + IoCtx ioctx; + int r = open_pool_ctx(dpp, pool, ioctx, false, true); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl; + return r; + } + + bool req; + r = ioctx.pool_requires_alignment2(&req); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned " + << r << dendl; + return r; + } + + if (!req) { + *alignment = 0; + return 0; + } + + uint64_t align; + r = ioctx.pool_required_alignment2(&align); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned " + << r << dendl; + return r; + } + if (align != 0) { + ldpp_dout(dpp, 20) << "required alignment=" << align << dendl; + } + *alignment = align; + return 0; +} + +void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size) +{ + if (alignment == 0) { + *max_size = size; + return; + } + + if (size <= alignment) { + *max_size = alignment; + return; + } + + *max_size = size - (size % alignment); +} + +int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment) +{ + uint64_t alignment; + int r = get_required_alignment(dpp, pool, &alignment); + if (r < 0) { + return r; + } + + if (palignment) { + *palignment = alignment; + } + + uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size; + + get_max_aligned_size(config_chunk_size, alignment, max_chunk_size); + + ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl; + + return 0; +} + +int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, + uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment) +{ + rgw_pool pool; + if (!get_obj_data_pool(placement_rule, obj, &pool)) { + ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl; + return -EIO; + } + return get_max_chunk_size(pool, max_chunk_size, dpp, palignment); +} + +void add_datalog_entry(const DoutPrefixProvider* dpp, + RGWDataChangesLog* datalog, + const RGWBucketInfo& bucket_info, + uint32_t shard_id, optional_yield y) +{ + const auto& logs = bucket_info.layout.logs; + if (logs.empty()) { + return; + } + int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id, y); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl; + } // datalog error is not fatal +} + +class RGWIndexCompletionManager; + +struct complete_op_data { + ceph::mutex lock = ceph::make_mutex("complete_op_data"); + AioCompletion *rados_completion{nullptr}; + int manager_shard_id{-1}; + RGWIndexCompletionManager *manager{nullptr}; + rgw_obj obj; + RGWModifyOp op; + string tag; + rgw_bucket_entry_ver ver; + cls_rgw_obj_key key; + rgw_bucket_dir_entry_meta dir_meta; + list remove_objs; + bool log_op; + uint16_t bilog_op; + rgw_zone_set zones_trace; + + bool stopped{false}; + + void stop() { + std::lock_guard l{lock}; + stopped = true; + } +}; + +class RGWIndexCompletionManager { + RGWRados* const store; + const uint32_t num_shards; + ceph::containers::tiny_vector locks; + std::vector> completions; + std::vector retry_completions; + + std::condition_variable cond; + std::mutex retry_completions_lock; + bool _stop{false}; + std::thread retry_thread; + + // used to distribute the completions and the locks they use across + // their respective vectors; it will get incremented and can wrap + // around back to 0 without issue + std::atomic cur_shard {0}; + + void process(); + + void add_completion(complete_op_data *completion); + + void stop() { + if (retry_thread.joinable()) { + _stop = true; + cond.notify_all(); + retry_thread.join(); + } + + for (uint32_t i = 0; i < num_shards; ++i) { + std::lock_guard l{locks[i]}; + for (auto c : completions[i]) { + c->stop(); + } + } + completions.clear(); + } + + uint32_t next_shard() { + return cur_shard++ % num_shards; + } + +public: + RGWIndexCompletionManager(RGWRados *_driver) : + store(_driver), + num_shards(store->ctx()->_conf->rgw_thread_pool_size), + locks{ceph::make_lock_container( + num_shards, + [](const size_t i) { + return ceph::make_mutex("RGWIndexCompletionManager::lock::" + + std::to_string(i)); + })}, + completions(num_shards), + retry_thread(&RGWIndexCompletionManager::process, this) + {} + + ~RGWIndexCompletionManager() { + stop(); + } + + void create_completion(const rgw_obj& obj, + RGWModifyOp op, string& tag, + rgw_bucket_entry_ver& ver, + const cls_rgw_obj_key& key, + rgw_bucket_dir_entry_meta& dir_meta, + list *remove_objs, bool log_op, + uint16_t bilog_op, + rgw_zone_set *zones_trace, + complete_op_data **result); + + bool handle_completion(completion_t cb, complete_op_data *arg); + + CephContext* ctx() { + return store->ctx(); + } +}; + +static void obj_complete_cb(completion_t cb, void *arg) +{ + complete_op_data *completion = reinterpret_cast(arg); + completion->lock.lock(); + if (completion->stopped) { + completion->lock.unlock(); /* can drop lock, no one else is referencing us */ + delete completion; + return; + } + bool need_delete = completion->manager->handle_completion(cb, completion); + completion->lock.unlock(); + if (need_delete) { + delete completion; + } +} + +void RGWIndexCompletionManager::process() +{ + DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: "); + while(!_stop) { + std::vector comps; + + { + std::unique_lock l{retry_completions_lock}; + cond.wait(l, [this](){return _stop || !retry_completions.empty();}); + if (_stop) { + return; + } + retry_completions.swap(comps); + } + + for (auto c : comps) { + std::unique_ptr up{c}; + + ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl; + + RGWRados::BucketShard bs(store); + RGWBucketInfo bucket_info; + + int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp); + if (r < 0) { + ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl; + /* not much to do */ + continue; + } + + r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info, + [&](RGWRados::BucketShard *bs) -> int { + const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx(bitx, &dpp, 10) << + "ENTERING " << __func__ << ": bucket-shard=" << bs << + " obj=" << c->obj << " tag=" << c->tag << + " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx; + ldout_bitx(bitx, &dpp, 25) << + "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx; + + librados::ObjectWriteOperation o; + o.assert_exists(); + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs, + c->log_op, c->bilog_op, &c->zones_trace); + int ret = bs->bucket_obj.operate(&dpp, &o, null_yield); + ldout_bitx(bitx, &dpp, 10) << + "EXITING " << __func__ << ": ret=" << dendl_bitx; + return ret; + }); + if (r < 0) { + ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl; + /* ignoring error, can't do anything about it */ + continue; + } + + // This null_yield can stay, for now, since we're in our own thread + add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info, + bs.shard_id, null_yield); + } + } +} + +void RGWIndexCompletionManager::create_completion(const rgw_obj& obj, + RGWModifyOp op, string& tag, + rgw_bucket_entry_ver& ver, + const cls_rgw_obj_key& key, + rgw_bucket_dir_entry_meta& dir_meta, + list *remove_objs, bool log_op, + uint16_t bilog_op, + rgw_zone_set *zones_trace, + complete_op_data **result) +{ + complete_op_data *entry = new complete_op_data; + + int shard_id = next_shard(); + + entry->manager_shard_id = shard_id; + entry->manager = this; + entry->obj = obj; + entry->op = op; + entry->tag = tag; + entry->ver = ver; + entry->key = key; + entry->dir_meta = dir_meta; + entry->log_op = log_op; + entry->bilog_op = bilog_op; + + if (remove_objs) { + for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) { + entry->remove_objs.push_back(*iter); + } + } + + if (zones_trace) { + entry->zones_trace = *zones_trace; + } else { + entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key()); + } + + *result = entry; + + entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb); + + std::lock_guard l{locks[shard_id]}; + const auto ok = completions[shard_id].insert(entry).second; + ceph_assert(ok); +} + +void RGWIndexCompletionManager::add_completion(complete_op_data *completion) { + { + std::lock_guard l{retry_completions_lock}; + retry_completions.push_back(completion); + } + cond.notify_all(); +} + +bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg) +{ + int shard_id = arg->manager_shard_id; + { + std::lock_guard l{locks[shard_id]}; + + auto& comps = completions[shard_id]; + + auto iter = comps.find(arg); + if (iter == comps.end()) { + ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl; + return true; + } + + comps.erase(iter); + } + + int r = rados_aio_get_return_value(cb); + if (r != -ERR_BUSY_RESHARDING) { + ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " << + (r == 0 ? "ok" : "failed with " + to_string(r)) << + " for obj=" << arg->key << dendl; + return true; + } + add_completion(arg); + ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl; + return false; +} + +void RGWRados::finalize() +{ + /* Before joining any sync threads, drain outstanding requests & + * mark the async_processor as going_down() */ + if (svc.rados) { + svc.rados->stop_processor(); + } + + if (run_sync_thread) { + std::lock_guard l{meta_sync_thread_lock}; + meta_sync_processor_thread->stop(); + + std::lock_guard dl{data_sync_thread_lock}; + for (auto iter : data_sync_processor_threads) { + RGWDataSyncProcessorThread *thread = iter.second; + thread->stop(); + } + if (sync_log_trimmer) { + sync_log_trimmer->stop(); + } + } + if (run_sync_thread) { + delete meta_sync_processor_thread; + meta_sync_processor_thread = NULL; + std::lock_guard dl{data_sync_thread_lock}; + for (auto iter : data_sync_processor_threads) { + RGWDataSyncProcessorThread *thread = iter.second; + delete thread; + } + data_sync_processor_threads.clear(); + delete sync_log_trimmer; + sync_log_trimmer = nullptr; + bucket_trim = boost::none; + } + if (meta_notifier) { + meta_notifier->stop(); + delete meta_notifier; + } + if (data_notifier) { + data_notifier->stop(); + delete data_notifier; + } + delete sync_tracer; + + delete lc; + lc = NULL; + + delete gc; + gc = NULL; + + delete obj_expirer; + obj_expirer = NULL; + + RGWQuotaHandler::free_handler(quota_handler); + if (cr_registry) { + cr_registry->put(); + } + + svc.shutdown(); + + delete binfo_cache; + delete obj_tombstone_cache; + if (d3n_data_cache) + delete d3n_data_cache; + + if (reshard_wait.get()) { + reshard_wait->stop(); + reshard_wait.reset(); + } + + if (run_reshard_thread) { + reshard->stop_processor(); + } + delete reshard; + delete index_completion_manager; + + rgw::notify::shutdown(); +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::init_rados() +{ + int ret = 0; + + ret = rados.init_with_context(cct); + if (ret < 0) { + return ret; + } + ret = rados.connect(); + if (ret < 0) { + return ret; + } + + auto crs = std::unique_ptr{ + new RGWCoroutinesManagerRegistry(cct)}; + ret = crs->hook_to_admin_command("cr dump"); + if (ret < 0) { + return ret; + } + + cr_registry = crs.release(); + + if (use_datacache) { + d3n_data_cache = new D3nDataCache(); + d3n_data_cache->init(cct); + } + + return ret; +} + +int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map& meta) +{ + string name = cct->_conf->name.get_id(); + if (name.compare(0, 4, "rgw.") == 0) { + name = name.substr(4); + } + map metadata = meta; + metadata["num_handles"] = "1"s; + metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id(); + metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name(); + metadata["zone_name"] = svc.zone->zone_name(); + metadata["zone_id"] = svc.zone->zone_id().id; + metadata["realm_name"] = svc.zone->get_realm().get_name(); + metadata["realm_id"] = svc.zone->get_realm().get_id(); + metadata["id"] = name; + int ret = rados.service_daemon_register( + daemon_type, + stringify(rados.get_instance_id()), + metadata); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map&& status) +{ + int ret = rados.service_daemon_update_status(move(status)); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::init_complete(const DoutPrefixProvider *dpp) +{ + int ret; + + /* + * create sync module instance even if we don't run sync thread, might need it for radosgw-admin + */ + sync_module = svc.sync_modules->get_sync_module(); + + ret = open_root_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_gc_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_lc_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_objexp_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_reshard_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_notif_pool_ctx(dpp); + if (ret < 0) + return ret; + + pools_initialized = true; + + if (use_gc) { + gc = new RGWGC(); + gc->initialize(cct, this); + } else { + ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl; + } + + obj_expirer = new RGWObjectExpirer(this->driver); + + if (use_gc_thread && use_gc) { + gc->start_processor(); + obj_expirer->start_processor(); + } + + auto& current_period = svc.zone->get_current_period(); + auto& zonegroup = svc.zone->get_zonegroup(); + auto& zone_params = svc.zone->get_zone_params(); + auto& zone = svc.zone->get_zone(); + + /* no point of running sync thread if we don't have a master zone configured + or there is no rest_master_conn */ + if (!svc.zone->need_to_sync()) { + run_sync_thread = false; + } + + if (svc.zone->is_meta_master()) { + auto md_log = svc.mdlog->get_log(current_period.get_id()); + meta_notifier = new RGWMetaNotifier(this, md_log); + meta_notifier->start(); + } + + /* init it anyway, might run sync through radosgw-admin explicitly */ + sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size); + sync_tracer->init(this); + ret = sync_tracer->hook_to_admin_command(); + if (ret < 0) { + return ret; + } + + if (run_sync_thread) { + for (const auto &pt: zonegroup.placement_targets) { + if (zone_params.placement_pools.find(pt.second.name) + == zone_params.placement_pools.end()){ + ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target " + << pt.second.name << " present in zonegroup" << dendl; + } + } + auto async_processor = svc.rados->get_async_processor(); + std::lock_guard l{meta_sync_thread_lock}; + meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor); + ret = meta_sync_processor_thread->init(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl; + return ret; + } + meta_sync_processor_thread->start(); + + // configure the bucket trim manager + rgw::BucketTrimConfig config; + rgw::configure_bucket_trim(cct, config); + + bucket_trim.emplace(this->driver, config); + ret = bucket_trim->init(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl; + return ret; + } + svc.datalog_rados->set_observer(&*bucket_trim); + + std::lock_guard dl{data_sync_thread_lock}; + for (auto source_zone : svc.zone->get_data_sync_source_zones()) { + ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl; + auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone); + ret = thread->init(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl; + return ret; + } + thread->start(); + data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread; + } + auto interval = cct->_conf->rgw_sync_log_trim_interval; + if (interval > 0) { + sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval); + ret = sync_log_trimmer->init(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl; + return ret; + } + sync_log_trimmer->start(); + } + } + if (cct->_conf->rgw_data_notify_interval_msec) { + data_notifier = new RGWDataNotifier(this); + data_notifier->start(); + } + + binfo_cache = new RGWChainedCacheImpl; + binfo_cache->init(svc.cache); + + lc = new RGWLC(); + lc->initialize(cct, this->driver); + + if (use_lc_thread) + lc->start_processor(); + + quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads); + + bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards : + zone.bucket_index_max_shards); + if (bucket_index_max_shards > get_max_bucket_shards()) { + bucket_index_max_shards = get_max_bucket_shards(); + ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: " + << get_max_bucket_shards() << dendl; + } + ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl; + + bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */ + + if (need_tombstone_cache) { + obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size); + } + + reshard_wait = std::make_shared(); + + reshard = new RGWReshard(this->driver); + + // disable reshard thread based on zone/zonegroup support + run_reshard_thread = run_reshard_thread && svc.zone->can_reshard(); + + if (run_reshard_thread) { + reshard->start_processor(); + } + + index_completion_manager = new RGWIndexCompletionManager(this); + ret = rgw::notify::init(cct, driver, dpp); + if (ret < 0 ) { + ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl; + } + + return ret; +} + +int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp) +{ + if (raw) { + return svc.init_raw(cct, use_cache, null_yield, dpp); + } + + return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp); +} + +int RGWRados::init_ctl(const DoutPrefixProvider *dpp) +{ + return ctl.init(&svc, driver, dpp); +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::init_begin(const DoutPrefixProvider *dpp) +{ + int ret = init_svc(false, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl; + return ret; + } + + ret = init_ctl(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl; + return ret; + } + + host_id = svc.zone_utils->gen_host_id(); + + return init_rados(); +} + +/** + * Open the pool used as root for this gateway + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true); +} + +int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true); +} + +int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true); +} + +int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true); +} + +int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true); +} + +int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true); +} + +int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx, + bool mostly_omap, bool bulk) +{ + constexpr bool create = true; // create the pool if it doesn't exist + return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap, bulk); +} + +/**** logs ****/ + +struct log_list_state { + string prefix; + librados::IoCtx io_ctx; + librados::NObjectIterator obit; +}; + +int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle) +{ + log_list_state *state = new log_list_state; + int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx); + if (r < 0) { + delete state; + return r; + } + try { + state->prefix = prefix; + state->obit = state->io_ctx.nobjects_begin(); + *handle = (RGWAccessHandle)state; + return 0; + } catch (const std::system_error& e) { + r = -e.code().value(); + ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() + << ", returning " << r << dendl; + return r; + } catch (const std::exception& e) { + ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() + << ", returning -5" << dendl; + return -EIO; + } +} + +int RGWRados::log_list_next(RGWAccessHandle handle, string *name) +{ + log_list_state *state = static_cast(handle); + while (true) { + if (state->obit == state->io_ctx.nobjects_end()) { + delete state; + return -ENOENT; + } + if (state->prefix.length() && + state->obit->get_oid().find(state->prefix) != 0) { + state->obit++; + continue; + } + *name = state->obit->get_oid(); + state->obit++; + break; + } + return 0; +} + +int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name) +{ + librados::IoCtx io_ctx; + int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx); + if (r < 0) + return r; + return io_ctx.remove(name); +} + +struct log_show_state { + librados::IoCtx io_ctx; + bufferlist bl; + bufferlist::const_iterator p; + string name; + uint64_t pos; + bool eof; + log_show_state() : pos(0), eof(false) {} +}; + +int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle) +{ + log_show_state *state = new log_show_state; + int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx); + if (r < 0) { + delete state; + return r; + } + state->name = name; + *handle = (RGWAccessHandle)state; + return 0; +} + +int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry) +{ + log_show_state *state = static_cast(handle); + off_t off = state->p.get_off(); + + ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length() + << " off " << off + << " eof " << (int)state->eof + << dendl; + // read some? + unsigned chunk = 1024*1024; + if ((state->bl.length() - off) < chunk/2 && !state->eof) { + bufferlist more; + int r = state->io_ctx.read(state->name, more, chunk, state->pos); + if (r < 0) + return r; + state->pos += r; + bufferlist old; + try { + old.substr_of(state->bl, off, state->bl.length() - off); + } catch (buffer::error& err) { + return -EINVAL; + } + state->bl = std::move(old); + state->bl.claim_append(more); + state->p = state->bl.cbegin(); + if ((unsigned)r < chunk) + state->eof = true; + ldpp_dout(dpp, 10) << " read " << r << dendl; + } + + if (state->p.end()) + return 0; // end of file + try { + decode(*entry, state->p); + } + catch (const buffer::error &e) { + return -EINVAL; + } + return 1; +} + +/** + * usage_log_hash: get usage log key hash, based on name and index + * + * Get the usage object name. Since a user may have more than 1 + * object holding that info (multiple shards), we use index to + * specify that shard number. Once index exceeds max shards it + * wraps. + * If name is not being set, results for all users will be returned + * and index will wrap only after total shards number. + * + * @param cct [in] ceph context + * @param name [in] user name + * @param hash [out] hash value + * @param index [in] shard index number + */ +static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index) +{ + uint32_t val = index; + + if (!name.empty()) { + int max_user_shards = cct->_conf->rgw_usage_max_user_shards; + val %= max_user_shards; + val += ceph_str_hash_linux(name.c_str(), name.size()); + } + char buf[17]; + int max_shards = cct->_conf->rgw_usage_max_shards; + snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards)); + hash = buf; +} + +int RGWRados::log_usage(const DoutPrefixProvider *dpp, map& usage_info) +{ + uint32_t index = 0; + + map log_objs; + + string hash; + string last_user; + + /* restructure usage map, zone by object hash */ + map::iterator iter; + for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) { + const rgw_user_bucket& ub = iter->first; + RGWUsageBatch& info = iter->second; + + if (ub.user.empty()) { + ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl; + continue; + } + + if (ub.user != last_user) { + /* index *should* be random, but why waste extra cycles + in most cases max user shards is not going to exceed 1, + so just incrementing it */ + usage_log_hash(cct, ub.user, hash, index++); + } + last_user = ub.user; + vector& v = log_objs[hash].entries; + + for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) { + v.push_back(miter->second); + } + } + + map::iterator liter; + + for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) { + int r = cls_obj_usage_log_add(dpp, liter->first, liter->second); + if (r < 0) + return r; + } + return 0; +} + +int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map& usage) +{ + uint32_t num = max_entries; + string hash, first_hash; + string user_str = user.to_str(); + usage_log_hash(cct, user_str, first_hash, 0); + + if (usage_iter.index) { + usage_log_hash(cct, user_str, hash, usage_iter.index); + } else { + hash = first_hash; + } + + usage.clear(); + + do { + map ret_usage; + map::iterator iter; + + int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num, + usage_iter.read_iter, ret_usage, is_truncated); + if (ret == -ENOENT) + goto next; + + if (ret < 0) + return ret; + + num -= ret_usage.size(); + + for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) { + usage[iter->first].aggregate(iter->second); + } + +next: + if (!*is_truncated) { + usage_iter.read_iter.clear(); + usage_log_hash(cct, user_str, hash, ++usage_iter.index); + } + } while (num && !*is_truncated && hash != first_hash); + return 0; +} + +int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch) +{ + uint32_t index = 0; + string hash, first_hash; + string user_str = user.to_str(); + usage_log_hash(cct, user_str, first_hash, index); + + hash = first_hash; + do { + int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch); + + if (ret < 0 && ret != -ENOENT) + return ret; + + usage_log_hash(cct, user_str, hash, ++index); + } while (hash != first_hash); + + return 0; +} + + +int RGWRados::clear_usage(const DoutPrefixProvider *dpp) +{ + auto max_shards = cct->_conf->rgw_usage_max_shards; + int ret=0; + for (unsigned i=0; i < max_shards; i++){ + string oid = RGW_USAGE_OBJ_PREFIX + to_string(i); + ret = cls_obj_usage_log_clear(dpp, oid); + if (ret < 0){ + ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl; + return ret; + } + } + return ret; +} + +int RGWRados::decode_policy(const DoutPrefixProvider *dpp, + ceph::buffer::list& bl, + ACLOwner *owner) +{ + auto i = bl.cbegin(); + RGWAccessControlPolicy policy(cct); + try { + policy.decode_owner(i); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + *owner = policy.get_owner(); + return 0; +} + +int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp) +{ + rgw_bucket bucket = bucket_info.bucket; + bucket.update_bucket_id(new_bucket_id); + + bucket_info.objv_tracker.clear(); + int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp); + if (ret < 0) { + return ret; + } + + return 0; +} + + +/** + * Get ordered listing of the objects in a bucket. + * + * max_p: maximum number of results to return + * bucket: bucket to list contents of + * prefix: only return results that match this prefix + * delim: do not include results that match this string. + * Any skipped results will have the matching portion of their name + * inserted in common_prefixes with a "true" mark. + * marker: if filled in, begin the listing with this object. + * end_marker: if filled in, end the listing with this object. + * result: the objects are put in here. + * common_prefixes: if delim is filled in, any matching prefixes are + * placed here. + * is_truncated: if number of objects in the bucket is bigger than + * max, then truncated. + */ +int RGWRados::Bucket::List::list_objects_ordered( + const DoutPrefixProvider *dpp, + int64_t max_p, + std::vector *result, + std::map *common_prefixes, + bool *is_truncated, + optional_yield y) +{ + RGWRados *store = target->get_store(); + CephContext *cct = store->ctx(); + int shard_id = target->get_shard_id(); + const auto& current_index = target->get_bucket_info().layout.current_index; + + int count = 0; + bool truncated = true; + bool cls_filtered = false; + const int64_t max = // protect against memory issues and negative vals + std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p)); + int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max); + + result->clear(); + + // use a local marker; either the marker will have a previous entry + // or it will be empty; either way it's OK to copy + rgw_obj_key marker_obj(params.marker.name, + params.marker.instance, + params.ns.empty() ? params.marker.ns : params.ns); + rgw_obj_index_key cur_marker; + marker_obj.get_index_key(&cur_marker); + + rgw_obj_key end_marker_obj(params.end_marker.name, + params.end_marker.instance, + params.ns.empty() ? params.end_marker.ns : params.ns); + rgw_obj_index_key cur_end_marker; + end_marker_obj.get_index_key(&cur_end_marker); + const bool cur_end_marker_valid = !params.end_marker.empty(); + + rgw_obj_key prefix_obj(params.prefix); + prefix_obj.set_ns(params.ns); + std::string cur_prefix = prefix_obj.get_index_key_name(); + std::string after_delim_s; /* needed in !params.delim.empty() AND later */ + + if (!params.delim.empty()) { + after_delim_s = cls_rgw_after_delim(params.delim); + /* if marker points at a common prefix, fast forward it into its + * upper bound string */ + int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size()); + if (delim_pos >= 0) { + string s = cur_marker.name.substr(0, delim_pos); + s.append(after_delim_s); + cur_marker = s; + } + } + + // we'll stop after this many attempts as long we return at least + // one entry; but we will also go beyond this number of attempts + // until we return at least one entry + constexpr uint16_t SOFT_MAX_ATTEMPTS = 8; + + rgw_obj_index_key prev_marker; + for (uint16_t attempt = 1; /* empty */; ++attempt) { + ldpp_dout(dpp, 20) << __func__ << + ": starting attempt " << attempt << dendl; + + if (attempt > 1 && !(prev_marker < cur_marker)) { + // we've failed to make forward progress + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " marker failed to make forward progress; attempt=" << attempt << + ", prev_marker=" << prev_marker << + ", cur_marker=" << cur_marker << dendl; + break; + } + prev_marker = cur_marker; + + ent_map_t ent_map; + ent_map.reserve(read_ahead); + int r = store->cls_bucket_list_ordered(dpp, + target->get_bucket_info(), + current_index, + shard_id, + cur_marker, + cur_prefix, + params.delim, + read_ahead + 1 - count, + params.list_versions, + attempt, + ent_map, + &truncated, + &cls_filtered, + &cur_marker, + y, + params.force_check_filter); + if (r < 0) { + return r; + } + + for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) { + rgw_bucket_dir_entry& entry = eiter->second; + rgw_obj_index_key index_key = entry.key; + rgw_obj_key obj(index_key); + + ldpp_dout(dpp, 20) << __func__ << + ": considering entry " << entry.key << dendl; + + /* note that parse_raw_oid() here will not set the correct + * object's instance, as rgw_obj_index_key encodes that + * separately. We don't need to set the instance because it's + * not needed for the checks here and we end up using the raw + * entry for the return vector + */ + bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj); + if (!valid) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " could not parse object name: " << obj.name << dendl; + continue; + } + + bool matched_ns = (obj.ns == params.ns); + if (!params.list_versions && !entry.is_visible()) { + ldpp_dout(dpp, 10) << __func__ << + ": skipping not visible entry \"" << entry.key << "\"" << dendl; + continue; + } + + if (params.enforce_ns && !matched_ns) { + if (!params.ns.empty()) { + /* we've iterated past the namespace we're searching -- done now */ + truncated = false; + ldpp_dout(dpp, 10) << __func__ << + ": finished due to getting past requested namespace \"" << + params.ns << "\"" << dendl; + goto done; + } + + /* we're skipping past namespaced objects */ + ldpp_dout(dpp, 20) << __func__ << + ": skipping past namespaced objects, including \"" << entry.key << + "\"" << dendl; + continue; + } + + if (cur_end_marker_valid && cur_end_marker <= index_key) { + truncated = false; + ldpp_dout(dpp, 10) << __func__ << + ": finished due to gitting end marker of \"" << cur_end_marker << + "\" with \"" << entry.key << "\"" << dendl; + goto done; + } + + if (count < max) { + params.marker = index_key; + next_marker = index_key; + } + + if (params.access_list_filter && + ! params.access_list_filter->filter(obj.name, index_key.name)) { + ldpp_dout(dpp, 20) << __func__ << + ": skipping past namespaced objects, including \"" << entry.key << + "\"" << dendl; + continue; + } + + if (params.prefix.size() && + 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) { + ldpp_dout(dpp, 20) << __func__ << + ": skipping object \"" << entry.key << + "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl; + continue; + } + + if (!params.delim.empty()) { + const int delim_pos = obj.name.find(params.delim, params.prefix.size()); + if (delim_pos >= 0) { + // run either the code where delimiter filtering is done a) + // in the OSD/CLS or b) here. + if (cls_filtered) { + // NOTE: this condition is for the newer versions of the + // OSD that does filtering on the CLS side should only + // find one delimiter at the end if it finds any after the + // prefix + if (delim_pos != + int(obj.name.length() - params.delim.length())) { + ldpp_dout(dpp, 0) << "WARNING: " << __func__ << + " found delimiter in place other than the end of " + "the prefix; obj.name=" << obj.name << + ", prefix=" << params.prefix << dendl; + } + if (common_prefixes) { + if (count >= max) { + truncated = true; + ldpp_dout(dpp, 10) << __func__ << + ": stopping early with common prefix \"" << entry.key << + "\" because requested number (" << max << + ") reached (cls filtered)" << dendl; + goto done; + } + + (*common_prefixes)[obj.name] = true; + count++; + } + + ldpp_dout(dpp, 20) << __func__ << + ": finished entry with common prefix \"" << entry.key << + "\" so continuing loop (cls filtered)" << dendl; + continue; + } else { + // NOTE: this condition is for older versions of the OSD + // that do not filter on the CLS side, so the following code + // must do the filtering; once we reach version 16 of ceph, + // this code can be removed along with the conditional that + // can lead this way + + /* extract key -with trailing delimiter- for CommonPrefix */ + string prefix_key = + obj.name.substr(0, delim_pos + params.delim.length()); + + if (common_prefixes && + common_prefixes->find(prefix_key) == common_prefixes->end()) { + if (count >= max) { + truncated = true; + ldpp_dout(dpp, 10) << __func__ << + ": stopping early with common prefix \"" << entry.key << + "\" because requested number (" << max << + ") reached (not cls filtered)" << dendl; + goto done; + } + next_marker = prefix_key; + (*common_prefixes)[prefix_key] = true; + + count++; + } + + ldpp_dout(dpp, 20) << __func__ << + ": finished entry with common prefix \"" << entry.key << + "\" so continuing loop (not cls filtered)" << dendl; + continue; + } // if we're running an older OSD version + } // if a delimiter was found after prefix + } // if a delimiter was passed in + + if (count >= max) { + truncated = true; + ldpp_dout(dpp, 10) << __func__ << + ": stopping early with entry \"" << entry.key << + "\" because requested number (" << max << + ") reached" << dendl; + goto done; + } + + ldpp_dout(dpp, 20) << __func__ << + ": adding entry " << entry.key << " to result" << dendl; + + result->emplace_back(std::move(entry)); + count++; + } // eiter for loop + + // NOTE: the following conditional is needed by older versions of + // the OSD that don't do delimiter filtering on the CLS side; once + // we reach version 16 of ceph, the following conditional and the + // code within can be removed + if (!cls_filtered && !params.delim.empty()) { + int marker_delim_pos = + cur_marker.name.find(params.delim, cur_prefix.size()); + if (marker_delim_pos >= 0) { + std::string skip_after_delim = + cur_marker.name.substr(0, marker_delim_pos); + skip_after_delim.append(after_delim_s); + + ldpp_dout(dpp, 20) << __func__ << + ": skip_after_delim=" << skip_after_delim << dendl; + + if (skip_after_delim > cur_marker.name) { + cur_marker = skip_after_delim; + ldpp_dout(dpp, 20) << __func__ << + ": setting cur_marker=" << cur_marker.name << + "[" << cur_marker.instance << "]" << dendl; + } + } + } // if older osd didn't do delimiter filtering + + ldpp_dout(dpp, 10) << __func__ << + ": end of outer loop, truncated=" << truncated << + ", count=" << count << ", attempt=" << attempt << dendl; + + if (!truncated || count >= (max + 1) / 2) { + // if we finished listing, or if we're returning at least half the + // requested entries, that's enough; S3 and swift protocols allow + // returning fewer than max entries + ldpp_dout(dpp, 10) << __func__ << + ": exiting attempt loop because we reached end (" << truncated << + ") or we're returning half the requested entries (" << count << + " of " << max << ")" << dendl; + break; + } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) { + // if we've made at least 8 attempts and we have some, but very + // few, results, return with what we have + ldpp_dout(dpp, 10) << __func__ << + ": exiting attempt loop because we made " << attempt << + " attempts and we're returning " << count << " entries" << dendl; + break; + } + } // for (uint16_t attempt... + +done: + + if (is_truncated) { + *is_truncated = truncated; + } + + return 0; +} // list_objects_ordered + + +/** + * Get listing of the objects in a bucket and allow the results to be out + * of order. + * + * Even though there are key differences with the ordered counterpart, + * the parameters are the same to maintain some compatability. + * + * max: maximum number of results to return + * bucket: bucket to list contents of + * prefix: only return results that match this prefix + * delim: should not be set; if it is we should have indicated an error + * marker: if filled in, begin the listing with this object. + * end_marker: if filled in, end the listing with this object. + * result: the objects are put in here. + * common_prefixes: this is never filled with an unordered list; the param + * is maintained for compatibility + * is_truncated: if number of objects in the bucket is bigger than max, then + * truncated. + */ +int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp, + int64_t max_p, + std::vector* result, + std::map* common_prefixes, + bool* is_truncated, + optional_yield y) +{ + RGWRados *store = target->get_store(); + int shard_id = target->get_shard_id(); + const auto& current_index = target->get_bucket_info().layout.current_index; + + int count = 0; + bool truncated = true; + + const int64_t max = // protect against memory issues and negative vals + std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p)); + + // read a few extra in each call to cls_bucket_list_unordered in + // case some are filtered out due to namespace matching, versioning, + // filtering, etc. + const int64_t max_read_ahead = 100; + const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead)); + + result->clear(); + + // use a local marker; either the marker will have a previous entry + // or it will be empty; either way it's OK to copy + rgw_obj_key marker_obj(params.marker.name, + params.marker.instance, + params.ns.empty() ? params.marker.ns : params.ns); + rgw_obj_index_key cur_marker; + marker_obj.get_index_key(&cur_marker); + + rgw_obj_key end_marker_obj(params.end_marker.name, + params.end_marker.instance, + params.ns.empty() ? params.end_marker.ns : params.ns); + rgw_obj_index_key cur_end_marker; + end_marker_obj.get_index_key(&cur_end_marker); + const bool cur_end_marker_valid = !params.end_marker.empty(); + + rgw_obj_key prefix_obj(params.prefix); + prefix_obj.set_ns(params.ns); + std::string cur_prefix = prefix_obj.get_index_key_name(); + + while (truncated && count <= max) { + std::vector ent_list; + ent_list.reserve(read_ahead); + + int r = store->cls_bucket_list_unordered(dpp, + target->get_bucket_info(), + current_index, + shard_id, + cur_marker, + cur_prefix, + read_ahead, + params.list_versions, + ent_list, + &truncated, + &cur_marker, + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " cls_bucket_list_unordered returned " << r << " for " << + target->get_bucket_info().bucket << dendl; + return r; + } + + // NB: while regions of ent_list will be sorted, we have no + // guarantee that all items will be sorted since they can cross + // shard boundaries + + for (auto& entry : ent_list) { + rgw_obj_index_key index_key = entry.key; + rgw_obj_key obj(index_key); + + if (count < max) { + params.marker.set(index_key); + next_marker.set(index_key); + } + + /* note that parse_raw_oid() here will not set the correct + * object's instance, as rgw_obj_index_key encodes that + * separately. We don't need to set the instance because it's + * not needed for the checks here and we end up using the raw + * entry for the return vector + */ + bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj); + if (!valid) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " could not parse object name: " << obj.name << dendl; + continue; + } + + if (!params.list_versions && !entry.is_visible()) { + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because not listing versions and entry not visibile" << dendl; + continue; + } + + if (params.enforce_ns && obj.ns != params.ns) { + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because namespace does not match" << dendl; + continue; + } + + if (cur_end_marker_valid && cur_end_marker <= index_key) { + // we're not guaranteed items will come in order, so we have + // to loop through all + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because after end_marker" << dendl; + continue; + } + + if (params.access_list_filter && + !params.access_list_filter->filter(obj.name, index_key.name)) { + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because doesn't match filter" << dendl; + continue; + } + + if (params.prefix.size() && + (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) { + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because doesn't match prefix" << dendl; + continue; + } + + if (count >= max) { + truncated = true; + goto done; + } + + result->emplace_back(std::move(entry)); + count++; + } // for (auto& entry : ent_list) + } // while (truncated && count <= max) + +done: + + if (is_truncated) { + *is_truncated = truncated; + } + + return 0; +} // list_objects_unordered + + +/** + * create a rados pool, associated meta info + * returns 0 on success, -ERR# otherwise. + */ +int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool) +{ + librados::IoCtx io_ctx; + constexpr bool create = true; + return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create); +} + +void RGWRados::create_bucket_id(string *bucket_id) +{ + uint64_t iid = instance_id(); + uint64_t bid = next_bucket_id(); + char buf[svc.zone->get_zone_params().get_id().size() + 48]; + snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64, + svc.zone->get_zone_params().get_id().c_str(), iid, bid); + *bucket_id = buf; +} + +int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket, + const string& zonegroup_id, + const rgw_placement_rule& placement_rule, + const string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + map& attrs, + RGWBucketInfo& info, + obj_version *pobjv, + obj_version *pep_objv, + real_time creation_time, + rgw_bucket *pmaster_bucket, + uint32_t *pmaster_num_shards, + optional_yield y, + const DoutPrefixProvider *dpp, + bool exclusive) +{ +#define MAX_CREATE_RETRIES 20 /* need to bound retries */ + rgw_placement_rule selected_placement_rule; + RGWZonePlacementInfo rule_info; + + for (int i = 0; i < MAX_CREATE_RETRIES; i++) { + int ret = 0; + ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule, + &selected_placement_rule, &rule_info, y); + if (ret < 0) + return ret; + + if (!pmaster_bucket) { + create_bucket_id(&bucket.marker); + bucket.bucket_id = bucket.marker; + } else { + bucket.marker = pmaster_bucket->marker; + bucket.bucket_id = pmaster_bucket->bucket_id; + } + + RGWObjVersionTracker& objv_tracker = info.objv_tracker; + + objv_tracker.read_version.clear(); + + if (pobjv) { + objv_tracker.write_version = *pobjv; + } else { + objv_tracker.generate_new_write_ver(cct); + } + + info.bucket = bucket; + info.owner = owner.user_id; + info.zonegroup = zonegroup_id; + info.placement_rule = selected_placement_rule; + info.swift_ver_location = swift_ver_location; + info.swift_versioning = (!swift_ver_location.empty()); + + init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(), + pmaster_num_shards ? + std::optional{*pmaster_num_shards} : + std::nullopt, + rule_info.index_type); + + info.requester_pays = false; + if (real_clock::is_zero(creation_time)) { + info.creation_time = ceph::real_clock::now(); + } else { + info.creation_time = creation_time; + } + if (pquota_info) { + info.quota = *pquota_info; + } + + int r = svc.bi->init_index(dpp, info, info.layout.current_index); + if (r < 0) { + return r; + } + + ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp, y); + if (ret == -ECANCELED) { + ret = -EEXIST; + } + if (ret == -EEXIST) { + /* we need to reread the info and return it, caller will have a use for it */ + RGWBucketInfo orig_info; + r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL); + if (r < 0) { + if (r == -ENOENT) { + continue; + } + ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl; + return r; + } + + /* only remove it if it's a different bucket instance */ + if (orig_info.bucket.bucket_id != bucket.bucket_id) { + int r = svc.bi->clean_index(dpp, info, info.layout.current_index); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl; + } + r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl; + /* continue anyway */ + } + } + + info = std::move(orig_info); + /* ret == -EEXIST here */ + } + return ret; + } + + /* this is highly unlikely */ + ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl; + return -ENOENT; +} + +bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj) +{ + get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc); + + return get_obj_data_pool(placement_rule, obj, &raw_obj->pool); +} + +std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y) +{ + return svc.rados->cluster_fsid(); +} + +int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + librados::IoCtx *ioctx) +{ + std::string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + + rgw_pool pool; + if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) { + ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << + ", probably misconfiguration" << dendl; + return -EIO; + } + + int r = open_pool_ctx(dpp, pool, *ioctx, false, true); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() << + " for obj=" << obj << " with error-code=" << r << dendl; + return r; + } + + ioctx->locator_set_key(key); + + return 0; +} + +int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp, + const rgw_placement_rule& target_placement_rule, + const rgw_obj& obj, + rgw_rados_ref *ref) +{ + get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc); + + rgw_pool pool; + if (!get_obj_data_pool(target_placement_rule, obj, &pool)) { + ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl; + return -EIO; + } + + ref->pool = svc.rados->pool(pool); + + int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams() + .set_mostly_omap(false)); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl; + return r; + } + + ref->pool.ioctx().locator_set_key(ref->obj.loc); + + return 0; +} + +int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + rgw_rados_ref *ref) +{ + return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref); +} + +int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref) +{ + ref->obj = obj; + + if (ref->obj.oid.empty()) { + ref->obj.oid = obj.pool.to_str(); + ref->obj.pool = svc.zone->get_zone_params().domain_root; + } + ref->pool = svc.rados->pool(obj.pool); + int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams() + .set_mostly_omap(false)); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl; + return r; + } + + ref->pool.ioctx().locator_set_key(ref->obj.loc); + + return 0; +} + +int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref) +{ + return get_raw_obj_ref(dpp, obj, ref); +} + +/* + * fixes an issue where head objects were supposed to have a locator created, but ended + * up without one + */ +int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key) +{ + const rgw_bucket& bucket = bucket_info.bucket; + string oid; + string locator; + + rgw_obj obj(bucket, key); + + get_obj_bucket_and_oid_loc(obj, oid, locator); + + if (locator.empty()) { + ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl; + return 0; + } + + librados::IoCtx ioctx; + + int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx); + if (ret < 0) { + cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl; + return ret; + } + ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */ + + uint64_t size; + bufferlist data; + + struct timespec mtime_ts; + map attrs; + librados::ObjectReadOperation op; + op.getxattrs(&attrs, NULL); + op.stat2(&size, &mtime_ts, NULL); +#define HEAD_SIZE 512 * 1024 + op.read(0, HEAD_SIZE, &data, NULL); + + ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl; + return ret; + } + + if (size > HEAD_SIZE) { + ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl; + return -EIO; + } + + if (size != data.length()) { + ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl; + return -EIO; + } + + if (copy_obj) { + librados::ObjectWriteOperation wop; + + wop.mtime2(&mtime_ts); + + map::iterator iter; + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + wop.setxattr(iter->first.c_str(), iter->second); + } + + wop.write(0, data); + + ioctx.locator_set_key(locator); + rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield); + } + + if (remove_bad) { + ioctx.locator_set_key(string()); + + ret = ioctx.remove(oid); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl; + return ret; + } + } + + return 0; +} + +int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp, + librados::IoCtx& src_ioctx, + const string& src_oid, const string& src_locator, + librados::IoCtx& dst_ioctx, + const string& dst_oid, const string& dst_locator) +{ + +#define COPY_BUF_SIZE (4 * 1024 * 1024) + bool done = false; + uint64_t chunk_size = COPY_BUF_SIZE; + uint64_t ofs = 0; + int ret = 0; + real_time mtime; + struct timespec mtime_ts; + uint64_t size; + + if (src_oid == dst_oid && src_locator == dst_locator) { + return 0; + } + + src_ioctx.locator_set_key(src_locator); + dst_ioctx.locator_set_key(dst_locator); + + do { + bufferlist data; + ObjectReadOperation rop; + ObjectWriteOperation wop; + + if (ofs == 0) { + rop.stat2(&size, &mtime_ts, NULL); + mtime = real_clock::from_timespec(mtime_ts); + } + rop.read(ofs, chunk_size, &data, NULL); + ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield); + if (ret < 0) { + goto done_err; + } + + if (data.length() == 0) { + break; + } + + if (ofs == 0) { + wop.create(true); /* make it exclusive */ + wop.mtime2(&mtime_ts); + mtime = real_clock::from_timespec(mtime_ts); + } + wop.write(ofs, data); + ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield); + if (ret < 0) { + goto done_err; + } + ofs += data.length(); + done = data.length() != chunk_size; + } while (!done); + + if (ofs != size) { + ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid + << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl; + ret = -EIO; + goto done_err; + } + + src_ioctx.remove(src_oid); + + return 0; + +done_err: + // TODO: clean up dst_oid if we created it + ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl; + return ret; +} + +/* + * fixes an issue where head objects were supposed to have a locator created, but ended + * up without one + */ +int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, rgw_obj_key& key, + bool fix, bool *need_fix, optional_yield y) +{ + const rgw_bucket& bucket = bucket_info.bucket; + rgw_obj obj(bucket, key); + + if (need_fix) { + *need_fix = false; + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + RGWObjState *astate = nullptr; + RGWObjManifest* manifest = nullptr; + RGWObjectCtx rctx(this->driver); + r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, &manifest, false, y); + if (r < 0) + return r; + + if (manifest) { + RGWObjManifest::obj_iterator miter; + for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) { + rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this); + rgw_obj loc; + string oid; + string locator; + + RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc); + + if (loc.key.ns.empty()) { + /* continue, we're only interested in tail objects */ + continue; + } + + auto& ioctx = ref.pool.ioctx(); + + get_obj_bucket_and_oid_loc(loc, oid, locator); + ref.pool.ioctx().locator_set_key(locator); + + ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl; + + r = ioctx.stat(oid, NULL, NULL); + if (r != -ENOENT) { + continue; + } + + string bad_loc; + prepend_bucket_marker(bucket, loc.key.name, bad_loc); + + /* create a new ioctx with the bad locator */ + librados::IoCtx src_ioctx; + src_ioctx.dup(ioctx); + src_ioctx.locator_set_key(bad_loc); + + r = src_ioctx.stat(oid, NULL, NULL); + if (r != 0) { + /* cannot find a broken part */ + continue; + } + ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl; + if (need_fix) { + *need_fix = true; + } + if (fix) { + r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl; + } + } + } + } + + return 0; +} + +int RGWRados::BucketShard::init(const rgw_bucket& _bucket, + const rgw_obj& obj, + RGWBucketInfo* bucket_info_out, + const DoutPrefixProvider *dpp) +{ + bucket = _bucket; + + RGWBucketInfo bucket_info; + RGWBucketInfo* bucket_info_p = + bucket_info_out ? bucket_info_out : &bucket_info; + + int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp); + if (ret < 0) { + return ret; + } + + string oid; + + ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl; + + return 0; +} + +int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, + const rgw_obj& obj) +{ + bucket = bucket_info.bucket; + + int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, + obj.get_hash_object(), + &bucket_obj, + &shard_id); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl; + + return 0; +} + +int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& index, + int sid) +{ + bucket = bucket_info.bucket; + shard_id = sid; + + int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, index, + shard_id, &bucket_obj); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl; + + return 0; +} + + +/* Execute @handler on last item in bucket listing for bucket specified + * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing + * to objects matching these criterias. */ +int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::string& obj_prefix, + const std::string& obj_delim, + std::function handler) +{ + RGWRados::Bucket target(this, bucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.prefix = obj_prefix; + list_op.params.delim = obj_delim; + + ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name + << ", obj_prefix=" << obj_prefix + << ", obj_delim=" << obj_delim + << dendl; + + bool is_truncated = false; + + boost::optional last_entry; + /* We need to rewind to the last object in a listing. */ + do { + /* List bucket entries in chunks. */ + static constexpr int MAX_LIST_OBJS = 100; + std::vector entries(MAX_LIST_OBJS); + + int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr, + &is_truncated, null_yield); + if (ret < 0) { + return ret; + } else if (!entries.empty()) { + last_entry = entries.back(); + } + } while (is_truncated); + + if (last_entry) { + return handler(*last_entry); + } + + /* Empty listing - no items we can run handler on. */ + return 0; +} + +bool RGWRados::swift_versioning_enabled(const RGWBucketInfo& bucket_info) const +{ + return bucket_info.has_swift_versioning() && + bucket_info.swift_ver_location.size(); +} + +int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx, + const rgw_user& user, + RGWBucketInfo& bucket_info, + const rgw_obj& obj, + const DoutPrefixProvider *dpp, + optional_yield y) +{ + if (! swift_versioning_enabled(bucket_info)) { + return 0; + } + + obj_ctx.set_atomic(obj); + + RGWObjState * state = nullptr; + RGWObjManifest *manifest = nullptr; + int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &state, &manifest, false, y); + if (r < 0) { + return r; + } + + if (!state->exists) { + return 0; + } + + const string& src_name = obj.get_oid(); + char buf[src_name.size() + 32]; + struct timespec ts = ceph::real_clock::to_timespec(state->mtime); + snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(), + src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000); + + RGWBucketInfo dest_bucket_info; + + r = get_bucket_info(&svc, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, null_yield, NULL); + if (r < 0) { + ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl; + if (r == -ENOENT) { + return -ERR_PRECONDITION_FAILED; + } + return r; + } + + if (dest_bucket_info.owner != bucket_info.owner) { + return -ERR_PRECONDITION_FAILED; + } + + rgw_obj dest_obj(dest_bucket_info.bucket, buf); + + if (dest_bucket_info.versioning_enabled()){ + gen_rand_obj_instance_name(&dest_obj); + } + + obj_ctx.set_atomic(dest_obj); + + rgw_zone_id no_zone; + + r = copy_obj(obj_ctx, + user, + NULL, /* req_info *info */ + no_zone, + dest_obj, + obj, + dest_bucket_info, + bucket_info, + bucket_info.placement_rule, + NULL, /* time_t *src_mtime */ + NULL, /* time_t *mtime */ + NULL, /* const time_t *mod_ptr */ + NULL, /* const time_t *unmod_ptr */ + false, /* bool high_precision_time */ + NULL, /* const char *if_match */ + NULL, /* const char *if_nomatch */ + RGWRados::ATTRSMOD_NONE, + true, /* bool copy_if_newer */ + state->attrset, + RGWObjCategory::Main, + 0, /* uint64_t olh_epoch */ + real_time(), /* time_t delete_at */ + NULL, /* string *version_id */ + NULL, /* string *ptag */ + NULL, /* string *petag */ + NULL, /* void (*progress_cb)(off_t, void *) */ + NULL, /* void *progress_data */ + dpp, + null_yield); + if (r == -ECANCELED || r == -ENOENT) { + /* Has already been overwritten, meaning another rgw process already + * copied it out */ + return 0; + } + + return r; +} + +int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx, + const rgw_user& user, + RGWBucketInfo& bucket_info, + rgw_obj& obj, + bool& restored, + const DoutPrefixProvider *dpp) +{ + if (! swift_versioning_enabled(bucket_info)) { + return 0; + } + + /* Bucket info of the bucket that stores previous versions of our object. */ + RGWBucketInfo archive_binfo; + + int ret = get_bucket_info(&svc, bucket_info.bucket.tenant, + bucket_info.swift_ver_location, + archive_binfo, nullptr, null_yield, nullptr); + if (ret < 0) { + return ret; + } + + /* Abort the operation if the bucket storing our archive belongs to someone + * else. This is a limitation in comparison to Swift as we aren't taking ACLs + * into consideration. For we can live with that. + * + * TODO: delegate this check to un upper layer and compare with ACLs. */ + if (bucket_info.owner != archive_binfo.owner) { + return -EPERM; + } + + /* This code will be executed on latest version of the object. */ + const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int { + rgw_zone_id no_zone; + + /* We don't support object versioning of Swift API on those buckets that + * are already versioned using the S3 mechanism. This affects also bucket + * storing archived objects. Otherwise the delete operation would create + * a deletion marker. */ + if (archive_binfo.versioned()) { + restored = false; + return -ERR_PRECONDITION_FAILED; + } + + /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly + * irrelevant and may be safely skipped. */ + std::map no_attrs; + + rgw_obj archive_obj(archive_binfo.bucket, entry.key); + + if (bucket_info.versioning_enabled()){ + gen_rand_obj_instance_name(&obj); + } + + obj_ctx.set_atomic(archive_obj); + obj_ctx.set_atomic(obj); + + int ret = copy_obj(obj_ctx, + user, + nullptr, /* req_info *info */ + no_zone, + obj, /* dest obj */ + archive_obj, /* src obj */ + bucket_info, /* dest bucket info */ + archive_binfo, /* src bucket info */ + bucket_info.placement_rule, /* placement_rule */ + nullptr, /* time_t *src_mtime */ + nullptr, /* time_t *mtime */ + nullptr, /* const time_t *mod_ptr */ + nullptr, /* const time_t *unmod_ptr */ + false, /* bool high_precision_time */ + nullptr, /* const char *if_match */ + nullptr, /* const char *if_nomatch */ + RGWRados::ATTRSMOD_NONE, + true, /* bool copy_if_newer */ + no_attrs, + RGWObjCategory::Main, + 0, /* uint64_t olh_epoch */ + real_time(), /* time_t delete_at */ + nullptr, /* string *version_id */ + nullptr, /* string *ptag */ + nullptr, /* string *petag */ + nullptr, /* void (*progress_cb)(off_t, void *) */ + nullptr, /* void *progress_data */ + dpp, + null_yield); + if (ret == -ECANCELED || ret == -ENOENT) { + /* Has already been overwritten, meaning another rgw process already + * copied it out */ + return 0; + } else if (ret < 0) { + return ret; + } else { + restored = true; + } + + /* Need to remove the archived copy. */ + ret = delete_obj(dpp, obj_ctx, archive_binfo, archive_obj, + archive_binfo.versioning_status()); + + return ret; + }; + + const std::string& obj_name = obj.get_oid(); + const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size() + % obj_name); + + return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(), + handler); +} + +int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, + uint64_t size, uint64_t accounted_size, + map& attrs, + bool assume_noent, bool modify_tail, + void *_index_op, optional_yield y) +{ + RGWRados::Bucket::UpdateIndex *index_op = static_cast(_index_op); + RGWRados *store = target->get_store(); + + ObjectWriteOperation op; +#ifdef WITH_LTTNG + const req_state* s = get_req_state(); + string req_id; + if (!s) { + // fake req_id + req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id()); + } else { + req_id = s->req_id; + } +#endif + + RGWObjState *state; + RGWObjManifest *manifest = nullptr; + int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent); + if (r < 0) + return r; + + rgw_obj& obj = target->get_obj(); + + if (obj.get_oid().empty()) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl; + return -EIO; + } + + rgw_rados_ref ref; + r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref); + if (r < 0) + return r; + + bool is_olh = state->is_olh; + + bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0; + + const string *ptag = meta.ptag; + if (!ptag && !index_op->get_optag()->empty()) { + ptag = index_op->get_optag(); + } + r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y); + if (r < 0) + return r; + + if (real_clock::is_zero(meta.set_mtime)) { + meta.set_mtime = real_clock::now(); + } + + if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) { + auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (iter == attrs.end()) { + real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime); + string mode = target->get_bucket_info().obj_lock.get_mode(); + RGWObjectRetention obj_retention(mode, lock_until_date); + bufferlist bl; + obj_retention.encode(bl); + op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl); + } + } + + if (state->is_olh) { + op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag); + } + + struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime); + op.mtime2(&mtime_ts); + + if (meta.data) { + /* if we want to overwrite the data, we also want to overwrite the + xattrs, so just remove the object */ + op.write_full(*meta.data); + if (state->compressed) { + uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE; + op.set_alloc_hint2(0, 0, alloc_hint_flags); + } + } + + string etag; + string content_type; + bufferlist acl_bl; + string storage_class; + + map::iterator iter; + if (meta.rmattrs) { + for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) { + const string& name = iter->first; + op.rmxattr(name.c_str()); + } + } + + if (meta.manifest) { + storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class; + + /* remove existing manifest attr */ + iter = attrs.find(RGW_ATTR_MANIFEST); + if (iter != attrs.end()) + attrs.erase(iter); + + bufferlist bl; + encode(*meta.manifest, bl); + op.setxattr(RGW_ATTR_MANIFEST, bl); + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + + if (name.compare(RGW_ATTR_ETAG) == 0) { + etag = rgw_bl_str(bl); + } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) { + content_type = rgw_bl_str(bl); + } else if (name.compare(RGW_ATTR_ACL) == 0) { + acl_bl = bl; + } + } + if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) { + cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER); + } + + if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) { + bufferlist bl; + encode(store->svc.zone->get_zone_short_id(), bl); + op.setxattr(RGW_ATTR_SOURCE_ZONE, bl); + } + + if (!storage_class.empty()) { + bufferlist bl; + bl.append(storage_class); + op.setxattr(RGW_ATTR_STORAGE_CLASS, bl); + } + + if (!op.size()) + return 0; + + uint64_t epoch; + int64_t poolid; + bool orig_exists; + uint64_t orig_size; + + if (!reset_obj) { //Multipart upload, it has immutable head. + orig_exists = false; + orig_size = 0; + } else { + orig_exists = state->exists; + orig_size = state->accounted_size; + } + + bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) || + !obj.key.instance.empty(); + + bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target); + + if (versioned_op) { + index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP); + } + + if (!index_op->is_prepared()) { + tracepoint(rgw_rados, prepare_enter, req_id.c_str()); + r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y); + tracepoint(rgw_rados, prepare_exit, req_id.c_str()); + if (r < 0) + return r; + } + + auto& ioctx = ref.pool.ioctx(); + + tracepoint(rgw_rados, operate_enter, req_id.c_str()); + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + tracepoint(rgw_rados, operate_exit, req_id.c_str()); + if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under, + or -ENOENT if was removed, or -EEXIST if it did not exist + before and now it does */ + if (r == -EEXIST && assume_noent) { + target->invalidate_state(); + return r; + } + goto done_cancel; + } + + epoch = ioctx.get_last_version(); + poolid = ioctx.get_id(); + + r = target->complete_atomic_modification(dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl; + } + + tracepoint(rgw_rados, complete_enter, req_id.c_str()); + r = index_op->complete(dpp, poolid, epoch, size, accounted_size, + meta.set_mtime, etag, content_type, + storage_class, &acl_bl, + meta.category, meta.remove_objs, y, + meta.user_data, meta.appendable); + tracepoint(rgw_rados, complete_exit, req_id.c_str()); + if (r < 0) + goto done_cancel; + + if (meta.mtime) { + *meta.mtime = meta.set_mtime; + } + + /* note that index_op was using state so we couldn't invalidate it earlier */ + target->invalidate_state(); + state = NULL; + + if (versioned_op && meta.olh_epoch) { + r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace); + if (r < 0) { + return r; + } + } + + if (!real_clock::is_zero(meta.delete_at)) { + rgw_obj_index_key obj_key; + obj.key.get_index_key(&obj_key); + + r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name, + obj.bucket.bucket_id, obj_key); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl; + /* ignoring error, nothing we can do at this point */ + } + } + meta.canceled = false; + + /* update quota cache */ + if (meta.completeMultipart){ + store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1), + 0, orig_size); + } + else { + store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1), + accounted_size, orig_size); + } + return 0; + +done_cancel: + int ret = index_op->cancel(dpp, meta.remove_objs, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; + } + + meta.canceled = true; + + /* we lost in a race. There are a few options: + * - existing object was rewritten (ECANCELED) + * - non existing object was created (EEXIST) + * - object was removed (ENOENT) + * should treat it as a success + */ + if (meta.if_match == NULL && meta.if_nomatch == NULL) { + if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) { + r = 0; + } + } else { + if (meta.if_match != NULL) { + // only overwrite existing object + if (strcmp(meta.if_match, "*") == 0) { + if (r == -ENOENT) { + r = -ERR_PRECONDITION_FAILED; + } else if (r == -ECANCELED) { + r = 0; + } + } + } + + if (meta.if_nomatch != NULL) { + // only create a new object + if (strcmp(meta.if_nomatch, "*") == 0) { + if (r == -EEXIST) { + r = -ERR_PRECONDITION_FAILED; + } else if (r == -ENOENT) { + r = 0; + } + } + } + } + + return r; +} + +int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size, + map& attrs, optional_yield y) +{ + RGWBucketInfo& bucket_info = target->get_bucket_info(); + + RGWRados::Bucket bop(target->get_store(), bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj()); + index_op.set_zones_trace(meta.zones_trace); + + bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL); + int r; + if (assume_noent) { + r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y); + if (r == -EEXIST) { + assume_noent = false; + } + } + if (!assume_noent) { + r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y); + } + return r; +} + +class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB +{ + const DoutPrefixProvider *dpp; + CephContext* cct; + rgw_obj obj; + rgw::sal::DataProcessor *filter; + boost::optional& compressor; + bool try_etag_verify; + rgw::putobj::etag_verifier_ptr etag_verifier; + boost::optional buffering; + CompressorRef& plugin; + rgw::sal::ObjectProcessor *processor; + void (*progress_cb)(off_t, void *); + void *progress_data; + bufferlist extra_data_bl, manifest_bl; + std::optional compression_info; + uint64_t extra_data_left{0}; + bool need_to_process_attrs{true}; + uint64_t data_len{0}; + map src_attrs; + uint64_t ofs{0}; + uint64_t lofs{0}; /* logical ofs */ + std::function&)> attrs_handler; + +public: + RGWRadosPutObj(const DoutPrefixProvider *dpp, + CephContext* cct, + CompressorRef& plugin, + boost::optional& compressor, + rgw::sal::ObjectProcessor *p, + void (*_progress_cb)(off_t, void *), + void *_progress_data, + std::function&)> _attrs_handler) : + dpp(dpp), + cct(cct), + filter(p), + compressor(compressor), + try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify), + plugin(plugin), + processor(p), + progress_cb(_progress_cb), + progress_data(_progress_data), + attrs_handler(_attrs_handler) {} + + + int process_attrs(void) { + bool encrypted = false; + if (extra_data_bl.length()) { + JSONParser jp; + if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) { + ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl; + return -EIO; + } + + JSONDecoder::decode_json("attrs", src_attrs, &jp); + + encrypted = src_attrs.count(RGW_ATTR_CRYPT_MODE); + if (encrypted) { + // we won't have access to the decrypted data for checksumming + try_etag_verify = false; + } + + // if the object is both compressed and encrypted, it was transferred + // in its encrypted+compressed form. we need to preserve the original + // RGW_ATTR_COMPRESSION instead of falling back to default compression + // settings + auto iter = src_attrs.find(RGW_ATTR_COMPRESSION); + if (iter != src_attrs.end() && !encrypted) { + const bufferlist bl = std::move(iter->second); + src_attrs.erase(iter); // don't preserve source compression info + + if (try_etag_verify) { + // if we're trying to verify etags, we need to convert compressed + // ranges in the manifest back into logical multipart part offsets + RGWCompressionInfo info; + bool compressed = false; + int r = rgw_compression_info_from_attr(bl, compressed, info); + if (r < 0) { + ldpp_dout(dpp, 4) << "failed to decode compression info, " + "disabling etag verification" << dendl; + try_etag_verify = false; + } else if (compressed) { + compression_info = std::move(info); + } + } + } + + /* We need the manifest to recompute the ETag for verification */ + iter = src_attrs.find(RGW_ATTR_MANIFEST); + if (iter != src_attrs.end()) { + manifest_bl = std::move(iter->second); + src_attrs.erase(iter); + + // if the source object was encrypted, preserve the part lengths from + // the original object's manifest in RGW_ATTR_CRYPT_PARTS. if the object + // already replicated and has the RGW_ATTR_CRYPT_PARTS attr, preserve it + if (src_attrs.count(RGW_ATTR_CRYPT_MODE) && + !src_attrs.count(RGW_ATTR_CRYPT_PARTS)) { + std::vector parts_len; + int r = RGWGetObj_BlockDecrypt::read_manifest_parts(dpp, manifest_bl, + parts_len); + if (r < 0) { + ldpp_dout(dpp, 4) << "failed to read part lengths from the manifest" << dendl; + } else { + // store the encoded part lenghts in RGW_ATTR_CRYPT_PARTS + bufferlist parts_bl; + encode(parts_len, parts_bl); + src_attrs[RGW_ATTR_CRYPT_PARTS] = std::move(parts_bl); + } + } + } + + // filter out olh attributes + iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX); + while (iter != src_attrs.end()) { + if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) { + break; + } + iter = src_attrs.erase(iter); + } + } + + int ret = attrs_handler(src_attrs); + if (ret < 0) { + return ret; + } + + // do not compress if object is encrypted + if (plugin && !encrypted) { + compressor = boost::in_place(cct, plugin, filter); + // add a filter that buffers data so we don't try to compress tiny blocks. + // libcurl reads in 16k at a time, and we need at least 64k to get a good + // compression ratio + constexpr unsigned buffer_size = 512 * 1024; + buffering = boost::in_place(&*compressor, buffer_size); + filter = &*buffering; + } + + if (try_etag_verify) { + ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl, + compression_info, + etag_verifier); + if (ret < 0) { + ldpp_dout(dpp, 4) << "failed to initial etag verifier, " + "disabling etag verification" << dendl; + } else { + filter = etag_verifier.get(); + } + } + + need_to_process_attrs = false; + + return 0; + } + + int handle_data(bufferlist& bl, bool *pause) override { + if (progress_cb) { + progress_cb(data_len, progress_data); + } + if (extra_data_left) { + uint64_t extra_len = bl.length(); + if (extra_len > extra_data_left) + extra_len = extra_data_left; + + bufferlist extra; + bl.splice(0, extra_len, &extra); + extra_data_bl.append(extra); + + extra_data_left -= extra_len; + if (extra_data_left == 0) { + int res = process_attrs(); + if (res < 0) + return res; + } + ofs += extra_len; + if (bl.length() == 0) { + return 0; + } + } + if (need_to_process_attrs) { + /* need to call process_attrs() even if we don't get any attrs, + * need it to call attrs_handler(). + */ + int res = process_attrs(); + if (res < 0) { + return res; + } + } + + ceph_assert(uint64_t(ofs) >= extra_data_len); + + uint64_t size = bl.length(); + ofs += size; + + const uint64_t lofs = data_len; + data_len += size; + + return filter->process(std::move(bl), lofs); + } + + int flush() { + return filter->process({}, data_len); + } + + bufferlist& get_extra_data() { return extra_data_bl; } + + map& get_attrs() { return src_attrs; } + + void set_extra_data_len(uint64_t len) override { + extra_data_left = len; + RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len); + } + + uint64_t get_data_len() { + return data_len; + } + + std::string get_verifier_etag() { + if (etag_verifier) { + etag_verifier->calculate_etag(); + return etag_verifier->get_calculated_etag(); + } else { + return ""; + } + } +}; + +/* + * prepare attrset depending on attrs_mod. + */ +static void set_copy_attrs(map& src_attrs, + map& attrs, + RGWRados::AttrsMod attrs_mod) +{ + switch (attrs_mod) { + case RGWRados::ATTRSMOD_NONE: + attrs = src_attrs; + break; + case RGWRados::ATTRSMOD_REPLACE: + if (!attrs[RGW_ATTR_ETAG].length()) { + attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG]; + } + if (!attrs[RGW_ATTR_TAIL_TAG].length()) { + auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG); + if (ttiter != src_attrs.end()) { + attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG]; + } + } + break; + case RGWRados::ATTRSMOD_MERGE: + for (map::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) { + if (attrs.find(it->first) == attrs.end()) { + attrs[it->first] = it->second; + } + } + break; + } +} + +int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y) +{ + RGWObjectCtx rctx(this->driver); + rgw::sal::Attrs attrset; + uint64_t obj_size; + ceph::real_time mtime; + RGWRados::Object op_target(this, dest_bucket_info, rctx, obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrset; + read_op.params.obj_size = &obj_size; + read_op.params.lastmod = &mtime; + + int ret = read_op.prepare(y, dpp); + if (ret < 0) + return ret; + + attrset.erase(RGW_ATTR_ID_TAG); + attrset.erase(RGW_ATTR_TAIL_TAG); + attrset.erase(RGW_ATTR_STORAGE_CLASS); + + return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule, + read_op, obj_size - 1, obj, NULL, mtime, + attrset, 0, real_time(), NULL, dpp, y); +} + +int RGWRados::reindex_obj(const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + if (bucket_info.versioned()) { + ldpp_dout(dpp, 10) << "WARNING: " << __func__ << + ": cannot process versioned bucket \"" << + bucket_info.bucket.get_key() << "\"" << + dendl; + return -ENOTSUP; + } + + Bucket target(this, bucket_info); + RGWRados::Bucket::UpdateIndex update_idx(&target, obj); + const std::string* no_write_tag = nullptr; + + int ret = update_idx.prepare(dpp, RGWModifyOp::CLS_RGW_OP_ADD, no_write_tag, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": update index prepare for \"" << obj << "\" returned: " << + cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +struct obj_time_weight { + real_time mtime; + uint32_t zone_short_id; + uint64_t pg_ver; + bool high_precision; + + obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {} + + bool compare_low_precision(const obj_time_weight& rhs) { + struct timespec l = ceph::real_clock::to_timespec(mtime); + struct timespec r = ceph::real_clock::to_timespec(rhs.mtime); + l.tv_nsec = 0; + r.tv_nsec = 0; + if (l > r) { + return false; + } + if (l < r) { + return true; + } + if (!zone_short_id || !rhs.zone_short_id) { + /* don't compare zone ids, if one wasn't provided */ + return false; + } + if (zone_short_id != rhs.zone_short_id) { + return (zone_short_id < rhs.zone_short_id); + } + return (pg_ver < rhs.pg_ver); + + } + + bool operator<(const obj_time_weight& rhs) { + if (!high_precision || !rhs.high_precision) { + return compare_low_precision(rhs); + } + if (mtime > rhs.mtime) { + return false; + } + if (mtime < rhs.mtime) { + return true; + } + if (!zone_short_id || !rhs.zone_short_id) { + /* don't compare zone ids, if one wasn't provided */ + return false; + } + if (zone_short_id != rhs.zone_short_id) { + return (zone_short_id < rhs.zone_short_id); + } + return (pg_ver < rhs.pg_ver); + } + + void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) { + mtime = _mtime; + zone_short_id = _short_id; + pg_ver = _pg_ver; + } + + void init(RGWObjState *state) { + mtime = state->mtime; + zone_short_id = state->zone_short_id; + pg_ver = state->pg_ver; + } +}; + +inline ostream& operator<<(ostream& out, const obj_time_weight &o) { + out << o.mtime; + + if (o.zone_short_id != 0 || o.pg_ver != 0) { + out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]"; + } + + return out; +} + +class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB { + bufferlist extra_data; +public: + RGWGetExtraDataCB() {} + int handle_data(bufferlist& bl, bool *pause) override { + int bl_len = (int)bl.length(); + if (extra_data.length() < extra_data_len) { + off_t max = extra_data_len - extra_data.length(); + if (max > bl_len) { + max = bl_len; + } + bl.splice(0, max, &extra_data); + } + return bl_len; + } + + bufferlist& get_extra_data() { + return extra_data; + } +}; + +int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + const rgw_obj& src_obj, + const RGWBucketInfo *src_bucket_info, + real_time *src_mtime, + uint64_t *psize, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + map *pattrs, + map *pheaders, + string *version_id, + string *ptag, + string *petag) +{ + /* source is in a different zonegroup, copy from there */ + + RGWRESTStreamRWRequest *in_stream_req; + string tag; + map src_attrs; + append_rand_alpha(cct, tag, tag, 32); + obj_time_weight set_mtime_weight; + set_mtime_weight.high_precision = high_precision_time; + + RGWRESTConn *conn; + if (source_zone.empty()) { + if (!src_bucket_info || src_bucket_info->zonegroup.empty()) { + /* source is in the master zonegroup */ + conn = svc.zone->get_master_conn(); + } else { + auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); + map::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup); + if (iter == zonegroup_conn_map.end()) { + ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + } else { + auto& zone_conn_map = svc.zone->get_zone_conn_map(); + auto iter = zone_conn_map.find(source_zone); + if (iter == zone_conn_map.end()) { + ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + + RGWGetExtraDataCB cb; + map req_headers; + real_time set_mtime; + + const real_time *pmod = mod_ptr; + + obj_time_weight dest_mtime_weight; + + constexpr bool prepend_meta = true; + constexpr bool get_op = true; + constexpr bool rgwx_stat = true; + constexpr bool sync_manifest = true; + constexpr bool skip_decrypt = true; + constexpr bool sync_cloudtiered = true; + int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr, + dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, + prepend_meta, get_op, rgwx_stat, + sync_manifest, skip_decrypt, nullptr, sync_cloudtiered, + true, &cb, &in_stream_req); + if (ret < 0) { + return ret; + } + + ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, + nullptr, pheaders, null_yield); + if (ret < 0) { + return ret; + } + + bufferlist& extra_data_bl = cb.get_extra_data(); + if (extra_data_bl.length()) { + JSONParser jp; + if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) { + ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl; + return -EIO; + } + + JSONDecoder::decode_json("attrs", src_attrs, &jp); + + src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout + } + + if (src_mtime) { + *src_mtime = set_mtime; + } + + if (petag) { + map::iterator iter = src_attrs.find(RGW_ATTR_ETAG); + if (iter != src_attrs.end()) { + bufferlist& etagbl = iter->second; + *petag = etagbl.to_str(); + while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') { + *petag = petag->substr(0, petag->size() - 1); + } + } + } + + if (pattrs) { + *pattrs = std::move(src_attrs); + } + + return 0; +} + +int RGWFetchObjFilter_Default::filter(CephContext *cct, + const rgw_obj_key& source_key, + const RGWBucketInfo& dest_bucket_info, + std::optional dest_placement_rule, + const map& obj_attrs, + std::optional *poverride_owner, + const rgw_placement_rule **prule) +{ + const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr); + if (!ptail_rule) { + auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS); + if (iter != obj_attrs.end()) { + dest_rule.storage_class = iter->second.to_str(); + dest_rule.inherit_from(dest_bucket_info.placement_rule); + ptail_rule = &dest_rule; + } else { + ptail_rule = &dest_bucket_info.placement_rule; + } + } + *prule = ptail_rule; + return 0; +} + +int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + const rgw_obj& dest_obj, + const rgw_obj& src_obj, + RGWBucketInfo& dest_bucket_info, + RGWBucketInfo *src_bucket_info, + std::optional dest_placement_rule, + real_time *src_mtime, + real_time *mtime, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + rgw::sal::Attrs& attrs, + RGWObjCategory category, + std::optional olh_epoch, + real_time delete_at, + string *ptag, + string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + const DoutPrefixProvider *dpp, + RGWFetchObjFilter *filter, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *zones_trace, + std::optional* bytes_transferred) +{ + /* source is in a different zonegroup, copy from there */ + + RGWRESTStreamRWRequest *in_stream_req; + string tag; + int i; + append_rand_alpha(cct, tag, tag, 32); + obj_time_weight set_mtime_weight; + set_mtime_weight.high_precision = high_precision_time; + int ret; + + rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size); + using namespace rgw::putobj; + AtomicObjectProcessor processor(&aio, this, dest_bucket_info, nullptr, + user_id, obj_ctx, dest_obj, olh_epoch, + tag, dpp, null_yield); + RGWRESTConn *conn; + auto& zone_conn_map = svc.zone->get_zone_conn_map(); + auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); + if (source_zone.empty()) { + if (!src_bucket_info || src_bucket_info->zonegroup.empty()) { + /* source is in the master zonegroup */ + conn = svc.zone->get_master_conn(); + } else { + map::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup); + if (iter == zonegroup_conn_map.end()) { + ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + } else { + auto iter = zone_conn_map.find(source_zone); + if (iter == zone_conn_map.end()) { + ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + + boost::optional compressor; + CompressorRef plugin; + + RGWFetchObjFilter_Default source_filter; + if (!filter) { + filter = &source_filter; + } + + std::optional override_owner; + + RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data, + [&](map& obj_attrs) { + const rgw_placement_rule *ptail_rule; + + int ret = filter->filter(cct, + src_obj.key, + dest_bucket_info, + dest_placement_rule, + obj_attrs, + &override_owner, + &ptail_rule); + if (ret < 0) { + ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl; + return ret; + } + + processor.set_tail_placement(*ptail_rule); + + const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule); + if (compression_type != "none") { + plugin = Compressor::create(cct, compression_type); + if (!plugin) { + ldpp_dout(dpp, 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } + } + + ret = processor.prepare(null_yield); + if (ret < 0) { + return ret; + } + return 0; + }); + + string etag; + real_time set_mtime; + uint64_t accounted_size = 0; + + RGWObjState *dest_state = NULL; + RGWObjManifest *manifest = nullptr; + + const real_time *pmod = mod_ptr; + + obj_time_weight dest_mtime_weight; + rgw_zone_set_entry dst_zone_trace(svc.zone->get_zone().id, dest_bucket_info.bucket.get_key()); + + if (copy_if_newer) { + /* need to get mtime for destination */ + ret = get_obj_state(dpp, &obj_ctx, dest_bucket_info, dest_obj, &dest_state, &manifest, false, null_yield); + if (ret < 0) + goto set_err_state; + + if (!real_clock::is_zero(dest_state->mtime)) { + dest_mtime_weight.init(dest_state); + pmod = &dest_mtime_weight.mtime; + } + } + + static constexpr bool prepend_meta = true; + static constexpr bool get_op = true; + static constexpr bool rgwx_stat = false; + static constexpr bool sync_manifest = true; + static constexpr bool skip_decrypt = true; + static constexpr bool sync_cloudtiered = true; + ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr, + dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, + prepend_meta, get_op, rgwx_stat, + sync_manifest, skip_decrypt, &dst_zone_trace, + sync_cloudtiered, true, + &cb, &in_stream_req); + if (ret < 0) { + goto set_err_state; + } + + ret = conn->complete_request(in_stream_req, &etag, &set_mtime, + &accounted_size, nullptr, nullptr, null_yield); + if (ret < 0) { + goto set_err_state; + } + ret = cb.flush(); + if (ret < 0) { + goto set_err_state; + } + if (cb.get_data_len() != accounted_size) { + ret = -EIO; + ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected " + << accounted_size << " bytes but received " << cb.get_data_len() << dendl; + goto set_err_state; + } + + if (compressor && compressor->is_compressed()) { + bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = accounted_size; + cs_info.compressor_message = compressor->get_compressor_message(); + cs_info.blocks = move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp; + } else if (auto c = cb.get_attrs().find(RGW_ATTR_COMPRESSION); + c != cb.get_attrs().end()) { + // if the object was transferred in its compressed+encrypted form, use its + // original uncompressed size + try { + RGWCompressionInfo info; + auto p = c->second.cbegin(); + decode(info, p); + accounted_size = info.orig_size; + } catch (const buffer::error&) { + ldpp_dout(dpp, 0) << "ERROR: could not decode compression attr for " + "replicated object " << dest_obj << dendl; + // decode error isn't fatal, but we might put the wrong size in the index + } + } + + if (override_owner) { + processor.set_owner(*override_owner); + + auto& obj_attrs = cb.get_attrs(); + + RGWUserInfo owner_info; + if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) { + ldpp_dout(dpp, 10) << "owner info does not exist" << dendl; + return -EINVAL; + } + + RGWAccessControlPolicy acl; + + auto aiter = obj_attrs.find(RGW_ATTR_ACL); + if (aiter == obj_attrs.end()) { + ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl; + acl.create_default(owner_info.user_id, owner_info.display_name); + } else { + auto iter = aiter->second.cbegin(); + try { + acl.decode(iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + } + + ACLOwner new_owner; + new_owner.set_id(*override_owner); + new_owner.set_name(owner_info.display_name); + + acl.set_owner(new_owner); + + bufferlist bl; + acl.encode(bl); + obj_attrs[RGW_ATTR_ACL] = std::move(bl); + } + + if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */ + cb.get_attrs().erase(RGW_ATTR_DELETE_AT); + } else { + map::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT); + if (iter != cb.get_attrs().end()) { + try { + decode(delete_at, iter->second); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl; + } + } + } + + if (src_mtime) { + *src_mtime = set_mtime; + } + + if (petag) { + const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG); + if (iter != cb.get_attrs().end()) { + *petag = iter->second.to_str(); + } + } + + //erase the append attr + cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM); + + { // add x-amz-replication-status=REPLICA + auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS]; + bl.clear(); // overwrite source's status + bl.append("REPLICA"); + } + { // update replication trace + std::vector trace; + if (auto i = cb.get_attrs().find(RGW_ATTR_OBJ_REPLICATION_TRACE); + i != cb.get_attrs().end()) { + try { + decode(trace, i->second); + } catch (const buffer::error&) {} + } + // add the source entry to the end + trace.push_back(source_trace_entry); + + bufferlist bl; + encode(trace, bl); + cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_TRACE] = std::move(bl); + } + + if (source_zone.empty()) { + set_copy_attrs(cb.get_attrs(), attrs, attrs_mod); + } else { + attrs = cb.get_attrs(); + } + + if (copy_if_newer) { + uint64_t pg_ver = 0; + auto i = attrs.find(RGW_ATTR_PG_VER); + if (i != attrs.end() && i->second.length() > 0) { + auto iter = i->second.cbegin(); + try { + decode(pg_ver, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl; + /* non critical error */ + } + } + set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver); + } + + /* Perform ETag verification is we have computed the object's MD5 sum at our end */ + if (const auto& verifier_etag = cb.get_verifier_etag(); + !verifier_etag.empty()) { + string trimmed_etag = etag; + + /* Remove the leading and trailing double quotes from etag */ + trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'), + trimmed_etag.end()); + + if (verifier_etag != trimmed_etag) { + ret = -EIO; + ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:" + << trimmed_etag << " Computed etag:" << verifier_etag << dendl; + goto set_err_state; + } + } + +#define MAX_COMPLETE_RETRY 100 + for (i = 0; i < MAX_COMPLETE_RETRY; i++) { + bool canceled = false; + ret = processor.complete(accounted_size, etag, mtime, set_mtime, + attrs, delete_at, nullptr, nullptr, nullptr, + zones_trace, &canceled, null_yield); + if (ret < 0) { + goto set_err_state; + } + + if (copy_if_newer && canceled) { + ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl; + obj_ctx.invalidate(dest_obj); /* object was overwritten */ + ret = get_obj_state(dpp, &obj_ctx, dest_bucket_info, dest_obj, &dest_state, &manifest, false, null_yield); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl; + goto set_err_state; + } + dest_mtime_weight.init(dest_state); + dest_mtime_weight.high_precision = high_precision_time; + if (!dest_state->exists || + dest_mtime_weight < set_mtime_weight) { + ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + continue; + } else { + ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + } + } + break; + } + + if (i == MAX_COMPLETE_RETRY) { + ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl; + ret = -EIO; + goto set_err_state; + } + + if (bytes_transferred) { + *bytes_transferred = cb.get_data_len(); + } + return 0; +set_err_state: + if (copy_if_newer && ret == -ERR_NOT_MODIFIED) { + // we may have already fetched during sync of OP_ADD, but were waiting + // for OP_LINK_OLH to call set_olh() with a real olh_epoch + if (olh_epoch && *olh_epoch > 0) { + constexpr bool log_data_change = true; + ret = set_olh(dpp, obj_ctx, dest_bucket_info, dest_obj, false, nullptr, + *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change); + } else { + // we already have the latest copy + ret = 0; + } + } + return ret; +} + + +int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp, + RGWObjState *astate, + map& src_attrs, + RGWRados::Object::Read& read_op, + const rgw_user& user_id, + const rgw_obj& dest_obj, + real_time *mtime) +{ + string etag; + + RGWRESTStreamS3PutObj *out_stream_req; + + auto rest_master_conn = svc.zone->get_master_conn(); + + int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req); + if (ret < 0) { + return ret; + } + + out_stream_req->set_send_length(astate->size); + + ret = RGWHTTP::send(out_stream_req); + if (ret < 0) { + delete out_stream_req; + return ret; + } + + ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield); + if (ret < 0) { + delete out_stream_req; + return ret; + } + + ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield); + if (ret < 0) + return ret; + + return 0; +} + +/** + * Copy an object. + * dest_obj: the object to copy into + * src_obj: the object to copy from + * attrs: usage depends on attrs_mod parameter + * attrs_mod: the modification mode of the attrs, may have the following values: + * ATTRSMOD_NONE - the attributes of the source object will be + * copied without modifications, attrs parameter is ignored; + * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs + * parameter, source object attributes are not copied; + * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes + * are overwritten by values contained in attrs parameter. + * err: stores any errors resulting from the get of the original object + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + const rgw_obj& dest_obj, + const rgw_obj& src_obj, + RGWBucketInfo& dest_bucket_info, + RGWBucketInfo& src_bucket_info, + const rgw_placement_rule& dest_placement, + real_time *src_mtime, + real_time *mtime, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + rgw::sal::Attrs& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + real_time delete_at, + string *version_id, + string *ptag, + string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + const DoutPrefixProvider *dpp, + optional_yield y) +{ + int ret; + uint64_t obj_size; + rgw_obj shadow_obj = dest_obj; + string shadow_oid; + + bool remote_src; + bool remote_dest; + + append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32); + shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns); + + auto& zonegroup = svc.zone->get_zonegroup(); + + remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup); + remote_src = !zonegroup.equals(src_bucket_info.zonegroup); + + if (remote_src && remote_dest) { + ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl; + return -EINVAL; + } + + ldpp_dout(dpp, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl; + + if (remote_src || !source_zone.empty()) { + rgw_zone_set_entry source_trace_entry{source_zone.id, std::nullopt}; + return fetch_remote_obj(obj_ctx, user_id, info, source_zone, + dest_obj, src_obj, dest_bucket_info, &src_bucket_info, + dest_placement, src_mtime, mtime, mod_ptr, + unmod_ptr, high_precision_time, + if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category, + olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp, + nullptr /* filter */, source_trace_entry); + } + + map src_attrs; + RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj); + RGWRados::Object::Read read_op(&src_op_target); + + read_op.conds.mod_ptr = mod_ptr; + read_op.conds.unmod_ptr = unmod_ptr; + read_op.conds.high_precision_time = high_precision_time; + read_op.conds.if_match = if_match; + read_op.conds.if_nomatch = if_nomatch; + read_op.params.attrs = &src_attrs; + read_op.params.lastmod = src_mtime; + read_op.params.obj_size = &obj_size; + + ret = read_op.prepare(y, dpp); + if (ret < 0) { + return ret; + } + if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) { + // Current implementation does not follow S3 spec and even + // may result in data corruption silently when copying + // multipart objects acorss pools. So reject COPY operations + //on encrypted objects before it is fully functional. + ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj + << " has not been implemented." << dendl; + return -ERR_NOT_IMPLEMENTED; + } + + src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL]; + src_attrs.erase(RGW_ATTR_DELETE_AT); + + src_attrs.erase(RGW_ATTR_OBJECT_RETENTION); + src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD); + map::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (rt != attrs.end()) + src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second; + map::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD); + if (lh != attrs.end()) + src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second; + + set_copy_attrs(src_attrs, attrs, attrs_mod); + attrs.erase(RGW_ATTR_ID_TAG); + attrs.erase(RGW_ATTR_PG_VER); + attrs.erase(RGW_ATTR_SOURCE_ZONE); + map::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION); + if (cmp != src_attrs.end()) + attrs[RGW_ATTR_COMPRESSION] = cmp->second; + + RGWObjManifest manifest; + RGWObjState *astate = NULL; + RGWObjManifest *amanifest = nullptr; + + ret = get_obj_state(dpp, &obj_ctx, src_bucket_info, src_obj, &astate, &amanifest, y); + if (ret < 0) { + return ret; + } + + vector ref_objs; + + if (remote_dest) { + /* dest is in a different zonegroup, copy it there */ + return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime); + } + uint64_t max_chunk_size; + + ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl; + return ret; + } + + rgw_pool src_pool; + rgw_pool dest_pool; + + const rgw_placement_rule *src_rule{nullptr}; + + if (amanifest) { + src_rule = &amanifest->get_tail_placement().placement_rule; + ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl; + } + + if (!src_rule || src_rule->empty()) { + src_rule = &src_bucket_info.placement_rule; + } + + if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) { + ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl; + return -EIO; + } + + if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) { + ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl; + return -EIO; + } + + ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool + << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl; + + bool copy_data = (!amanifest) || + (*src_rule != dest_placement) || + (src_pool != dest_pool); + + bool copy_first = false; + if (amanifest) { + if (!amanifest->has_tail()) { + copy_data = true; + } else { + uint64_t head_size = amanifest->get_head_size(); + + if (head_size > 0) { + if (head_size > max_chunk_size) { + copy_data = true; + } else { + copy_first = true; + } + } + } + } + + if (petag) { + const auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + *petag = iter->second.to_str(); + } + } + + if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */ + attrs.erase(RGW_ATTR_TAIL_TAG); + return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj, + mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y); + } + + /* This has been in for 2 years, so we can safely assume amanifest is not NULL */ + RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp); + + if (copy_first) { // we need to copy first chunk, not increase refcount + ++miter; + } + + bufferlist first_chunk; + + const bool copy_itself = (dest_obj == src_obj); + RGWObjManifest *pmanifest; + ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl; + + RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj); + RGWRados::Object::Write write_op(&dest_op_target); + + string tag; + + if (ptag) { + tag = *ptag; + } + + if (tag.empty()) { + append_rand_alpha(cct, tag, tag, 32); + } + + std::unique_ptr aio; + rgw::AioResultList all_results; + if (!copy_itself) { + aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y); + attrs.erase(RGW_ATTR_TAIL_TAG); + manifest = *amanifest; + const rgw_bucket_placement& tail_placement = manifest.get_tail_placement(); + if (tail_placement.bucket.name.empty()) { + manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket); + } + string ref_tag; + for (; miter != amanifest->obj_end(dpp); ++miter) { + ObjectWriteOperation op; + ref_tag = tag + '\0'; + cls_refcount_get(op, ref_tag, true); + + auto obj = svc.rados->obj(miter.get_location().get_raw_obj(this)); + ret = obj.open(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl; + goto done_ret; + } + + static constexpr uint64_t cost = 1; // 1 throttle unit per request + static constexpr uint64_t id = 0; // ids unused + rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id); + ret = rgw::check_for_errors(completed); + all_results.splice(all_results.end(), completed); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl; + goto done_ret; + } + } + + rgw::AioResultList completed = aio->drain(); + ret = rgw::check_for_errors(completed); + all_results.splice(all_results.end(), completed); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <keep_tail = true; + } + + if (copy_first) { + ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp); + if (ret < 0) { + goto done_ret; + } + + pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length()); + } else { + pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0); + } + + write_op.meta.data = &first_chunk; + write_op.meta.manifest = pmanifest; + write_op.meta.ptag = &tag; + write_op.meta.owner = dest_bucket_info.owner; + write_op.meta.mtime = mtime; + write_op.meta.flags = PUT_OBJ_CREATE; + write_op.meta.category = category; + write_op.meta.olh_epoch = olh_epoch; + write_op.meta.delete_at = delete_at; + write_op.meta.modify_tail = !copy_itself; + + ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y); + if (ret < 0) { + goto done_ret; + } + + return 0; + +done_ret: + if (!copy_itself) { + + /* wait all pending op done */ + rgw::AioResultList completed = aio->drain(); + all_results.splice(all_results.end(), completed); + + /* rollback reference */ + string ref_tag = tag + '\0'; + int ret2 = 0; + for (auto& r : all_results) { + if (r.result < 0) { + continue; // skip errors + } + ObjectWriteOperation op; + cls_refcount_put(op, ref_tag, true); + + static constexpr uint64_t cost = 1; // 1 throttle unit per request + static constexpr uint64_t id = 0; // ids unused + rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id); + ret2 = rgw::check_for_errors(completed); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl; + } + } + completed = aio->drain(); + ret2 = rgw::check_for_errors(completed); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <_conf->rgw_put_obj_min_window_size); + using namespace rgw::putobj; + // do not change the null_yield in the initialization of this AtomicObjectProcessor + // it causes crashes in the ragweed tests + AtomicObjectProcessor processor(&aio, this, dest_bucket_info, &dest_placement, + dest_bucket_info.owner, obj_ctx, + dest_obj, olh_epoch, tag, + dpp, null_yield); + int ret = processor.prepare(y); + if (ret < 0) + return ret; + + off_t ofs = 0; + + do { + bufferlist bl; + ret = read_op.read(ofs, end, bl, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl; + return ret; + } + + uint64_t read_len = ret; + ret = processor.process(std::move(bl), ofs); + if (ret < 0) { + return ret; + } + + ofs += read_len; + } while (ofs <= end); + + // flush + ret = processor.process({}, ofs); + if (ret < 0) { + return ret; + } + + string etag; + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + etag = bl.to_str(); + if (petag) { + *petag = etag; + } + } + + uint64_t accounted_size; + { + bool compressed{false}; + RGWCompressionInfo cs_info; + ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl; + return ret; + } + // pass original size if compressed + accounted_size = compressed ? cs_info.orig_size : ofs; + } + + return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, + nullptr, nullptr, nullptr, nullptr, nullptr, y); +} + +int RGWRados::transition_obj(RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, + const rgw_obj& obj, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider *dpp, + optional_yield y) +{ + rgw::sal::Attrs attrs; + real_time read_mtime; + uint64_t obj_size; + + obj_ctx.set_atomic(obj); + RGWRados::Object op_target(this, bucket_info, obj_ctx, obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrs; + read_op.params.lastmod = &read_mtime; + read_op.params.obj_size = &obj_size; + + int ret = read_op.prepare(y, dpp); + if (ret < 0) { + return ret; + } + + if (read_mtime != mtime) { + /* raced */ + ldpp_dout(dpp, 0) << __func__ << " ERROR: failed to transition obj(" << obj.key << ") read_mtime = " << read_mtime << " doesn't match mtime = " << mtime << dendl; + return -ECANCELED; + } + + attrs.erase(RGW_ATTR_ID_TAG); + attrs.erase(RGW_ATTR_TAIL_TAG); + + ret = copy_obj_data(obj_ctx, + bucket_info, + placement_rule, + read_op, + obj_size - 1, + obj, + nullptr /* pmtime */, + mtime, + attrs, + olh_epoch, + real_time(), + nullptr /* petag */, + dpp, + y); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y) +{ + constexpr uint NUM_ENTRIES = 1000u; + + rgw_obj_index_key marker; + string prefix; + bool is_truncated; + + do { + std::vector ent_list; + ent_list.reserve(NUM_ENTRIES); + + int r = cls_bucket_list_unordered(dpp, + bucket_info, + bucket_info.layout.current_index, + RGW_NO_SHARD, + marker, + prefix, + NUM_ENTRIES, + true, + ent_list, + &is_truncated, + &marker, + y); + if (r < 0) { + return r; + } + + string ns; + for (auto const& dirent : ent_list) { + rgw_obj_key obj; + + if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) { + return -ENOTEMPTY; + } + } + } while (is_truncated); + + return 0; +} + +/** + * Delete a bucket. + * bucket: the name of the bucket to delete + * Returns 0 on success, -ERR# otherwise. + */ +int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty) +{ + const rgw_bucket& bucket = bucket_info.bucket; + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) + return r; + + if (check_empty) { + r = check_bucket_empty(dpp, bucket_info, y); + if (r < 0) { + return r; + } + } + + bool remove_ep = true; + + if (objv_tracker.read_version.empty()) { + RGWBucketEntryPoint ep; + r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket, + &ep, + null_yield, + dpp, + RGWBucketCtl::Bucket::GetParams() + .set_objv_tracker(&objv_tracker)); + if (r < 0 || + (!bucket_info.bucket.bucket_id.empty() && + ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) { + if (r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl; + /* we have no idea what caused the error, will not try to remove it */ + } + /* + * either failed to read bucket entrypoint, or it points to a different bucket instance than + * requested + */ + remove_ep = false; + } + } + + if (remove_ep) { + r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp, + RGWBucketCtl::Bucket::RemoveParams() + .set_objv_tracker(&objv_tracker)); + if (r < 0) + return r; + } + + /* if the bucket is not synced we can remove the meta file */ + if (!svc.zone->is_syncing_bucket_meta(bucket)) { + RGWObjVersionTracker objv_tracker; + r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp); + if (r < 0) { + return r; + } + + /* remove bucket index objects asynchronously by best effort */ + (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(), + bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); + } + + return 0; +} + +int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp) +{ + RGWBucketInfo info; + map attrs; + int r; + + if (bucket.bucket_id.empty()) { + r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs); + } else { + r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp); + } + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; + return r; + } + + info.owner = owner.get_id(); + + r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; + return r; + } + + return 0; +} + + +int RGWRados::set_buckets_enabled(vector& buckets, bool enabled, const DoutPrefixProvider *dpp) +{ + int ret = 0; + + vector::iterator iter; + + for (iter = buckets.begin(); iter != buckets.end(); ++iter) { + rgw_bucket& bucket = *iter; + if (enabled) { + ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl; + } else { + ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl; + } + + RGWBucketInfo info; + map attrs; + int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs); + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; + ret = r; + continue; + } + if (enabled) { + info.flags &= ~BUCKET_SUSPENDED; + } else { + info.flags |= BUCKET_SUSPENDED; + } + + r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; + ret = r; + continue; + } + } + return ret; +} + +int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended) +{ + RGWBucketInfo bucket_info; + int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp); + if (ret < 0) { + return ret; + } + + *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0); + return 0; +} + +int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp) +{ + if ((!manifest)|| state->keep_tail) + return 0; + + cls_rgw_obj_chain chain; + store->update_gc_chain(dpp, obj, *manifest, &chain); + + if (chain.empty()) { + return 0; + } + + string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str()); + if (store->gc == nullptr) { + ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl; + //Delete objects inline just in case gc hasn't been initialised, prevents crashes + store->delete_objs_inline(dpp, chain, tag); + } else { + auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously + if (ret < 0 && leftover_chain) { + //Delete objects inline if send chain to gc fails + store->delete_objs_inline(dpp, *leftover_chain, tag); + } + } + return 0; +} + +void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain) +{ + RGWObjManifest::obj_iterator iter; + rgw_raw_obj raw_head; + obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head); + for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) { + const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this); + if (mobj == raw_head) + continue; + cls_rgw_obj_key key(mobj.oid); + chain->push_obj(mobj.pool.to_str(), key, mobj.loc); + } +} + +std::tuple> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag) +{ + if (chain.empty()) { + return {0, std::nullopt}; + } + + return gc->send_split_chain(chain, tag); +} + +void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag) +{ + string last_pool; + std::unique_ptr ctx(new IoCtx); + int ret = 0; + for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) { + cls_rgw_obj& obj = *liter; + if (obj.pool != last_pool) { + ctx.reset(new IoCtx); + ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx); + if (ret < 0) { + last_pool = ""; + ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" << + obj.pool << dendl; + continue; + } + last_pool = obj.pool; + } + ctx->locator_set_key(obj.loc); + const string& oid = obj.key.name; /* just stored raw oid there */ + ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool << + ":" << obj.key.name << dendl; + ObjectWriteOperation op; + cls_refcount_put(op, tag, true); + ret = ctx->operate(oid, &op); + if (ret < 0) { + ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl; + } + } +} + +static void accumulate_raw_stats(const rgw_bucket_dir_header& header, + map& stats) +{ + for (const auto& pair : header.stats) { + const RGWObjCategory category = static_cast(pair.first); + const rgw_bucket_category_stats& header_stats = pair.second; + + RGWStorageStats& s = stats[category]; + + s.category = category; + s.size += header_stats.total_size; + s.size_rounded += header_stats.total_size_rounded; + s.size_utilized += header_stats.actual_size; + s.num_objects += header_stats.num_entries; + } +} + +int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, + map *existing_stats, + map *calculated_stats) +{ + RGWSI_RADOS::Pool index_pool; + + // key - bucket index object id + // value - bucket index check OP returned result with the given bucket index object (shard) + map oids; + + int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr); + if (ret < 0) { + return ret; + } + + // declare and pre-populate + map bucket_objs_ret; + for (auto& iter : oids) { + bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret()); + } + + ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)(); + if (ret < 0) { + return ret; + } + + // aggregate results (from different shards if there are any) + for (const auto& iter : bucket_objs_ret) { + accumulate_raw_stats(iter.second.existing_header, *existing_stats); + accumulate_raw_stats(iter.second.calculated_header, *calculated_stats); + } + + return 0; +} + +int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) { + return r; + } + + return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); +} + +static int resync_encrypted_multipart(const DoutPrefixProvider* dpp, + optional_yield y, RGWRados* store, + RGWBucketInfo& bucket_info, + RGWObjectCtx& obj_ctx, + const RGWObjState& state) +{ + // only overwrite if the tag hasn't changed + obj_ctx.set_atomic(state.obj); + + // make a tiny adjustment to the existing mtime so that fetch_remote_obj() + // won't return ERR_NOT_MODIFIED when resyncing the object + const auto set_mtime = state.mtime + std::chrono::nanoseconds(1); + + // use set_attrs() to update the mtime in a bucket index transaction so the + // change is recorded in bilog and datalog entries. this will cause any peer + // zones to resync the object + auto add_attrs = std::map{ + { RGW_ATTR_PREFIX "resync-encrypted-multipart", bufferlist{} }, + }; + + return store->set_attrs(dpp, &obj_ctx, bucket_info, state.obj, + add_attrs, nullptr, y, set_mtime); +} + +static void try_resync_encrypted_multipart(const DoutPrefixProvider* dpp, + optional_yield y, RGWRados* store, + RGWBucketInfo& bucket_info, + RGWObjectCtx& obj_ctx, + const rgw_bucket_dir_entry& dirent, + Formatter* f) +{ + const auto obj = rgw_obj{bucket_info.bucket, dirent.key}; + + RGWObjState* astate = nullptr; + RGWObjManifest* manifest = nullptr; + constexpr bool follow_olh = false; // dirent will have version ids + int ret = store->get_obj_state(dpp, &obj_ctx, bucket_info, obj, + &astate, &manifest, follow_olh, y); + if (ret < 0) { + ldpp_dout(dpp, 4) << obj << " does not exist" << dendl; + return; + } + + // check whether the object is encrypted + if (auto i = astate->attrset.find(RGW_ATTR_CRYPT_MODE); + i == astate->attrset.end()) { + ldpp_dout(dpp, 4) << obj << " is not encrypted" << dendl; + return; + } + + // check whether the object is multipart + if (!manifest) { + ldpp_dout(dpp, 4) << obj << " has no manifest so is not multipart" << dendl; + return; + } + const RGWObjManifest::obj_iterator end = manifest->obj_end(dpp); + if (end.get_cur_part_id() == 0) { + ldpp_dout(dpp, 4) << obj << " manifest is not multipart" << dendl; + return; + } + + ret = resync_encrypted_multipart(dpp, y, store, bucket_info, + obj_ctx, *astate); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update " << obj + << ": " << cpp_strerror(ret) << dendl; + return; + } + + f->open_object_section("object"); + encode_json("name", obj.key.name, f); + if (!obj.key.instance.empty()) { + encode_json("version", obj.key.instance, f); + } + encode_json("mtime", astate->mtime, f); + f->close_section(); // "object" +} + +int RGWRados::bucket_resync_encrypted_multipart(const DoutPrefixProvider* dpp, + optional_yield y, + rgw::sal::RadosStore* driver, + RGWBucketInfo& bucket_info, + const std::string& marker, + RGWFormatterFlusher& flusher) +{ + RGWRados::Bucket target(this, bucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.marker.name = marker; + list_op.params.enforce_ns = true; // only empty ns + list_op.params.list_versions = true; + list_op.params.allow_unordered = true; + + /* List bucket entries in chunks. */ + static constexpr int MAX_LIST_OBJS = 100; + std::vector entries; + entries.reserve(MAX_LIST_OBJS); + + int processed = 0; + bool is_truncated = true; + + Formatter* f = flusher.get_formatter(); + f->open_array_section("progress"); + + do { + int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr, + &is_truncated, y); + if (ret < 0) { + return ret; + } + + f->open_object_section("batch"); + f->open_array_section("modified"); + + for (const auto& dirent : entries) { + RGWObjectCtx obj_ctx{driver}; + try_resync_encrypted_multipart(dpp, y, this, bucket_info, + obj_ctx, dirent, f); + } + + f->close_section(); // "modified" + + processed += entries.size(); + encode_json("total processed", processed, f); + encode_json("marker", list_op.get_next_marker().name, f); + f->close_section(); // "batch" + + flusher.flush(); // flush after each 'chunk' + } while (is_truncated); + + f->close_section(); // "progress" array + return 0; +} + +int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": unable to open bucket index, r=" << r << " (" << + cpp_strerror(-r) << ")" << dendl; + return r; + } + + r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": unable to issue set bucket resharding, r=" << r << " (" << + cpp_strerror(-r) << ")" << dendl; + } + return r; +} + +int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y) +{ + std::string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + if (!rctx) + return 0; + + RGWObjState *state = NULL; + RGWObjManifest *manifest = nullptr; + + int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y); + if (r < 0) + return r; + + if (!state->is_atomic) { + ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl; + return -EINVAL; + } + + string tag; + + if (state->tail_tag.length() > 0) { + tag = state->tail_tag.c_str(); + } else if (state->obj_tag.length() > 0) { + tag = state->obj_tag.c_str(); + } else { + ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl; + return -EINVAL; + } + + ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl; + + cls_rgw_obj_chain chain; + update_gc_chain(dpp, state->obj, *manifest, &chain); + return gc->async_defer_chain(tag, chain); +} + +void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op) +{ + list prefixes; + prefixes.push_back(RGW_ATTR_OLH_PREFIX); + cls_rgw_remove_obj(op, prefixes); +} + +void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist) +{ + cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist); +} + +void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type) +{ + cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type); +} + +struct tombstone_entry { + ceph::real_time mtime; + uint32_t zone_short_id; + uint64_t pg_ver; + + tombstone_entry() = default; + explicit tombstone_entry(const RGWObjState& state) + : mtime(state.mtime), zone_short_id(state.zone_short_id), + pg_ver(state.pg_ver) {} +}; + +/** + * Delete an object. + * bucket: name of the bucket storing the object + * obj: name of the object to delete + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp) +{ + RGWRados *store = target->get_store(); + const rgw_obj& src_obj = target->get_obj(); + const string& instance = src_obj.key.instance; + rgw_obj obj = target->get_obj(); + + if (instance == "null") { + obj.key.instance.clear(); + } + + bool explicit_marker_version = (!params.marker_version_id.empty()); + + if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) { + if (instance.empty() || explicit_marker_version) { + rgw_obj marker = obj; + marker.key.instance.clear(); + + if (!params.marker_version_id.empty()) { + if (params.marker_version_id != "null") { + marker.key.set_instance(params.marker_version_id); + } + } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) { + store->gen_rand_obj_instance_name(&marker); + } + + result.version_id = marker.key.instance; + if (result.version_id.empty()) + result.version_id = "null"; + result.delete_marker = true; + + struct rgw_bucket_dir_entry_meta meta; + + meta.owner = params.obj_owner.get_id().to_str(); + meta.owner_display_name = params.obj_owner.get_display_name(); + + if (real_clock::is_zero(params.mtime)) { + meta.mtime = real_clock::now(); + } else { + meta.mtime = params.mtime; + } + + int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace); + if (r < 0) { + return r; + } + } else { + rgw_bucket_dir_entry dirent; + + int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent); + if (r < 0) { + return r; + } + result.delete_marker = dirent.is_delete_marker(); + r = store->unlink_obj_instance(dpp, target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace); + if (r < 0) { + return r; + } + result.version_id = instance; + } + + BucketShard *bs = nullptr; + int r = target->get_bucket_shard(&bs, dpp); + if (r < 0) { + ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl; + return r; + } + + add_datalog_entry(dpp, store->svc.datalog_rados, + target->get_bucket_info(), bs->shard_id, y); + + return 0; + } + + rgw_rados_ref ref; + int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref); + if (r < 0) { + return r; + } + + RGWObjState *state; + RGWObjManifest *manifest = nullptr; + r = target->get_state(dpp, &state, &manifest, false, y); + if (r < 0) + return r; + + ObjectWriteOperation op; + + if (!real_clock::is_zero(params.unmod_since)) { + struct timespec ctime = ceph::real_clock::to_timespec(state->mtime); + struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since); + if (!params.high_precision_time) { + ctime.tv_nsec = 0; + unmod.tv_nsec = 0; + } + + ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl; + if (ctime > unmod) { + return -ERR_PRECONDITION_FAILED; + } + + /* only delete object if mtime is less than or equal to params.unmod_since */ + store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE); + } + uint64_t obj_accounted_size = state->accounted_size; + + if(params.abortmp) { + obj_accounted_size = params.parts_accounted_size; + } + + if (!real_clock::is_zero(params.expiration_time)) { + bufferlist bl; + real_time delete_at; + + if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) { + try { + auto iter = bl.cbegin(); + decode(delete_at, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl; + return -EIO; + } + + if (params.expiration_time != delete_at) { + return -ERR_PRECONDITION_FAILED; + } + } else { + return -ERR_PRECONDITION_FAILED; + } + } + + if (!state->exists) { + target->invalidate_state(); + return -ENOENT; + } + + r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y); + if (r < 0) + return r; + + RGWBucketInfo& bucket_info = target->get_bucket_info(); + + RGWRados::Bucket bop(store, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + index_op.set_zones_trace(params.zones_trace); + index_op.set_bilog_flags(params.bilog_flags); + + r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y); + if (r < 0) + return r; + + store->remove_rgw_head_obj(op); + + auto& ioctx = ref.pool.ioctx(); + r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y); + + /* raced with another operation, object state is indeterminate */ + const bool need_invalidate = (r == -ECANCELED); + + int64_t poolid = ioctx.get_id(); + if (r >= 0) { + tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache(); + if (obj_tombstone_cache) { + tombstone_entry entry{*state}; + obj_tombstone_cache->add(obj, entry); + } + r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs, y); + + int ret = target->complete_atomic_modification(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl; + } + /* other than that, no need to propagate error */ + } else { + int ret = index_op.cancel(dpp, params.remove_objs, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; + } + } + + if (need_invalidate) { + target->invalidate_state(); + } + + if (r < 0) + return r; + + /* update quota cache */ + store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size); + + return 0; +} + +int RGWRados::delete_obj(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + int versioning_status, // versioning flags defined in enum RGWBucketFlags + uint16_t bilog_flags, + const real_time& expiration_time, + rgw_zone_set *zones_trace) +{ + RGWRados::Object del_target(this, bucket_info, obj_ctx, obj); + RGWRados::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = bucket_info.owner; + del_op.params.versioning_status = versioning_status; + del_op.params.bilog_flags = bilog_flags; + del_op.params.expiration_time = expiration_time; + del_op.params.zones_trace = zones_trace; + + return del_op.delete_obj(null_yield, dpp); +} + +int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + + op.remove(); + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r < 0) + return r; + + return 0; +} + +int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, + const DoutPrefixProvider *dpp, optional_yield y) +{ + std::string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + + RGWBucketInfo bucket_info; + int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl; + return ret; + } + + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, nullptr, y); +} + +static void generate_fake_tag(const DoutPrefixProvider *dpp, RGWRados* store, map& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl) +{ + string tag; + + RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp); + if (mi != manifest.obj_end(dpp)) { + if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part + ++mi; + tag = mi.get_location().get_raw_obj(store).oid; + tag.append("_"); + } + + unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length()); + + map::iterator iter = attrset.find(RGW_ATTR_ETAG); + if (iter != attrset.end()) { + bufferlist& bl = iter->second; + hash.Update((const unsigned char *)bl.c_str(), bl.length()); + } + + hash.Final(md5); + buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str); + tag.append(md5_str); + + ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl; + + tag_bl.append(tag.c_str(), tag.size() + 1); +} + +static bool is_olh(map& attrs) +{ + map::iterator iter = attrs.find(RGW_ATTR_OLH_VER); + return (iter != attrs.end()); +} + +static bool has_olh_tag(map& attrs) +{ + map::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG); + return (iter != attrs.end()); +} + +int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& + obj_ctx, RGWBucketInfo& bucket_info, + const rgw_obj& obj, RGWObjState *olh_state, + RGWObjState **target_state, + RGWObjManifest **target_manifest, optional_yield y) +{ + ceph_assert(olh_state->is_olh); + + rgw_obj target; + int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */ + if (r < 0) { + return r; + } + + r = get_obj_state(dpp, &obj_ctx, bucket_info, target, target_state, + target_manifest, false, y); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, + RGWBucketInfo& bucket_info, const rgw_obj& obj, + RGWObjState **state, RGWObjManifest** manifest, + bool follow_olh, optional_yield y, bool assume_noent) +{ + if (obj.empty()) { + return -EINVAL; + } + + bool need_follow_olh = follow_olh && obj.key.instance.empty(); + *manifest = nullptr; + + RGWObjStateManifest *sm = rctx->get_state(obj); + RGWObjState *s = &(sm->state); + ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl; + *state = s; + if (sm->manifest) { + *manifest = &(*sm->manifest); + } + if (s->has_attrs) { + if (s->is_olh && need_follow_olh) { + return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y); + } + return 0; + } + + s->obj = obj; + + rgw_raw_obj raw_obj; + obj_to_raw(bucket_info.placement_rule, obj, &raw_obj); + + int r = -ENOENT; + + if (!assume_noent) { + r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y); + } + + if (r == -ENOENT) { + s->exists = false; + s->has_attrs = true; + tombstone_entry entry; + if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) { + s->mtime = entry.mtime; + s->zone_short_id = entry.zone_short_id; + s->pg_ver = entry.pg_ver; + ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj + << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl; + } else { + s->mtime = real_time(); + } + return 0; + } + if (r < 0) + return r; + + s->exists = true; + s->has_attrs = true; + s->accounted_size = s->size; + + auto iter = s->attrset.find(RGW_ATTR_ETAG); + if (iter != s->attrset.end()) { + /* get rid of extra null character at the end of the etag, as we used to store it like that */ + bufferlist& bletag = iter->second; + if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') { + bufferlist newbl; + bletag.splice(0, bletag.length() - 1, &newbl); + bletag = std::move(newbl); + } + } + + iter = s->attrset.find(RGW_ATTR_COMPRESSION); + const bool compressed = (iter != s->attrset.end()); + if (compressed) { + // use uncompressed size for accounted_size + try { + RGWCompressionInfo info; + auto p = iter->second.cbegin(); + decode(info, p); + s->accounted_size = info.orig_size; + } catch (buffer::error&) { + ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl; + return -EIO; + } + } + + if (iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ); iter != s->attrset.end()) { + const bufferlist& bl = iter->second; + auto it = bl.begin(); + it.copy(bl.length(), s->shadow_obj); + s->shadow_obj[bl.length()] = '\0'; + } + if (iter = s->attrset.find(RGW_ATTR_ID_TAG); iter != s->attrset.end()) { + s->obj_tag = iter->second; + } + if (iter = s->attrset.find(RGW_ATTR_TAIL_TAG); iter != s->attrset.end()) { + s->tail_tag = iter->second; + } + + if (iter = s->attrset.find(RGW_ATTR_MANIFEST); iter != s->attrset.end()) { + bufferlist manifest_bl = iter->second; + auto miter = manifest_bl.cbegin(); + try { + sm->manifest.emplace(); + decode(*sm->manifest, miter); + sm->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be + broken due to old bugs */ + s->size = sm->manifest->get_obj_size(); + if (!compressed) + s->accounted_size = s->size; + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl; + return -EIO; + } + *manifest = &(*sm->manifest); + ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl; + if (cct->_conf->subsys.should_gather() && \ + sm->manifest->has_explicit_objs()) { + RGWObjManifest::obj_iterator mi; + for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) { + ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl; + } + } + + if (!s->obj_tag.length()) { + /* + * Uh oh, something's wrong, object with manifest should have tag. Let's + * create one out of the manifest, would be unique + */ + generate_fake_tag(dpp, this, s->attrset, *sm->manifest, manifest_bl, s->obj_tag); + s->fake_tag = true; + } + } + if (iter = s->attrset.find(RGW_ATTR_PG_VER); iter != s->attrset.end()) { + const bufferlist& pg_ver_bl = iter->second; + if (pg_ver_bl.length()) { + auto pgbl = pg_ver_bl.cbegin(); + try { + decode(s->pg_ver, pgbl); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl; + } + } + } + if (iter = s->attrset.find(RGW_ATTR_SOURCE_ZONE); iter != s->attrset.end()) { + const bufferlist& zone_short_id_bl = iter->second; + if (zone_short_id_bl.length()) { + auto zbl = zone_short_id_bl.cbegin(); + try { + decode(s->zone_short_id, zbl); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl; + } + } + } + if (s->obj_tag.length()) { + ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl; + } else { + ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl; + } + + /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if + * it exist, and not only if is_olh() returns true + */ + if (iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG); iter != s->attrset.end()) { + s->olh_tag = iter->second; + } + + if (is_olh(s->attrset)) { + s->is_olh = true; + + ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl; + + if (need_follow_olh) { + return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y); + } else if (obj.key.have_null_instance() && !sm->manifest) { + // read null version, and the head object only have olh info + s->exists = false; + return -ENOENT; + } + } + + return 0; +} + +int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, + bool follow_olh, optional_yield y, bool assume_noent) +{ + int ret; + + do { + ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent); + } while (ret == -EAGAIN); + + return ret; +} + +int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y) +{ + RGWObjState *astate; + int r = get_state(dpp, &astate, pmanifest, true, y); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y) +{ + RGWObjState *state; + RGWObjManifest *manifest = nullptr; + int r = source->get_state(dpp, &state, &manifest, true, y); + if (r < 0) + return r; + if (!state->exists) + return -ENOENT; + if (!state->get_attr(name, dest)) + return -ENODATA; + + return 0; +} + +int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp) +{ + RGWObjectCtx& ctx = source->get_ctx(); + rgw_obj& obj = source->get_obj(); + RGWRados *store = source->get_store(); + + RGWObjStateManifest *sm = ctx.get_state(obj); + result.obj = obj; + if (sm->state.has_attrs) { + state.ret = 0; + result.size = sm->state.size; + result.mtime = ceph::real_clock::to_timespec(sm->state.mtime); + result.attrs = sm->state.attrset; + result.manifest = sm->manifest; + return 0; + } + + string oid; + string loc; + get_obj_bucket_and_oid_loc(obj, oid, loc); + + int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + op.stat2(&result.size, &result.mtime, NULL); + op.getxattrs(&result.attrs, NULL); + state.completion = librados::Rados::aio_create_completion(nullptr, nullptr); + state.io_ctx.locator_set_key(loc); + r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL); + if (r < 0) { + ldpp_dout(dpp, 5) << __func__ + << ": ERROR: aio_operate() returned ret=" << r + << dendl; + return r; + } + + return 0; +} + + +int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp) +{ + if (!state.completion) { + return state.ret; + } + + state.completion->wait_for_complete(); + state.ret = state.completion->get_return_value(); + state.completion->release(); + + if (state.ret != 0) { + return state.ret; + } + + return finish(dpp); +} + +int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp) +{ + map::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST); + if (iter != result.attrs.end()) { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + try { + result.manifest.emplace(); + decode(*result.manifest, biter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl; + return -EIO; + } + } + + return 0; +} + +int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, + RGWBucketInfo& bucket_info, const rgw_obj& obj, + ObjectOperation& op, RGWObjState **pstate, + RGWObjManifest** pmanifest, optional_yield y) +{ + if (!rctx) + return 0; + + int r = get_obj_state(dpp, rctx, bucket_info, obj, pstate, pmanifest, false, y); + if (r < 0) + return r; + + return append_atomic_test(dpp, *pstate, op); +} + +int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, + const RGWObjState* state, + librados::ObjectOperation& op) +{ + if (!state->is_atomic) { + ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl; + return 0; + } + + if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility + op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); + } else { + ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl; + } + return 0; +} + +int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent) +{ + return store->get_obj_state(dpp, &ctx, bucket_info, obj, pstate, pmanifest, follow_olh, y, assume_noent); +} + +void RGWRados::Object::invalidate_state() +{ + ctx.invalidate(obj); +} + +int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp, + ObjectWriteOperation& op, bool reset_obj, const string *ptag, + const char *if_match, const char *if_nomatch, bool removal_op, + bool modify_tail, optional_yield y) +{ + int r = get_state(dpp, &state, &manifest, false, y); + if (r < 0) + return r; + + bool need_guard = ((manifest) || (state->obj_tag.length() != 0) || + if_match != NULL || if_nomatch != NULL) && + (!state->fake_tag); + + if (!state->is_atomic) { + ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl; + + if (reset_obj) { + op.create(false); + store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object + } + + return 0; + } + + if (need_guard) { + /* first verify that the object wasn't replaced under */ + if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) { + op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); + // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion + } + + if (if_match) { + if (strcmp(if_match, "*") == 0) { + // test the object is existing + if (!state->exists) { + return -ERR_PRECONDITION_FAILED; + } + } else { + bufferlist bl; + if (!state->get_attr(RGW_ATTR_ETAG, bl) || + strncmp(if_match, bl.c_str(), bl.length()) != 0) { + return -ERR_PRECONDITION_FAILED; + } + } + } + + if (if_nomatch) { + if (strcmp(if_nomatch, "*") == 0) { + // test the object is NOT existing + if (state->exists) { + return -ERR_PRECONDITION_FAILED; + } + } else { + bufferlist bl; + if (!state->get_attr(RGW_ATTR_ETAG, bl) || + strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) { + return -ERR_PRECONDITION_FAILED; + } + } + } + } + + if (reset_obj) { + if (state->exists) { + op.create(false); + store->remove_rgw_head_obj(op); + } else { + op.create(true); + } + } + + if (removal_op) { + /* the object is being removed, no need to update its tag */ + return 0; + } + + if (ptag) { + state->write_tag = *ptag; + } else { + append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32); + } + bufferlist bl; + bl.append(state->write_tag.c_str(), state->write_tag.size() + 1); + + ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl; + + op.setxattr(RGW_ATTR_ID_TAG, bl); + if (modify_tail) { + op.setxattr(RGW_ATTR_TAIL_TAG, bl); + } + + return 0; +} + +/** + * Set an attr on an object. + * bucket: name of the bucket holding the object + * obj: name of the object to set the attr on + * name: the attr to set + * bl: the contents of the attr + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, const char *name, bufferlist& bl) +{ + map attrs; + attrs[name] = bl; + return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield); +} + +int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& src_obj, + map& attrs, + map* rmattrs, + optional_yield y, + ceph::real_time set_mtime /* = zero() */) +{ + rgw_obj obj = src_obj; + if (obj.key.instance == "null") { + obj.key.instance.clear(); + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + RGWObjState *state = NULL; + RGWObjManifest *manifest = nullptr; + + r = append_atomic_test(dpp, rctx, bucket_info, obj, op, &state, &manifest, y); + if (r < 0) + return r; + + // ensure null version object exist + if (src_obj.key.instance == "null" && !manifest) { + return -ENOENT; + } + + map::iterator iter; + if (rmattrs) { + for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { + const string& name = iter->first; + op.rmxattr(name.c_str()); + } + } + + const rgw_bucket& bucket = obj.bucket; + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + + if (name.compare(RGW_ATTR_DELETE_AT) == 0) { + real_time ts; + try { + decode(ts, bl); + + rgw_obj_index_key obj_key; + obj.key.get_index_key(&obj_key); + + obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl; + } + } + } + + if (!op.size()) + return 0; + + bufferlist bl; + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + if (state) { + string tag; + append_rand_alpha(cct, tag, tag, 32); + state->write_tag = tag; + r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y); + + if (r < 0) + return r; + + bl.append(tag.c_str(), tag.size() + 1); + op.setxattr(RGW_ATTR_ID_TAG, bl); + } + + + /* As per https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html, + * the only way for users to modify object metadata is to make a copy of the object and + * set the metadata. + * Hence do not update mtime for any other attr changes */ + real_time mtime = state->mtime; + if (set_mtime != ceph::real_clock::zero()) { + mtime = set_mtime; + } + struct timespec mtime_ts = real_clock::to_timespec(mtime); + op.mtime2(&mtime_ts); + auto& ioctx = ref.pool.ioctx(); + r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield); + if (state) { + if (r >= 0) { + bufferlist acl_bl; + if (iter = attrs.find(RGW_ATTR_ACL); iter != attrs.end()) { + acl_bl = iter->second; + } + std::string etag; + if (iter = attrs.find(RGW_ATTR_ETAG); iter != attrs.end()) { + etag = rgw_bl_str(iter->second); + } + std::string content_type; + if (iter = attrs.find(RGW_ATTR_CONTENT_TYPE); iter != attrs.end()) { + content_type = rgw_bl_str(iter->second); + } + string storage_class; + if (iter = attrs.find(RGW_ATTR_STORAGE_CLASS); iter != attrs.end()) { + storage_class = rgw_bl_str(iter->second); + } + uint64_t epoch = ioctx.get_last_version(); + int64_t poolid = ioctx.get_id(); + r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size, + mtime, etag, content_type, storage_class, &acl_bl, + RGWObjCategory::Main, nullptr, y); + } else { + int ret = index_op.cancel(dpp, nullptr, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl; + } + } + } + if (r < 0) + return r; + + if (state) { + state->obj_tag.swap(bl); + if (rmattrs) { + for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { + state->attrset.erase(iter->first); + } + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + state->attrset[iter->first] = iter->second; + } + + auto iter = state->attrset.find(RGW_ATTR_ID_TAG); + if (iter != state->attrset.end()) { + iter->second = state->obj_tag; + } + + state->mtime = mtime; + } + + return 0; +} + +int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp) +{ + RGWRados *store = source->get_store(); + CephContext *cct = store->ctx(); + + bufferlist etag; + + map::iterator iter; + + RGWObjState *astate; + RGWObjManifest *manifest = nullptr; + int r = source->get_state(dpp, &astate, &manifest, true, y); + if (r < 0) + return r; + + if (!astate->exists) { + return -ENOENT; + } + + const RGWBucketInfo& bucket_info = source->get_bucket_info(); + + state.obj = astate->obj; + store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj); + + state.cur_pool = state.head_obj.pool; + state.cur_ioctx = &state.io_ctxs[state.cur_pool]; + + r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx); + if (r < 0) { + return r; + } + if (params.target_obj) { + *params.target_obj = state.obj; + } + if (params.attrs) { + *params.attrs = astate->attrset; + if (cct->_conf->subsys.should_gather()) { + for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) { + ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl; + } + } + } + + /* Convert all times go GMT to make them compatible */ + if (conds.mod_ptr || conds.unmod_ptr) { + obj_time_weight src_weight; + src_weight.init(astate); + src_weight.high_precision = conds.high_precision_time; + + obj_time_weight dest_weight; + dest_weight.high_precision = conds.high_precision_time; + + if (conds.mod_ptr && !conds.if_nomatch) { + dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver); + ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl; + if (!(dest_weight < src_weight)) { + return -ERR_NOT_MODIFIED; + } + } + + if (conds.unmod_ptr && !conds.if_match) { + dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver); + ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl; + if (dest_weight < src_weight) { + return -ERR_PRECONDITION_FAILED; + } + } + } + if (conds.if_match || conds.if_nomatch) { + r = get_attr(dpp, RGW_ATTR_ETAG, etag, y); + if (r < 0) + return r; + + if (conds.if_match) { + string if_match_str = rgw_string_unquote(conds.if_match); + ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl; + if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) { + return -ERR_PRECONDITION_FAILED; + } + } + + if (conds.if_nomatch) { + string if_nomatch_str = rgw_string_unquote(conds.if_nomatch); + ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl; + if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) { + return -ERR_NOT_MODIFIED; + } + } + } + + if (params.obj_size) + *params.obj_size = astate->size; + if (params.lastmod) + *params.lastmod = astate->mtime; + + return 0; +} + +int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end) +{ + if (ofs < 0) { + ofs += obj_size; + if (ofs < 0) + ofs = 0; + end = obj_size - 1; + } else if (end < 0) { + end = obj_size - 1; + } + + if (obj_size > 0) { + if (ofs >= (off_t)obj_size) { + return -ERANGE; + } + if (end >= (off_t)obj_size) { + end = obj_size - 1; + } + } + return 0; +} + +int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function call) +{ + RGWRados *store = target->get_store(); + BucketShard *bs = nullptr; + int r; + +#define NUM_RESHARD_RETRIES 10 + for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { + int ret = get_bucket_shard(&bs, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" << + obj_instance.key << ". ret=" << ret << dendl; + return ret; + } + + r = call(bs); + if (r != -ERR_BUSY_RESHARDING) { + break; + } + + ldpp_dout(dpp, 10) << + "NOTICE: resharding operation on bucket index detected, blocking. obj=" << + obj_instance.key << dendl; + + r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp); + if (r == -ERR_BUSY_RESHARDING) { + ldpp_dout(dpp, 10) << __func__ << + " NOTICE: block_while_resharding() still busy. obj=" << + obj_instance.key << dendl; + continue; + } else if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: block_while_resharding() failed. obj=" << + obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; + return r; + } + + ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl; + i = 0; /* resharding is finished, make sure we can retry */ + invalidate_bs(); + } // for loop + + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << + obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; + return r; + } + + if (pbs) { + *pbs = bs; + } + + return 0; +} + +int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + + if (write_tag && write_tag->length()) { + optag = string(write_tag->c_str(), write_tag->length()); + } else { + if (optag.empty()) { + append_rand_alpha(store->ctx(), optag, optag, 32); + } + } + + int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int { + return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace); + }); + + if (r < 0) { + return r; + } + prepared = true; + + return 0; +} + +int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, + uint64_t size, uint64_t accounted_size, + ceph::real_time& ut, const string& etag, + const string& content_type, const string& storage_class, + bufferlist *acl_bl, + RGWObjCategory category, + list *remove_objs, + optional_yield y, + const string *user_data, + bool appendable) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs = nullptr; + + int ret = get_bucket_shard(&bs, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl; + return ret; + } + + rgw_bucket_dir_entry ent; + obj.key.get_index_key(&ent.key); + ent.meta.size = size; + ent.meta.accounted_size = accounted_size; + ent.meta.mtime = ut; + ent.meta.etag = etag; + ent.meta.storage_class = storage_class; + if (user_data) + ent.meta.user_data = *user_data; + + ACLOwner owner; + if (acl_bl && acl_bl->length()) { + int ret = store->decode_policy(dpp, *acl_bl, &owner); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl; + } + } + ent.meta.owner = owner.get_id().to_str(); + ent.meta.owner_display_name = owner.get_display_name(); + ent.meta.content_type = content_type; + ent.meta.appendable = appendable; + + ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace); + + add_datalog_entry(dpp, store->svc.datalog_rados, + target->bucket_info, bs->shard_id, y); + + return ret; +} + +int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp, + int64_t poolid, uint64_t epoch, + real_time& removed_mtime, + list *remove_objs, + optional_yield y) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs = nullptr; + + int ret = get_bucket_shard(&bs, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl; + return ret; + } + + ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace); + + add_datalog_entry(dpp, store->svc.datalog_rados, + target->bucket_info, bs->shard_id, y); + + return ret; +} + + +int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp, + list *remove_objs, + optional_yield y) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs; + + int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int { + return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace); + }); + + /* + * need to update data log anyhow, so that whoever follows needs to update its internal markers + * for following the specific bucket shard log. Otherwise they end up staying behind, and users + * have no way to tell that they're all caught up + */ + add_datalog_entry(dpp, store->svc.datalog_rados, + target->bucket_info, bs->shard_id, y); + + return ret; +} + +/* + * Read up through index `end` inclusive. Number of bytes read is up + * to `end - ofs + 1`. + */ +int RGWRados::Object::Read::read(int64_t ofs, int64_t end, + bufferlist& bl, optional_yield y, + const DoutPrefixProvider *dpp) +{ + RGWRados *store = source->get_store(); + + rgw_raw_obj read_obj; + uint64_t read_ofs = ofs; + uint64_t len, read_len; + bool reading_from_head = true; + ObjectReadOperation op; + + bool merge_bl = false; + bufferlist *pbl = &bl; + bufferlist read_bl; + uint64_t max_chunk_size; + + RGWObjState *astate; + RGWObjManifest *manifest = nullptr; + int r = source->get_state(dpp, &astate, &manifest, true, y); + if (r < 0) + return r; + + if (astate->size == 0) { + end = 0; + } else if (end >= (int64_t)astate->size) { + end = astate->size - 1; + } + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (manifest && manifest->has_tail()) { + /* now get the relevant object part */ + RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs); + + uint64_t stripe_ofs = iter.get_stripe_ofs(); + read_obj = iter.get_location().get_raw_obj(store); + len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs)); + read_ofs = iter.location_ofs() + (ofs - stripe_ofs); + reading_from_head = (read_obj == state.head_obj); + } else { + read_obj = state.head_obj; + } + + r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl; + return r; + } + + if (len > max_chunk_size) + len = max_chunk_size; + + + read_len = len; + + if (reading_from_head) { + /* only when reading from the head object do we need to do the atomic test */ + r = store->append_atomic_test(dpp, &source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, &manifest, y); + if (r < 0) + return r; + + if (astate && astate->prefetch_data) { + if (!ofs && astate->data.length() >= len) { + bl = astate->data; + return bl.length(); + } + + if (ofs < astate->data.length()) { + unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len); + astate->data.begin(ofs).copy(copy_len, bl); + read_len -= copy_len; + read_ofs += copy_len; + if (!read_len) + return bl.length(); + + merge_bl = true; + pbl = &read_bl; + } + } + } + + ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl; + op.read(read_ofs, read_len, pbl, NULL); + + if (state.cur_pool != read_obj.pool) { + auto iter = state.io_ctxs.find(read_obj.pool); + if (iter == state.io_ctxs.end()) { + state.cur_ioctx = &state.io_ctxs[read_obj.pool]; + r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false, true); + if (r < 0) { + ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl; + return r; + } + } else { + state.cur_ioctx = &iter->second; + } + state.cur_pool = read_obj.pool; + } + + state.cur_ioctx->locator_set_key(read_obj.loc); + + r = state.cur_ioctx->operate(read_obj.oid, &op, NULL); + ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl; + + if (r < 0) { + return r; + } + + if (merge_bl) { + bl.append(read_bl); + } + + return bl.length(); +} + +int get_obj_data::flush(rgw::AioResultList&& results) { + int r = rgw::check_for_errors(results); + if (r < 0) { + return r; + } + std::list bl_list; + + auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; }; + results.sort(cmp); // merge() requires results to be sorted first + completed.merge(results, cmp); // merge results in sorted order + + while (!completed.empty() && completed.front().id == offset) { + auto bl = std::move(completed.front().data); + + bl_list.push_back(bl); + offset += bl.length(); + int r = client_cb->handle_data(bl, 0, bl.length()); + if (r < 0) { + return r; + } + + if (rgwrados->get_use_datacache()) { + const std::lock_guard l(d3n_get_data.d3n_lock); + auto oid = completed.front().obj.get_ref().obj.oid; + if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) { + lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl; + rgwrados->d3n_data_cache->put(bl, bl.length(), oid); + } else { + lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl; + } + } + completed.pop_front_and_dispose(std::default_delete{}); + } + return 0; +} + +static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg) +{ + struct get_obj_data* d = static_cast(arg); + return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len, + is_head_obj, astate, arg); +} + +int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg) +{ + ObjectReadOperation op; + struct get_obj_data* d = static_cast(arg); + string oid, key; + + if (is_head_obj) { + /* only when reading from the head object do we need to do the atomic test */ + int r = append_atomic_test(dpp, astate, op); + if (r < 0) + return r; + + if (astate && + obj_ofs < astate->data.length()) { + unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len); + + r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len); + if (r < 0) + return r; + + len -= chunk_len; + d->offset += chunk_len; + read_ofs += chunk_len; + obj_ofs += chunk_len; + if (!len) + return 0; + } + } + + auto obj = d->rgwrados->svc.rados->obj(read_obj); + int r = obj.open(dpp); + if (r < 0) { + ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl; + return r; + } + + ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl; + op.read(read_ofs, len, nullptr, nullptr); + + const uint64_t cost = len; + const uint64_t id = obj_ofs; // use logical object offset for sorting replies + + auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id); + + return d->flush(std::move(completed)); +} + +int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, + optional_yield y) +{ + RGWRados *store = source->get_store(); + CephContext *cct = store->ctx(); + const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size; + const uint64_t window_size = cct->_conf->rgw_get_obj_window_size; + + auto aio = rgw::make_throttle(window_size, y); + get_obj_data data(store, cb, &*aio, ofs, y); + + int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(), state.obj, + ofs, end, chunk_size, _get_obj_iterate_cb, &data, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl; + data.cancel(); // drain completions without writing back to client + return r; + } + + return data.drain(); +} + +int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, const rgw_obj& obj, + off_t ofs, off_t end, uint64_t max_chunk_size, + iterate_obj_cb cb, void *arg, optional_yield y) +{ + rgw_raw_obj head_obj; + rgw_raw_obj read_obj; + uint64_t read_ofs = ofs; + uint64_t len; + bool reading_from_head = true; + RGWObjState *astate = NULL; + RGWObjManifest *manifest = nullptr; + + obj_to_raw(bucket_info.placement_rule, obj, &head_obj); + + int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y); + if (r < 0) { + return r; + } + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (manifest) { + /* now get the relevant object stripe */ + RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs); + + RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp); + + for (; iter != obj_end && ofs <= end; ++iter) { + off_t stripe_ofs = iter.get_stripe_ofs(); + off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size(); + + while (ofs < next_stripe_ofs && ofs <= end) { + read_obj = iter.get_location().get_raw_obj(this); + uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs)); + read_ofs = iter.location_ofs() + (ofs - stripe_ofs); + + if (read_len > max_chunk_size) { + read_len = max_chunk_size; + } + + reading_from_head = (read_obj == head_obj); + r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg); + if (r < 0) { + return r; + } + + len -= read_len; + ofs += read_len; + } + } + } else { + while (ofs <= end) { + read_obj = head_obj; + uint64_t read_len = std::min(len, max_chunk_size); + + r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg); + if (r < 0) { + return r; + } + + len -= read_len; + ofs += read_len; + } + } + + return 0; +} + +int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield); +} + +int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + bufferlist outbl; + + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield); +} + +void RGWRados::olh_cancel_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, + RGWObjState& state, const rgw_obj& olh_obj, + const std::string& op_tag, optional_yield y) +{ + if (cct->_conf->rgw_debug_inject_olh_cancel_modification_err) { + // simulate the scenario where we fail to remove the pending xattr + return; + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref); + if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " get_obj_head_ref() returned " << r << dendl; + return; + } + string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; + attr_name.append(op_tag); + + // first remove the relevant pending prefix + ObjectWriteOperation op; + bucket_index_guard_olh_op(dpp, state, op); + op.rmxattr(attr_name.c_str()); + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, y); + if (r < 0) { + if (r != -ENOENT && r != -ECANCELED) { + ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " rmxattr rgw_rados_operate() returned " << r << dendl; + } + return; + } + + if (auto iter = state.attrset.find(RGW_ATTR_OLH_INFO); iter == state.attrset.end()) { + // attempt to remove the OLH object if there are no pending ops, + // its olh info attr is empty, and its tag hasn't changed + ObjectWriteOperation rm_op; + bucket_index_guard_olh_op(dpp, state, rm_op); + rm_op.cmpxattr(RGW_ATTR_OLH_INFO, CEPH_OSD_CMPXATTR_OP_EQ, bufferlist()); + cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); + rm_op.remove(); + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, y); + } + if (r < 0 && (r != -ENOENT && r != -ECANCELED)) { + ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " olh rm rgw_rados_operate() returned " << r << dendl; + } +} + +int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag) +{ + ObjectWriteOperation op; + + ceph_assert(olh_obj.key.instance.empty()); + + bool has_tag = (state.exists && has_olh_tag(state.attrset)); + + if (!state.exists) { + op.create(true); + } else { + op.assert_exists(); + struct timespec mtime_ts = real_clock::to_timespec(state.mtime); + op.mtime2(&mtime_ts); + } + + /* + * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object. + * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two + * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to + * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh + * log will reflect that. + * + * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag + * is used for object data instance, olh_tag for olh instance. + */ + if (has_tag) { + /* guard against racing writes */ + bucket_index_guard_olh_op(dpp, state, op); + } else if (state.exists) { + // This is the case where a null versioned object already exists for this key + // but it hasn't been initialized as an OLH object yet. We immediately add + // the RGW_ATTR_OLH_INFO attr so that the OLH points back to itself and + // therefore effectively makes this an unobservable modification. + op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, bufferlist()); + RGWOLHInfo info; + info.target = olh_obj; + info.removed = false; + bufferlist bl; + encode(info, bl); + op.setxattr(RGW_ATTR_OLH_INFO, bl); + } + + if (!has_tag) { + /* obj tag */ + string obj_tag = gen_rand_alphanumeric_lower(cct, 32); + + bufferlist bl; + bl.append(obj_tag.c_str(), obj_tag.size()); + op.setxattr(RGW_ATTR_ID_TAG, bl); + + state.attrset[RGW_ATTR_ID_TAG] = bl; + state.obj_tag = bl; + + /* olh tag */ + string olh_tag = gen_rand_alphanumeric_lower(cct, 32); + + bufferlist olh_bl; + olh_bl.append(olh_tag.c_str(), olh_tag.size()); + op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl); + + state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl; + state.olh_tag = olh_bl; + state.is_olh = true; + + bufferlist verbl; + op.setxattr(RGW_ATTR_OLH_VER, verbl); + } + + bufferlist bl; + RGWOLHPendingInfo pending_info; + pending_info.time = real_clock::now(); + encode(pending_info, bl); + +#define OLH_PENDING_TAG_LEN 32 + /* tag will start with current time epoch, this so that entries are sorted by time */ + char buf[32]; + utime_t ut(pending_info.time); + snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec()); + *op_tag = buf; + + string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size()); + + op_tag->append(s); + + string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; + attr_name.append(*op_tag); + + op.setxattr(attr_name.c_str(), bl); + + int ret = obj_operate(dpp, bucket_info, olh_obj, &op); + if (ret < 0) { + return ret; + } + + state.exists = true; + state.attrset[attr_name] = bl; + + return 0; +} + +int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag) +{ + int ret; + + ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag); + if (ret == -EEXIST) { + ret = -ECANCELED; + } + + return ret; +} + +int RGWRados::guard_reshard(const DoutPrefixProvider *dpp, + BucketShard *bs, + const rgw_obj& obj_instance, + RGWBucketInfo& bucket_info, + std::function call) +{ + rgw_obj obj; + const rgw_obj *pobj = &obj_instance; + int r; + + for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { + r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp); + if (r < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl; + return r; + } + + r = call(bs); + if (r != -ERR_BUSY_RESHARDING) { + break; + } + + ldpp_dout(dpp, 10) << + "NOTICE: resharding operation on bucket index detected, blocking. obj=" << + obj_instance.key << dendl; + + r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp); + if (r == -ERR_BUSY_RESHARDING) { + ldpp_dout(dpp, 10) << __func__ << + " NOTICE: block_while_resharding() still busy. obj=" << + obj_instance.key << dendl; + continue; + } else if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: block_while_resharding() failed. obj=" << + obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; + return r; + } + + ldpp_dout(dpp, 20) << "reshard completion identified" << dendl; + i = 0; /* resharding is finished, make sure we can retry */ + } // for loop + + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << + obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; + return r; + } + + return 0; +} + + +int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, + const rgw_obj& obj_instance, + RGWBucketInfo& bucket_info, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + int ret = 0; + cls_rgw_bucket_instance_entry entry; + + // gets loaded by fetch_new_bucket_info; can be used by + // clear_resharding + std::map bucket_attrs; + + // since we want to run this recovery code from two distinct places, + // let's just put it in a lambda so we can easily re-use; if the + // lambda successfully fetches a new bucket id, it sets + // new_bucket_id and returns 0, otherwise it returns a negative + // error code + auto fetch_new_bucket_info = + [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int { + int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name, + bucket_info, nullptr, y, dpp, &bucket_attrs); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to refresh bucket info after reshard at " << + log_tag << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = bs->init(dpp, bucket_info, obj_instance); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to refresh bucket shard generation after reshard at " << + log_tag << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen; + ldpp_dout(dpp, 20) << __func__ << + " INFO: refreshed bucket info after reshard at " << + log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl; + + return 0; + }; // lambda fetch_new_bucket_info + + constexpr int num_retries = 10; + for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop + auto& ref = bs->bucket_obj.get_ref(); + ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry); + if (ret == -ENOENT) { + ret = fetch_new_bucket_info("get_bucket_resharding_failed"); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " failed to refresh bucket info after reshard when get bucket " + "resharding failed, error: " << cpp_strerror(-ret) << dendl; + return ret; + } + } else if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) << + dendl; + return ret; + } + + if (!entry.resharding_in_progress()) { + ret = fetch_new_bucket_info("get_bucket_resharding_succeeded"); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " failed to refresh bucket info after reshard when get bucket " + "resharding succeeded, error: " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " << + (i < num_retries ? "retrying" : "too many retries") << dendl; + + if (i == num_retries) { + break; + } + + // If bucket is erroneously marked as resharding (e.g., crash or + // other error) then fix it. If we can take the bucket reshard + // lock then it means no other resharding should be taking place, + // and we're free to clear the flags. + { + // since we expect to do this rarely, we'll do our work in a + // block and erase our work after each try + + RGWObjectCtx obj_ctx(this->driver); + const rgw_bucket& b = bs->bucket; + std::string bucket_id = b.get_key(); + RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true); + ret = reshard_lock.lock(dpp); + if (ret == -ENOENT) { + continue; + } else if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << + " ERROR: failed to take reshard lock for bucket " << + bucket_id << "; expected if resharding underway" << dendl; + } else { + ldpp_dout(dpp, 10) << __func__ << + " INFO: was able to take reshard lock for bucket " << + bucket_id << dendl; + // the reshard may have finished, so call clear_resharding() + // with its current bucket info; ALSO this will load + // bucket_attrs for call to clear_resharding below + ret = fetch_new_bucket_info("trying_to_clear_resharding"); + if (ret < 0) { + reshard_lock.unlock(); + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to update bucket info before clear resharding for bucket " << + bucket_id << dendl; + continue; // try again + } + + ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp); + reshard_lock.unlock(); + if (ret == -ENOENT) { + ldpp_dout(dpp, 5) << __func__ << + " INFO: no need to reset reshard flags; old shards apparently" + " removed after successful resharding of bucket " << + bucket_id << dendl; + continue; // immediately test again + } else if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to clear resharding flags for bucket " << + bucket_id << ", " << cpp_strerror(-ret) << dendl; + // wait and then test again + } else { + ldpp_dout(dpp, 5) << __func__ << + " INFO: apparently successfully cleared resharding flags for " + "bucket " << bucket_id << dendl; + continue; // if we apparently succeed immediately test again + } // if clear resharding succeeded + } // if taking of lock succeeded + } // block to encapsulate recovery from incomplete reshard + + ret = reshard_wait->wait(y); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: bucket is still resharding, please retry" << dendl; + return ret; + } + } // for loop + + ldpp_dout(dpp, 0) << __func__ << + " ERROR: bucket is still resharding, please retry" << dendl; + return -ERR_BUSY_RESHARDING; +} + +int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, + RGWObjState& olh_state, const rgw_obj& obj_instance, + bool delete_marker, const string& op_tag, + struct rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, + real_time unmod_since, bool high_precision_time, + optional_yield y, + rgw_zone_set *_zones_trace, bool log_data_change) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key()); + + BucketShard bs(this); + + r = guard_reshard(dpp, &bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); + auto& ref = bs->bucket_obj.get_ref(); + librados::ObjectWriteOperation op; + op.assert_exists(); // bucket index shard must exist + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag, + delete_marker, op_tag, meta, olh_epoch, + unmod_since, high_precision_time, + svc.zone->need_to_log_data(), zones_trace); + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + }); + if (r < 0) { + ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl; + return r; + } + + if (log_data_change) { + add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id, y); + } + + return 0; +} + +void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op) +{ + ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl; + op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag); +} + +int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw_obj& obj_instance, + const string& op_tag, const string& olh_tag, + uint64_t olh_epoch, rgw_zone_set *_zones_trace) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key()); + + BucketShard bs(this); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); + r = guard_reshard(dpp, &bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + auto& ref = bs->bucket_obj.get_ref(); + librados::ObjectWriteOperation op; + op.assert_exists(); // bucket index shard must exist + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_unlink_instance(op, key, op_tag, + olh_tag, olh_epoch, svc.zone->need_to_log_data(), zones_trace); + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + }); + if (r < 0) { + ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl; + return r; + } + + return 0; +} + +int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, RGWObjState& state, + const rgw_obj& obj_instance, uint64_t ver_marker, + std::map > *log, + bool *is_truncated) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + int ret = + bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + auto& shard_ref = bs.bucket_obj.get_ref(); + ObjectReadOperation op; + + rgw_cls_read_olh_log_ret log_ret; + int op_ret = 0; + cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret); + bufferlist outbl; + r = rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield); + if (r < 0) { + return r; + } + if (op_ret < 0) { + ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl; + return op_ret; + } + + *log = std::move(log_ret.log); + *is_truncated = log_ret.is_truncated; + + return 0; +} + +// a multisite sync bug resulted in the OLH head attributes being overwritten by +// the attributes from another zone, causing link_olh() to fail endlessly due to +// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH +// attributes from the bucket index. see http://tracker.ceph.com/issues/37792 +int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info, + const rgw_obj& obj) +{ + // fetch the current olh entry from the bucket index + rgw_bucket_olh_entry olh; + int r = bi_get_olh(dpp, bucket_info, obj, &olh); + if (r < 0) { + ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl; + return r; + } + if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved? + return 0; + } + + ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag + << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl; + + // rewrite OLH_ID_TAG and OLH_INFO from current olh + ObjectWriteOperation op; + // assert this is the same olh tag we think we're fixing + bucket_index_guard_olh_op(dpp, *state, op); + // preserve existing mtime + struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime); + op.mtime2(&mtime_ts); + { + bufferlist bl; + bl.append(olh.tag.c_str(), olh.tag.size()); + op.setxattr(RGW_ATTR_OLH_ID_TAG, bl); + } + { + RGWOLHInfo info; + info.target = rgw_obj(bucket_info.bucket, olh.key); + info.removed = olh.delete_marker; + bufferlist bl; + encode(info, bl); + op.setxattr(RGW_ATTR_OLH_INFO, bl); + } + rgw_rados_ref ref; + r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + RGWObjState& state, + const rgw_obj& obj_instance, uint64_t ver) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + int ret = + bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + ret = guard_reshard(dpp, &bs, obj_instance, bucket_info, + [&](BucketShard *pbs) -> int { + ObjectWriteOperation op; + op.assert_exists(); // bucket index shard must exist + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_trim_olh_log(op, key, ver, olh_tag); + return pbs->bucket_obj.operate(dpp, &op, null_yield); + }); + if (ret < 0) { + ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::string& olh_tag, + const rgw_obj& obj_instance) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info, + [&](BucketShard *pbs) -> int { + ObjectWriteOperation op; + op.assert_exists(); // bucket index shard must exist + auto& ref = pbs->bucket_obj.get_ref(); + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_clear_olh(op, key, olh_tag); + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + }); + if (ret < 0) { + ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh) +{ + try { + auto biter = bl.cbegin(); + decode(*olh, biter); + return 0; + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl; + return -EIO; + } +} + +int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + RGWObjState& state, + RGWBucketInfo& bucket_info, + const rgw_obj& obj, + bufferlist& olh_tag, + std::map >& log, + uint64_t *plast_ver, + rgw_zone_set* zones_trace) +{ + if (log.empty()) { + return 0; + } + + librados::ObjectWriteOperation op; + + uint64_t last_ver = log.rbegin()->first; + *plast_ver = last_ver; + + map >::iterator iter = log.begin(); + + op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag); + op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver); + + bufferlist ver_bl; + string last_ver_s = to_string(last_ver); + ver_bl.append(last_ver_s.c_str(), last_ver_s.size()); + op.setxattr(RGW_ATTR_OLH_VER, ver_bl); + + struct timespec mtime_ts = real_clock::to_timespec(state.mtime); + op.mtime2(&mtime_ts); + + bool need_to_link = false; + uint64_t link_epoch = 0; + cls_rgw_obj_key key; + bool delete_marker = false; + list remove_instances; + bool need_to_remove = false; + + // decode current epoch and instance + auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER); + if (olh_ver != state.attrset.end()) { + std::string str = olh_ver->second.to_str(); + std::string err; + link_epoch = strict_strtoll(str.c_str(), 10, &err); + } + auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO); + if (olh_info != state.attrset.end()) { + RGWOLHInfo info; + int r = decode_olh_info(dpp, cct, olh_info->second, &info); + if (r < 0) { + return r; + } + info.target.key.get_index_key(&key); + delete_marker = info.removed; + } + + for (iter = log.begin(); iter != log.end(); ++iter) { + vector::iterator viter = iter->second.begin(); + for (; viter != iter->second.end(); ++viter) { + rgw_bucket_olh_log_entry& entry = *viter; + + ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op + << " key=" << entry.key.name << "[" << entry.key.instance << "] " + << (entry.delete_marker ? "(delete)" : "") << dendl; + switch (entry.op) { + case CLS_RGW_OLH_OP_REMOVE_INSTANCE: + remove_instances.push_back(entry.key); + break; + case CLS_RGW_OLH_OP_LINK_OLH: + // only overwrite a link of the same epoch if its key sorts before + if (link_epoch < iter->first || key.instance.empty() || + key.instance > entry.key.instance) { + ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker + << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl; + need_to_link = true; + need_to_remove = false; + key = entry.key; + delete_marker = entry.delete_marker; + } else { + ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker + << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl; + } + break; + case CLS_RGW_OLH_OP_UNLINK_OLH: + need_to_remove = true; + need_to_link = false; + break; + default: + ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl; + return -EIO; + } + string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; + attr_name.append(entry.op_tag); + op.rmxattr(attr_name.c_str()); + } + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + const rgw_bucket& bucket = obj.bucket; + + if (need_to_link) { + rgw_obj target(bucket, key); + RGWOLHInfo info; + info.target = target; + info.removed = delete_marker; + bufferlist bl; + encode(info, bl); + op.setxattr(RGW_ATTR_OLH_INFO, bl); + } + + /* first remove object instances */ + for (list::iterator liter = remove_instances.begin(); + liter != remove_instances.end(); ++liter) { + cls_rgw_obj_key& key = *liter; + rgw_obj obj_instance(bucket, key); + int ret = delete_obj(dpp, obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl; + return ret; + } + } + + /* update olh object */ + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl; + return r; + } + + + if (need_to_remove) { + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + r = clear_olh(dpp, obj_ctx, obj, bucket_info, ref, olh_tag, last_ver, null_yield); + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, 0) << "ERROR: could not clear olh, r=" << r << dendl; + return r; + } + } else { + r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj, last_ver); + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl; + return r; + } + } + + return 0; +} + +int RGWRados::clear_olh(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const rgw_obj& obj, + RGWBucketInfo& bucket_info, + const std::string& tag, + const uint64_t ver, + optional_yield y) { + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + return clear_olh(dpp, obj_ctx, obj, bucket_info, ref, tag, ver, y); +} + +int RGWRados::clear_olh(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const rgw_obj& obj, + RGWBucketInfo& bucket_info, + rgw_rados_ref& ref, + const std::string& tag, + const uint64_t ver, + optional_yield y) { + ObjectWriteOperation rm_op; + + RGWObjManifest *manifest = nullptr; + RGWObjState *s = nullptr; + + int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &s, &manifest, false, y); + if (r < 0) { + return r; + } + map pending_entries; + rgw_filter_attrset(s->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries); + + map rm_pending_entries; + check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries); + + if (!rm_pending_entries.empty()) { + r = remove_olh_pending_entries(dpp, bucket_info, *s, obj, rm_pending_entries); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: rm_pending_entries returned ret=" << r << dendl; + return r; + } + } + + bufferlist tag_bl; + tag_bl.append(tag.c_str(), tag.length()); + rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, tag_bl); + rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, ver); + cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */ + rm_op.remove(); + + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, y); + if (r == -ECANCELED) { + return r; /* someone else made a modification in the meantime */ + } + /* + * only clear if was successful, otherwise we might clobber pending operations on this object + */ + r = bucket_index_clear_olh(dpp, bucket_info, tag, obj); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl; + return r; + } + return 0; +} + +/* + * read olh log and apply it + */ +int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace) +{ + map > log; + bool is_truncated; + uint64_t ver_marker = 0; + + do { + int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj, ver_marker, &log, &is_truncated); + if (ret < 0) { + return ret; + } + ret = apply_olh_log(dpp, obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace); + if (ret < 0) { + return ret; + } + } while (is_truncated); + + return 0; +} + +int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, + const rgw_obj& target_obj, bool delete_marker, + rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, + optional_yield y, rgw_zone_set *zones_trace, bool log_data_change) +{ + string op_tag; + + rgw_obj olh_obj = target_obj; + olh_obj.key.instance.clear(); + + RGWObjState *state = NULL; + RGWObjManifest *manifest = nullptr; + + int ret = 0; + int i; + +#define MAX_ECANCELED_RETRY 100 + for (i = 0; i < MAX_ECANCELED_RETRY; i++) { + if (ret == -ECANCELED) { + obj_ctx.invalidate(olh_obj); + } + + ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, &manifest, false, y); /* don't follow olh */ + if (ret < 0) { + return ret; + } + + ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag); + if (ret < 0) { + ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + return ret; + } + if (cct->_conf->rgw_debug_inject_set_olh_err) { + // fail here to simulate the scenario of an unlinked object instance + ret = -cct->_conf->rgw_debug_inject_set_olh_err; + } else { + ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj, + delete_marker, op_tag, meta, olh_epoch, unmod_since, + high_precision_time, y, zones_trace, log_data_change); + } + if (ret < 0) { + ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl; + olh_cancel_modification(dpp, bucket_info, *state, olh_obj, op_tag, y); + if (ret == -ECANCELED) { + // the bucket index rejected the link_olh() due to olh tag mismatch; + // attempt to reconstruct olh head attributes based on the bucket index + int r2 = repair_olh(dpp, state, bucket_info, olh_obj); + if (r2 < 0 && r2 != -ECANCELED) { + return r2; + } + continue; + } + // it's possible that the pending xattr from this op prevented the olh + // object from being cleaned by another thread that was deleting the last + // existing version. We invoke a best-effort update_olh here to handle this case. + int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj); + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl; + } + return ret; + } + break; + } + + if (i == MAX_ECANCELED_RETRY) { + ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl; + return -EIO; + } + + ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj); + if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ + ret = 0; + } + if (ret < 0) { + ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, + uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace) +{ + string op_tag; + + rgw_obj olh_obj = target_obj; + olh_obj.key.instance.clear(); + + RGWObjState *state = NULL; + RGWObjManifest *manifest = NULL; + + int ret = 0; + int i; + + for (i = 0; i < MAX_ECANCELED_RETRY; i++) { + if (ret == -ECANCELED) { + obj_ctx.invalidate(olh_obj); + } + + ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, &manifest, false, y); /* don't follow olh */ + if (ret < 0) + return ret; + + ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag); + if (ret < 0) { + ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + return ret; + } + + string olh_tag(state->olh_tag.c_str(), state->olh_tag.length()); + + ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace); + if (ret < 0) { + olh_cancel_modification(dpp, bucket_info, *state, olh_obj, op_tag, y); + ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + // it's possible that the pending xattr from this op prevented the olh + // object from being cleaned by another thread that was deleting the last + // existing version. We invoke a best-effort update_olh here to handle this case. + int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace); + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl; + } + return ret; + } + break; + } + + if (i == MAX_ECANCELED_RETRY) { + ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl; + return -EIO; + } + + ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace); + if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ + return 0; + } + if (ret < 0) { + ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl; + return ret; + } + + return 0; +} + +void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key) +{ +#define OBJ_INSTANCE_LEN 32 + char buf[OBJ_INSTANCE_LEN + 1]; + + gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped, + no underscore for instance name due to the way we encode the raw keys */ + + target_key->set_instance(buf); +} + +void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj) +{ + gen_rand_obj_instance_name(&target_obj->key); +} + +int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh) +{ + map attrset; + + ObjectReadOperation op; + op.getxattrs(&attrset, NULL); + + int r = obj_operate(dpp, bucket_info, obj, &op); + if (r < 0) { + return r; + } + + auto iter = attrset.find(RGW_ATTR_OLH_VER); + if (iter == attrset.end()) { /* not an olh */ + return -EINVAL; + } + + return decode_olh_info(dpp, cct, iter->second, olh); +} + +void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp, + map& pending_entries, + map *rm_pending_entries) +{ + map::iterator iter = pending_entries.begin(); + + real_time now = real_clock::now(); + + while (iter != pending_entries.end()) { + auto biter = iter->second.cbegin(); + RGWOLHPendingInfo pending_info; + try { + decode(pending_info, biter); + } catch (buffer::error& err) { + /* skipping bad entry, we could remove it but it might hide a bug */ + ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl; + ++iter; + continue; + } + + map::iterator cur_iter = iter; + ++iter; + if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) { + (*rm_pending_entries)[cur_iter->first] = cur_iter->second; + pending_entries.erase(cur_iter); + } else { + /* entries names are sorted by time (rounded to a second) */ + break; + } + } +} + +int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map& pending_attrs) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref); + if (r < 0) { + return r; + } + + // trim no more than 1000 entries per osd op + constexpr int max_entries = 1000; + + auto i = pending_attrs.begin(); + while (i != pending_attrs.end()) { + ObjectWriteOperation op; + bucket_index_guard_olh_op(dpp, state, op); + + for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) { + op.rmxattr(i->first.c_str()); + } + + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r == -ENOENT || r == -ECANCELED) { + /* raced with some other change, shouldn't sweat about it */ + return 0; + } + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl; + return r; + } + } + return 0; +} + +int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target) +{ + map pending_entries; + rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries); + + map rm_pending_entries; + check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries); + + if (!rm_pending_entries.empty()) { + int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj, rm_pending_entries); + if (ret < 0) { + ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl; + return ret; + } + } + if (!pending_entries.empty()) { + ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl; + + int ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj); + if (ret < 0) { + if (ret == -ECANCELED) { + // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object. + // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We + // return ENOENT to indicate that the OLH object was removed. + ret = -ENOENT; + } + return ret; + } + } + + auto iter = state->attrset.find(RGW_ATTR_OLH_VER); + if (iter == state->attrset.end()) { + return -EINVAL; + } + iter = state->attrset.find(RGW_ATTR_OLH_INFO); + if (iter == state->attrset.end()) { + return -ENOENT; + } + + RGWOLHInfo olh; + int ret = decode_olh_info(dpp, cct, iter->second, &olh); + if (ret < 0) { + return ret; + } + + if (olh.removed) { + return -ENOENT; + } + + *target = olh.target; + + return 0; +} + +int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp, + rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch, + map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker, optional_yield y) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + map unfiltered_attrset; + uint64_t size = 0; + struct timespec mtime_ts; + + ObjectReadOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + if (attrs) { + op.getxattrs(&unfiltered_attrset, NULL); + } + if (psize || pmtime) { + op.stat2(&size, &mtime_ts, NULL); + } + if (first_chunk) { + op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL); + } + bufferlist outbl; + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y); + + if (epoch) { + *epoch = ref.pool.ioctx().get_last_version(); + } + + if (r < 0) + return r; + + if (psize) + *psize = size; + if (pmtime) + *pmtime = ceph::real_clock::from_timespec(mtime_ts); + if (attrs) { + rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs); + } + + return 0; +} + +int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, string *bucket_ver, string *master_ver, + map& stats, + string *max_marker, bool *syncstopped) +{ + vector headers; + map bucket_instance_ids; + int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids); + if (r < 0) { + return r; + } + + ceph_assert(headers.size() == bucket_instance_ids.size()); + + auto iter = headers.begin(); + map::iterator viter = bucket_instance_ids.begin(); + BucketIndexShardsManager ver_mgr; + BucketIndexShardsManager master_ver_mgr; + BucketIndexShardsManager marker_mgr; + char buf[64]; + for(; iter != headers.end(); ++iter, ++viter) { + accumulate_raw_stats(*iter, stats); + snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver); + ver_mgr.add(viter->first, string(buf)); + snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver); + master_ver_mgr.add(viter->first, string(buf)); + if (shard_id >= 0) { + *max_marker = iter->max_marker; + } else { + marker_mgr.add(viter->first, iter->max_marker); + } + if (syncstopped != NULL) + *syncstopped = iter->syncstopped; + } + ver_mgr.to_string(bucket_ver); + master_ver_mgr.to_string(master_ver); + if (shard_id < 0) { + marker_mgr.to_string(max_marker); + } + return 0; +} + +class RGWGetBucketStatsContext : public RGWGetDirHeader_CB { + RGWGetBucketStats_CB *cb; + uint32_t pendings; + map stats; + int ret_code; + bool should_cb; + ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext"); + +public: + RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings) + : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true) + {} + + void handle_response(int r, rgw_bucket_dir_header& header) override { + std::lock_guard l{lock}; + if (should_cb) { + if ( r >= 0) { + accumulate_raw_stats(header, stats); + } else { + ret_code = r; + } + + // Are we all done? + if (--pendings == 0) { + if (!ret_code) { + cb->set_response(&stats); + } + cb->handle_response(ret_code); + cb->put(); + } + } + } + + void unset_cb() { + std::lock_guard l{lock}; + should_cb = false; + } +}; + +int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx) +{ + int num_aio = 0; + RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1); + ceph_assert(get_ctx); + int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio); + if (r < 0) { + ctx->put(); + if (num_aio) { + get_ctx->unset_cb(); + } + } + get_ctx->put(); + return r; +} + +int RGWRados::get_bucket_instance_info(const string& meta_key, + RGWBucketInfo& info, + real_time *pmtime, + map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + rgw_bucket bucket; + rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr); + + return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp); +} + +int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, + real_time *pmtime, map *pattrs, optional_yield y, + const DoutPrefixProvider *dpp) +{ + return ctl.bucket->read_bucket_instance_info(bucket, &info, + y, + dpp, + RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(pmtime) + .set_attrs(pattrs)); +} + +int RGWRados::get_bucket_info(RGWServices *svc, + const string& tenant, const string& bucket_name, + RGWBucketInfo& info, + real_time *pmtime, + optional_yield y, + const DoutPrefixProvider *dpp, map *pattrs) +{ + rgw_bucket bucket; + bucket.tenant = tenant; + bucket.name = bucket_name; + return ctl.bucket->read_bucket_info(bucket, &info, y, dpp, + RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(pmtime) + .set_attrs(pattrs)); +} + +int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info, + ceph::real_time *pmtime, + const DoutPrefixProvider *dpp, + map *pattrs) +{ + rgw_bucket bucket = info.bucket; + bucket.bucket_id.clear(); + + auto rv = info.objv_tracker.read_version; + + return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp, + RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(pmtime) + .set_attrs(pattrs) + .set_refresh_version(rv)); +} + +int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, + real_time mtime, map *pattrs, + const DoutPrefixProvider *dpp, optional_yield y) +{ + return ctl.bucket->store_bucket_instance_info(info.bucket, info, y, dpp, + RGWBucketCtl::BucketInstance::PutParams() + .set_exclusive(exclusive) + .set_mtime(mtime) + .set_attrs(pattrs)); +} + +int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv, + map *pattrs, bool create_entry_point, + const DoutPrefixProvider *dpp, optional_yield y) +{ + bool create_head = !info.has_instance_obj || create_entry_point; + + int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp, y); + if (ret < 0) { + return ret; + } + + if (!create_head) + return 0; /* done! */ + + RGWBucketEntryPoint entry_point; + entry_point.bucket = info.bucket; + entry_point.owner = info.owner; + entry_point.creation_time = info.creation_time; + entry_point.linked = true; + RGWObjVersionTracker ot; + if (pep_objv && !pep_objv->tag.empty()) { + ot.write_version = *pep_objv; + } else { + ot.generate_new_write_ver(cct); + if (pep_objv) { + *pep_objv = ot.write_version; + } + } + ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, y, dpp, RGWBucketCtl::Bucket::PutParams() + .set_exclusive(exclusive) + .set_objv_tracker(&ot) + .set_mtime(mtime)); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::update_containers_stats(map& m, const DoutPrefixProvider *dpp) +{ + map::iterator iter; + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt& ent = iter->second; + rgw_bucket& bucket = ent.bucket; + ent.count = 0; + ent.size = 0; + ent.size_rounded = 0; + + vector headers; + + RGWBucketInfo bucket_info; + int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp); + if (ret < 0) { + return ret; + } + + int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers); + if (r < 0) + return r; + + auto hiter = headers.begin(); + for (; hiter != headers.end(); ++hiter) { + RGWObjCategory category = main_category; + auto iter = (hiter->stats).find(category); + if (iter != hiter->stats.end()) { + struct rgw_bucket_category_stats& stats = iter->second; + ent.count += stats.num_entries; + ent.size += stats.total_size; + ent.size_rounded += stats.total_size_rounded; + } + } + + // fill in placement_rule from the bucket instance for use in swift's + // per-storage policy statistics + ent.placement_rule = std::move(bucket_info.placement_rule); + } + + return m.size(); +} + +int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + librados::Rados *rad = get_rados_handle(); + librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr); + + r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size); + completion->release(); + return r; +} + +int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + int r = open_pool_ctx(dpp, pool, io_ctx, false, false); + if (r < 0) + return r; + + iter = io_ctx.nobjects_begin(); + + return 0; +} + +int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + int r = open_pool_ctx(dpp, pool, io_ctx, false, false); + if (r < 0) + return r; + + librados::ObjectCursor oc; + if (!oc.from_str(cursor)) { + ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl; + return -EINVAL; + } + + try { + iter = io_ctx.nobjects_begin(oc); + return 0; + } catch (const std::system_error& e) { + r = -e.code().value(); + ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() + << ", returning " << r << dendl; + return r; + } catch (const std::exception& e) { + ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() + << ", returning -5" << dendl; + return -EIO; + } +} + +string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx) +{ + return ctx.iter.get_cursor().to_str(); +} + +static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num, + vector& objs, + bool *is_truncated, RGWAccessListFilter *filter) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + if (iter == io_ctx.nobjects_end()) + return -ENOENT; + + uint32_t i; + + for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) { + rgw_bucket_dir_entry e; + + string oid = iter->get_oid(); + ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl; + + // fill it in with initial values; we may correct later + if (filter && !filter->filter(oid, oid)) + continue; + + e.key = oid; + objs.push_back(e); + } + + if (is_truncated) + *is_truncated = (iter != io_ctx.nobjects_end()); + + return objs.size(); +} + +int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector& objs, + bool *is_truncated, RGWAccessListFilter *filter) +{ + // catch exceptions from NObjectIterator::operator++() + try { + return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter); + } catch (const std::system_error& e) { + int r = -e.code().value(); + ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what() + << ", returning " << r << dendl; + return r; + } catch (const std::exception& e) { + ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what() + << ", returning -5" << dendl; + return -EIO; + } +} + +int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx) +{ + if (!ctx->initialized) { + int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx); + if (r < 0) { + ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl; + return r; + } + ctx->initialized = true; + } + return 0; +} + +int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, list& oids, + bool *is_truncated) +{ + if (!ctx.initialized) { + return -EINVAL; + } + RGWAccessListFilterPrefix filter(prefix_filter); + vector objs; + int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter); + if (r < 0) { + if(r != -ENOENT) + ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl; + return r; + } + + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + oids.push_back(iter->key.name); + } + + return oids.size(); +} + +int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter, + int max, RGWListRawObjsCtx& ctx, list& oids, + bool *is_truncated) +{ + if (!ctx.initialized) { + int r = list_raw_objects_init(dpp, pool, string(), &ctx); + if (r < 0) { + return r; + } + } + + return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated); +} + +string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx) +{ + return pool_iterate_get_cursor(ctx.iter_ctx); +} + +int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + rgw_bucket_dir_entry *dirent) +{ + rgw_cls_bi_entry bi_entry; + int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl; + } + if (r < 0) { + return r; + } + auto iter = bi_entry.data.cbegin(); + try { + decode(*dirent, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl; + return -EIO; + } + + return 0; +} + +int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + rgw_bucket_olh_entry *olh) +{ + rgw_cls_bi_entry bi_entry; + int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl; + } + if (r < 0) { + return r; + } + auto iter = bi_entry.data.cbegin(); + try { + decode(*olh, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl; + return -EIO; + } + + return 0; +} + +int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + BIIndexType index_type, rgw_cls_bi_entry *entry) +{ + BucketShard bs(this); + int ret = bs.init(dpp, bucket_info, obj); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance); + + auto& ref = bs.bucket_obj.get_ref(); + + return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry); +} + +void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry) +{ + auto& ref = bs.bucket_obj.get_ref(); + cls_rgw_bi_put(op, ref.obj.oid, entry); +} + +int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry) +{ + auto& ref = bs.bucket_obj.get_ref(); + int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry) +{ + // make sure incomplete multipart uploads are hashed correctly + if (obj.key.ns == RGW_OBJ_NS_MULTIPART) { + RGWMPObj mp; + mp.from_meta(obj.key.name); + obj.index_hash_source = mp.get_key(); + } + BucketShard bs(this); + + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + return bi_put(bs, entry); +} + +int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, + const string& obj_name_filter, const string& marker, uint32_t max, + list *entries, bool *is_truncated) +{ + rgw_obj obj(bucket, obj_name_filter); + BucketShard bs(this); + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + auto& ref = bs.bucket_obj.get_ref(); + ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated); + if (ret == -ENOENT) { + *is_truncated = false; + } + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max, + list *entries, bool *is_truncated) +{ + auto& ref = bs.bucket_obj.get_ref(); + int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_list(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max, + list *entries, bool *is_truncated) +{ + BucketShard bs(this); + int ret = bs.init(dpp, bucket_info, + bucket_info.layout.current_index, + shard_id); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated); +} + +int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs) +{ + auto& ref = bs.bucket_obj.get_ref(); + int ret = ref.pool.ioctx().remove(ref.obj.oid); + if (ret == -ENOENT) { + ret = 0; + } + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op) +{ + return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield); +} + +int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c, + librados::ObjectWriteOperation *op) +{ + return gc_pool_ctx.aio_operate(oid, c, op); +} + +int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl) +{ + return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield); +} + +int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated, bool& processing_queue) +{ + return gc->list(index, marker, max, expired_only, result, truncated, processing_queue); +} + +int RGWRados::process_gc(bool expired_only) +{ + return gc->process(expired_only); +} + +int RGWRados::list_lc_progress(string& marker, uint32_t max_entries, + vector>& progress_map, + int& index) +{ + return lc->list_lc_progress(marker, max_entries, progress_map, index); +} + +int RGWRados::process_lc(const std::unique_ptr& optional_bucket) +{ + RGWLC lc; + lc.initialize(cct, this->driver); + RGWLC::LCWorker worker(&lc, cct, &lc, 0); + auto ret = lc.process(&worker, optional_bucket, true /* once */); + lc.stop_processor(); // sets down_flag, but returns immediately + return ret; +} + +bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp) +{ + return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now()); +} + +int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag, + rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx; + ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key()); + + ObjectWriteOperation o; + o.assert_exists(); // bucket index shard must exist + + cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance); + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->need_to_log_data(), bilog_flags, zones_trace); + int ret = bs.bucket_obj.operate(dpp, &o, y); + ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx; + return ret; +} + +int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag, + int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, + list *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << + " obj=" << obj << " tag=" << tag << " op=" << op << + ", remove_objs=" << (remove_objs ? *remove_objs : std::list()) << dendl_bitx; + ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + ObjectWriteOperation o; + o.assert_exists(); // bucket index shard must exist + + rgw_bucket_dir_entry_meta dir_meta; + dir_meta = ent.meta; + dir_meta.category = category; + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key()); + + rgw_bucket_entry_ver ver; + ver.pool = pool; + ver.epoch = epoch; + cls_rgw_obj_key key(ent.key.name, ent.key.instance); + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs, + svc.zone->need_to_log_data(), bilog_flags, &zones_trace); + complete_op_data *arg; + index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs, + svc.zone->need_to_log_data(), bilog_flags, &zones_trace, &arg); + librados::AioCompletion *completion = arg->rados_completion; + int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o); + completion->release(); /* can't reference arg here, as it might have already been released */ + + ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx; + return ret; +} + +int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag, + int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, + list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace) +{ + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace); +} + +int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag, + int64_t pool, uint64_t epoch, + rgw_obj& obj, + real_time& removed_mtime, + list *remove_objs, + uint16_t bilog_flags, + rgw_zone_set *zones_trace) +{ + rgw_bucket_dir_entry ent; + ent.meta.mtime = removed_mtime; + obj.key.get_index_key(&ent.key); + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, + ent, RGWObjCategory::None, remove_objs, + bilog_flags, zones_trace); +} + +int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, + list *remove_objs, + uint16_t bilog_flags, rgw_zone_set *zones_trace) +{ + rgw_bucket_dir_entry ent; + obj.key.get_index_key(&ent.key); + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, + -1 /* pool id */, 0, ent, + RGWObjCategory::None, remove_objs, bilog_flags, + zones_trace); +} + +int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) + return r; + + return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)(); +} + + +// returns 0 if there is an error in calculation +uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries, + uint32_t num_shards) +{ + if (num_shards == 0) { + // we'll get a floating point exception since we divide by + // num_shards + return 0; + } + + // We want to minimize the chances that when num_shards >> + // num_entries that we return much fewer than num_entries to the + // client. Given all the overhead of making a cls call to the osd, + // returning a few entries is not much more work than returning one + // entry. This minimum might be better tuned based on future + // experiments where num_shards >> num_entries. (Note: ">>" should + // be interpreted as "much greater than".) + constexpr uint32_t min_read = 8; + + // The following is based on _"Balls into Bins" -- A Simple and + // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle + // cases when num_shards >> num_entries (it almost serves as a + // ceiling calculation). We also assume alpha is 1.0 and extract it + // from the calculation. Future work could involve memoizing some of + // the transcendental functions to minimize repeatedly re-calling + // them with the same parameters, which we expect to be the case the + // majority of the time. + uint32_t calc_read = + 1 + + static_cast((num_entries / num_shards) + + sqrt((2 * num_entries) * + log(num_shards) / num_shards)); + + return std::max(min_read, calc_read); +} + + +int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + const int shard_id, + const rgw_obj_index_key& start_after, + const std::string& prefix, + const std::string& delimiter, + const uint32_t num_entries, + const bool list_versions, + const uint16_t expansion_factor, + ent_map_t& m, + bool* is_truncated, + bool* cls_filtered, + rgw_obj_index_key* last_entry, + optional_yield y, + RGWBucketListNameFilter force_check_filter) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + + /* expansion_factor allows the number of entries to read to grow + * exponentially; this is used when earlier reads are producing too + * few results, perhaps due to filtering or to a series of + * namespaced entries */ + + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket << + " start_after=\"" << start_after.to_string() << + "\", prefix=\"" << prefix << + ", delimiter=\"" << delimiter << + "\", shard_id=" << shard_id << + "\", num_entries=" << num_entries << + ", shard_id=" << shard_id << + ", list_versions=" << list_versions << + ", expansion_factor=" << expansion_factor << + ", force_check_filter is " << + (force_check_filter ? "set" : "unset") << dendl_bitx; + ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + m.clear(); + + RGWSI_RADOS::Pool index_pool; + // key - oid (for different shards if there is any) + // value - list result for the corresponding oid (shard), it is filled by + // the AIO callback + std::map shard_oids; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, + &index_pool, &shard_oids, + nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl; + return r; + } + + const uint32_t shard_count = shard_oids.size(); + if (shard_count == 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": the bucket index shard count appears to be 0, " + "which is an illegal value" << dendl; + return -ERR_INVALID_BUCKET_STATE; + } + + uint32_t num_entries_per_shard; + if (expansion_factor == 0) { + num_entries_per_shard = + calc_ordered_bucket_list_per_shard(num_entries, shard_count); + } else if (expansion_factor <= 11) { + // we'll max out the exponential multiplication factor at 1024 (2<<10) + num_entries_per_shard = + std::min(num_entries, + (uint32_t(1 << (expansion_factor - 1)) * + calc_ordered_bucket_list_per_shard(num_entries, shard_count))); + } else { + num_entries_per_shard = num_entries; + } + + if (num_entries_per_shard == 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": unable to calculate the number of entries to read from each " + "bucket index shard" << dendl; + return -ERR_INVALID_BUCKET_STATE; + } + + ldpp_dout(dpp, 10) << __func__ << + ": request from each of " << shard_count << + " shard(s) for " << num_entries_per_shard << " entries to get " << + num_entries << " total entries" << dendl; + + auto& ioctx = index_pool.ioctx(); + std::map shard_list_results; + cls_rgw_obj_key start_after_key(start_after.name, start_after.instance); + r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter, + num_entries_per_shard, + list_versions, shard_oids, shard_list_results, + cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": CLSRGWIssueBucketList for " << bucket_info.bucket << + " failed" << dendl; + return r; + } + + // to manage the iterators through each shard's list results + struct ShardTracker { + const size_t shard_idx; + rgw_cls_list_ret& result; + const std::string& oid_name; + RGWRados::ent_map_t::iterator cursor; + RGWRados::ent_map_t::iterator end; + + // manages an iterator through a shard and provides other + // accessors + ShardTracker(size_t _shard_idx, + rgw_cls_list_ret& _result, + const std::string& _oid_name): + shard_idx(_shard_idx), + result(_result), + oid_name(_oid_name), + cursor(_result.dir.m.begin()), + end(_result.dir.m.end()) + {} + + inline const std::string& entry_name() const { + return cursor->first; + } + rgw_bucket_dir_entry& dir_entry() const { + return cursor->second; + } + inline bool is_truncated() const { + return result.is_truncated; + } + inline ShardTracker& advance() { + ++cursor; + // return a self-reference to allow for chaining of calls, such + // as x.advance().at_end() + return *this; + } + inline bool at_end() const { + return cursor == end; + } + }; // ShardTracker + + // add the next unique candidate, or return false if we reach the end + auto next_candidate = [] (CephContext *cct, ShardTracker& t, + std::multimap& candidates, + size_t tracker_idx) { + if (!t.at_end()) { + candidates.emplace(t.entry_name(), tracker_idx); + } + return; + }; + + // one tracker per shard requested (may not be all shards) + std::vector results_trackers; + results_trackers.reserve(shard_list_results.size()); + for (auto& r : shard_list_results) { + results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]); + + // if any *one* shard's result is trucated, the entire result is + // truncated + *is_truncated = *is_truncated || r.second.is_truncated; + + // unless *all* are shards are cls_filtered, the entire result is + // not filtered + *cls_filtered = *cls_filtered && r.second.cls_filtered; + } + + // create a map to track the next candidate entry from ShardTracker + // (key=candidate, value=index into results_trackers); as we consume + // entries from shards, we replace them with the next entries in the + // shards until we run out + std::multimap candidates; + size_t tracker_idx = 0; + std::vector vidx; + vidx.reserve(shard_list_results.size()); + for (auto& t : results_trackers) { + // it's important that the values in the map refer to the index + // into the results_trackers vector, which may not be the same + // as the shard number (i.e., when not all shards are requested) + next_candidate(cct, t, candidates, tracker_idx); + ++tracker_idx; + } + + rgw_bucket_dir_entry* + last_entry_visited = nullptr; // to set last_entry (marker) + std::map updates; + uint32_t count = 0; + while (count < num_entries && !candidates.empty()) { + r = 0; + // select the next entry in lexical order (first key in map); + // again tracker_idx is not necessarily shard number, but is index + // into results_trackers vector + tracker_idx = candidates.begin()->second; + auto& tracker = results_trackers.at(tracker_idx); + + const std::string& name = tracker.entry_name(); + rgw_bucket_dir_entry& dirent = tracker.dir_entry(); + + ldpp_dout(dpp, 20) << __func__ << ": currently processing " << + dirent.key << " from shard " << tracker.shard_idx << dendl; + + const bool force_check = + force_check_filter && force_check_filter(dirent.key.name); + + if ((!dirent.exists && + !dirent.is_delete_marker() && + !dirent.is_common_prefix()) || + !dirent.pending_map.empty() || + force_check) { + /* there are uncommitted ops. We need to check the current + * state, and if the tags are old we need to do clean-up as + * well. */ + librados::IoCtx sub_ctx; + sub_ctx.dup(ioctx); + ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << + " calling check_disk_state bucket=" << bucket_info.bucket << + " entry=" << dirent.key << dendl_bitx; + r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, + updates[tracker.oid_name], y); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << __func__ << + ": check_disk_state for \"" << dirent.key << + "\" failed with r=" << r << dendl; + return r; + } + } else { + r = 0; + } + + // at this point either r >= 0 or r == -ENOENT + if (r >= 0) { // i.e., if r != -ENOENT + ldpp_dout(dpp, 10) << __func__ << ": got " << + dirent.key << dendl; + + auto [it, inserted] = m.insert_or_assign(name, std::move(dirent)); + last_entry_visited = &it->second; + if (inserted) { + ++count; + } else { + ldpp_dout(dpp, 0) << "WARNING: " << __func__ << + " reassigned map value at \"" << name << + "\", which should not happen" << dendl; + } + } else { + ldpp_dout(dpp, 10) << __func__ << ": skipping " << + dirent.key.name << "[" << dirent.key.instance << "]" << dendl; + last_entry_visited = &tracker.dir_entry(); + } + + // refresh the candidates map + vidx.clear(); + bool need_to_stop = false; + auto range = candidates.equal_range(name); + for (auto i = range.first; i != range.second; ++i) { + vidx.push_back(i->second); + } + candidates.erase(range.first, range.second); + for (auto idx : vidx) { + auto& tracker_match = results_trackers.at(idx); + tracker_match.advance(); + next_candidate(cct, tracker_match, candidates, idx); + if (tracker_match.at_end() && tracker_match.is_truncated()) { + need_to_stop = true; + break; + } + } + if (need_to_stop) { + // once we exhaust one shard that is truncated, we need to stop, + // as we cannot be certain that one of the next entries needs to + // come from that shard; S3 and swift protocols allow returning + // fewer than what was requested + ldpp_dout(dpp, 10) << __func__ << + ": stopped accumulating results at count=" << count << + ", dirent=\"" << dirent.key << + "\", because its shard is truncated and exhausted" << dendl; + break; + } + } // while we haven't provided requested # of result entries + + // suggest updates if there are any + for (auto& miter : updates) { + if (miter.second.length()) { + ObjectWriteOperation o; + cls_rgw_suggest_changes(o, miter.second); + // we don't care if we lose suggested updates, send them off blindly + AioCompletion *c = + librados::Rados::aio_create_completion(nullptr, nullptr); + + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << + ": doing dir_suggest on " << miter.first << dendl_bitx; + ioctx.aio_operate(miter.first, c, &o); + c->release(); + } + } // updates loop + + // determine truncation by checking if all the returned entries are + // consumed or not + *is_truncated = false; + for (const auto& t : results_trackers) { + if (!t.at_end() || t.is_truncated()) { + *is_truncated = true; + break; + } + } + + ldpp_dout(dpp, 20) << __func__ << + ": returning, count=" << count << ", is_truncated=" << *is_truncated << + dendl; + + if (*is_truncated && count < num_entries) { + ldpp_dout(dpp, 10) << __func__ << + ": requested " << num_entries << " entries but returning " << + count << ", which is truncated" << dendl; + } + + if (last_entry_visited != nullptr && last_entry) { + *last_entry = last_entry_visited->key; + ldpp_dout(dpp, 20) << __func__ << + ": returning, last_entry=" << *last_entry << dendl; + } else { + ldpp_dout(dpp, 20) << __func__ << + ": returning, last_entry NOT SET" << dendl; + } + + ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; + return 0; +} // RGWRados::cls_bucket_list_ordered + + +// A helper function to retrieve the hash source from an incomplete +// multipart entry by removing everything from the second to last +// period on. +static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) { + std::size_t found = oid_wo_ns.rfind('.'); + if (found == std::string::npos || found < 1) { + return -EINVAL; + } + found = oid_wo_ns.rfind('.', found - 1); + if (found == std::string::npos || found < 1) { + return -EINVAL; + } + *index_hash_source = oid_wo_ns.substr(0, found); + return 0; +} + + +int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, + const rgw_obj_index_key& start_after, + const std::string& prefix, + uint32_t num_entries, + bool list_versions, + std::vector& ent_list, + bool *is_truncated, + rgw_obj_index_key *last_entry, + optional_yield y, + RGWBucketListNameFilter force_check_filter) { + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket << + " start_after=\"" << start_after << + "\", prefix=\"" << prefix << + "\", shard_id=" << shard_id << + "\", num_entries=" << num_entries << + ", list_versions=" << list_versions << + (force_check_filter ? "set" : "unset") << dendl_bitx; + ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + ent_list.clear(); + static MultipartMetaFilter multipart_meta_filter; + + *is_truncated = false; + RGWSI_RADOS::Pool index_pool; + + std::map oids; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr); + if (r < 0) { + return r; + } + + auto& ioctx = index_pool.ioctx(); + + const uint32_t num_shards = oids.size(); + + rgw_obj_index_key marker = start_after; + uint32_t current_shard; + if (shard_id >= 0) { + current_shard = shard_id; + } else if (start_after.empty()) { + current_shard = 0u; + } else { + // at this point we have a marker (start_after) that has something + // in it, so we need to get to the bucket shard index, so we can + // start reading from there + + + // now convert the key (oid) to an rgw_obj_key since that will + // separate out the namespace, name, and instance + rgw_obj_key obj_key; + bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key); + if (!parsed) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " received an invalid start marker: \"" << start_after << "\"" << + dendl; + return -EINVAL; + } else if (obj_key.name.empty()) { + // if the name is empty that means the object name came in with + // a namespace only, and therefore we need to start our scan at + // the first bucket index shard + current_shard = 0u; + } else { + // so now we have the key used to compute the bucket index shard + // and can extract the specific shard from it + if (obj_key.ns == RGW_OBJ_NS_MULTIPART) { + // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of + // the implementation relying on MultipartMetaFilter + // because MultipartMetaFilter only checks .meta suffix, which may + // exclude data multiparts but include some regular objects with .meta suffix + // by mistake. + string index_hash_source; + r = parse_index_hash_source(obj_key.name, &index_hash_source); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " parse_index_hash_source unable to parse \"" << obj_key.name << + "\", r=" << r << dendl; + return r; + } + current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards); + } else { + current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards); + } + } + } + + uint32_t count = 0u; + std::map updates; + rgw_obj_index_key last_added_entry; + while (count <= num_entries && + ((shard_id >= 0 && current_shard == uint32_t(shard_id)) || + current_shard < num_shards)) { + const std::string& oid = oids[current_shard]; + rgw_cls_list_ret result; + + librados::ObjectReadOperation op; + const std::string empty_delimiter; + cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter, + num_entries, + list_versions, &result); + r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": error in rgw_rados_operate (bucket list op), r=" << r << dendl; + return r; + } + + for (auto& entry : result.dir.m) { + rgw_bucket_dir_entry& dirent = entry.second; + + bool force_check = force_check_filter && + force_check_filter(dirent.key.name); + if ((!dirent.exists && !dirent.is_delete_marker()) || + !dirent.pending_map.empty() || + force_check) { + /* there are uncommitted ops. We need to check the current state, + * and if the tags are old we need to do cleanup as well. */ + librados::IoCtx sub_ctx; + sub_ctx.dup(ioctx); + ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << + ": calling check_disk_state bucket=" << bucket_info.bucket << + " entry=" << dirent.key << dendl_bitx; + r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": error in check_disk_state, r=" << r << dendl; + return r; + } + } else { + r = 0; + } + + // at this point either r >= 0 or r == -ENOENT + if (r >= 0) { // i.e., if r != -ENOENT + ldpp_dout(dpp, 10) << __func__ << ": got " << + dirent.key << dendl; + + if (count < num_entries) { + marker = last_added_entry = dirent.key; // double assign + ent_list.emplace_back(std::move(dirent)); + ++count; + } else { + last_added_entry = dirent.key; + *is_truncated = true; + ldpp_dout(dpp, 10) << "INFO: " << __func__ << + ": reached max entries (" << num_entries << ") to return at \"" << + dirent.key << "\"" << dendl; + goto check_updates; + } + } else { // r == -ENOENT + // in the case of -ENOENT, make sure we're advancing marker + // for possible next call to CLSRGWIssueBucketList + marker = dirent.key; + } + } // entry for loop + + if (!result.is_truncated) { + // if we reached the end of the shard read next shard + ++current_shard; + marker = rgw_obj_index_key(); + } + } // shard loop + +check_updates: + + // suggest updates if there is any + std::map::iterator miter = updates.begin(); + for (; miter != updates.end(); ++miter) { + if (miter->second.length()) { + ObjectWriteOperation o; + cls_rgw_suggest_changes(o, miter->second); + // we don't care if we lose suggested updates, send them off blindly + AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); + + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << + " doing dir_suggest on " << miter->first << dendl_bitx; + ioctx.aio_operate(miter->first, c, &o); + c->release(); + } + } + + if (last_entry && !ent_list.empty()) { + *last_entry = last_added_entry; + } + + ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; + return 0; +} // RGWRados::cls_bucket_list_unordered + + +int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid, + rgw_usage_log_info& info) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + cls_rgw_usage_log_add(op, info); + + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + return r; +} + +int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket, + uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + string& read_iter, map& usage, + bool *is_truncated) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + *is_truncated = false; + + r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch, + max_entries, read_iter, usage, is_truncated); + + return r; +} + +static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch) +{ + bool done = false; + do { + librados::ObjectWriteOperation op; + cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch); + int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r == -ENODATA) + done = true; + else if (r < 0) + return r; + } while (!done); + + return 0; +} + +int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket, + uint64_t start_epoch, uint64_t end_epoch) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch); + return r; +} + +int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + librados::ObjectWriteOperation op; + cls_rgw_usage_log_clear(op); + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + return r; +} + + +// note: this removes entries from the rados bucket index objects +// without going through CLS; this is known to be called from +// "radosgw-admin unlink" and "radosgw-admin bucket check --fix" +int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::list& entry_key_list) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket << + " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx; + ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + const auto& current_index = bucket_info.get_current_index(); + if (is_layout_indexless(current_index)) { + return -EINVAL; + } + const uint32_t num_shards = current_index.layout.normal.num_shards; + + RGWSI_RADOS::Pool index_pool; + std::map index_oids; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, + bucket_info.layout.current_index, + &index_pool, &index_oids, nullptr); + if (r < 0) { + ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ << + " open_bucket_index returned " << r << dendl_bitx; + return r; + } + + // split up removals by shard + std::map> sharded_removals; + for (const auto& entry_key : entry_key_list) { + const rgw_obj_key obj_key(entry_key); + const uint32_t shard = + RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards); + + // entry_key already combines namespace and name, so we first have + // to break that apart before we can then combine with instance + std::string name; + std::string ns; // namespace + rgw_obj_key::parse_index_key(entry_key.name, &name, &ns); + rgw_obj_key full_key(name, entry_key.instance, ns); + std::string combined_key = full_key.get_oid(); + + sharded_removals[shard].insert(combined_key); + + ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << + ": removal from bucket index, bucket=" << bucket_info.bucket << + " key=" << combined_key << " designated for shard " << shard << + dendl_bitx; + } + + for (const auto& removals : sharded_removals) { + const int shard = removals.first; + const std::string& oid = index_oids[shard]; + + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << + ": removal from bucket index, bucket=" << bucket_info.bucket << + ", shard=" << shard << ", oid=" << oid << ", num_keys=" << + removals.second.size() << dendl_bitx; + + r = index_pool.ioctx().omap_rm_keys(oid, removals.second); + if (r < 0) { + ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ << + ": omap_rm_keys returned ret=" << r << + dendl_bitx; + return r; + } + } + + ldout_bitx(bitx, dpp, 5) << + "EXITING " << __func__ << " and returning " << r << dendl_bitx; + + return r; +} + +int RGWRados::check_disk_state(const DoutPrefixProvider *dpp, + librados::IoCtx io_ctx, + RGWBucketInfo& bucket_info, + rgw_bucket_dir_entry& list_state, + rgw_bucket_dir_entry& object, + bufferlist& suggested_updates, + optional_yield y) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << + bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx; + + uint8_t suggest_flag = (svc.zone->need_to_log_data() ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0); + + std::string loc; + + rgw_obj obj(bucket_info.bucket, list_state.key); + + MultipartMetaFilter multipart_meta_filter; + string temp_key; + if (multipart_meta_filter.filter(list_state.key.name, temp_key)) { + obj.in_extra_data = true; + } + + string oid; + get_obj_bucket_and_oid_loc(obj, oid, loc); + + if (loc != list_state.locator) { + ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl; + } + + io_ctx.locator_set_key(list_state.locator); + + RGWObjState *astate = NULL; + RGWObjManifest *manifest = nullptr; + RGWObjectCtx rctx(this->driver); + int r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, &manifest, false, y); + if (r < 0) + return r; + + list_state.pending_map.clear(); // we don't need this and it inflates size + if (!list_state.is_delete_marker() && !astate->exists) { + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx; + /* object doesn't exist right now -- hopefully because it's + * marked as !exists and got deleted */ + if (list_state.exists) { + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx; + /* FIXME: what should happen now? Work out if there are any + * non-bad ways this could happen (there probably are, but annoying + * to handle!) */ + } + + // encode a suggested removal of that key + list_state.ver.epoch = io_ctx.get_last_version(); + list_state.ver.pool = io_ctx.get_id(); + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx; + cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates); + return -ENOENT; + } + + string etag; + string content_type; + string storage_class; + ACLOwner owner; + bool appendable = false; + + object.meta.size = astate->size; + object.meta.accounted_size = astate->accounted_size; + object.meta.mtime = astate->mtime; + + map::iterator iter = astate->attrset.find(RGW_ATTR_ETAG); + if (iter != astate->attrset.end()) { + etag = rgw_bl_str(iter->second); + } + iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE); + if (iter != astate->attrset.end()) { + content_type = rgw_bl_str(iter->second); + } + iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS); + if (iter != astate->attrset.end()) { + storage_class = rgw_bl_str(iter->second); + } + iter = astate->attrset.find(RGW_ATTR_ACL); + if (iter != astate->attrset.end()) { + r = decode_policy(dpp, iter->second, &owner); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl; + } + } + iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM); + if (iter != astate->attrset.end()) { + appendable = true; + } + + if (manifest) { + RGWObjManifest::obj_iterator miter; + for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) { + const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this); + rgw_obj loc; + RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc); + + if (loc.key.ns == RGW_OBJ_NS_MULTIPART) { + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx; + r = delete_obj_index(loc, astate->mtime, dpp, y); + if (r < 0) { + ldout_bitx(bitx, dpp, 0) << + "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx; + } + } + } + } + + object.meta.etag = etag; + object.meta.content_type = content_type; + object.meta.storage_class = storage_class; + object.meta.owner = owner.get_id().to_str(); + object.meta.owner_display_name = owner.get_display_name(); + object.meta.appendable = appendable; + + // encode suggested updates + + list_state.meta.size = object.meta.size; + list_state.meta.accounted_size = object.meta.accounted_size; + list_state.meta.mtime = object.meta.mtime; + list_state.meta.category = main_category; + list_state.meta.etag = etag; + list_state.meta.appendable = appendable; + list_state.meta.content_type = content_type; + list_state.meta.storage_class = storage_class; + + librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id + r = get_obj_head_ioctx(dpp, bucket_info, obj, &head_obj_ctx); + if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + " WARNING: unable to find head object data pool for \"" << + obj << "\", not updating version pool/epoch" << dendl; + } else { + list_state.ver.pool = head_obj_ctx.get_id(); + list_state.ver.epoch = astate->epoch; + } + + if (astate->obj_tag.length() > 0) { + list_state.tag = astate->obj_tag.c_str(); + } + + list_state.meta.owner = owner.get_id().to_str(); + list_state.meta.owner_display_name = owner.get_display_name(); + + list_state.exists = true; + + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << + ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx; + cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates); + + ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; + return 0; +} // RGWRados::check_disk_state + +int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector& headers, map *bucket_instance_ids) +{ + RGWSI_RADOS::Pool index_pool; + map oids; + map list_results; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids); + if (r < 0) { + ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned " + << r << dendl; + return r; + } + + r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) { + ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned " + << r << dendl; + return r; + } + + map::iterator iter = list_results.begin(); + for(; iter != list_results.end(); ++iter) { + headers.push_back(std::move(iter->second.dir.header)); + } + return 0; +} + +int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr); + if (r < 0) + return r; + + map::iterator iter = bucket_objs.begin(); + for (; iter != bucket_objs.end(); ++iter) { + r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast(ctx->get())); + if (r < 0) { + ctx->put(); + break; + } else { + (*num_aio)++; + } + } + return r; +} + +int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, + const rgw_bucket& bucket, + uint64_t num_objs, + const DoutPrefixProvider *dpp) +{ + if (! cct->_conf.get_val("rgw_dynamic_resharding")) { + return 0; + } + + bool need_resharding = false; + uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout); + const uint32_t max_dynamic_shards = + uint32_t(cct->_conf.get_val("rgw_max_dynamic_shards")); + + if (num_source_shards >= max_dynamic_shards) { + return 0; + } + + uint32_t suggested_num_shards = 0; + const uint64_t max_objs_per_shard = + cct->_conf.get_val("rgw_max_objs_per_shard"); + + // TODO: consider per-bucket sync policy here? + const bool is_multisite = svc.zone->need_to_log_data(); + + quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards, + num_objs, is_multisite, need_resharding, + &suggested_num_shards); + if (! need_resharding) { + return 0; + } + + const uint32_t final_num_shards = + RGWBucketReshard::get_preferred_shards(suggested_num_shards, + max_dynamic_shards); + // final verification, so we don't reduce number of shards + if (final_num_shards <= num_source_shards) { + return 0; + } + + ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name << + " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards << + "; new num shards " << final_num_shards << " (suggested " << + suggested_num_shards << ")" << dendl; + + return add_bucket_to_reshard(dpp, bucket_info, final_num_shards); +} + +int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards) +{ + RGWReshard reshard(this->driver, dpp); + + uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout); + + new_num_shards = std::min(new_num_shards, get_max_bucket_shards()); + if (new_num_shards <= num_source_shards) { + ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl; + return 0; + } + + cls_rgw_reshard_entry entry; + entry.time = real_clock::now(); + entry.tenant = bucket_info.owner.tenant; + entry.bucket_name = bucket_info.bucket.name; + entry.bucket_id = bucket_info.bucket.bucket_id; + entry.old_num_shards = num_source_shards; + entry.new_num_shards = new_num_shards; + + return reshard.add(dpp, entry); +} + +int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket, + RGWQuota& quota, + uint64_t obj_size, optional_yield y, + bool check_size_only) +{ + // if we only check size, then num_objs will set to 0 + if(check_size_only) + return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y); + + return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y); +} + +int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key, + int *shard_id) +{ + int r = 0; + switch (layout.hash_type) { + case rgw::BucketHashType::Mod: + if (!layout.num_shards) { + if (shard_id) { + *shard_id = -1; + } + } else { + uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards); + if (shard_id) { + *shard_id = (int)sid; + } + } + break; + default: + r = -ENOTSUP; + } + return r; +} + +uint64_t RGWRados::instance_id() +{ + return get_rados_handle()->get_instance_id(); +} + +uint64_t RGWRados::next_bucket_id() +{ + std::lock_guard l{bucket_id_lock}; + return ++max_bucket_id; +} + +librados::Rados* RGWRados::get_rados_handle() +{ + return &rados; +} + +int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list& handles) +{ + rgw_rados_ref ref; + int ret = get_raw_obj_ref(dpp, obj, &ref); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl; + return ret; + } + + ObjectWriteOperation op; + list prefixes; + cls_rgw_remove_obj(op, prefixes); + + AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); + ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl; + c->release(); + return ret; + } + + handles.push_back(c); + + return 0; +} + +int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, + RGWBucketInfo& bucket_info, RGWObjState *astate, + list& handles, bool keep_index_consistent, + optional_yield y) +{ + rgw_rados_ref ref; + int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl; + return ret; + } + + if (keep_index_consistent) { + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl; + return ret; + } + } + + ObjectWriteOperation op; + list prefixes; + cls_rgw_remove_obj(op, prefixes); + + AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); + ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl; + c->release(); + return ret; + } + + handles.push_back(c); + + if (keep_index_consistent) { + ret = delete_obj_index(obj, astate->mtime, dpp, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl; + return ret; + } + } + return ret; +} + +void objexp_hint_entry::generate_test_instances(list& o) +{ + auto it = new objexp_hint_entry; + it->tenant = "tenant1"; + it->bucket_name = "bucket1"; + it->bucket_id = "1234"; + it->obj_key = rgw_obj_key("obj"); + o.push_back(it); + o.push_back(new objexp_hint_entry); +} + +void objexp_hint_entry::dump(Formatter *f) const +{ + f->open_object_section("objexp_hint_entry"); + encode_json("tenant", tenant, f); + encode_json("bucket_name", bucket_name, f); + encode_json("bucket_id", bucket_id, f); + encode_json("rgw_obj_key", obj_key, f); + utime_t ut(exp_time); + encode_json("exp_time", ut, f); + f->close_section(); +} + +void RGWOLHInfo::generate_test_instances(list &o) +{ + RGWOLHInfo *olh = new RGWOLHInfo; + olh->removed = false; + o.push_back(olh); + o.push_back(new RGWOLHInfo); +} + +void RGWOLHInfo::dump(Formatter *f) const +{ + encode_json("target", target, f); +} + +void RGWOLHPendingInfo::dump(Formatter *f) const +{ + utime_t ut(time); + encode_json("time", ut, f); +} + diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h new file mode 100644 index 000000000..75a5e1b54 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rados.h @@ -0,0 +1,1661 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include "include/rados/librados.hpp" +#include "include/Context.h" +#include "include/random.h" +#include "common/RefCountedObj.h" +#include "common/ceph_time.h" +#include "common/Timer.h" +#include "rgw_common.h" +#include "cls/rgw/cls_rgw_types.h" +#include "cls/version/cls_version_types.h" +#include "cls/log/cls_log_types.h" +#include "cls/timeindex/cls_timeindex_types.h" +#include "cls/otp/cls_otp_types.h" +#include "rgw_quota.h" +#include "rgw_log.h" +#include "rgw_metadata.h" +#include "rgw_meta_sync_status.h" +#include "rgw_period_puller.h" +#include "rgw_obj_manifest.h" +#include "rgw_sync_module.h" +#include "rgw_trim_bilog.h" +#include "rgw_service.h" +#include "rgw_sal.h" +#include "rgw_aio.h" +#include "rgw_d3n_cacherequest.h" + +#include "services/svc_rados.h" +#include "services/svc_bi_rados.h" +#include "common/Throttle.h" +#include "common/ceph_mutex.h" +#include "rgw_cache.h" +#include "rgw_sal_fwd.h" + +struct D3nDataCache; + +class RGWWatcher; +class ACLOwner; +class RGWGC; +class RGWMetaNotifier; +class RGWDataNotifier; +class RGWLC; +class RGWObjectExpirer; +class RGWMetaSyncProcessorThread; +class RGWDataSyncProcessorThread; +class RGWSyncLogTrimThread; +class RGWSyncTraceManager; +struct RGWZoneGroup; +struct RGWZoneParams; +class RGWReshard; +class RGWReshardWait; + +struct get_obj_data; + +/* flags for put_obj_meta() */ +#define PUT_OBJ_CREATE 0x01 +#define PUT_OBJ_EXCL 0x02 +#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL) + +static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid) +{ + if (bucket.marker.empty() || orig_oid.empty()) { + oid = orig_oid; + } else { + oid = bucket.marker; + oid.append("_"); + oid.append(orig_oid); + } +} + +static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator) +{ + const rgw_bucket& bucket = obj.bucket; + prepend_bucket_marker(bucket, obj.get_oid(), oid); + const std::string& loc = obj.key.get_loc(); + if (!loc.empty()) { + prepend_bucket_marker(bucket, loc, locator); + } else { + locator.clear(); + } +} + +struct RGWOLHInfo { + rgw_obj target; + bool removed; + + RGWOLHInfo() : removed(false) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(target, bl); + encode(removed, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(target, bl); + decode(removed, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(std::list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOLHInfo) + +struct RGWOLHPendingInfo { + ceph::real_time time; + + RGWOLHPendingInfo() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(time, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(time, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOLHPendingInfo) + +struct RGWUsageBatch { + std::map m; + + void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) { + bool exists = m.find(t) != m.end(); + *account = !exists; + m[t].aggregate(entry); + } +}; + +struct RGWCloneRangeInfo { + rgw_obj src; + off_t src_ofs; + off_t dst_ofs; + uint64_t len; +}; + +class RGWFetchObjFilter { +public: + virtual ~RGWFetchObjFilter() {} + + virtual int filter(CephContext *cct, + const rgw_obj_key& source_key, + const RGWBucketInfo& dest_bucket_info, + std::optional dest_placement_rule, + const std::map& obj_attrs, + std::optional *poverride_owner, + const rgw_placement_rule **prule) = 0; +}; + +class RGWFetchObjFilter_Default : public RGWFetchObjFilter { +protected: + rgw_placement_rule dest_rule; +public: + RGWFetchObjFilter_Default() {} + + int filter(CephContext *cct, + const rgw_obj_key& source_key, + const RGWBucketInfo& dest_bucket_info, + std::optional dest_placement_rule, + const std::map& obj_attrs, + std::optional *poverride_owner, + const rgw_placement_rule **prule) override; +}; + +struct RGWObjStateManifest { + RGWObjState state; + std::optional manifest; +}; + +class RGWObjectCtx { + rgw::sal::Driver* driver; + ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx"); + + std::map objs_state; +public: + explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {} + RGWObjectCtx(RGWObjectCtx& _o) { + std::unique_lock wl{lock}; + this->driver = _o.driver; + this->objs_state = _o.objs_state; + } + + rgw::sal::Driver* get_driver() { + return driver; + } + + RGWObjStateManifest *get_state(const rgw_obj& obj); + + void set_compressed(const rgw_obj& obj); + void set_atomic(const rgw_obj& obj); + void set_prefetch_data(const rgw_obj& obj); + void invalidate(const rgw_obj& obj); +}; + + +struct RGWRawObjState { + rgw_raw_obj obj; + bool has_attrs{false}; + bool exists{false}; + uint64_t size{0}; + ceph::real_time mtime; + uint64_t epoch{0}; + bufferlist obj_tag; + bool has_data{false}; + bufferlist data; + bool prefetch_data{false}; + uint64_t pg_ver{0}; + + /* important! don't forget to update copy constructor */ + + RGWObjVersionTracker objv_tracker; + + std::map attrset; + RGWRawObjState() {} + RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) { + has_attrs = rhs.has_attrs; + exists = rhs.exists; + size = rhs.size; + mtime = rhs.mtime; + epoch = rhs.epoch; + if (rhs.obj_tag.length()) { + obj_tag = rhs.obj_tag; + } + has_data = rhs.has_data; + if (rhs.data.length()) { + data = rhs.data; + } + prefetch_data = rhs.prefetch_data; + pg_ver = rhs.pg_ver; + objv_tracker = rhs.objv_tracker; + } +}; + +struct RGWPoolIterCtx { + librados::IoCtx io_ctx; + librados::NObjectIterator iter; +}; + +struct RGWListRawObjsCtx { + bool initialized; + RGWPoolIterCtx iter_ctx; + + RGWListRawObjsCtx() : initialized(false) {} +}; + +struct objexp_hint_entry { + std::string tenant; + std::string bucket_name; + std::string bucket_id; + rgw_obj_key obj_key; + ceph::real_time exp_time; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(bucket_name, bl); + encode(bucket_id, bl); + encode(obj_key, bl); + encode(exp_time, bl); + encode(tenant, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ? + DECODE_START(2, bl); + decode(bucket_name, bl); + decode(bucket_id, bl); + decode(obj_key, bl); + decode(exp_time, bl); + if (struct_v >= 2) { + decode(tenant, bl); + } else { + tenant.clear(); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(objexp_hint_entry) + +class RGWMetaSyncStatusManager; +class RGWDataSyncStatusManager; +class RGWCoroutinesManagerRegistry; + +class RGWGetDirHeader_CB; +class RGWGetUserHeader_CB; +namespace rgw { namespace sal { + class RadosStore; + class MPRadosSerializer; + class LCRadosSerializer; +} } + +class RGWAsyncRadosProcessor; + +template +class RGWChainedCacheImpl; + +struct bucket_info_entry { + RGWBucketInfo info; + real_time mtime; + std::map attrs; +}; + +struct tombstone_entry; + +template +class lru_map; +using tombstone_cache_t = lru_map; + +class RGWIndexCompletionManager; + +class RGWRados +{ + friend class RGWGC; + friend class RGWMetaNotifier; + friend class RGWDataNotifier; + friend class RGWObjectExpirer; + friend class RGWMetaSyncProcessorThread; + friend class RGWDataSyncProcessorThread; + friend class RGWReshard; + friend class RGWBucketReshard; + friend class RGWBucketReshardLock; + friend class BucketIndexLockGuard; + friend class rgw::sal::MPRadosSerializer; + friend class rgw::sal::LCRadosSerializer; + friend class rgw::sal::RadosStore; + + /** Open the pool used as root for this gateway */ + int open_root_pool_ctx(const DoutPrefixProvider *dpp); + int open_gc_pool_ctx(const DoutPrefixProvider *dpp); + int open_lc_pool_ctx(const DoutPrefixProvider *dpp); + int open_objexp_pool_ctx(const DoutPrefixProvider *dpp); + int open_reshard_pool_ctx(const DoutPrefixProvider *dpp); + int open_notif_pool_ctx(const DoutPrefixProvider *dpp); + + int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx, + bool mostly_omap, bool bulk); + + + ceph::mutex lock = ceph::make_mutex("rados_timer_lock"); + SafeTimer *timer; + + rgw::sal::RadosStore* driver = nullptr; + RGWGC *gc = nullptr; + RGWLC *lc; + RGWObjectExpirer *obj_expirer; + bool use_gc_thread; + bool use_lc_thread; + bool quota_threads; + bool run_sync_thread; + bool run_reshard_thread; + + RGWMetaNotifier *meta_notifier; + RGWDataNotifier *data_notifier; + RGWMetaSyncProcessorThread *meta_sync_processor_thread; + RGWSyncTraceManager *sync_tracer = nullptr; + std::map data_sync_processor_threads; + + boost::optional bucket_trim; + RGWSyncLogTrimThread *sync_log_trimmer{nullptr}; + + ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock"); + ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock"); + + librados::IoCtx root_pool_ctx; // .rgw + + ceph::mutex bucket_id_lock{ceph::make_mutex("rados_bucket_id")}; + + // This field represents the number of bucket index object shards + uint32_t bucket_index_max_shards; + + std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y); + + int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref); + int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref); + int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref); + uint64_t max_bucket_id; + + int clear_olh(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const rgw_obj& obj, + RGWBucketInfo& bucket_info, + rgw_rados_ref& ref, + const std::string& tag, + const uint64_t ver, + optional_yield y); + + int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx, + RGWBucketInfo& bucket_info, const rgw_obj& obj, + RGWObjState *olh_state, RGWObjState **target_state, + RGWObjManifest **target_manifest, optional_yield y); + int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, + bool follow_olh, optional_yield y, bool assume_noent = false); + int append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, + librados::ObjectOperation& op, RGWObjState **state, + RGWObjManifest** pmanifest, optional_yield y); + + int update_placement_map(); + int store_bucket_info(RGWBucketInfo& info, std::map *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive); + + void remove_rgw_head_obj(librados::ObjectWriteOperation& op); + void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist); + void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type); +protected: + CephContext *cct; + + librados::Rados rados; + + using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl; + RGWChainedCacheImpl_bucket_info_entry *binfo_cache; + + tombstone_cache_t *obj_tombstone_cache; + + librados::IoCtx gc_pool_ctx; // .rgw.gc + librados::IoCtx lc_pool_ctx; // .rgw.lc + librados::IoCtx objexp_pool_ctx; + librados::IoCtx reshard_pool_ctx; + librados::IoCtx notif_pool_ctx; // .rgw.notif + + bool pools_initialized; + + RGWQuotaHandler *quota_handler; + + RGWCoroutinesManagerRegistry *cr_registry; + + RGWSyncModuleInstanceRef sync_module; + bool writeable_zone{false}; + + RGWIndexCompletionManager *index_completion_manager{nullptr}; + + bool use_cache{false}; + bool use_gc{true}; + bool use_datacache{false}; + + int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx); +public: + RGWRados(): timer(NULL), + gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false), + run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL), + data_notifier(NULL), meta_sync_processor_thread(NULL), + bucket_index_max_shards(0), + max_bucket_id(0), cct(NULL), + binfo_cache(NULL), obj_tombstone_cache(nullptr), + pools_initialized(false), + quota_handler(NULL), + cr_registry(NULL), + pctl(&ctl), + reshard(NULL) {} + + RGWRados& set_use_cache(bool status) { + use_cache = status; + return *this; + } + + RGWRados& set_use_gc(bool status) { + use_gc = status; + return *this; + } + + RGWRados& set_use_datacache(bool status) { + use_datacache = status; + return *this; + } + + bool get_use_datacache() { + return use_datacache; + } + + RGWLC *get_lc() { + return lc; + } + + RGWGC *get_gc() { + return gc; + } + + RGWRados& set_run_gc_thread(bool _use_gc_thread) { + use_gc_thread = _use_gc_thread; + return *this; + } + + RGWRados& set_run_lc_thread(bool _use_lc_thread) { + use_lc_thread = _use_lc_thread; + return *this; + } + + RGWRados& set_run_quota_threads(bool _run_quota_threads) { + quota_threads = _run_quota_threads; + return *this; + } + + RGWRados& set_run_sync_thread(bool _run_sync_thread) { + run_sync_thread = _run_sync_thread; + return *this; + } + + RGWRados& set_run_reshard_thread(bool _run_reshard_thread) { + run_reshard_thread = _run_reshard_thread; + return *this; + } + + librados::IoCtx* get_lc_pool_ctx() { + return &lc_pool_ctx; + } + + librados::IoCtx& get_notif_pool_ctx() { + return notif_pool_ctx; + } + + void set_context(CephContext *_cct) { + cct = _cct; + } + void set_store(rgw::sal::RadosStore* _driver) { + driver = _driver; + } + + RGWServices svc; + RGWCtl ctl; + + RGWCtl *pctl{nullptr}; + + /** + * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we + * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed. + */ + std::string host_id; + + RGWReshard *reshard; + std::shared_ptr reshard_wait; + + virtual ~RGWRados() = default; + + tombstone_cache_t *get_tombstone_cache() { + return obj_tombstone_cache; + } + const RGWSyncModuleInstanceRef& get_sync_module() { + return sync_module; + } + RGWSyncTraceManager *get_sync_tracer() { + return sync_tracer; + } + + int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment); + void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size); + int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr); + int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr); + + uint32_t get_max_bucket_shards() { + return RGWSI_BucketIndex_RADOS::shards_max(); + } + + + int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref); + + int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx); + int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, std::list& oids, + bool *is_truncated); + int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, std::list& oids, + bool *is_truncated); + std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx); + + CephContext *ctx() { return cct; } + /** do all necessary setup of the storage device */ + int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) { + set_context(_cct); + return init_begin(dpp); + } + /** Initialize the RADOS instance and prepare to do other ops */ + int init_svc(bool raw, const DoutPrefixProvider *dpp); + int init_ctl(const DoutPrefixProvider *dpp); + virtual int init_rados(); + int init_begin(const DoutPrefixProvider *dpp); + int init_complete(const DoutPrefixProvider *dpp); + void finalize(); + + int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map& meta); + int update_service_map(const DoutPrefixProvider *dpp, std::map&& status); + + /// list logs + int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle); + int log_list_next(RGWAccessHandle handle, std::string *name); + + /// remove log + int log_remove(const DoutPrefixProvider *dpp, const std::string& name); + + /// show log + int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle); + int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry); + + // log bandwidth info + int log_usage(const DoutPrefixProvider *dpp, std::map& usage_info); + int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map& usage); + int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch); + int clear_usage(const DoutPrefixProvider *dpp); + + int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool); + + void create_bucket_id(std::string *bucket_id); + + bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool); + bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj); + + int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket, + const std::string& zonegroup_id, + const rgw_placement_rule& placement_rule, + const std::string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + std::map& attrs, + RGWBucketInfo& bucket_info, + obj_version *pobjv, + obj_version *pep_objv, + ceph::real_time creation_time, + rgw_bucket *master_bucket, + uint32_t *master_num_shards, + optional_yield y, + const DoutPrefixProvider *dpp, + bool exclusive = true); + + RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; } + + struct BucketShard { + RGWRados *store; + rgw_bucket bucket; + int shard_id; + RGWSI_RADOS::Obj bucket_obj; + + explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {} + int init(const rgw_bucket& _bucket, const rgw_obj& obj, + RGWBucketInfo* out, const DoutPrefixProvider *dpp); + int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj); + int init(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& index, int sid); + + friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) { + out << "BucketShard:{ bucket=" << bs.bucket << + ", shard_id=" << bs.shard_id << + ", bucket_ojb=" << bs.bucket_obj << "}"; + return out; + } + }; + + class Object { + RGWRados *store; + RGWBucketInfo bucket_info; + RGWObjectCtx& ctx; + rgw_obj obj; + + BucketShard bs; + + RGWObjState *state; + RGWObjManifest *manifest; + + bool versioning_disabled; + + bool bs_initialized; + + const rgw_placement_rule *pmeta_placement_rule; + + protected: + int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false); + void invalidate_state(); + + int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag, + const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y); + int complete_atomic_modification(const DoutPrefixProvider *dpp); + + public: + Object(RGWRados *_store, const RGWBucketInfo& _bucket_info, RGWObjectCtx& _ctx, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info), + ctx(_ctx), obj(_obj), bs(store), + state(NULL), manifest(nullptr), versioning_disabled(false), + bs_initialized(false), + pmeta_placement_rule(nullptr) {} + + RGWRados *get_store() { return store; } + rgw_obj& get_obj() { return obj; } + RGWObjectCtx& get_ctx() { return ctx; } + RGWBucketInfo& get_bucket_info() { return bucket_info; } + //const std::string& get_instance() { return obj->get_instance(); } + //rgw::sal::Object* get_target() { return obj; } + int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y); + + int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) { + if (!bs_initialized) { + int r = + bs.init(bucket_info.bucket, obj, nullptr /* no RGWBucketInfo */, dpp); + if (r < 0) { + return r; + } + bs_initialized = true; + } + *pbs = &bs; + return 0; + } + + void set_versioning_disabled(bool status) { + versioning_disabled = status; + } + + bool versioning_enabled() { + return (!versioning_disabled && bucket_info.versioning_enabled()); + } + + void set_meta_placement_rule(const rgw_placement_rule *p) { + pmeta_placement_rule = p; + } + + const rgw_placement_rule& get_meta_placement_rule() { + return pmeta_placement_rule ? *pmeta_placement_rule : bucket_info.placement_rule; + } + + struct Read { + RGWRados::Object *source; + + struct GetObjState { + std::map io_ctxs; + rgw_pool cur_pool; + librados::IoCtx *cur_ioctx{nullptr}; + rgw_obj obj; + rgw_raw_obj head_obj; + } state; + + struct ConditionParams { + const ceph::real_time *mod_ptr; + const ceph::real_time *unmod_ptr; + bool high_precision_time; + uint32_t mod_zone_id; + uint64_t mod_pg_ver; + const char *if_match; + const char *if_nomatch; + + ConditionParams() : + mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0), + if_match(NULL), if_nomatch(NULL) {} + } conds; + + struct Params { + ceph::real_time *lastmod; + uint64_t *obj_size; + std::map *attrs; + rgw_obj *target_obj; + + Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr), + target_obj(nullptr) {} + } params; + + explicit Read(RGWRados::Object *_source) : source(_source) {} + + int prepare(optional_yield y, const DoutPrefixProvider *dpp); + static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end); + int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp); + int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y); + int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y); + }; + + struct Write { + RGWRados::Object *target; + + struct MetaParams { + ceph::real_time *mtime; + std::map* rmattrs; + const bufferlist *data; + RGWObjManifest *manifest; + const std::string *ptag; + std::list *remove_objs; + ceph::real_time set_mtime; + rgw_user owner; + RGWObjCategory category; + int flags; + const char *if_match; + const char *if_nomatch; + std::optional olh_epoch; + ceph::real_time delete_at; + bool canceled; + const std::string *user_data; + rgw_zone_set *zones_trace; + bool modify_tail; + bool completeMultipart; + bool appendable; + + MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL), + remove_objs(NULL), category(RGWObjCategory::Main), flags(0), + if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr), + modify_tail(false), completeMultipart(false), appendable(false) {} + } meta; + + explicit Write(RGWRados::Object *_target) : target(_target) {} + + int _do_write_meta(const DoutPrefixProvider *dpp, + uint64_t size, uint64_t accounted_size, + std::map& attrs, + bool modify_tail, bool assume_noent, + void *index_op, optional_yield y); + int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size, + std::map& attrs, optional_yield y); + int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive); + const req_state* get_req_state() { + return nullptr; /* XXX dang Only used by LTTng, and it handles null anyway */ + } + }; + + struct Delete { + RGWRados::Object *target; + + struct DeleteParams { + rgw_user bucket_owner; + int versioning_status; // versioning flags defined in enum RGWBucketFlags + ACLOwner obj_owner; // needed for creation of deletion marker + uint64_t olh_epoch; + std::string marker_version_id; + uint32_t bilog_flags; + std::list *remove_objs; + ceph::real_time expiration_time; + ceph::real_time unmod_since; + ceph::real_time mtime; /* for setting delete marker mtime */ + bool high_precision_time; + rgw_zone_set *zones_trace; + bool abortmp; + uint64_t parts_accounted_size; + + DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {} + } params; + + struct DeleteResult { + bool delete_marker; + std::string version_id; + + DeleteResult() : delete_marker(false) {} + } result; + + explicit Delete(RGWRados::Object *_target) : target(_target) {} + + int delete_obj(optional_yield y, const DoutPrefixProvider *dpp); + }; + + struct Stat { + RGWRados::Object *source; + + struct Result { + rgw_obj obj; + std::optional manifest; + uint64_t size{0}; + struct timespec mtime {}; + std::map attrs; + } result; + + struct State { + librados::IoCtx io_ctx; + librados::AioCompletion *completion; + int ret; + + State() : completion(NULL), ret(0) {} + } state; + + + explicit Stat(RGWRados::Object *_source) : source(_source) {} + + int stat_async(const DoutPrefixProvider *dpp); + int wait(const DoutPrefixProvider *dpp); + int stat(); + private: + int finish(const DoutPrefixProvider *dpp); + }; + }; + + class Bucket { + RGWRados *store; + RGWBucketInfo bucket_info; + rgw_bucket& bucket; + int shard_id; + + public: + Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket), + shard_id(RGW_NO_SHARD) {} + RGWRados *get_store() { return store; } + rgw_bucket& get_bucket() { return bucket; } + RGWBucketInfo& get_bucket_info() { return bucket_info; } + + int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp); + + int get_shard_id() { return shard_id; } + void set_shard_id(int id) { + shard_id = id; + } + + class UpdateIndex { + RGWRados::Bucket *target; + std::string optag; + rgw_obj obj; + uint16_t bilog_flags{0}; + BucketShard bs; + bool bs_initialized{false}; + bool blind; + bool prepared{false}; + rgw_zone_set *zones_trace{nullptr}; + + int init_bs(const DoutPrefixProvider *dpp) { + int r = + bs.init(target->get_bucket(), obj, &target->bucket_info, dpp); + if (r < 0) { + return r; + } + bs_initialized = true; + return 0; + } + + void invalidate_bs() { + bs_initialized = false; + } + + int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function call); + public: + + UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj), + bs(target->get_store()) { + blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless); + } + + int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) { + if (!bs_initialized) { + int r = init_bs(dpp); + if (r < 0) { + return r; + } + } + *pbs = &bs; + return 0; + } + + void set_bilog_flags(uint16_t flags) { + bilog_flags = flags; + } + + void set_zones_trace(rgw_zone_set *_zones_trace) { + zones_trace = _zones_trace; + } + + int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y); + int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size, + uint64_t accounted_size, ceph::real_time& ut, + const std::string& etag, const std::string& content_type, + const std::string& storage_class, + bufferlist *acl_bl, RGWObjCategory category, + std::list *remove_objs, + optional_yield y, + const std::string *user_data = nullptr, + bool appendable = false); + int complete_del(const DoutPrefixProvider *dpp, + int64_t poolid, uint64_t epoch, + ceph::real_time& removed_mtime, /* mtime of removed object */ + std::list *remove_objs, + optional_yield y); + int cancel(const DoutPrefixProvider *dpp, + std::list *remove_objs, + optional_yield y); + + const std::string *get_optag() { return &optag; } + + bool is_prepared() { return prepared; } + }; // class UpdateIndex + + class List { + protected: + // absolute maximum number of objects that + // list_objects_(un)ordered can return + static constexpr int64_t bucket_list_objects_absolute_max = 25000; + + RGWRados::Bucket *target; + rgw_obj_key next_marker; + + int list_objects_ordered(const DoutPrefixProvider *dpp, + int64_t max, + std::vector *result, + std::map *common_prefixes, + bool *is_truncated, + optional_yield y); + int list_objects_unordered(const DoutPrefixProvider *dpp, + int64_t max, + std::vector *result, + std::map *common_prefixes, + bool *is_truncated, + optional_yield y); + + public: + + struct Params { + std::string prefix; + std::string delim; + rgw_obj_key marker; + rgw_obj_key end_marker; + std::string ns; + bool enforce_ns; + RGWAccessListFilter* access_list_filter; + RGWBucketListNameFilter force_check_filter; + bool list_versions; + bool allow_unordered; + + Params() : + enforce_ns(true), + access_list_filter(nullptr), + list_versions(false), + allow_unordered(false) + {} + } params; + + explicit List(RGWRados::Bucket *_target) : target(_target) {} + + int list_objects(const DoutPrefixProvider *dpp, int64_t max, + std::vector *result, + std::map *common_prefixes, + bool *is_truncated, + optional_yield y) { + if (params.allow_unordered) { + return list_objects_unordered(dpp, max, result, common_prefixes, + is_truncated, y); + } else { + return list_objects_ordered(dpp, max, result, common_prefixes, + is_truncated, y); + } + } + rgw_obj_key& get_next_marker() { + return next_marker; + } + }; // class List + }; // class Bucket + + int on_last_entry_in_listing(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::string& obj_prefix, + const std::string& obj_delim, + std::function handler); + + bool swift_versioning_enabled(const RGWBucketInfo& bucket_info) const; + + int swift_versioning_copy(RGWObjectCtx& obj_ctx, /* in/out */ + const rgw_user& user, /* in */ + RGWBucketInfo& bucket_info, /* in */ + const rgw_obj& obj, /* in */ + const DoutPrefixProvider *dpp, /* in */ + optional_yield y); /* in */ + int swift_versioning_restore(RGWObjectCtx& obj_ctx, /* in/out */ + const rgw_user& user, /* in */ + RGWBucketInfo& bucket_info, /* in */ + rgw_obj& obj, /* in/out */ + bool& restored, /* out */ + const DoutPrefixProvider *dpp); /* in */ + int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp, + RGWObjState *astate, + std::map& src_attrs, + RGWRados::Object::Read& read_op, + const rgw_user& user_id, + const rgw_obj& dest_obj, + ceph::real_time *mtime); + + enum AttrsMod { + ATTRSMOD_NONE = 0, + ATTRSMOD_REPLACE = 1, + ATTRSMOD_MERGE = 2 + }; + + D3nDataCache* d3n_data_cache{nullptr}; + + int rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y); + int reindex_obj(const RGWBucketInfo& dest_bucket_info, + const rgw_obj& obj, + const DoutPrefixProvider* dpp, + optional_yield y); + + int stat_remote_obj(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + const rgw_obj& src_obj, + const RGWBucketInfo *src_bucket_info, + real_time *src_mtime, + uint64_t *psize, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + std::map *pattrs, + std::map *pheaders, + std::string *version_id, + std::string *ptag, + std::string *petag); + + int fetch_remote_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + const rgw_obj& dest_obj, + const rgw_obj& src_obj, + RGWBucketInfo& dest_bucket_info, + RGWBucketInfo *src_bucket_info, + std::optional dest_placement, + ceph::real_time *src_mtime, + ceph::real_time *mtime, + const ceph::real_time *mod_ptr, + const ceph::real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + rgw::sal::Attrs& attrs, + RGWObjCategory category, + std::optional olh_epoch, + ceph::real_time delete_at, + std::string *ptag, + std::string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + const DoutPrefixProvider *dpp, + RGWFetchObjFilter *filter, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *zones_trace = nullptr, + std::optional* bytes_transferred = 0); + /** + * Copy an object. + * dest_obj: the object to copy into + * src_obj: the object to copy from + * attrs: usage depends on attrs_mod parameter + * attrs_mod: the modification mode of the attrs, may have the following values: + * ATTRSMOD_NONE - the attributes of the source object will be + * copied without modifications, attrs parameter is ignored; + * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs + * parameter, source object attributes are not copied; + * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes + * are overwritten by values contained in attrs parameter. + * Returns: 0 on success, -ERR# otherwise. + */ + int copy_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + const rgw_obj& dest_obj, + const rgw_obj& src_obj, + RGWBucketInfo& dest_bucket_info, + RGWBucketInfo& src_bucket_info, + const rgw_placement_rule& dest_placement, + ceph::real_time *src_mtime, + ceph::real_time *mtime, + const ceph::real_time *mod_ptr, + const ceph::real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + std::map& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + ceph::real_time delete_at, + std::string *version_id, + std::string *ptag, + std::string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + const DoutPrefixProvider *dpp, + optional_yield y); + + int copy_obj_data(RGWObjectCtx& obj_ctx, + RGWBucketInfo& dest_bucket_info, + const rgw_placement_rule& dest_placement, + RGWRados::Object::Read& read_op, off_t end, + const rgw_obj& dest_obj, + ceph::real_time *mtime, + ceph::real_time set_mtime, + std::map& attrs, + uint64_t olh_epoch, + ceph::real_time delete_at, + std::string *petag, + const DoutPrefixProvider *dpp, + optional_yield y); + + int transition_obj(RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, + const rgw_obj& obj, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider *dpp, + optional_yield y); + + int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y); + + /** + * Delete a bucket. + * bucket: the name of the bucket to delete + * Returns 0 on success, -ERR# otherwise. + */ + int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true); + + void wakeup_meta_sync_shards(std::set& shard_ids); + + void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map >& entries); + + RGWMetaSyncStatusManager* get_meta_sync_manager(); + RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone); + + int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp); + int set_buckets_enabled(std::vector& buckets, bool enabled, const DoutPrefixProvider *dpp); + int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended); + + /** Delete an object.*/ + int delete_obj(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + int versioning_status, // versioning flags defined in enum RGWBucketFlags + uint16_t bilog_flags = 0, + const ceph::real_time& expiration_time = ceph::real_time(), + rgw_zone_set *zones_trace = nullptr); + + int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj); + + /** Remove an object from the bucket index */ + int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, + const DoutPrefixProvider *dpp, optional_yield y); + + /** + * Set an attr on an object. + * bucket: name of the bucket holding the object + * obj: name of the object to set the attr on + * name: the attr to set + * bl: the contents of the attr + * Returns: 0 on success, -ERR# otherwise. + */ + int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, const char *name, bufferlist& bl); + + int set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, + std::map& attrs, + std::map* rmattrs, + optional_yield y, + ceph::real_time set_mtime = ceph::real_clock::zero()); + + int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, + bool follow_olh, optional_yield y, bool assume_noent = false); + int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) { + return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y); + } + + using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t, + off_t, bool, RGWObjState*, void*); + + int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info, + const rgw_obj& obj, off_t ofs, off_t end, + uint64_t max_chunk_size, iterate_obj_cb cb, void *arg, + optional_yield y); + + int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op); + + virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg); + + /** + * a simple object read without keeping state + */ + + int raw_obj_stat(const DoutPrefixProvider *dpp, + rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch, + std::map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker, optional_yield y); + + int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op); + int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op); + + int guard_reshard(const DoutPrefixProvider *dpp, + BucketShard *bs, + const rgw_obj& obj_instance, + RGWBucketInfo& bucket_info, + std::function call); + int block_while_resharding(RGWRados::BucketShard *bs, + const rgw_obj& obj_instance, + RGWBucketInfo& bucket_info, + optional_yield y, + const DoutPrefixProvider *dpp); + + void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op); + void olh_cancel_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, const std::string& op_tag, optional_yield y); + int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag); + int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag); + int bucket_index_link_olh(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, RGWObjState& olh_state, + const rgw_obj& obj_instance, bool delete_marker, + const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, + ceph::real_time unmod_since, bool high_precision_time, + optional_yield y, + rgw_zone_set *zones_trace = nullptr, + bool log_data_change = false); + int bucket_index_unlink_instance(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw_obj& obj_instance, + const std::string& op_tag, const std::string& olh_tag, + uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr); + int bucket_index_read_olh_log(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, RGWObjState& state, + const rgw_obj& obj_instance, uint64_t ver_marker, + std::map > *log, bool *is_truncated); + int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver); + int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const std::string& olh_tag, const rgw_obj& obj_instance); + int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw_obj& obj, + bufferlist& obj_tag, std::map >& log, + uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr); + int update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace = nullptr); + int clear_olh(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const rgw_obj& obj, + RGWBucketInfo& bucket_info, + const std::string& tag, + const uint64_t ver, + optional_yield y); + int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time, + optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false); + int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info, + const rgw_obj& obj); + int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, + uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr); + + void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map& pending_entries, std::map *rm_pending_entries); + int remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map& pending_attrs); + int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target); + int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh); + + void gen_rand_obj_instance_name(rgw_obj_key *target_key); + void gen_rand_obj_instance_name(rgw_obj *target); + + int update_containers_stats(std::map& m, const DoutPrefixProvider *dpp); + int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl); + +public: + void set_atomic(void *ctx, const rgw_obj& obj) { + RGWObjectCtx *rctx = static_cast(ctx); + rctx->set_atomic(obj); + } + void set_prefetch_data(void *ctx, const rgw_obj& obj) { + RGWObjectCtx *rctx = static_cast(ctx); + rctx->set_prefetch_data(obj); + } + void set_compressed(void *ctx, const rgw_obj& obj) { + RGWObjectCtx *rctx = static_cast(ctx); + rctx->set_compressed(obj); + } + int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner); + int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver, + std::map& stats, std::string *max_marker, bool* syncstopped = NULL); + int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb); + + int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map *pattrs, const DoutPrefixProvider *dpp, optional_yield y); + /* xxx dang obj_ctx -> svc */ + int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map *pattrs, optional_yield y, const DoutPrefixProvider *dpp); + int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map *pattrs, optional_yield y, const DoutPrefixProvider *dpp); + + static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry); + + int get_bucket_info(RGWServices *svc, + const std::string& tenant_name, const std::string& bucket_name, + RGWBucketInfo& info, + ceph::real_time *pmtime, optional_yield y, + const DoutPrefixProvider *dpp, std::map *pattrs = NULL); + + // Returns 0 on successful refresh. Returns error code if there was + // an error or the version stored on the OSD is the same as that + // presented in the BucketInfo structure. + // + int try_refresh_bucket_info(RGWBucketInfo& info, + ceph::real_time *pmtime, + const DoutPrefixProvider *dpp, + std::map *pattrs = nullptr); + + int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv, + std::map *pattrs, bool create_entry_point, + const DoutPrefixProvider *dpp, optional_yield y); + + int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent, + RGWObjCategory category, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj, + ceph::real_time& removed_mtime, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj, + std::list *remove_objs, + uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout); + + using ent_map_t = + boost::container::flat_map; + + int cls_bucket_list_ordered(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + const int shard_id, + const rgw_obj_index_key& start_after, + const std::string& prefix, + const std::string& delimiter, + const uint32_t num_entries, + const bool list_versions, + const uint16_t exp_factor, // 0 means ignore + ent_map_t& m, + bool* is_truncated, + bool* cls_filtered, + rgw_obj_index_key *last_entry, + optional_yield y, + RGWBucketListNameFilter force_check_filter = {}); + int cls_bucket_list_unordered(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, + const rgw_obj_index_key& start_after, + const std::string& prefix, + uint32_t num_entries, + bool list_versions, + std::vector& ent_list, + bool *is_truncated, + rgw_obj_index_key *last_entry, + optional_yield y, + RGWBucketListNameFilter force_check_filter = {}); + int cls_bucket_head(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, std::vector& headers, + std::map *bucket_instance_ids = NULL); + int cls_bucket_head_async(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio); + int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent); + int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh); + int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry); + void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry); + int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry); + int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry); + int bi_list(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + int shard_id, + const std::string& filter_obj, + const std::string& marker, + uint32_t max, + std::list *entries, + bool *is_truncated); + int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list *entries, bool *is_truncated); + int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max, + std::list *entries, bool *is_truncated); + int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs); + + int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info); + int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, std::string& read_iter, + std::map& usage, bool *is_truncated); + int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch, + uint64_t end_epoch); + int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid); + + int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id); + + int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id); + int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id); + + void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain); + std::tuple> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag); + void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag); + int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op); + int gc_aio_operate(const std::string& oid, librados::AioCompletion *c, + librados::ObjectWriteOperation *op); + int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl); + + int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated, bool& processing_queue); + int process_gc(bool expired_only); + bool process_expire_objects(const DoutPrefixProvider *dpp); + int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y); + + int process_lc(const std::unique_ptr& optional_bucket); + int list_lc_progress(std::string& marker, uint32_t max_entries, + std::vector>& progress_map, + int& index); + + int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, + std::map *existing_stats, + std::map *calculated_stats); + int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info); + + // Search the bucket for encrypted multipart uploads, and increase their mtime + // slightly to generate a bilog entry to trigger a resync to repair any + // corrupted replicas. See https://tracker.ceph.com/issues/46062 + int bucket_resync_encrypted_multipart(const DoutPrefixProvider* dpp, + optional_yield y, + rgw::sal::RadosStore* driver, + RGWBucketInfo& bucket_info, + const std::string& marker, + RGWFormatterFlusher& flusher); + + int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry); + int remove_objs_from_index(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::list& oid_list); + int move_rados_obj(const DoutPrefixProvider *dpp, + librados::IoCtx& src_ioctx, + const std::string& src_oid, const std::string& src_locator, + librados::IoCtx& dst_ioctx, + const std::string& dst_oid, const std::string& dst_locator); + int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key); + int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, + rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y); + + int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket, + RGWQuota& quota, uint64_t obj_size, + optional_yield y, bool check_size_only = false); + + int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket, + uint64_t num_objs, const DoutPrefixProvider *dpp); + + int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards); + + uint64_t instance_id(); + + librados::Rados* get_rados_handle(); + + int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list& handles); + int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate, + std::list& handles, bool keep_index_consistent, + optional_yield y); + + private: + /** + * Check the actual on-disk state of the object specified + * by list_state, and fill in the time and size of object. + * Then append any changes to suggested_updates for + * the rgw class' dir_suggest_changes function. + * + * Note that this can maul list_state; don't use it afterwards. Also + * it expects object to already be filled in from list_state; it only + * sets the size and mtime. + * + * Returns 0 on success, -ENOENT if the object doesn't exist on disk, + * and -errno on other failures. (-ENOENT is not a failure, and it + * will encode that info as a suggested update.) + */ + int check_disk_state(const DoutPrefixProvider *dpp, + librados::IoCtx io_ctx, + RGWBucketInfo& bucket_info, + rgw_bucket_dir_entry& list_state, + rgw_bucket_dir_entry& object, + bufferlist& suggested_updates, + optional_yield y); + + /** + * Init pool iteration + * pool: pool to use for the ctx initialization + * ctx: context object to use for the iteration + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx); + + /** + * Init pool iteration + * pool: pool to use + * cursor: position to start iteration + * ctx: context object to use for the iteration + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx); + + /** + * Get pool iteration position + * ctx: context object to use for the iteration + * Returns: std::string representation of position + */ + std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx); + + /** + * Iterate over pool return object names, use optional filter + * ctx: iteration context, initialized with pool_iterate_begin() + * num: max number of objects to return + * objs: a vector that the results will append into + * is_truncated: if not NULL, will hold true iff iteration is complete + * filter: if not NULL, will be used to filter returned objects + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, + std::vector& objs, + bool *is_truncated, RGWAccessListFilter *filter); + + uint64_t next_bucket_id(); + + /** + * This is broken out to facilitate unit testing. + */ + static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries, + uint32_t num_shards); +}; + + +struct get_obj_data { + RGWRados* rgwrados; + RGWGetDataCB* client_cb = nullptr; + rgw::Aio* aio; + uint64_t offset; // next offset to write to client + rgw::AioResultList completed; // completed read results, sorted by offset + optional_yield yield; + + get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio, + uint64_t offset, optional_yield yield) + : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {} + ~get_obj_data() { + if (rgwrados->get_use_datacache()) { + const std::lock_guard l(d3n_get_data.d3n_lock); + } + } + + D3nGetObjData d3n_get_data; + std::atomic_bool d3n_bypass_cache_write{false}; + + int flush(rgw::AioResultList&& results); + + void cancel() { + // wait for all completions to drain and ignore the results + aio->drain(); + } + + int drain() { + auto c = aio->wait(); + while (!c.empty()) { + int r = flush(std::move(c)); + if (r < 0) { + cancel(); + return r; + } + c = aio->wait(); + } + return flush(std::move(c)); + } +}; diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc new file mode 100644 index 000000000..2abf02908 --- /dev/null +++ b/src/rgw/driver/rados/rgw_reshard.cc @@ -0,0 +1,1419 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "rgw_zone.h" +#include "driver/rados/rgw_bucket.h" +#include "rgw_reshard.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/lock/cls_lock_client.h" +#include "common/errno.h" +#include "common/ceph_json.h" + +#include "common/dout.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" +#include "services/svc_tier_rados.h" +#include "services/svc_bilog_rados.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +const string reshard_oid_prefix = "reshard."; +const string reshard_lock_name = "reshard_process"; +const string bucket_instance_lock_name = "bucket_instance_lock"; + +/* All primes up to 2000 used to attempt to make dynamic sharding use + * a prime numbers of shards. Note: this list also includes 1 for when + * 1 shard is the most appropriate, even though 1 is not prime. + */ +const std::initializer_list RGWBucketReshard::reshard_primes = { + 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, + 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, + 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, + 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, + 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, + 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, + 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, + 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, + 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, + 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, + 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, + 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021, + 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, + 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, + 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259, + 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, + 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, + 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, + 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, + 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, + 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741, + 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831, + 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913, + 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999 +}; + +class BucketReshardShard { + rgw::sal::RadosStore* store; + const RGWBucketInfo& bucket_info; + int shard_id; + RGWRados::BucketShard bs; + vector entries; + map stats; + deque& aio_completions; + uint64_t max_aio_completions; + uint64_t reshard_shard_batch_size; + + int wait_next_completion() { + librados::AioCompletion *c = aio_completions.front(); + aio_completions.pop_front(); + + c->wait_for_complete(); + + int ret = c->get_return_value(); + c->release(); + + if (ret < 0) { + derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; + } + + int get_completion(librados::AioCompletion **c) { + if (aio_completions.size() >= max_aio_completions) { + int ret = wait_next_completion(); + if (ret < 0) { + return ret; + } + } + + *c = librados::Rados::aio_create_completion(nullptr, nullptr); + aio_completions.push_back(*c); + + return 0; + } + +public: + BucketReshardShard(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info, + const rgw::bucket_index_layout_generation& index, + int shard_id, deque& _completions) : + store(_store), bucket_info(_bucket_info), shard_id(shard_id), + bs(store->getRados()), aio_completions(_completions) + { + bs.init(dpp, bucket_info, index, shard_id); + + max_aio_completions = + store->ctx()->_conf.get_val("rgw_reshard_max_aio"); + reshard_shard_batch_size = + store->ctx()->_conf.get_val("rgw_reshard_batch_size"); + } + + int get_shard_id() const { + return shard_id; + } + + int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category, + const rgw_bucket_category_stats& entry_stats) { + entries.push_back(entry); + if (account) { + rgw_bucket_category_stats& target = stats[category]; + target.num_entries += entry_stats.num_entries; + target.total_size += entry_stats.total_size; + target.total_size_rounded += entry_stats.total_size_rounded; + target.actual_size += entry_stats.actual_size; + } + if (entries.size() >= reshard_shard_batch_size) { + int ret = flush(); + if (ret < 0) { + return ret; + } + } + + return 0; + } + + int flush() { + if (entries.size() == 0) { + return 0; + } + + librados::ObjectWriteOperation op; + for (auto& entry : entries) { + store->getRados()->bi_put(op, bs, entry); + } + cls_rgw_bucket_update_stats(op, false, stats); + + librados::AioCompletion *c; + int ret = get_completion(&c); + if (ret < 0) { + return ret; + } + ret = bs.bucket_obj.aio_operate(c, &op); + if (ret < 0) { + derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl; + return ret; + } + entries.clear(); + stats.clear(); + return 0; + } + + int wait_all_aio() { + int ret = 0; + while (!aio_completions.empty()) { + int r = wait_next_completion(); + if (r < 0) { + ret = r; + } + } + return ret; + } +}; // class BucketReshardShard + + +class BucketReshardManager { + rgw::sal::RadosStore *store; + deque completions; + vector target_shards; + +public: + BucketReshardManager(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore *_store, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& target) + : store(_store) + { + const uint32_t num_shards = rgw::num_shards(target.layout.normal); + target_shards.reserve(num_shards); + for (uint32_t i = 0; i < num_shards; ++i) { + target_shards.emplace_back(dpp, store, bucket_info, target, i, completions); + } + } + + ~BucketReshardManager() { + for (auto& shard : target_shards) { + int ret = shard.wait_all_aio(); + if (ret < 0) { + ldout(store->ctx(), 20) << __func__ << + ": shard->wait_all_aio() returned ret=" << ret << dendl; + } + } + } + + int add_entry(int shard_index, + rgw_cls_bi_entry& entry, bool account, RGWObjCategory category, + const rgw_bucket_category_stats& entry_stats) { + int ret = target_shards[shard_index].add_entry(entry, account, category, + entry_stats); + if (ret < 0) { + derr << "ERROR: target_shards.add_entry(" << entry.idx << + ") returned error: " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; + } + + int finish() { + int ret = 0; + for (auto& shard : target_shards) { + int r = shard.flush(); + if (r < 0) { + derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl; + ret = r; + } + } + for (auto& shard : target_shards) { + int r = shard.wait_all_aio(); + if (r < 0) { + derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl; + ret = r; + } + } + target_shards.clear(); + return ret; + } +}; // class BucketReshardManager + +RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store, + const RGWBucketInfo& _bucket_info, + const std::map& _bucket_attrs, + RGWBucketReshardLock* _outer_reshard_lock) : + store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs), + reshard_lock(store, bucket_info, true), + outer_reshard_lock(_outer_reshard_lock) +{ } + +// sets reshard status of bucket index shards for the current index layout +static int set_resharding_status(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, + const RGWBucketInfo& bucket_info, + cls_rgw_reshard_status status) +{ + cls_rgw_bucket_instance_entry instance_entry; + instance_entry.set_status(status); + + int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry); + if (ret < 0) { + ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: " + << cpp_strerror(-ret) << dendl; + return ret; + } + return 0; +} + +static int remove_old_reshard_instance(rgw::sal::RadosStore* store, + const rgw_bucket& bucket, + const DoutPrefixProvider* dpp) +{ + RGWBucketInfo info; + int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr, + nullptr, null_yield, dpp); + if (r < 0) { + return r; + } + + // delete its shard objects (ignore errors) + store->svc()->bi->clean_index(dpp, info, info.layout.current_index); + // delete the bucket instance metadata + return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp); +} + +// initialize the new bucket index shard objects +static int init_target_index(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& index, + const DoutPrefixProvider* dpp) +{ + int ret = store->svc()->bi->init_index(dpp, bucket_info, index); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize " + "target index shard objects: " << cpp_strerror(ret) << dendl; + return ret; + } + + if (!bucket_info.datasync_flag_enabled()) { + // if bucket sync is disabled, disable it on each of the new shards too + auto log = rgw::log_layout_from_index(0, index); + ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable " + "bucket sync on the target index shard objects: " + << cpp_strerror(ret) << dendl; + store->svc()->bi->clean_index(dpp, bucket_info, index); + return ret; + } + } + + return ret; +} + +// initialize a target index layout, create its bucket index shard objects, and +// write the target layout to the bucket instance metadata +static int init_target_layout(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + uint32_t new_num_shards, + const DoutPrefixProvider* dpp) +{ + auto prev = bucket_info.layout; // make a copy for cleanup + const auto current = prev.current_index; + + // initialize a new normal target index layout generation + rgw::bucket_index_layout_generation target; + target.layout.type = rgw::BucketIndexType::Normal; + target.layout.normal.num_shards = new_num_shards; + target.gen = current.gen + 1; + + if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) { + // backward-compatible cleanup of old reshards, where the target was in a + // different bucket instance + if (!bucket_info.new_bucket_instance_id.empty()) { + rgw_bucket new_bucket = bucket_info.bucket; + new_bucket.bucket_id = bucket_info.new_bucket_instance_id; + ldout(store->ctx(), 10) << __func__ << " removing target bucket instance " + "from a previous reshard attempt" << dendl; + // ignore errors + remove_old_reshard_instance(store, new_bucket, dpp); + } + bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING; + } + + if (bucket_info.layout.target_index) { + // a previous reshard failed or stalled, and its reshard lock dropped + ldpp_dout(dpp, 10) << __func__ << " removing existing target index " + "objects from a previous reshard attempt" << dendl; + // delete its existing shard objects (ignore errors) + store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index); + // don't reuse this same generation in the new target layout, in case + // something is still trying to operate on its shard objects + target.gen = bucket_info.layout.target_index->gen + 1; + } + + // create the index shard objects + int ret = init_target_index(store, bucket_info, target, dpp); + if (ret < 0) { + return ret; + } + + // retry in case of racing writes to the bucket instance metadata + static constexpr auto max_retries = 10; + int tries = 0; + do { + // update resharding state + bucket_info.layout.target_index = target; + bucket_info.layout.resharding = rgw::BucketReshardState::InProgress; + + if (ret = fault.check("set_target_layout"); + ret == 0) { // no fault injected, write the bucket instance metadata + ret = store->getRados()->put_bucket_instance_info(bucket_info, false, + real_time(), &bucket_attrs, dpp, null_yield); + } else if (ret == -ECANCELED) { + fault.clear(); // clear the fault so a retry can succeed + } + + if (ret == -ECANCELED) { + // racing write detected, read the latest bucket info and try again + int ret2 = store->getRados()->get_bucket_instance_info( + bucket_info.bucket, bucket_info, + nullptr, &bucket_attrs, null_yield, dpp); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " + "bucket info: " << cpp_strerror(ret2) << dendl; + ret = ret2; + break; + } + + // check that we're still in the reshard state we started in + if (bucket_info.layout.resharding != rgw::BucketReshardState::None || + bucket_info.layout.current_index != current) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "another reshard" << dendl; + break; + } + + prev = bucket_info.layout; // update the copy + } + ++tries; + } while (ret == -ECANCELED && tries < max_retries); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write " + "target index layout to bucket info: " << cpp_strerror(ret) << dendl; + + bucket_info.layout = std::move(prev); // restore in-memory layout + + // delete the target shard objects (ignore errors) + store->svc()->bi->clean_index(dpp, bucket_info, target); + return ret; + } + return 0; +} // init_target_layout + +// delete the bucket index shards associated with the target layout and remove +// it from the bucket instance metadata +static int revert_target_layout(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + const DoutPrefixProvider* dpp) +{ + auto prev = bucket_info.layout; // make a copy for cleanup + + // remove target index shard objects + int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index); + if (ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove " + "target index with: " << cpp_strerror(ret) << dendl; + ret = 0; // non-fatal error + } + + // retry in case of racing writes to the bucket instance metadata + static constexpr auto max_retries = 10; + int tries = 0; + do { + // clear target_index and resharding state + bucket_info.layout.target_index = std::nullopt; + bucket_info.layout.resharding = rgw::BucketReshardState::None; + + if (ret = fault.check("revert_target_layout"); + ret == 0) { // no fault injected, revert the bucket instance metadata + ret = store->getRados()->put_bucket_instance_info(bucket_info, false, + real_time(), + &bucket_attrs, dpp, null_yield); + } else if (ret == -ECANCELED) { + fault.clear(); // clear the fault so a retry can succeed + } + + if (ret == -ECANCELED) { + // racing write detected, read the latest bucket info and try again + int ret2 = store->getRados()->get_bucket_instance_info( + bucket_info.bucket, bucket_info, + nullptr, &bucket_attrs, null_yield, dpp); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " + "bucket info: " << cpp_strerror(ret2) << dendl; + ret = ret2; + break; + } + + // check that we're still in the reshard state we started in + if (bucket_info.layout.resharding == rgw::BucketReshardState::None) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "reshard cancel" << dendl; + return -ECANCELED; + } + if (bucket_info.layout.current_index != prev.current_index || + bucket_info.layout.target_index != prev.target_index) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "another reshard" << dendl; + return -ECANCELED; + } + + prev = bucket_info.layout; // update the copy + } + ++tries; + } while (ret == -ECANCELED && tries < max_retries); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear " + "target index layout in bucket info: " << cpp_strerror(ret) << dendl; + + bucket_info.layout = std::move(prev); // restore in-memory layout + return ret; + } + return 0; +} // remove_target_layout + +static int init_reshard(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + uint32_t new_num_shards, + const DoutPrefixProvider *dpp) +{ + if (new_num_shards == 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " got invalid new_num_shards=0" << dendl; + return -EINVAL; + } + + int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp); + if (ret < 0) { + return ret; + } + + if (ret = fault.check("block_writes"); + ret == 0) { // no fault injected, block writes to the current index shards + ret = set_resharding_status(dpp, store, bucket_info, + cls_rgw_reshard_status::IN_PROGRESS); + } + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause " + "writes to the current index: " << cpp_strerror(ret) << dendl; + // clean up the target layout (ignore errors) + revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp); + return ret; + } + return 0; +} // init_reshard + +static int cancel_reshard(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + const DoutPrefixProvider *dpp) +{ + // unblock writes to the current index shard objects + int ret = set_resharding_status(dpp, store, bucket_info, + cls_rgw_reshard_status::NOT_RESHARDING); + if (ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock " + "writes to current index objects: " << cpp_strerror(ret) << dendl; + ret = 0; // non-fatal error + } + + if (bucket_info.layout.target_index) { + return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp); + } + // there is nothing to revert + return 0; +} // cancel_reshard + +static int commit_target_layout(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + const DoutPrefixProvider *dpp) +{ + auto& layout = bucket_info.layout; + const auto next_log_gen = layout.logs.empty() ? 1 : + layout.logs.back().gen + 1; + + if (!store->svc()->zone->need_to_log_data()) { + // if we're not syncing data, we can drop any existing logs + layout.logs.clear(); + } + + // use the new index layout as current + ceph_assert(layout.target_index); + layout.current_index = std::move(*layout.target_index); + layout.target_index = std::nullopt; + layout.resharding = rgw::BucketReshardState::None; + // add the in-index log layout + layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index)); + + int ret = fault.check("commit_target_layout"); + if (ret == 0) { // no fault injected, write the bucket instance metadata + ret = store->getRados()->put_bucket_instance_info( + bucket_info, false, real_time(), &bucket_attrs, dpp, null_yield); + } else if (ret == -ECANCELED) { + fault.clear(); // clear the fault so a retry can succeed + } + return ret; +} // commit_target_layout + +static int commit_reshard(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + const DoutPrefixProvider *dpp) +{ + auto prev = bucket_info.layout; // make a copy for cleanup + + // retry in case of racing writes to the bucket instance metadata + static constexpr auto max_retries = 10; + int tries = 0; + int ret = 0; + do { + ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp); + if (ret == -ECANCELED) { + // racing write detected, read the latest bucket info and try again + int ret2 = store->getRados()->get_bucket_instance_info( + bucket_info.bucket, bucket_info, + nullptr, &bucket_attrs, null_yield, dpp); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " + "bucket info: " << cpp_strerror(ret2) << dendl; + ret = ret2; + break; + } + + // check that we're still in the reshard state we started in + if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "reshard cancel" << dendl; + return -ECANCELED; // whatever canceled us already did the cleanup + } + if (bucket_info.layout.current_index != prev.current_index || + bucket_info.layout.target_index != prev.target_index) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "another reshard" << dendl; + return -ECANCELED; // whatever canceled us already did the cleanup + } + + prev = bucket_info.layout; // update the copy + } + ++tries; + } while (ret == -ECANCELED && tries < max_retries); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit " + "target index layout: " << cpp_strerror(ret) << dendl; + + bucket_info.layout = std::move(prev); // restore in-memory layout + + // unblock writes to the current index shard objects + int ret2 = set_resharding_status(dpp, store, bucket_info, + cls_rgw_reshard_status::NOT_RESHARDING); + if (ret2 < 0) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock " + "writes to current index objects: " << cpp_strerror(ret2) << dendl; + // non-fatal error + } + return ret; + } + + if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() && + prev.current_index.layout.type == rgw::BucketIndexType::Normal) { + // write a datalog entry for each shard of the previous index. triggering + // sync on the old shards will force them to detect the end-of-log for that + // generation, and eventually transition to the next + // TODO: use a log layout to support types other than BucketLogType::InIndex + for (uint32_t shard_id = 0; shard_id < rgw::num_shards(prev.current_index.layout.normal); ++shard_id) { + // This null_yield can stay, for now, since we're in our own thread + ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id, + null_yield); + if (ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket=" + << bucket_info.bucket << ", shard_id=" << shard_id << "of generation=" + << prev.logs.back().gen << ")" << dendl; + } // datalog error is not fatal + } + } + + // check whether the old index objects are still needed for bilogs + const auto& logs = bucket_info.layout.logs; + auto log = std::find_if(logs.begin(), logs.end(), + [&prev] (const rgw::bucket_log_layout_generation& log) { + return log.layout.type == rgw::BucketLogType::InIndex + && log.layout.in_index.gen == prev.current_index.gen; + }); + if (log == logs.end()) { + // delete the index objects (ignore errors) + store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index); + } + return 0; +} // commit_reshard + +int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + const DoutPrefixProvider* dpp) +{ + ReshardFaultInjector no_fault; + return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp); +} + +int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp) +{ + int ret = reshard_lock.lock(dpp); + if (ret < 0) { + return ret; + } + + if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) { + ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl; + ret = -EINVAL; + } else { + ret = clear_resharding(store, bucket_info, bucket_attrs, dpp); + } + + reshard_lock.unlock(); + return ret; +} + +RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store, + const std::string& reshard_lock_oid, + bool _ephemeral) : + store(_store), + lock_oid(reshard_lock_oid), + ephemeral(_ephemeral), + internal_lock(reshard_lock_name) +{ + const int lock_dur_secs = store->ctx()->_conf.get_val( + "rgw_reshard_bucket_lock_duration"); + duration = std::chrono::seconds(lock_dur_secs); + +#define COOKIE_LEN 16 + char cookie_buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1); + cookie_buf[COOKIE_LEN] = '\0'; + + internal_lock.set_cookie(cookie_buf); + internal_lock.set_duration(duration); +} + +int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) { + internal_lock.set_must_renew(false); + + int ret; + if (ephemeral) { + ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx, + lock_oid); + } else { + ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid); + } + + if (ret == -EBUSY) { + ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ << + " found lock on " << lock_oid << + " to be held by another RGW process; skipping for now" << dendl; + return ret; + } else if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ << + " failed to acquire lock on " << lock_oid << ": " << + cpp_strerror(-ret) << dendl; + return ret; + } + + reset_time(Clock::now()); + + return 0; +} + +void RGWBucketReshardLock::unlock() { + int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid); + if (ret < 0) { + ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ << + " failed to drop lock on " << lock_oid << " ret=" << ret << dendl; + } +} + +int RGWBucketReshardLock::renew(const Clock::time_point& now) { + internal_lock.set_must_renew(true); + int ret; + if (ephemeral) { + ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx, + lock_oid); + } else { + ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid); + } + if (ret < 0) { /* expired or already locked by another processor */ + std::stringstream error_s; + if (-ENOENT == ret) { + error_s << "ENOENT (lock expired or never initially locked)"; + } else { + error_s << ret << " (" << cpp_strerror(-ret) << ")"; + } + ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " << + lock_oid << " with error " << error_s.str() << dendl; + return ret; + } + internal_lock.set_must_renew(false); + + reset_time(now); + ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " << + lock_oid << dendl; + + return 0; +} + + +int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current, + const rgw::bucket_index_layout_generation& target, + int max_entries, + bool verbose, + ostream *out, + Formatter *formatter, + const DoutPrefixProvider *dpp) +{ + if (out) { + (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl; + (*out) << "bucket name: " << bucket_info.bucket.name << std::endl; + } + + /* update bucket info -- in progress*/ + list entries; + + if (max_entries < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": can't reshard, negative max_entries" << dendl; + return -EINVAL; + } + + BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target); + + bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr); + + if (verbose_json_out) { + formatter->open_array_section("entries"); + } + + uint64_t total_entries = 0; + + if (!verbose_json_out && out) { + (*out) << "total entries:"; + } + + const uint32_t num_source_shards = rgw::num_shards(current.layout.normal); + string marker; + for (uint32_t i = 0; i < num_source_shards; ++i) { + bool is_truncated = true; + marker.clear(); + const std::string null_object_filter; // empty string since we're not filtering by object + while (is_truncated) { + entries.clear(); + int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated); + if (ret == -ENOENT) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to find shard " + << i << ", skipping" << dendl; + // break out of the is_truncated loop and move on to the next shard + break; + } else if (ret < 0) { + derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + for (auto iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_cls_bi_entry& entry = *iter; + if (verbose_json_out) { + formatter->open_object_section("entry"); + + encode_json("shard_id", i, formatter); + encode_json("num_entry", total_entries, formatter); + encode_json("entry", entry, formatter); + } + total_entries++; + + marker = entry.idx; + + int target_shard_id; + cls_rgw_obj_key cls_key; + RGWObjCategory category; + rgw_bucket_category_stats stats; + bool account = entry.get_info(&cls_key, &category, &stats); + rgw_obj_key key(cls_key); + if (entry.type == BIIndexType::OLH && key.empty()) { + // bogus entry created by https://tracker.ceph.com/issues/46456 + // to fix, skip so it doesn't get include in the new bucket instance + total_entries--; + ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl; + continue; + } + rgw_obj obj(bucket_info.bucket, key); + RGWMPObj mp; + if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) { + // place the multipart .meta object on the same shard as its head object + obj.index_hash_source = mp.get_key(); + } + ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal, + obj.get_hash_object(), &target_shard_id); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl; + return ret; + } + + int shard_index = (target_shard_id > 0 ? target_shard_id : 0); + + ret = target_shards_mgr.add_entry(shard_index, entry, account, + category, stats); + if (ret < 0) { + return ret; + } + + Clock::time_point now = Clock::now(); + if (reshard_lock.should_renew(now)) { + // assume outer locks have timespans at least the size of ours, so + // can call inside conditional + if (outer_reshard_lock) { + ret = outer_reshard_lock->renew(now); + if (ret < 0) { + return ret; + } + } + ret = reshard_lock.renew(now); + if (ret < 0) { + ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl; + return ret; + } + } + if (verbose_json_out) { + formatter->close_section(); + formatter->flush(*out); + } else if (out && !(total_entries % 1000)) { + (*out) << " " << total_entries; + } + } // entries loop + } + } + + if (verbose_json_out) { + formatter->close_section(); + formatter->flush(*out); + } else if (out) { + (*out) << " " << total_entries << std::endl; + } + + int ret = target_shards_mgr.finish(); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl; + return -EIO; + } + return 0; +} // RGWBucketReshard::do_reshard + +int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list *status) +{ + return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status); +} + +int RGWBucketReshard::execute(int num_shards, + ReshardFaultInjector& fault, + int max_op_entries, + const DoutPrefixProvider *dpp, + bool verbose, ostream *out, + Formatter *formatter, + RGWReshard* reshard_log) +{ + // take a reshard lock on the bucket + int ret = reshard_lock.lock(dpp); + if (ret < 0) { + return ret; + } + // unlock when scope exits + auto unlock = make_scope_guard([this] { reshard_lock.unlock(); }); + + if (reshard_log) { + ret = reshard_log->update(dpp, bucket_info); + if (ret < 0) { + return ret; + } + } + + // prepare the target index and add its layout the bucket info + ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp); + if (ret < 0) { + return ret; + } + + if (ret = fault.check("do_reshard"); + ret == 0) { // no fault injected, do the reshard + ret = do_reshard(bucket_info.layout.current_index, + *bucket_info.layout.target_index, + max_op_entries, verbose, out, formatter, dpp); + } + + if (ret < 0) { + cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp); + + ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \"" + << bucket_info.bucket.name << "\" canceled due to errors" << dendl; + return ret; + } + + ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp); + if (ret < 0) { + return ret; + } + + ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \"" + << bucket_info.bucket.name << "\" completed successfully" << dendl; + return 0; +} // execute + +bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket, + const RGWSI_Zone* zone_svc) +{ + return !zone_svc->need_to_log_data() || + bucket.layout.logs.size() < max_bilog_history; +} + + +RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out, + Formatter *_formatter) : + store(_store), instance_lock(bucket_instance_lock_name), + verbose(_verbose), out(_out), formatter(_formatter) +{ + num_logshards = store->ctx()->_conf.get_val("rgw_reshard_num_logs"); +} + +string RGWReshard::get_logshard_key(const string& tenant, + const string& bucket_name) +{ + return tenant + ":" + bucket_name; +} + +#define MAX_RESHARD_LOGSHARDS_PRIME 7877 + +void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid) +{ + string key = get_logshard_key(tenant, bucket_name); + + uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size()); + uint32_t sid2 = sid ^ ((sid & 0xFF) << 24); + sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards; + + get_logshard_oid(int(sid), oid); +} + +int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry) +{ + if (!store->svc()->zone->can_reshard()) { + ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled" << dendl; + return 0; + } + + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + librados::ObjectWriteOperation op; + cls_rgw_reshard_add(op, entry); + + int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl; + return ret; + } + return 0; +} + +int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info) +{ + cls_rgw_reshard_entry entry; + entry.bucket_name = bucket_info.bucket.name; + entry.bucket_id = bucket_info.bucket.bucket_id; + entry.tenant = bucket_info.owner.tenant; + + int ret = get(dpp, entry); + if (ret < 0) { + return ret; + } + + ret = add(dpp, entry); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " << + cpp_strerror(-ret) << dendl; + } + + return ret; +} + + +int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list& entries, bool *is_truncated) +{ + string logshard_oid; + + get_logshard_oid(logshard_num, &logshard_oid); + + int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated); + + if (ret == -ENOENT) { + // these shard objects aren't created until we actually write something to + // them, so treat ENOENT as a successful empty listing + *is_truncated = false; + ret = 0; + } else if (ret == -EACCES) { + ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool + << ". Fix the pool access permissions of your client" << dendl; + } else if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid=" + << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl; + } + + return ret; +} + +int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry) +{ + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry); + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << + " bucket=" << entry.bucket_name << dendl; + } + return ret; + } + + return 0; +} + +int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry) +{ + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + librados::ObjectWriteOperation op; + cls_rgw_reshard_remove(op, entry); + + int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl; + return ret; + } + + return ret; +} + +int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry) +{ + int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl; + return ret; + } + + return 0; +} + +int RGWReshardWait::wait(optional_yield y) +{ + std::unique_lock lock(mutex); + + if (going_down) { + return -ECANCELED; + } + + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + + Waiter waiter(context); + waiters.push_back(waiter); + lock.unlock(); + + waiter.timer.expires_after(duration); + + boost::system::error_code ec; + waiter.timer.async_wait(yield[ec]); + + lock.lock(); + waiters.erase(waiters.iterator_to(waiter)); + return -ec.value(); + } + + cond.wait_for(lock, duration); + + if (going_down) { + return -ECANCELED; + } + + return 0; +} + +void RGWReshardWait::stop() +{ + std::scoped_lock lock(mutex); + going_down = true; + cond.notify_all(); + for (auto& waiter : waiters) { + // unblock any waiters with ECANCELED + waiter.timer.cancel(); + } +} + +int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry, + int max_entries, const DoutPrefixProvider *dpp) +{ + ldpp_dout(dpp, 20) << __func__ << " resharding " << + entry.bucket_name << dendl; + + rgw_bucket bucket; + RGWBucketInfo bucket_info; + std::map bucket_attrs; + + int ret = store->getRados()->get_bucket_info(store->svc(), + entry.tenant, + entry.bucket_name, + bucket_info, nullptr, + null_yield, dpp, + &bucket_attrs); + if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) { + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": Error in get_bucket_info for bucket " << entry.bucket_name << + ": " << cpp_strerror(-ret) << dendl; + if (ret != -ENOENT) { + // any error other than ENOENT will abort + return ret; + } + } else { + ldpp_dout(dpp, 0) << __func__ << + ": Bucket: " << entry.bucket_name << + " already resharded by someone, skipping " << dendl; + } + + // we've encountered a reshard queue entry for an apparently + // non-existent bucket; let's try to recover by cleaning up + ldpp_dout(dpp, 0) << __func__ << + ": removing reshard queue entry for a resharded or non-existent bucket" << + entry.bucket_name << dendl; + + ret = remove(dpp, entry); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": Error removing non-existent bucket " << + entry.bucket_name << " from resharding queue: " << + cpp_strerror(-ret) << dendl; + return ret; + } + + // we cleaned up, move on to the next entry + return 0; + } + + if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) { + ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not " + "eligible for resharding until peer zones finish syncing one " + "or more of its old log generations" << dendl; + return remove(dpp, entry); + } + + RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr); + + ReshardFaultInjector f; // no fault injected + ret = br.execute(entry.new_num_shards, f, max_entries, dpp, + false, nullptr, nullptr, this); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": Error during resharding bucket " << entry.bucket_name << ":" << + cpp_strerror(-ret)<< dendl; + return ret; + } + + ldpp_dout(dpp, 20) << __func__ << + " removing reshard queue entry for bucket " << entry.bucket_name << + dendl; + + ret = remove(dpp, entry); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " << + entry.bucket_name << " from resharding queue: " << + cpp_strerror(-ret) << dendl; + return ret; + } + return 0; +} + +int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp) +{ + string marker; + bool truncated = true; + + constexpr uint32_t max_entries = 1000; + + string logshard_oid; + get_logshard_oid(logshard_num, &logshard_oid); + + RGWBucketReshardLock logshard_lock(store, logshard_oid, false); + + int ret = logshard_lock.lock(dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " << + logshard_oid << ", ret = " << ret < entries; + ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated); + if (ret < 0) { + ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" << + logshard_oid << dendl; + continue; + } + + for(auto& entry: entries) { // logshard entries + process_entry(entry, max_entries, dpp); + if (ret < 0) { + return ret; + } + + Clock::time_point now = Clock::now(); + if (logshard_lock.should_renew(now)) { + ret = logshard_lock.renew(now); + if (ret < 0) { + return ret; + } + } + + entry.get_key(&marker); + } // entry for loop + } while (truncated); + + logshard_lock.unlock(); + return 0; +} + + +void RGWReshard::get_logshard_oid(int shard_num, string *logshard) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num); + + string objname(reshard_oid_prefix); + *logshard = objname + buf; +} + +int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp) +{ + int ret = 0; + + for (int i = 0; i < num_logshards; i++) { + string logshard; + get_logshard_oid(i, &logshard); + + ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl; + + ret = process_single_logshard(i, dpp); + + ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl; + } + + return 0; +} + +bool RGWReshard::going_down() +{ + return down_flag; +} + +void RGWReshard::start_processor() +{ + worker = new ReshardWorker(store->ctx(), this); + worker->create("rgw_reshard"); +} + +void RGWReshard::stop_processor() +{ + down_flag = true; + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = nullptr; +} + +void *RGWReshard::ReshardWorker::entry() { + do { + utime_t start = ceph_clock_now(); + reshard->process_all_logshards(this); + + if (reshard->going_down()) + break; + + utime_t end = ceph_clock_now(); + end -= start; + int secs = cct->_conf.get_val("rgw_reshard_thread_interval"); + + if (secs <= end.sec()) + continue; // next round + + secs -= end.sec(); + + std::unique_lock locker{lock}; + cond.wait_for(locker, std::chrono::seconds(secs)); + } while (!reshard->going_down()); + + return NULL; +} + +void RGWReshard::ReshardWorker::stop() +{ + std::lock_guard l{lock}; + cond.notify_all(); +} + +CephContext *RGWReshard::ReshardWorker::get_cct() const +{ + return cct; +} + +unsigned RGWReshard::ReshardWorker::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const +{ + return out << "rgw reshard worker thread: "; +} diff --git a/src/rgw/driver/rados/rgw_reshard.h b/src/rgw/driver/rados/rgw_reshard.h new file mode 100644 index 000000000..59819f3a5 --- /dev/null +++ b/src/rgw/driver/rados/rgw_reshard.h @@ -0,0 +1,274 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include "include/common_fwd.h" +#include "include/rados/librados.hpp" +#include "common/ceph_time.h" +#include "common/async/yield_context.h" +#include "cls/rgw/cls_rgw_types.h" +#include "cls/lock/cls_lock_client.h" + +#include "rgw_common.h" +#include "common/fault_injector.h" + + +class RGWReshard; +namespace rgw { namespace sal { + class RadosStore; +} } + +using ReshardFaultInjector = FaultInjector; + +class RGWBucketReshardLock { + using Clock = ceph::coarse_mono_clock; + + rgw::sal::RadosStore* store; + const std::string lock_oid; + const bool ephemeral; + rados::cls::lock::Lock internal_lock; + std::chrono::seconds duration; + + Clock::time_point start_time; + Clock::time_point renew_thresh; + + void reset_time(const Clock::time_point& now) { + start_time = now; + renew_thresh = start_time + duration / 2; + } + +public: + RGWBucketReshardLock(rgw::sal::RadosStore* _store, + const std::string& reshard_lock_oid, + bool _ephemeral); + RGWBucketReshardLock(rgw::sal::RadosStore* _store, + const RGWBucketInfo& bucket_info, + bool _ephemeral) : + RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral) + {} + + int lock(const DoutPrefixProvider *dpp); + void unlock(); + int renew(const Clock::time_point&); + + bool should_renew(const Clock::time_point& now) const { + return now >= renew_thresh; + } +}; // class RGWBucketReshardLock + +class RGWBucketReshard { + public: + using Clock = ceph::coarse_mono_clock; + + private: + rgw::sal::RadosStore *store; + RGWBucketInfo bucket_info; + std::map bucket_attrs; + + RGWBucketReshardLock reshard_lock; + RGWBucketReshardLock* outer_reshard_lock; + + // using an initializer_list as an array in contiguous memory + // allocated in at once + static const std::initializer_list reshard_primes; + + int do_reshard(const rgw::bucket_index_layout_generation& current, + const rgw::bucket_index_layout_generation& target, + int max_entries, + bool verbose, + std::ostream *os, + Formatter *formatter, + const DoutPrefixProvider *dpp); +public: + + // pass nullptr for the final parameter if no outer reshard lock to + // manage + RGWBucketReshard(rgw::sal::RadosStore* _store, + const RGWBucketInfo& _bucket_info, + const std::map& _bucket_attrs, + RGWBucketReshardLock* _outer_reshard_lock); + int execute(int num_shards, ReshardFaultInjector& f, + int max_op_entries, const DoutPrefixProvider *dpp, + bool verbose = false, std::ostream *out = nullptr, + ceph::Formatter *formatter = nullptr, + RGWReshard *reshard_log = nullptr); + int get_status(const DoutPrefixProvider *dpp, std::list *status); + int cancel(const DoutPrefixProvider* dpp); + + static int clear_resharding(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + const DoutPrefixProvider* dpp); + + static uint32_t get_max_prime_shards() { + return *std::crbegin(reshard_primes); + } + + // returns the prime in our list less than or equal to the + // parameter; the lowest value that can be returned is 1 + static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) { + auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(), + requested_shards); + if (it == reshard_primes.begin()) { + return 1; + } else { + return *(--it); + } + } + + // returns the prime in our list greater than or equal to the + // parameter; if we do not have such a prime, 0 is returned + static uint32_t get_prime_shards_greater_or_equal( + uint32_t requested_shards) + { + auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(), + requested_shards); + if (it == reshard_primes.end()) { + return 0; + } else { + return *it; + } + } + + // returns a preferred number of shards given a calculated number of + // shards based on max_dynamic_shards and the list of prime values + static uint32_t get_preferred_shards(uint32_t suggested_shards, + uint32_t max_dynamic_shards) { + + // use a prime if max is within our prime range, otherwise use + // specified max + const uint32_t absolute_max = + max_dynamic_shards >= get_max_prime_shards() ? + max_dynamic_shards : + get_prime_shards_less_or_equal(max_dynamic_shards); + + // if we can use a prime number, use it, otherwise use suggested; + // note get_prime_shards_greater_or_equal will return 0 if no prime in + // prime range + const uint32_t prime_ish_num_shards = + std::max(get_prime_shards_greater_or_equal(suggested_shards), + suggested_shards); + + // dynamic sharding cannot reshard more than defined maximum + const uint32_t final_num_shards = + std::min(prime_ish_num_shards, absolute_max); + + return final_num_shards; + } + + const std::map& get_bucket_attrs() const { + return bucket_attrs; + } + + // for multisite, the RGWBucketInfo keeps a history of old log generations + // until all peers are done with them. prevent this log history from growing + // too large by refusing to reshard the bucket until the old logs get trimmed + static constexpr size_t max_bilog_history = 4; + + static bool can_reshard(const RGWBucketInfo& bucket, + const RGWSI_Zone* zone_svc); +}; // RGWBucketReshard + + +class RGWReshard { +public: + using Clock = ceph::coarse_mono_clock; + +private: + rgw::sal::RadosStore* store; + std::string lock_name; + rados::cls::lock::Lock instance_lock; + int num_logshards; + + bool verbose; + std::ostream *out; + Formatter *formatter; + + void get_logshard_oid(int shard_num, std::string *shard); +protected: + class ReshardWorker : public Thread, public DoutPrefixProvider { + CephContext *cct; + RGWReshard *reshard; + ceph::mutex lock = ceph::make_mutex("ReshardWorker"); + ceph::condition_variable cond; + + public: + ReshardWorker(CephContext * const _cct, + RGWReshard * const _reshard) + : cct(_cct), + reshard(_reshard) {} + + void *entry() override; + void stop(); + + CephContext *get_cct() const override; + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; + }; + + ReshardWorker *worker = nullptr; + std::atomic down_flag = { false }; + + std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name); + void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid); + +public: + RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr); + int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry); + int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info); + int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry); + int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry); + int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list& entries, bool *is_truncated); + int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry); + + /* reshard thread */ + int process_entry(const cls_rgw_reshard_entry& entry, int max_entries, + const DoutPrefixProvider *dpp); + int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp); + int process_all_logshards(const DoutPrefixProvider *dpp); + bool going_down(); + void start_processor(); + void stop_processor(); +}; + +class RGWReshardWait { + public: + // the blocking wait uses std::condition_variable::wait_for(), which uses the + // std::chrono::steady_clock. use that for the async waits as well + using Clock = std::chrono::steady_clock; + private: + const ceph::timespan duration; + ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock"); + ceph::condition_variable cond; + + struct Waiter : boost::intrusive::list_base_hook<> { + using Executor = boost::asio::io_context::executor_type; + using Timer = boost::asio::basic_waitable_timer, Executor>; + Timer timer; + explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {} + }; + boost::intrusive::list waiters; + + bool going_down{false}; + +public: + RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5)) + : duration(duration) {} + ~RGWReshardWait() { + ceph_assert(going_down); + } + int wait(optional_yield y); + // unblock any threads waiting on reshard + void stop(); +}; diff --git a/src/rgw/driver/rados/rgw_rest_bucket.cc b/src/rgw/driver/rados/rgw_rest_bucket.cc new file mode 100644 index 000000000..ebe4e429c --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_bucket.cc @@ -0,0 +1,413 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_op.h" +#include "driver/rados/rgw_bucket.h" +#include "rgw_rest_bucket.h" +#include "rgw_sal.h" + +#include "include/str_list.h" + +#include "services/svc_sys_obj.h" +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +class RGWOp_Bucket_Info : public RGWRESTOp { + +public: + RGWOp_Bucket_Info() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_READ); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "get_bucket_info"; } +}; + +void RGWOp_Bucket_Info::execute(optional_yield y) +{ + RGWBucketAdminOpState op_state; + + bool fetch_stats; + + std::string bucket; + + string uid_str; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_bool(s, "stats", false, &fetch_stats); + + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + op_state.set_fetch_stats(fetch_stats); + + op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this); +} + +class RGWOp_Get_Policy : public RGWRESTOp { + +public: + RGWOp_Get_Policy() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_READ); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "get_policy"; } +}; + +void RGWOp_Get_Policy::execute(optional_yield y) +{ + RGWBucketAdminOpState op_state; + + std::string bucket; + std::string object; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "object", object, &object); + + op_state.set_bucket_name(bucket); + op_state.set_object(object); + + op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this); +} + +class RGWOp_Check_Bucket_Index : public RGWRESTOp { + +public: + RGWOp_Check_Bucket_Index() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "check_bucket_index"; } +}; + +void RGWOp_Check_Bucket_Index::execute(optional_yield y) +{ + std::string bucket; + + bool fix_index; + bool check_objects; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_bool(s, "fix", false, &fix_index); + RESTArgs::get_bool(s, "check-objects", false, &check_objects); + + op_state.set_bucket_name(bucket); + op_state.set_fix_index(fix_index); + op_state.set_check_objects(check_objects); + + op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s); +} + +class RGWOp_Bucket_Link : public RGWRESTOp { + +public: + RGWOp_Bucket_Link() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "link_bucket"; } +}; + +void RGWOp_Bucket_Link::execute(optional_yield y) +{ + std::string uid_str; + std::string bucket; + std::string bucket_id; + std::string new_bucket_name; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id); + RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name); + + rgw_user uid(uid_str); + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + op_state.set_bucket_id(bucket_id); + op_state.set_new_bucket_name(new_bucket_name); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWBucketAdminOp::link(driver, op_state, s); +} + +class RGWOp_Bucket_Unlink : public RGWRESTOp { + +public: + RGWOp_Bucket_Unlink() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "unlink_bucket"; } +}; + +void RGWOp_Bucket_Unlink::execute(optional_yield y) +{ + std::string uid_str; + std::string bucket; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWBucketAdminOp::unlink(driver, op_state, s); +} + +class RGWOp_Bucket_Remove : public RGWRESTOp { + +public: + RGWOp_Bucket_Remove() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "remove_bucket"; } +}; + +void RGWOp_Bucket_Remove::execute(optional_yield y) +{ + std::string bucket_name; + bool delete_children; + std::unique_ptr bucket; + + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); + RESTArgs::get_bool(s, "purge-objects", false, &delete_children); + + /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to + * the master. This user is actually the OP caller, not the bucket owner. */ + op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl; + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_BUCKET; + } + return; + } + + op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield); +} + +class RGWOp_Set_Bucket_Quota : public RGWRESTOp { + +public: + RGWOp_Set_Bucket_Quota() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "set_bucket_quota"; } +}; + +#define QUOTA_INPUT_MAX_LEN 1024 + +void RGWOp_Set_Bucket_Quota::execute(optional_yield y) +{ + bool uid_arg_existed = false; + std::string uid_str; + RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed); + if (! uid_arg_existed) { + op_ret = -EINVAL; + return; + } + rgw_user uid(uid_str); + bool bucket_arg_existed = false; + std::string bucket_name; + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed); + if (! bucket_arg_existed) { + op_ret = -EINVAL; + return; + } + + bool use_http_params; + + if (s->content_length > 0) { + use_http_params = false; + } else { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + use_http_params = (!encoding || strcmp(encoding, "chunked") != 0); + } + RGWQuotaInfo quota; + if (!use_http_params) { + bool empty; + op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty); + if (op_ret < 0) { + if (!empty) + return; + /* was probably chunked input, but no content provided, configure via http params */ + use_http_params = true; + } + } + if (use_http_params) { + std::unique_ptr bucket; + op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield); + if (op_ret < 0) { + return; + } + RGWQuotaInfo *old_quota = &bucket->get_info().quota; + int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size); + int64_t max_size_kb; + bool has_max_size_kb = false; + RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects); + RESTArgs::get_int64(s, "max-size", old_quota->max_size, "a.max_size); + RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb); + if (has_max_size_kb) + quota.max_size = max_size_kb * 1024; + RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled); + } + + RGWBucketAdminOpState op_state; + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket_name); + op_state.set_quota(quota); + + op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s); +} + +class RGWOp_Sync_Bucket : public RGWRESTOp { + +public: + RGWOp_Sync_Bucket() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "sync_bucket"; } +}; + +void RGWOp_Sync_Bucket::execute(optional_yield y) +{ + std::string bucket; + std::string tenant; + bool sync_bucket; + + RGWBucketAdminOpState op_state; + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "tenant", tenant, &tenant); + RESTArgs::get_bool(s, "sync", true, &sync_bucket); + + op_state.set_bucket_name(bucket); + op_state.set_tenant(tenant); + op_state.set_sync_bucket(sync_bucket); + + op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s); +} + +class RGWOp_Object_Remove: public RGWRESTOp { + +public: + RGWOp_Object_Remove() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "remove_object"; } +}; + +void RGWOp_Object_Remove::execute(optional_yield y) +{ + std::string bucket; + std::string object; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "object", object, &object); + + op_state.set_bucket_name(bucket); + op_state.set_object(object); + + op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s); +} + + +RGWOp *RGWHandler_Bucket::op_get() +{ + + if (s->info.args.sub_resource_exists("policy")) + return new RGWOp_Get_Policy; + + if (s->info.args.sub_resource_exists("index")) + return new RGWOp_Check_Bucket_Index; + + return new RGWOp_Bucket_Info; +} + +RGWOp *RGWHandler_Bucket::op_put() +{ + if (s->info.args.sub_resource_exists("quota")) + return new RGWOp_Set_Bucket_Quota; + + if (s->info.args.sub_resource_exists("sync")) + return new RGWOp_Sync_Bucket; + + return new RGWOp_Bucket_Link; +} + +RGWOp *RGWHandler_Bucket::op_post() +{ + return new RGWOp_Bucket_Unlink; +} + +RGWOp *RGWHandler_Bucket::op_delete() +{ + if (s->info.args.sub_resource_exists("object")) + return new RGWOp_Object_Remove; + + return new RGWOp_Bucket_Remove; +} diff --git a/src/rgw/driver/rados/rgw_rest_bucket.h b/src/rgw/driver/rados/rgw_rest_bucket.h new file mode 100644 index 000000000..00f0b6439 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_bucket.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + + +class RGWHandler_Bucket : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_put() override; + RGWOp *op_post() override; + RGWOp *op_delete() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Bucket() override = default; + + int read_permissions(RGWOp*, optional_yield y) override { + return 0; + } +}; + +class RGWRESTMgr_Bucket : public RGWRESTMgr { +public: + RGWRESTMgr_Bucket() = default; + ~RGWRESTMgr_Bucket() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Bucket(auth_registry); + } +}; diff --git a/src/rgw/driver/rados/rgw_rest_log.cc b/src/rgw/driver/rados/rgw_rest_log.cc new file mode 100644 index 000000000..f4099807d --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_log.cc @@ -0,0 +1,1268 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/ceph_json.h" +#include "common/strtol.h" +#include "rgw_rest.h" +#include "rgw_op.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_log.h" +#include "rgw_client_io.h" +#include "rgw_sync.h" +#include "rgw_data_sync.h" +#include "rgw_common.h" +#include "rgw_zone.h" +#include "rgw_mdlog.h" +#include "rgw_datalog_notify.h" +#include "rgw_trim_bilog.h" + +#include "services/svc_zone.h" +#include "services/svc_mdlog.h" +#include "services/svc_bilog_rados.h" + +#include "common/errno.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define LOG_CLASS_LIST_MAX_ENTRIES (1000) +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void RGWOp_MDLog_List::execute(optional_yield y) { + string period = s->info.args.get("period"); + string shard = s->info.args.get("id"); + string max_entries_str = s->info.args.get("max-entries"); + string marker = s->info.args.get("marker"), + err; + void *handle; + unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + if (s->info.args.exists("start-time") || + s->info.args.exists("end-time")) { + ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; + op_ret = -EINVAL; + return; + } + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + if (!max_entries_str.empty()) { + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl; + op_ret = -EINVAL; + return; + } + if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) { + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + } + } + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id" << dendl; + op_ret = -EINVAL; + return; + } + } + + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + + meta_log.init_list_entries(shard_id, {}, {}, marker, &handle); + + op_ret = meta_log.list_entries(this, handle, max_entries, entries, + &last_marker, &truncated); + + meta_log.complete_list_entries(handle); +} + +void RGWOp_MDLog_List::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + s->formatter->open_object_section("log_entries"); + s->formatter->dump_string("marker", last_marker); + s->formatter->dump_bool("truncated", truncated); + { + s->formatter->open_array_section("entries"); + for (list::iterator iter = entries.begin(); + iter != entries.end(); ++iter) { + cls_log_entry& entry = *iter; + static_cast(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter); + flusher.flush(); + } + s->formatter->close_section(); + } + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_MDLog_Info::execute(optional_yield y) { + num_objects = s->cct->_conf->rgw_md_log_max_shards; + period = static_cast(driver)->svc()->mdlog->read_oldest_log_period(y, s); + op_ret = period.get_error(); +} + +void RGWOp_MDLog_Info::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + s->formatter->open_object_section("mdlog"); + s->formatter->dump_unsigned("num_objects", num_objects); + if (period) { + s->formatter->dump_string("period", period.get_period().get_id()); + s->formatter->dump_unsigned("realm_epoch", period.get_epoch()); + } + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_MDLog_ShardInfo::execute(optional_yield y) { + string period = s->info.args.get("period"); + string shard = s->info.args.get("id"); + string err; + + unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id" << dendl; + op_ret = -EINVAL; + return; + } + } + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + + op_ret = meta_log.get_info(this, shard_id, &info); +} + +void RGWOp_MDLog_ShardInfo::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + encode_json("info", info, s->formatter); + flusher.flush(); +} + +void RGWOp_MDLog_Delete::execute(optional_yield y) { + string marker = s->info.args.get("marker"), + period = s->info.args.get("period"), + shard = s->info.args.get("id"), + err; + unsigned shard_id; + + + if (s->info.args.exists("start-time") || + s->info.args.exists("end-time")) { + ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; + op_ret = -EINVAL; + } + + if (s->info.args.exists("start-marker")) { + ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl; + op_ret = -EINVAL; + } + + if (s->info.args.exists("end-marker")) { + if (!s->info.args.exists("marker")) { + marker = s->info.args.get("end-marker"); + } else { + ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl; + op_ret = -EINVAL; + } + } + + op_ret = 0; + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + if (marker.empty()) { /* bounding end */ + op_ret = -EINVAL; + return; + } + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id" << dendl; + op_ret = -EINVAL; + return; + } + } + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + + op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker); +} + +void RGWOp_MDLog_Lock::execute(optional_yield y) { + string period, shard_id_str, duration_str, locker_id, zone_id; + unsigned shard_id; + + op_ret = 0; + + period = s->info.args.get("period"); + shard_id_str = s->info.args.get("id"); + duration_str = s->info.args.get("length"); + locker_id = s->info.args.get("locker-id"); + zone_id = s->info.args.get("zone-id"); + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + } + + if (period.empty() || + shard_id_str.empty() || + (duration_str.empty()) || + locker_id.empty() || + zone_id.empty()) { + ldpp_dout(this, 5) << "Error invalid parameter list" << dendl; + op_ret = -EINVAL; + return; + } + + string err; + shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl; + op_ret = -EINVAL; + return; + } + + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + unsigned dur; + dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err); + if (!err.empty() || dur <= 0) { + ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl; + op_ret = -EINVAL; + return; + } + op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id, + locker_id); + if (op_ret == -EBUSY) + op_ret = -ERR_LOCKED; +} + +void RGWOp_MDLog_Unlock::execute(optional_yield y) { + string period, shard_id_str, locker_id, zone_id; + unsigned shard_id; + + op_ret = 0; + + period = s->info.args.get("period"); + shard_id_str = s->info.args.get("id"); + locker_id = s->info.args.get("locker-id"); + zone_id = s->info.args.get("zone-id"); + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + } + + if (period.empty() || + shard_id_str.empty() || + locker_id.empty() || + zone_id.empty()) { + ldpp_dout(this, 5) << "Error invalid parameter list" << dendl; + op_ret = -EINVAL; + return; + } + + string err; + shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl; + op_ret = -EINVAL; + return; + } + + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id); +} + +void RGWOp_MDLog_Notify::execute(optional_yield y) { +#define LARGE_ENOUGH_BUF (128 * 1024) + + int r = 0; + bufferlist data; + std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF); + if (r < 0) { + op_ret = r; + return; + } + + char* buf = data.c_str(); + ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl; + + JSONParser p; + r = p.parse(buf, data.length()); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl; + op_ret = r; + return; + } + + set updated_shards; + try { + decode_json_obj(updated_shards, &p); + } catch (JSONDecoder::err& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; + op_ret = -EINVAL; + return; + } + + if (driver->ctx()->_conf->subsys.should_gather()) { + for (set::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) { + ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl; + } + } + + driver->wakeup_meta_sync_shards(updated_shards); + + op_ret = 0; +} + +void RGWOp_BILog_List::execute(optional_yield y) { + bool gen_specified = false; + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + marker = s->info.args.get("marker"), + max_entries_str = s->info.args.get("max-entries"), + bucket_instance = s->info.args.get("bucket-instance"), + gen_str = s->info.args.get("generation", &gen_specified), + format_version_str = s->info.args.get("format-ver"); + std::unique_ptr bucket; + rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); + + unsigned max_entries; + + if (bucket_name.empty() && bucket_instance.empty()) { + ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl; + op_ret = -EINVAL; + return; + } + + string err; + std::optional gen; + if (gen_specified) { + gen = strict_strtoll(gen_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl; + op_ret = -EINVAL; + return; + } + } + + if (!format_version_str.empty()) { + format_ver = strict_strtoll(format_version_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl; + op_ret = -EINVAL; + return; + } + } + + int shard_id; + string bn; + op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); + if (op_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + b.name = bn; + b.bucket_id = bucket_instance; + } + op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + + const auto& logs = bucket->get_info().layout.logs; + if (logs.empty()) { + ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl; + op_ret = -ENOENT; + return; + } + + auto log = std::prev(logs.end()); + if (gen) { + log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen)); + if (log == logs.end()) { + ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl; + op_ret = -ENOENT; + return; + } + } + if (auto next = std::next(log); next != logs.end()) { + next_log_layout = *next; // get the next log after the current latest + } + auto& log_layout = *log; // current log layout for log listing + + unsigned count = 0; + + + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + send_response(); + do { + list entries; + int ret = static_cast(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id, + marker, max_entries - count, + entries, &truncated); + if (ret < 0) { + ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl; + return; + } + + count += entries.size(); + + send_response(entries, marker); + } while (truncated && count < max_entries); + + send_response_end(); +} + +void RGWOp_BILog_List::send_response() { + if (sent_header) + return; + + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + sent_header = true; + + if (op_ret < 0) + return; + + if (format_ver >= 2) { + s->formatter->open_object_section("result"); + } + + s->formatter->open_array_section("entries"); +} + +void RGWOp_BILog_List::send_response(list& entries, string& marker) +{ + for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_bi_log_entry& entry = *iter; + encode_json("entry", entry, s->formatter); + + marker = entry.id; + flusher.flush(); + } +} + +void RGWOp_BILog_List::send_response_end() { + s->formatter->close_section(); + + if (format_ver >= 2) { + encode_json("truncated", truncated, s->formatter); + + if (next_log_layout) { + s->formatter->open_object_section("next_log"); + encode_json("generation", next_log_layout->gen, s->formatter); + encode_json("num_shards", rgw::num_shards(next_log_layout->layout.in_index.layout), s->formatter); + s->formatter->close_section(); // next_log + } + + s->formatter->close_section(); // result + } + + flusher.flush(); +} + +void RGWOp_BILog_Info::execute(optional_yield y) { + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + bucket_instance = s->info.args.get("bucket-instance"); + std::unique_ptr bucket; + rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); + + if (bucket_name.empty() && bucket_instance.empty()) { + ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl; + op_ret = -EINVAL; + return; + } + + int shard_id; + string bn; + op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); + if (op_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + b.name = bn; + b.bucket_id = bucket_instance; + } + op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + + const auto& logs = bucket->get_info().layout.logs; + if (logs.empty()) { + ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl; + op_ret = -ENOENT; + return; + } + + map stats; + const auto& index = log_to_index_layout(logs.back()); + + int ret = bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped); + if (ret < 0 && ret != -ENOENT) { + op_ret = ret; + return; + } + + oldest_gen = logs.front().gen; + latest_gen = logs.back().gen; + + for (auto& log : logs) { + uint32_t num_shards = rgw::num_shards(log.layout.in_index.layout); + generations.push_back({log.gen, num_shards}); + } +} + +void RGWOp_BILog_Info::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + s->formatter->open_object_section("info"); + encode_json("bucket_ver", bucket_ver, s->formatter); + encode_json("master_ver", master_ver, s->formatter); + encode_json("max_marker", max_marker, s->formatter); + encode_json("syncstopped", syncstopped, s->formatter); + encode_json("oldest_gen", oldest_gen, s->formatter); + encode_json("latest_gen", latest_gen, s->formatter); + encode_json("generations", generations, s->formatter); + s->formatter->close_section(); + + flusher.flush(); +} + +void RGWOp_BILog_Delete::execute(optional_yield y) { + bool gen_specified = false; + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + start_marker = s->info.args.get("start-marker"), + end_marker = s->info.args.get("end-marker"), + bucket_instance = s->info.args.get("bucket-instance"), + gen_str = s->info.args.get("generation", &gen_specified); + + std::unique_ptr bucket; + rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); + + op_ret = 0; + if ((bucket_name.empty() && bucket_instance.empty()) || + end_marker.empty()) { + ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl; + op_ret = -EINVAL; + return; + } + + string err; + uint64_t gen = 0; + if (gen_specified) { + gen = strict_strtoll(gen_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl; + op_ret = -EINVAL; + return; + } + } + + int shard_id; + string bn; + op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); + if (op_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + b.name = bn; + b.bucket_id = bucket_instance; + } + op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + + op_ret = bilog_trim(this, static_cast(driver), + bucket->get_info(), gen, shard_id, + start_marker, end_marker); + if (op_ret < 0) { + ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl; + } + + return; +} + +void RGWOp_DATALog_List::execute(optional_yield y) { + string shard = s->info.args.get("id"); + + string max_entries_str = s->info.args.get("max-entries"), + marker = s->info.args.get("marker"), + err; + unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + if (s->info.args.exists("start-time") || + s->info.args.exists("end-time")) { + ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; + op_ret = -EINVAL; + } + + s->info.args.get_bool("extra-info", &extra_info, false); + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + if (!max_entries_str.empty()) { + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl; + op_ret = -EINVAL; + return; + } + if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) { + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + } + } + + // Note that last_marker is updated to be the marker of the last + // entry listed + op_ret = static_cast(driver)->svc()-> + datalog_rados->list_entries(this, shard_id, max_entries, entries, + marker, &last_marker, &truncated, y); +} + +void RGWOp_DATALog_List::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + s->formatter->open_object_section("log_entries"); + s->formatter->dump_string("marker", last_marker); + s->formatter->dump_bool("truncated", truncated); + { + s->formatter->open_array_section("entries"); + for (const auto& entry : entries) { + if (!extra_info) { + encode_json("entry", entry.entry, s->formatter); + } else { + encode_json("entry", entry, s->formatter); + } + flusher.flush(); + } + s->formatter->close_section(); + } + s->formatter->close_section(); + flusher.flush(); +} + + +void RGWOp_DATALog_Info::execute(optional_yield y) { + num_objects = s->cct->_conf->rgw_data_log_num_shards; + op_ret = 0; +} + +void RGWOp_DATALog_Info::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + s->formatter->open_object_section("num_objects"); + s->formatter->dump_unsigned("num_objects", num_objects); + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_DATALog_ShardInfo::execute(optional_yield y) { + string shard = s->info.args.get("id"); + string err; + + unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + op_ret = static_cast(driver)->svc()-> + datalog_rados->get_info(this, shard_id, &info, y); +} + +void RGWOp_DATALog_ShardInfo::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + encode_json("info", info, s->formatter); + flusher.flush(); +} + +void RGWOp_DATALog_Notify::execute(optional_yield y) { + string source_zone = s->info.args.get("source-zone"); +#define LARGE_ENOUGH_BUF (128 * 1024) + + int r = 0; + bufferlist data; + std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF); + if (r < 0) { + op_ret = r; + return; + } + + char* buf = data.c_str(); + ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl; + + JSONParser p; + r = p.parse(buf, data.length()); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl; + op_ret = r; + return; + } + + bc::flat_map> updated_shards; + try { + auto decoder = rgw_data_notify_v1_decoder{updated_shards}; + decode_json_obj(decoder, &p); + } catch (JSONDecoder::err& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; + op_ret = -EINVAL; + return; + } + + if (driver->ctx()->_conf->subsys.should_gather()) { + for (bc::flat_map >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) { + ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl; + bc::flat_set& entries = iter->second; + for (const auto& [key, gen] : entries) { + ldpp_dout(this, 20) << __func__ << "(): modified key=" << key + << " of gen=" << gen << dendl; + } + } + } + + driver->wakeup_data_sync_shards(this, source_zone, updated_shards); + + op_ret = 0; +} + +void RGWOp_DATALog_Notify2::execute(optional_yield y) { + string source_zone = s->info.args.get("source-zone"); +#define LARGE_ENOUGH_BUF (128 * 1024) + + int r = 0; + bufferlist data; + std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF); + if (r < 0) { + op_ret = r; + return; + } + + char* buf = data.c_str(); + ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl; + + JSONParser p; + r = p.parse(buf, data.length()); + if (r < 0) { + ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl; + op_ret = r; + return; + } + + bc::flat_map > updated_shards; + try { + decode_json_obj(updated_shards, &p); + } catch (JSONDecoder::err& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; + op_ret = -EINVAL; + return; + } + + if (driver->ctx()->_conf->subsys.should_gather()) { + for (bc::flat_map >::iterator iter = + updated_shards.begin(); iter != updated_shards.end(); ++iter) { + ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl; + bc::flat_set& entries = iter->second; + for (const auto& [key, gen] : entries) { + ldpp_dout(this, 20) << __func__ << "(): modified key=" << key << + " of generation=" << gen << dendl; + } + } + } + + driver->wakeup_data_sync_shards(this, source_zone, updated_shards); + + op_ret = 0; +} + +void RGWOp_DATALog_Delete::execute(optional_yield y) { + string marker = s->info.args.get("marker"), + shard = s->info.args.get("id"), + err; + unsigned shard_id; + + op_ret = 0; + + if (s->info.args.exists("start-time") || + s->info.args.exists("end-time")) { + ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; + op_ret = -EINVAL; + } + + if (s->info.args.exists("start-marker")) { + ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl; + op_ret = -EINVAL; + } + + if (s->info.args.exists("end-marker")) { + if (!s->info.args.exists("marker")) { + marker = s->info.args.get("end-marker"); + } else { + ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl; + op_ret = -EINVAL; + } + } + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + if (marker.empty()) { /* bounding end */ + op_ret = -EINVAL; + return; + } + + op_ret = static_cast(driver)->svc()-> + datalog_rados->trim_entries(this, shard_id, marker, y); +} + +// not in header to avoid pulling in rgw_sync.h +class RGWOp_MDLog_Status : public RGWRESTOp { + rgw_meta_sync_status status; +public: + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission(optional_yield) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { return "get_metadata_log_status"; } +}; + +void RGWOp_MDLog_Status::execute(optional_yield y) +{ + auto sync = static_cast(driver)->getRados()->get_meta_sync_manager(); + if (sync == nullptr) { + ldpp_dout(this, 1) << "no sync manager" << dendl; + op_ret = -ENOENT; + return; + } + op_ret = sync->read_sync_status(this, &status); +} + +void RGWOp_MDLog_Status::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret >= 0) { + encode_json("status", status, s->formatter); + } + flusher.flush(); +} + +// not in header to avoid pulling in rgw_data_sync.h +class RGWOp_BILog_Status : public RGWRESTOp { + bilog_status_v2 status; + int version = 1; +public: + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { return "get_bucket_index_log_status"; } +}; + +void RGWOp_BILog_Status::execute(optional_yield y) +{ + const auto options = s->info.args.get("options"); + bool merge = (options == "merge"); + const auto source_zone = s->info.args.get("source-zone"); + const auto source_key = s->info.args.get("source-bucket"); + auto key = s->info.args.get("bucket"); + op_ret = s->info.args.get_int("version", &version, 1); + + if (key.empty()) { + key = source_key; + } + if (key.empty()) { + ldpp_dout(this, 4) << "no 'bucket' provided" << dendl; + op_ret = -EINVAL; + return; + } + + rgw_bucket b; + int shard_id{-1}; // unused + op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id); + if (op_ret < 0) { + ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl; + op_ret = -EINVAL; + return; + } + + // read the bucket instance info for num_shards + std::unique_ptr bucket; + op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl; + return; + } + + rgw_bucket source_bucket; + + if (source_key.empty() || + source_key == key) { + source_bucket = bucket->get_key(); + } else { + op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl; + return; + } + } + + const auto& local_zone_id = driver->get_zone()->get_id(); + + if (!merge) { + rgw_sync_bucket_pipe pipe; + pipe.source.zone = source_zone; + pipe.source.bucket = source_bucket; + pipe.dest.zone = local_zone_id; + pipe.dest.bucket = bucket->get_key(); + + ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl; + + op_ret = rgw_read_bucket_full_sync_status( + this, + static_cast(driver), + pipe, + &status.sync_status, + s->yield); + if (op_ret < 0) { + ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; + return; + } + status.inc_status.resize(status.sync_status.shards_done_with_gen.size()); + + op_ret = rgw_read_bucket_inc_sync_status( + this, + static_cast(driver), + pipe, + status.sync_status.incremental_gen, + &status.inc_status); + if (op_ret < 0) { + ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; + } + return; + } + + rgw_zone_id source_zone_id(source_zone); + + RGWBucketSyncPolicyHandlerRef source_handler; + op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y); + if (op_ret < 0) { + ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl; + return; + } + + auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id); + + std::vector current_status; + for (auto& entry : local_dests) { + auto pipe = entry.second; + + ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl; + + RGWBucketInfo *pinfo = &bucket->get_info(); + std::optional opt_dest_info; + + if (!pipe.dest.bucket) { + /* Uh oh, something went wrong */ + ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl; + op_ret = -EIO; + return; + } + + if (*pipe.dest.bucket != pinfo->bucket) { + opt_dest_info.emplace(); + std::unique_ptr dest_bucket; + op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl; + return; + } + + *opt_dest_info = dest_bucket->get_info(); + pinfo = &(*opt_dest_info); + pipe.dest.bucket = pinfo->bucket; + } + + op_ret = rgw_read_bucket_full_sync_status( + this, + static_cast(driver), + pipe, + &status.sync_status, + s->yield); + if (op_ret < 0) { + ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; + return; + } + + current_status.resize(status.sync_status.shards_done_with_gen.size()); + int r = rgw_read_bucket_inc_sync_status(this, static_cast(driver), + pipe, status.sync_status.incremental_gen, ¤t_status); + if (r < 0) { + ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl; + op_ret = r; + return; + } + + if (status.inc_status.empty()) { + status.inc_status = std::move(current_status); + } else { + if (current_status.size() != status.inc_status.size()) { + op_ret = -EINVAL; + ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets " + "syncing from the same source: status.size()= " + << status.inc_status.size() + << " current_status.size()=" + << current_status.size() << dendl; + return; + } + auto m = status.inc_status.begin(); + for (auto& cur_shard_status : current_status) { + auto& result_shard_status = *m++; + // always take the first marker, or any later marker that's smaller + if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) { + result_shard_status = std::move(cur_shard_status); + } + } + } + } +} + +void RGWOp_BILog_Status::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret >= 0) { + if (version < 2) { + encode_json("status", status.inc_status, s->formatter); + } else { + encode_json("status", status, s->formatter); + } + } + flusher.flush(); +} + +// not in header to avoid pulling in rgw_data_sync.h +class RGWOp_DATALog_Status : public RGWRESTOp { + rgw_data_sync_status status; +public: + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override ; + void send_response() override; + const char* name() const override { return "get_data_changes_log_status"; } +}; + +void RGWOp_DATALog_Status::execute(optional_yield y) +{ + const auto source_zone = s->info.args.get("source-zone"); + auto sync = driver->get_data_sync_manager(source_zone); + if (sync == nullptr) { + ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl; + op_ret = -ENOENT; + return; + } + op_ret = sync->read_sync_status(this, &status); +} + +void RGWOp_DATALog_Status::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret >= 0) { + encode_json("status", status, s->formatter); + } + flusher.flush(); +} + + +RGWOp *RGWHandler_Log::op_get() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) { + if (s->info.args.exists("id")) { + if (s->info.args.exists("info")) { + return new RGWOp_MDLog_ShardInfo; + } else { + return new RGWOp_MDLog_List; + } + } else if (s->info.args.exists("status")) { + return new RGWOp_MDLog_Status; + } else { + return new RGWOp_MDLog_Info; + } + } else if (type.compare("bucket-index") == 0) { + if (s->info.args.exists("info")) { + return new RGWOp_BILog_Info; + } else if (s->info.args.exists("status")) { + return new RGWOp_BILog_Status; + } else { + return new RGWOp_BILog_List; + } + } else if (type.compare("data") == 0) { + if (s->info.args.exists("id")) { + if (s->info.args.exists("info")) { + return new RGWOp_DATALog_ShardInfo; + } else { + return new RGWOp_DATALog_List; + } + } else if (s->info.args.exists("status")) { + return new RGWOp_DATALog_Status; + } else { + return new RGWOp_DATALog_Info; + } + } + return NULL; +} + +RGWOp *RGWHandler_Log::op_delete() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) + return new RGWOp_MDLog_Delete; + else if (type.compare("bucket-index") == 0) + return new RGWOp_BILog_Delete; + else if (type.compare("data") == 0) + return new RGWOp_DATALog_Delete; + return NULL; +} + +RGWOp *RGWHandler_Log::op_post() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) { + if (s->info.args.exists("lock")) + return new RGWOp_MDLog_Lock; + else if (s->info.args.exists("unlock")) + return new RGWOp_MDLog_Unlock; + else if (s->info.args.exists("notify")) + return new RGWOp_MDLog_Notify; + } else if (type.compare("data") == 0) { + if (s->info.args.exists("notify")) { + return new RGWOp_DATALog_Notify; + } else if (s->info.args.exists("notify2")) { + return new RGWOp_DATALog_Notify2; + } + } + return NULL; +} + diff --git a/src/rgw/driver/rados/rgw_rest_log.h b/src/rgw/driver/rados/rgw_rest_log.h new file mode 100644 index 000000000..02b1d133f --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_log.h @@ -0,0 +1,337 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_datalog.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_metadata.h" +#include "rgw_mdlog.h" +#include "rgw_data_sync.h" + +class RGWOp_BILog_List : public RGWRESTOp { + bool sent_header; + uint32_t format_ver{0}; + bool truncated{false}; + std::optional next_log_layout; + +public: + RGWOp_BILog_List() : sent_header(false) {} + ~RGWOp_BILog_List() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void send_response() override; + virtual void send_response(std::list& entries, std::string& marker); + virtual void send_response_end(); + void execute(optional_yield y) override; + const char* name() const override { + return "list_bucket_index_log"; + } +}; + +class RGWOp_BILog_Info : public RGWRESTOp { + std::string bucket_ver; + std::string master_ver; + std::string max_marker; + bool syncstopped; + uint64_t oldest_gen = 0; + uint64_t latest_gen = 0; + std::vector generations; + +public: + RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {} + ~RGWOp_BILog_Info() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void send_response() override; + void execute(optional_yield y) override; + const char* name() const override { + return "bucket_index_log_info"; + } +}; + +class RGWOp_BILog_Delete : public RGWRESTOp { +public: + RGWOp_BILog_Delete() {} + ~RGWOp_BILog_Delete() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "trim_bucket_index_log"; + } +}; + +class RGWOp_MDLog_List : public RGWRESTOp { + std::list entries; + std::string last_marker; + bool truncated; +public: + RGWOp_MDLog_List() : truncated(false) {} + ~RGWOp_MDLog_List() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "list_metadata_log"; + } +}; + +class RGWOp_MDLog_Info : public RGWRESTOp { + unsigned num_objects; + RGWPeriodHistory::Cursor period; +public: + RGWOp_MDLog_Info() : num_objects(0) {} + ~RGWOp_MDLog_Info() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "get_metadata_log_info"; + } +}; + +class RGWOp_MDLog_ShardInfo : public RGWRESTOp { + RGWMetadataLogInfo info; +public: + RGWOp_MDLog_ShardInfo() {} + ~RGWOp_MDLog_ShardInfo() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "get_metadata_log_shard_info"; + } +}; + +class RGWOp_MDLog_Lock : public RGWRESTOp { +public: + RGWOp_MDLog_Lock() {} + ~RGWOp_MDLog_Lock() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "lock_mdlog_object"; + } +}; + +class RGWOp_MDLog_Unlock : public RGWRESTOp { +public: + RGWOp_MDLog_Unlock() {} + ~RGWOp_MDLog_Unlock() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "unlock_mdlog_object"; + } +}; + +class RGWOp_MDLog_Notify : public RGWRESTOp { +public: + RGWOp_MDLog_Notify() {} + ~RGWOp_MDLog_Notify() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "mdlog_notify"; + } + RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; } +}; + +class RGWOp_MDLog_Delete : public RGWRESTOp { +public: + RGWOp_MDLog_Delete() {} + ~RGWOp_MDLog_Delete() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "trim_metadata_log"; + } +}; + +class RGWOp_DATALog_List : public RGWRESTOp { + std::vector entries; + std::string last_marker; + bool truncated; + bool extra_info; +public: + RGWOp_DATALog_List() : truncated(false), extra_info(false) {} + ~RGWOp_DATALog_List() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "list_data_changes_log"; + } +}; + +class RGWOp_DATALog_Info : public RGWRESTOp { + unsigned num_objects; +public: + RGWOp_DATALog_Info() : num_objects(0) {} + ~RGWOp_DATALog_Info() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "get_data_changes_log_info"; + } +}; + +class RGWOp_DATALog_ShardInfo : public RGWRESTOp { + RGWDataChangesLogInfo info; +public: + RGWOp_DATALog_ShardInfo() {} + ~RGWOp_DATALog_ShardInfo() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "get_data_changes_log_shard_info"; + } +}; + +class RGWOp_DATALog_Notify : public RGWRESTOp { +public: + RGWOp_DATALog_Notify() {} + ~RGWOp_DATALog_Notify() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "datalog_notify"; + } + RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; } +}; + +class RGWOp_DATALog_Notify2 : public RGWRESTOp { + rgw_data_notify_entry data_notify; +public: + RGWOp_DATALog_Notify2() {} + ~RGWOp_DATALog_Notify2() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "datalog_notify2"; + } + RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; } +}; + +class RGWOp_DATALog_Delete : public RGWRESTOp { +public: + RGWOp_DATALog_Delete() {} + ~RGWOp_DATALog_Delete() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "trim_data_changes_log"; + } +}; + +class RGWHandler_Log : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + + int read_permissions(RGWOp*, optional_yield) override { + return 0; + } +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Log() override = default; +}; + +class RGWRESTMgr_Log : public RGWRESTMgr { +public: + RGWRESTMgr_Log() = default; + ~RGWRESTMgr_Log() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state* const, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefixs) override { + return new RGWHandler_Log(auth_registry); + } +}; diff --git a/src/rgw/driver/rados/rgw_rest_pubsub.h b/src/rgw/driver/rados/rgw_rest_pubsub.h new file mode 100644 index 000000000..27bde7a95 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_pubsub.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include "rgw_rest_s3.h" + +// s3 compliant notification handler factory +class RGWHandler_REST_PSNotifs_S3 : public RGWHandler_REST_S3 { +protected: + int init_permissions(RGWOp* op, optional_yield y) override {return 0;} + int read_permissions(RGWOp* op, optional_yield y) override {return 0;} + bool supports_quota() override {return false;} + RGWOp* op_get() override; + RGWOp* op_put() override; + RGWOp* op_delete() override; +public: + using RGWHandler_REST_S3::RGWHandler_REST_S3; + virtual ~RGWHandler_REST_PSNotifs_S3() = default; + // following are used to generate the operations when invoked by another REST handler + static RGWOp* create_get_op(); + static RGWOp* create_put_op(); + static RGWOp* create_delete_op(); +}; + +// AWS compliant topics handler factory +class RGWHandler_REST_PSTopic_AWS : public RGWHandler_REST { + const rgw::auth::StrategyRegistry& auth_registry; +protected: + RGWOp* op_post() override; +public: + RGWHandler_REST_PSTopic_AWS(const rgw::auth::StrategyRegistry& _auth_registry) : + auth_registry(_auth_registry) {} + virtual ~RGWHandler_REST_PSTopic_AWS() = default; + int postauth_init(optional_yield) override { return 0; } + int authorize(const DoutPrefixProvider* dpp, optional_yield y) override; + static bool action_exists(const req_state* s); +}; + diff --git a/src/rgw/driver/rados/rgw_rest_realm.cc b/src/rgw/driver/rados/rgw_rest_realm.cc new file mode 100644 index 000000000..79640a2a1 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_realm.cc @@ -0,0 +1,376 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" +#include "rgw_rest_realm.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_config.h" +#include "rgw_zone.h" +#include "rgw_sal_rados.h" + +#include "services/svc_zone.h" +#include "services/svc_mdlog.h" + +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +// reject 'period push' if we would have to fetch too many intermediate periods +static const uint32_t PERIOD_HISTORY_FETCH_MAX = 64; + +// base period op, shared between Get and Post +class RGWOp_Period_Base : public RGWRESTOp { + protected: + RGWPeriod period; + std::ostringstream error_stream; + public: + int verify_permission(optional_yield) override { return 0; } + void send_response() override; +}; + +// reply with the period object on success +void RGWOp_Period_Base::send_response() +{ + set_req_state_err(s, op_ret, error_stream.str()); + dump_errno(s); + + if (op_ret < 0) { + if (!s->err.message.empty()) { + ldpp_dout(this, 4) << "Request failed with " << op_ret + << ": " << s->err.message << dendl; + } + end_header(s); + return; + } + + encode_json("period", period, s->formatter); + end_header(s, NULL, "application/json", s->formatter->get_len()); + flusher.flush(); +} + +// GET /admin/realm/period +class RGWOp_Period_Get : public RGWOp_Period_Base { + public: + void execute(optional_yield y) override; + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission(optional_yield) override { + return check_caps(s->user->get_caps()); + } + const char* name() const override { return "get_period"; } +}; + +void RGWOp_Period_Get::execute(optional_yield y) +{ + string realm_id, realm_name, period_id; + epoch_t epoch = 0; + RESTArgs::get_string(s, "realm_id", realm_id, &realm_id); + RESTArgs::get_string(s, "realm_name", realm_name, &realm_name); + RESTArgs::get_string(s, "period_id", period_id, &period_id); + RESTArgs::get_uint32(s, "epoch", 0, &epoch); + + period.set_id(period_id); + period.set_epoch(epoch); + + op_ret = period.init(this, driver->ctx(), static_cast(driver)->svc()->sysobj, realm_id, y, realm_name); + if (op_ret < 0) + ldpp_dout(this, 5) << "failed to read period" << dendl; +} + +// POST /admin/realm/period +class RGWOp_Period_Post : public RGWOp_Period_Base { + public: + void execute(optional_yield y) override; + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_WRITE); + } + int verify_permission(optional_yield) override { + return check_caps(s->user->get_caps()); + } + const char* name() const override { return "post_period"; } + RGWOpType get_type() override { return RGW_OP_PERIOD_POST; } +}; + +void RGWOp_Period_Post::execute(optional_yield y) +{ + auto cct = driver->ctx(); + + // initialize the period without reading from rados + period.init(this, cct, static_cast(driver)->svc()->sysobj, y, false); + + // decode the period from input + const auto max_size = cct->_conf->rgw_max_put_param_size; + bool empty; + op_ret = get_json_input(cct, s, period, max_size, &empty); + if (op_ret < 0) { + ldpp_dout(this, -1) << "failed to decode period" << dendl; + return; + } + + // require period.realm_id to match our realm + if (period.get_realm() != static_cast(driver)->svc()->zone->get_realm().get_id()) { + error_stream << "period with realm id " << period.get_realm() + << " doesn't match current realm " << static_cast(driver)->svc()->zone->get_realm().get_id() << std::endl; + op_ret = -EINVAL; + return; + } + + // load the realm and current period from rados; there may be a more recent + // period that we haven't restarted with yet. we also don't want to modify + // the objects in use by RGWRados + RGWRealm realm(period.get_realm()); + op_ret = realm.init(this, cct, static_cast(driver)->svc()->sysobj, y); + if (op_ret < 0) { + ldpp_dout(this, -1) << "failed to read current realm: " + << cpp_strerror(-op_ret) << dendl; + return; + } + + RGWPeriod current_period; + op_ret = current_period.init(this, cct, static_cast(driver)->svc()->sysobj, realm.get_id(), y); + if (op_ret < 0) { + ldpp_dout(this, -1) << "failed to read current period: " + << cpp_strerror(-op_ret) << dendl; + return; + } + + // if period id is empty, handle as 'period commit' + if (period.get_id().empty()) { + op_ret = period.commit(this, driver, realm, current_period, error_stream, y); + if (op_ret < 0) { + ldpp_dout(this, -1) << "master zone failed to commit period" << dendl; + } + return; + } + + // if it's not period commit, nobody is allowed to push to the master zone + if (period.get_master_zone() == static_cast(driver)->svc()->zone->get_zone_params().get_id()) { + ldpp_dout(this, 10) << "master zone rejecting period id=" + << period.get_id() << " epoch=" << period.get_epoch() << dendl; + op_ret = -EINVAL; // XXX: error code + return; + } + + // write the period to rados + op_ret = period.store_info(this, false, y); + if (op_ret < 0) { + ldpp_dout(this, -1) << "failed to store period " << period.get_id() << dendl; + return; + } + // set as latest epoch + op_ret = period.update_latest_epoch(this, period.get_epoch(), y); + if (op_ret == -EEXIST) { + // already have this epoch (or a more recent one) + ldpp_dout(this, 4) << "already have epoch >= " << period.get_epoch() + << " for period " << period.get_id() << dendl; + op_ret = 0; + return; + } + if (op_ret < 0) { + ldpp_dout(this, -1) << "failed to set latest epoch" << dendl; + return; + } + + auto period_history = static_cast(driver)->svc()->mdlog->get_period_history(); + + // decide whether we can set_current_period() or set_latest_epoch() + if (period.get_id() != current_period.get_id()) { + auto current_epoch = current_period.get_realm_epoch(); + // discard periods in the past + if (period.get_realm_epoch() < current_epoch) { + ldpp_dout(this, 10) << "discarding period " << period.get_id() + << " with realm epoch " << period.get_realm_epoch() + << " older than current epoch " << current_epoch << dendl; + // return success to ack that we have this period + return; + } + // discard periods too far in the future + if (period.get_realm_epoch() > current_epoch + PERIOD_HISTORY_FETCH_MAX) { + ldpp_dout(this, -1) << "discarding period " << period.get_id() + << " with realm epoch " << period.get_realm_epoch() << " too far in " + "the future from current epoch " << current_epoch << dendl; + op_ret = -ENOENT; // XXX: error code + return; + } + // attach a copy of the period into the period history + auto cursor = period_history->attach(this, RGWPeriod{period}, y); + if (!cursor) { + // we're missing some history between the new period and current_period + op_ret = cursor.get_error(); + ldpp_dout(this, -1) << "failed to collect the periods between current period " + << current_period.get_id() << " (realm epoch " << current_epoch + << ") and the new period " << period.get_id() + << " (realm epoch " << period.get_realm_epoch() + << "): " << cpp_strerror(-op_ret) << dendl; + return; + } + if (cursor.has_next()) { + // don't switch if we have a newer period in our history + ldpp_dout(this, 4) << "attached period " << period.get_id() + << " to history, but the history contains newer periods" << dendl; + return; + } + // set as current period + op_ret = realm.set_current_period(this, period, y); + if (op_ret < 0) { + ldpp_dout(this, -1) << "failed to update realm's current period" << dendl; + return; + } + ldpp_dout(this, 4) << "period " << period.get_id() + << " is newer than current period " << current_period.get_id() + << ", updating realm's current period and notifying zone" << dendl; + realm.notify_new_period(this, period, y); + return; + } + // reflect the period into our local objects + op_ret = period.reflect(this, y); + if (op_ret < 0) { + ldpp_dout(this, -1) << "failed to update local objects: " + << cpp_strerror(-op_ret) << dendl; + return; + } + ldpp_dout(this, 4) << "period epoch " << period.get_epoch() + << " is newer than current epoch " << current_period.get_epoch() + << ", updating period's latest epoch and notifying zone" << dendl; + realm.notify_new_period(this, period, y); + // update the period history + period_history->insert(RGWPeriod{period}); +} + +class RGWHandler_Period : public RGWHandler_Auth_S3 { + protected: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + + RGWOp *op_get() override { return new RGWOp_Period_Get; } + RGWOp *op_post() override { return new RGWOp_Period_Post; } +}; + +class RGWRESTMgr_Period : public RGWRESTMgr { + public: + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Period(auth_registry); + } +}; + + +// GET /admin/realm +class RGWOp_Realm_Get : public RGWRESTOp { + std::unique_ptr realm; +public: + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission(optional_yield) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { return "get_realm"; } +}; + +void RGWOp_Realm_Get::execute(optional_yield y) +{ + string id; + RESTArgs::get_string(s, "id", id, &id); + string name; + RESTArgs::get_string(s, "name", name, &name); + + // read realm + realm.reset(new RGWRealm(id, name)); + op_ret = realm->init(this, g_ceph_context, static_cast(driver)->svc()->sysobj, y); + if (op_ret < 0) + ldpp_dout(this, -1) << "failed to read realm id=" << id + << " name=" << name << dendl; +} + +void RGWOp_Realm_Get::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + + if (op_ret < 0) { + end_header(s); + return; + } + + encode_json("realm", *realm, s->formatter); + end_header(s, NULL, "application/json", s->formatter->get_len()); + flusher.flush(); +} + +// GET /admin/realm?list +class RGWOp_Realm_List : public RGWRESTOp { + std::string default_id; + std::list realms; +public: + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission(optional_yield) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { return "list_realms"; } +}; + +void RGWOp_Realm_List::execute(optional_yield y) +{ + { + // read default realm + RGWRealm realm(driver->ctx(), static_cast(driver)->svc()->sysobj); + [[maybe_unused]] int ret = realm.read_default_id(this, default_id, y); + } + op_ret = static_cast(driver)->svc()->zone->list_realms(this, realms); + if (op_ret < 0) + ldpp_dout(this, -1) << "failed to list realms" << dendl; +} + +void RGWOp_Realm_List::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + + if (op_ret < 0) { + end_header(s); + return; + } + + s->formatter->open_object_section("realms_list"); + encode_json("default_info", default_id, s->formatter); + encode_json("realms", realms, s->formatter); + s->formatter->close_section(); + end_header(s, NULL, "application/json", s->formatter->get_len()); + flusher.flush(); +} + +class RGWHandler_Realm : public RGWHandler_Auth_S3 { +protected: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + RGWOp *op_get() override { + if (s->info.args.sub_resource_exists("list")) + return new RGWOp_Realm_List; + return new RGWOp_Realm_Get; + } +}; + +RGWRESTMgr_Realm::RGWRESTMgr_Realm() +{ + // add the /admin/realm/period resource + register_resource("period", new RGWRESTMgr_Period); +} + +RGWHandler_REST* +RGWRESTMgr_Realm::get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) +{ + return new RGWHandler_Realm(auth_registry); +} diff --git a/src/rgw/driver/rados/rgw_rest_realm.h b/src/rgw/driver/rados/rgw_rest_realm.h new file mode 100644 index 000000000..a0d1dc1c9 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_realm.h @@ -0,0 +1,16 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" + +class RGWRESTMgr_Realm : public RGWRESTMgr { +public: + RGWRESTMgr_Realm(); + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override; +}; diff --git a/src/rgw/driver/rados/rgw_rest_user.cc b/src/rgw/driver/rados/rgw_rest_user.cc new file mode 100644 index 000000000..361ceb0f7 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_user.cc @@ -0,0 +1,1137 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/ceph_json.h" + +#include "rgw_op.h" +#include "rgw_user.h" +#include "rgw_rest_user.h" +#include "rgw_sal.h" + +#include "include/str_list.h" +#include "include/ceph_assert.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int fetch_access_keys_from_master(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, RGWUserAdminOpState &op_state, req_state *s, optional_yield y) { + bufferlist data; + JSONParser jp; + RGWUserInfo ui; + int op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, &jp, s->info, y); + if (op_ret < 0) { + ldpp_dout(dpp, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return op_ret; + } + ui.decode_json(&jp); + op_state.op_access_keys = std::move(ui.access_keys); + + return 0; +} + +class RGWOp_User_List : public RGWRESTOp { + +public: + RGWOp_User_List() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_READ); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "list_user"; } +}; + +void RGWOp_User_List::execute(optional_yield y) +{ + RGWUserAdminOpState op_state(driver); + + uint32_t max_entries; + std::string marker; + RESTArgs::get_uint32(s, "max-entries", 1000, &max_entries); + RESTArgs::get_string(s, "marker", marker, &marker); + + op_state.max_entries = max_entries; + op_state.marker = marker; + op_ret = RGWUserAdminOp_User::list(this, driver, op_state, flusher); +} + +class RGWOp_User_Info : public RGWRESTOp { + +public: + RGWOp_User_Info() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_READ); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "get_user_info"; } +}; + +void RGWOp_User_Info::execute(optional_yield y) +{ + RGWUserAdminOpState op_state(driver); + + std::string uid_str, access_key_str; + bool fetch_stats; + bool sync_stats; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "access-key", access_key_str, &access_key_str); + + // if uid was not supplied in rest argument, error out now, otherwise we'll + // end up initializing anonymous user, for which keys.init will eventually + // return -EACESS + if (uid_str.empty() && access_key_str.empty()){ + op_ret=-EINVAL; + return; + } + + rgw_user uid(uid_str); + + RESTArgs::get_bool(s, "stats", false, &fetch_stats); + + RESTArgs::get_bool(s, "sync", false, &sync_stats); + + op_state.set_user_id(uid); + op_state.set_access_key(access_key_str); + op_state.set_fetch_stats(fetch_stats); + op_state.set_sync_stats(sync_stats); + + op_ret = RGWUserAdminOp_User::info(s, driver, op_state, flusher, y); +} + +class RGWOp_User_Create : public RGWRESTOp { + +public: + RGWOp_User_Create() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "create_user"; } +}; + +void RGWOp_User_Create::execute(optional_yield y) +{ + std::string uid_str; + std::string display_name; + std::string email; + std::string access_key; + std::string secret_key; + std::string key_type_str; + std::string caps; + std::string tenant_name; + std::string op_mask_str; + std::string default_placement_str; + std::string placement_tags_str; + + bool gen_key; + bool suspended; + bool system; + bool exclusive; + + int32_t max_buckets; + const int32_t default_max_buckets = + s->cct->_conf.get_val("rgw_user_max_buckets"); + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "display-name", display_name, &display_name); + RESTArgs::get_string(s, "email", email, &email); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + RESTArgs::get_string(s, "user-caps", caps, &caps); + RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name); + RESTArgs::get_bool(s, "generate-key", true, &gen_key); + RESTArgs::get_bool(s, "suspended", false, &suspended); + RESTArgs::get_int32(s, "max-buckets", default_max_buckets, &max_buckets); + RESTArgs::get_bool(s, "system", false, &system); + RESTArgs::get_bool(s, "exclusive", false, &exclusive); + RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str); + RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str); + RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str); + + if (!s->user->get_info().system && system) { + ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl; + op_ret = -EINVAL; + return; + } + + if (!tenant_name.empty()) { + uid.tenant = tenant_name; + } + + // TODO: validate required args are passed in. (for eg. uid and display_name here) + op_state.set_user_id(uid); + op_state.set_display_name(display_name); + op_state.set_user_email(email); + op_state.set_caps(caps); + op_state.set_access_key(access_key); + op_state.set_secret_key(secret_key); + + if (!op_mask_str.empty()) { + uint32_t op_mask; + int ret = rgw_parse_op_type_list(op_mask_str, &op_mask); + if (ret < 0) { + ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl; + op_ret = -EINVAL; + return; + } + op_state.set_op_mask(op_mask); + } + + if (!key_type_str.empty()) { + int32_t key_type = KEY_TYPE_UNDEFINED; + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + + op_state.set_key_type(key_type); + } + + if (max_buckets != default_max_buckets) { + if (max_buckets < 0) { + max_buckets = -1; + } + op_state.set_max_buckets(max_buckets); + } + if (s->info.args.exists("suspended")) + op_state.set_suspension(suspended); + + if (s->info.args.exists("system")) + op_state.set_system(system); + + if (s->info.args.exists("exclusive")) + op_state.set_exclusive(exclusive); + + if (!default_placement_str.empty()) { + rgw_placement_rule target_rule; + target_rule.from_str(default_placement_str); + if (!driver->valid_placement(target_rule)) { + ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl; + op_ret = -EINVAL; + return; + } + op_state.set_default_placement(target_rule); + } + + if (!placement_tags_str.empty()) { + list placement_tags_list; + get_str_list(placement_tags_str, ",", placement_tags_list); + op_state.set_placement_tags(placement_tags_list); + } + + if(!(driver->is_meta_master())) { + op_ret = fetch_access_keys_from_master(this, driver, op_state, s, y); + + if(op_ret < 0) { + return; + } else { + // set_generate_key() is not set if keys have already been fetched from master zone + gen_key = false; + } + } + + if (gen_key) { + op_state.set_generate_key(); + } + + op_ret = RGWUserAdminOp_User::create(s, driver, op_state, flusher, y); +} + +class RGWOp_User_Modify : public RGWRESTOp { + +public: + RGWOp_User_Modify() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "modify_user"; } +}; + +void RGWOp_User_Modify::execute(optional_yield y) +{ + std::string uid_str; + std::string display_name; + std::string email; + std::string access_key; + std::string secret_key; + std::string key_type_str; + std::string op_mask_str; + std::string default_placement_str; + std::string placement_tags_str; + + bool gen_key; + bool suspended; + bool system; + bool email_set; + bool quota_set; + int32_t max_buckets; + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "display-name", display_name, &display_name); + RESTArgs::get_string(s, "email", email, &email, &email_set); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_bool(s, "generate-key", false, &gen_key); + RESTArgs::get_bool(s, "suspended", false, &suspended); + RESTArgs::get_int32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets, "a_set); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + + RESTArgs::get_bool(s, "system", false, &system); + RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str); + RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str); + RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str); + + if (!s->user->get_info().system && system) { + ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl; + op_ret = -EINVAL; + return; + } + + op_state.set_user_id(uid); + op_state.set_display_name(display_name); + + if (email_set) + op_state.set_user_email(email); + + op_state.set_access_key(access_key); + op_state.set_secret_key(secret_key); + + if (quota_set) { + if (max_buckets < 0 ) { + max_buckets = -1; + } + op_state.set_max_buckets(max_buckets); + } + + if (!key_type_str.empty()) { + int32_t key_type = KEY_TYPE_UNDEFINED; + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + + op_state.set_key_type(key_type); + } + + if (!op_mask_str.empty()) { + uint32_t op_mask; + if (rgw_parse_op_type_list(op_mask_str, &op_mask) < 0) { + ldpp_dout(this, 0) << "failed to parse op_mask" << dendl; + op_ret = -EINVAL; + return; + } + op_state.set_op_mask(op_mask); + } + + if (s->info.args.exists("suspended")) + op_state.set_suspension(suspended); + + if (s->info.args.exists("system")) + op_state.set_system(system); + + if (!op_mask_str.empty()) { + uint32_t op_mask; + int ret = rgw_parse_op_type_list(op_mask_str, &op_mask); + if (ret < 0) { + ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl; + op_ret = -EINVAL; + return; + } + op_state.set_op_mask(op_mask); + } + + if (!default_placement_str.empty()) { + rgw_placement_rule target_rule; + target_rule.from_str(default_placement_str); + if (!driver->valid_placement(target_rule)) { + ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl; + op_ret = -EINVAL; + return; + } + op_state.set_default_placement(target_rule); + } + + if (!placement_tags_str.empty()) { + list placement_tags_list; + get_str_list(placement_tags_str, ",", placement_tags_list); + op_state.set_placement_tags(placement_tags_list); + } + + if(!(driver->is_meta_master())) { + op_ret = fetch_access_keys_from_master(this, driver, op_state, s, y); + + if(op_ret < 0) { + return; + } else { + // set_generate_key() is not set if keys have already been fetched from master zone + gen_key = false; + } + } + + if (gen_key) { + op_state.set_generate_key(); + } + + op_ret = RGWUserAdminOp_User::modify(s, driver, op_state, flusher, y); +} + +class RGWOp_User_Remove : public RGWRESTOp { + +public: + RGWOp_User_Remove() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "remove_user"; } +}; + +void RGWOp_User_Remove::execute(optional_yield y) +{ + std::string uid_str; + bool purge_data; + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_bool(s, "purge-data", false, &purge_data); + + // FIXME: no double checking + if (!uid.empty()) + op_state.set_user_id(uid); + + op_state.set_purge_data(purge_data); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWUserAdminOp_User::remove(s, driver, op_state, flusher, s->yield); +} + +class RGWOp_Subuser_Create : public RGWRESTOp { + +public: + RGWOp_Subuser_Create() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "create_subuser"; } +}; + +void RGWOp_Subuser_Create::execute(optional_yield y) +{ + std::string uid_str; + std::string subuser; + std::string secret_key; + std::string access_key; + std::string perm_str; + std::string key_type_str; + + bool gen_subuser = false; // FIXME placeholder + bool gen_secret; + bool gen_access; + + uint32_t perm_mask = 0; + int32_t key_type = KEY_TYPE_SWIFT; + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "access", perm_str, &perm_str); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + RESTArgs::get_bool(s, "generate-secret", false, &gen_secret); + RESTArgs::get_bool(s, "gen-access-key", false, &gen_access); + + perm_mask = rgw_str_to_perm(perm_str.c_str()); + op_state.set_perm(perm_mask); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + op_state.set_access_key(access_key); + op_state.set_secret_key(secret_key); + op_state.set_generate_subuser(gen_subuser); + + if (gen_access) + op_state.set_gen_access(); + + if (gen_secret) + op_state.set_gen_secret(); + + if (!key_type_str.empty()) { + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + } + op_state.set_key_type(key_type); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWUserAdminOp_Subuser::create(s, driver, op_state, flusher, y); +} + +class RGWOp_Subuser_Modify : public RGWRESTOp { + +public: + RGWOp_Subuser_Modify() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "modify_subuser"; } +}; + +void RGWOp_Subuser_Modify::execute(optional_yield y) +{ + std::string uid_str; + std::string subuser; + std::string secret_key; + std::string key_type_str; + std::string perm_str; + + RGWUserAdminOpState op_state(driver); + + uint32_t perm_mask; + int32_t key_type = KEY_TYPE_SWIFT; + + bool gen_secret; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "access", perm_str, &perm_str); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + RESTArgs::get_bool(s, "generate-secret", false, &gen_secret); + + perm_mask = rgw_str_to_perm(perm_str.c_str()); + op_state.set_perm(perm_mask); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + + if (!secret_key.empty()) + op_state.set_secret_key(secret_key); + + if (gen_secret) + op_state.set_gen_secret(); + + if (!key_type_str.empty()) { + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + } + op_state.set_key_type(key_type); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWUserAdminOp_Subuser::modify(s, driver, op_state, flusher, y); +} + +class RGWOp_Subuser_Remove : public RGWRESTOp { + +public: + RGWOp_Subuser_Remove() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "remove_subuser"; } +}; + +void RGWOp_Subuser_Remove::execute(optional_yield y) +{ + std::string uid_str; + std::string subuser; + bool purge_keys; + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_bool(s, "purge-keys", true, &purge_keys); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + + if (purge_keys) + op_state.set_purge_keys(); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWUserAdminOp_Subuser::remove(s, driver, op_state, flusher, y); +} + +class RGWOp_Key_Create : public RGWRESTOp { + +public: + RGWOp_Key_Create() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "create_access_key"; } +}; + +void RGWOp_Key_Create::execute(optional_yield y) +{ + std::string uid_str; + std::string subuser; + std::string access_key; + std::string secret_key; + std::string key_type_str; + + bool gen_key; + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + RESTArgs::get_bool(s, "generate-key", true, &gen_key); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + op_state.set_access_key(access_key); + op_state.set_secret_key(secret_key); + + if (gen_key) + op_state.set_generate_key(); + + if (!key_type_str.empty()) { + int32_t key_type = KEY_TYPE_UNDEFINED; + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + + op_state.set_key_type(key_type); + } + + op_ret = RGWUserAdminOp_Key::create(s, driver, op_state, flusher, y); +} + +class RGWOp_Key_Remove : public RGWRESTOp { + +public: + RGWOp_Key_Remove() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "remove_access_key"; } +}; + +void RGWOp_Key_Remove::execute(optional_yield y) +{ + std::string uid_str; + std::string subuser; + std::string access_key; + std::string key_type_str; + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + op_state.set_access_key(access_key); + + if (!key_type_str.empty()) { + int32_t key_type = KEY_TYPE_UNDEFINED; + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + + op_state.set_key_type(key_type); + } + + op_ret = RGWUserAdminOp_Key::remove(s, driver, op_state, flusher, y); +} + +class RGWOp_Caps_Add : public RGWRESTOp { + +public: + RGWOp_Caps_Add() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "add_user_caps"; } +}; + +void RGWOp_Caps_Add::execute(optional_yield y) +{ + std::string uid_str; + std::string caps; + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "user-caps", caps, &caps); + + op_state.set_user_id(uid); + op_state.set_caps(caps); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWUserAdminOp_Caps::add(s, driver, op_state, flusher, y); +} + +class RGWOp_Caps_Remove : public RGWRESTOp { + +public: + RGWOp_Caps_Remove() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "remove_user_caps"; } +}; + +void RGWOp_Caps_Remove::execute(optional_yield y) +{ + std::string uid_str; + std::string caps; + + RGWUserAdminOpState op_state(driver); + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "user-caps", caps, &caps); + + op_state.set_user_id(uid); + op_state.set_caps(caps); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWUserAdminOp_Caps::remove(s, driver, op_state, flusher, y); +} + +struct UserQuotas { + RGWQuota quota; + + UserQuotas() {} + + explicit UserQuotas(RGWUserInfo& info){ + quota.bucket_quota = info.quota.bucket_quota; + quota.user_quota = info.quota.user_quota; + } + + void dump(Formatter *f) const { + encode_json("bucket_quota", quota.bucket_quota, f); + encode_json("user_quota", quota.user_quota, f); + } + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj); + JSONDecoder::decode_json("user_quota", quota.user_quota, obj); + } +}; + +class RGWOp_Quota_Info : public RGWRESTOp { + +public: + RGWOp_Quota_Info() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_READ); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "get_quota_info"; } +}; + + +void RGWOp_Quota_Info::execute(optional_yield y) +{ + RGWUserAdminOpState op_state(driver); + + std::string uid_str; + std::string quota_type; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "quota-type", quota_type, "a_type); + + if (uid_str.empty()) { + op_ret = -EINVAL; + return; + } + + rgw_user uid(uid_str); + + bool show_all = quota_type.empty(); + bool show_bucket = show_all || (quota_type == "bucket"); + bool show_user = show_all || (quota_type == "user"); + + if (!(show_all || show_bucket || show_user)) { + op_ret = -EINVAL; + return; + } + + op_state.set_user_id(uid); + + RGWUser user; + op_ret = user.init(s, driver, op_state, y); + if (op_ret < 0) + return; + + if (!op_state.has_existing_user()) { + op_ret = -ERR_NO_SUCH_USER; + return; + } + + RGWUserInfo info; + string err_msg; + op_ret = user.info(info, &err_msg); + if (op_ret < 0) + return; + + flusher.start(0); + if (show_all) { + UserQuotas quotas(info); + encode_json("quota", quotas, s->formatter); + } else if (show_user) { + encode_json("user_quota", info.quota.user_quota, s->formatter); + } else { + encode_json("bucket_quota", info.quota.bucket_quota, s->formatter); + } + + flusher.flush(); +} + +class RGWOp_Quota_Set : public RGWRESTOp { + +public: + RGWOp_Quota_Set() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "set_quota_info"; } +}; + +/** + * set quota + * + * two different ways to set the quota info: as json struct in the message body or via http params. + * + * as json: + * + * PUT /admin/user?uid=["a-type=] + * + * whereas quota-type is optional and is either user, or bucket + * + * if quota-type is not specified then we expect to get a structure that contains both quotas, + * otherwise we'll only get the relevant configuration. + * + * E.g., if quota type not specified: + * { + * "user_quota" : { + * "max_size_kb" : 4096, + * "max_objects" : -1, + * "enabled" : false + * }, + * "bucket_quota" : { + * "max_size_kb" : 1024, + * "max_objects" : -1, + * "enabled" : true + * } + * } + * + * + * or if quota type is specified: + * { + * "max_size_kb" : 4096, + * "max_objects" : -1, + * "enabled" : false + * } + * + * Another option is not to pass any body and set the following http params: + * + * + * max-size-kb= + * max-objects= + * enabled[={true,false}] + * + * all params are optionals and default to the current settings. With this type of configuration the + * quota-type param is mandatory. + * + */ + +void RGWOp_Quota_Set::execute(optional_yield y) +{ + RGWUserAdminOpState op_state(driver); + + std::string uid_str; + std::string quota_type; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "quota-type", quota_type, "a_type); + + if (uid_str.empty()) { + op_ret = -EINVAL; + return; + } + + rgw_user uid(uid_str); + + bool set_all = quota_type.empty(); + bool set_bucket = set_all || (quota_type == "bucket"); + bool set_user = set_all || (quota_type == "user"); + + if (!(set_all || set_bucket || set_user)) { + ldpp_dout(this, 20) << "invalid quota type" << dendl; + op_ret = -EINVAL; + return; + } + + bool use_http_params; + + if (s->content_length > 0) { + use_http_params = false; + } else { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + use_http_params = (!encoding || strcmp(encoding, "chunked") != 0); + } + + if (use_http_params && set_all) { + ldpp_dout(this, 20) << "quota type was not specified, can't set all quotas via http headers" << dendl; + op_ret = -EINVAL; + return; + } + + op_state.set_user_id(uid); + + RGWUser user; + op_ret = user.init(s, driver, op_state, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "failed initializing user info: " << op_ret << dendl; + return; + } + + if (!op_state.has_existing_user()) { + op_ret = -ERR_NO_SUCH_USER; + return; + } + +#define QUOTA_INPUT_MAX_LEN 1024 + if (set_all) { + UserQuotas quotas; + + if ((op_ret = get_json_input(driver->ctx(), s, quotas, QUOTA_INPUT_MAX_LEN, NULL)) < 0) { + ldpp_dout(this, 20) << "failed to retrieve input" << dendl; + return; + } + + op_state.set_user_quota(quotas.quota.user_quota); + op_state.set_bucket_quota(quotas.quota.bucket_quota); + } else { + RGWQuotaInfo quota; + + if (!use_http_params) { + bool empty; + op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty); + if (op_ret < 0) { + ldpp_dout(this, 20) << "failed to retrieve input" << dendl; + if (!empty) + return; + + /* was probably chunked input, but no content provided, configure via http params */ + use_http_params = true; + } + } + + if (use_http_params) { + RGWUserInfo info; + string err_msg; + op_ret = user.info(info, &err_msg); + if (op_ret < 0) { + ldpp_dout(this, 20) << "failed to get user info: " << op_ret << dendl; + return; + } + RGWQuotaInfo *old_quota; + if (set_user) { + old_quota = &info.quota.user_quota; + } else { + old_quota = &info.quota.bucket_quota; + } + + RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects); + RESTArgs::get_int64(s, "max-size", old_quota->max_size, "a.max_size); + int64_t max_size_kb; + bool has_max_size_kb = false; + RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb); + if (has_max_size_kb) { + quota.max_size = max_size_kb * 1024; + } + RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled); + } + + if (set_user) { + op_state.set_user_quota(quota); + } else { + op_state.set_bucket_quota(quota); + } + } + + string err; + op_ret = user.modify(s, op_state, y, &err); + if (op_ret < 0) { + ldpp_dout(this, 20) << "failed updating user info: " << op_ret << ": " << err << dendl; + return; + } +} + +RGWOp *RGWHandler_User::op_get() +{ + if (s->info.args.sub_resource_exists("quota")) + return new RGWOp_Quota_Info; + + if (s->info.args.sub_resource_exists("list")) + return new RGWOp_User_List; + + return new RGWOp_User_Info; +} + +RGWOp *RGWHandler_User::op_put() +{ + if (s->info.args.sub_resource_exists("subuser")) + return new RGWOp_Subuser_Create; + + if (s->info.args.sub_resource_exists("key")) + return new RGWOp_Key_Create; + + if (s->info.args.sub_resource_exists("caps")) + return new RGWOp_Caps_Add; + + if (s->info.args.sub_resource_exists("quota")) + return new RGWOp_Quota_Set; + + return new RGWOp_User_Create; +} + +RGWOp *RGWHandler_User::op_post() +{ + if (s->info.args.sub_resource_exists("subuser")) + return new RGWOp_Subuser_Modify; + + return new RGWOp_User_Modify; +} + +RGWOp *RGWHandler_User::op_delete() +{ + if (s->info.args.sub_resource_exists("subuser")) + return new RGWOp_Subuser_Remove; + + if (s->info.args.sub_resource_exists("key")) + return new RGWOp_Key_Remove; + + if (s->info.args.sub_resource_exists("caps")) + return new RGWOp_Caps_Remove; + + return new RGWOp_User_Remove; +} + diff --git a/src/rgw/driver/rados/rgw_rest_user.h b/src/rgw/driver/rados/rgw_rest_user.h new file mode 100644 index 000000000..ee585be45 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_user.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + + +class RGWHandler_User : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_put() override; + RGWOp *op_post() override; + RGWOp *op_delete() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_User() override = default; + + int read_permissions(RGWOp*, optional_yield) override { + return 0; + } +}; + +class RGWRESTMgr_User : public RGWRESTMgr { +public: + RGWRESTMgr_User() = default; + ~RGWRESTMgr_User() override = default; + + RGWHandler_REST *get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_User(auth_registry); + } +}; diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc new file mode 100644 index 000000000..9acdb79d3 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sal_rados.cc @@ -0,0 +1,3846 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/Clock.h" +#include "common/errno.h" + +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "rgw_bucket.h" +#include "rgw_multi.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_aio.h" +#include "rgw_aio_throttle.h" +#include "rgw_tracer.h" + +#include "rgw_zone.h" +#include "rgw_rest_conn.h" +#include "rgw_service.h" +#include "rgw_lc.h" +#include "rgw_lc_tier.h" +#include "rgw_rest_admin.h" +#include "rgw_rest_bucket.h" +#include "rgw_rest_metadata.h" +#include "rgw_rest_log.h" +#include "rgw_rest_config.h" +#include "rgw_rest_ratelimit.h" +#include "rgw_rest_realm.h" +#include "rgw_rest_user.h" +#include "services/svc_sys_obj.h" +#include "services/svc_meta.h" +#include "services/svc_meta_be_sobj.h" +#include "services/svc_cls.h" +#include "services/svc_zone.h" +#include "services/svc_tier_rados.h" +#include "services/svc_quota.h" +#include "services/svc_config_key.h" +#include "services/svc_zone_utils.h" +#include "services/svc_role_rados.h" +#include "services/svc_user.h" +#include "cls/rgw/cls_rgw_client.h" + +#include "rgw_pubsub.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static string mp_ns = RGW_OBJ_NS_MULTIPART; + +namespace rgw::sal { + +// default number of entries to list with each bucket listing call +// (use marker to bridge between calls) +static constexpr size_t listing_max_entries = 1000; +static std::string pubsub_oid_prefix = "pubsub."; + +static int decode_policy(CephContext* cct, + bufferlist& bl, + RGWAccessControlPolicy* policy) +{ + auto iter = bl.cbegin(); + try { + policy->decode(iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (cct->_conf->subsys.should_gather()) { + ldout(cct, 15) << __func__ << " Read AccessControlPolicy"; + RGWAccessControlPolicy_S3* s3policy = static_cast(policy); + s3policy->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + +static int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider* dpp, + RadosStore* store, + User* user, + Attrs& bucket_attrs, + RGWAccessControlPolicy* policy, + optional_yield y) +{ + auto aiter = bucket_attrs.find(RGW_ATTR_ACL); + + if (aiter != bucket_attrs.end()) { + int ret = decode_policy(store->ctx(), aiter->second, policy); + if (ret < 0) + return ret; + } else { + ldout(store->ctx(), 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl; + /* object exists, but policy is broken */ + int r = user->load_user(dpp, y); + if (r < 0) + return r; + + policy->create_default(user->get_id(), user->get_display_name()); + } + return 0; +} + +static int drain_aio(std::list& handles) +{ + int ret = 0; + while (!handles.empty()) { + librados::AioCompletion* handle = handles.front(); + handles.pop_front(); + handle->wait_for_complete(); + int r = handle->get_return_value(); + handle->release(); + if (r < 0) { + ret = r; + } + } + return ret; +} + +int RadosCompletions::drain() +{ + return drain_aio(handles); +} + +int RadosUser::list_buckets(const DoutPrefixProvider* dpp, const std::string& marker, + const std::string& end_marker, uint64_t max, bool need_stats, + BucketList &buckets, optional_yield y) +{ + RGWUserBuckets ulist; + bool is_truncated = false; + int ret; + + buckets.clear(); + ret = store->ctl()->user->list_buckets(dpp, info.user_id, marker, end_marker, max, + need_stats, &ulist, &is_truncated, y); + if (ret < 0) + return ret; + + buckets.set_truncated(is_truncated); + for (const auto& ent : ulist.get_buckets()) { + buckets.add(std::unique_ptr(new RadosBucket(this->store, ent.second, this))); + } + + return 0; +} + +int RadosUser::create_bucket(const DoutPrefixProvider* dpp, + const rgw_bucket& b, + const std::string& zonegroup_id, + rgw_placement_rule& placement_rule, + std::string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool* existed, + req_info& req_info, + std::unique_ptr* bucket_out, + optional_yield y) +{ + int ret; + bufferlist in_data; + RGWBucketInfo master_info; + rgw_bucket* pmaster_bucket; + uint32_t* pmaster_num_shards; + real_time creation_time; + std::unique_ptr bucket; + obj_version objv,* pobjv = NULL; + + /* If it exists, look it up; otherwise create it */ + ret = store->get_bucket(dpp, this, b, &bucket, y); + if (ret < 0 && ret != -ENOENT) + return ret; + + if (ret != -ENOENT) { + RGWAccessControlPolicy old_policy(store->ctx()); + *existed = true; + if (swift_ver_location.empty()) { + swift_ver_location = bucket->get_info().swift_ver_location; + } + placement_rule.inherit_from(bucket->get_info().placement_rule); + + // don't allow changes to the acl policy + int r = rgw_op_get_bucket_policy_from_attr(dpp, store, this, bucket->get_attrs(), + &old_policy, y); + if (r >= 0 && old_policy != policy) { + bucket_out->swap(bucket); + return -EEXIST; + } + } else { + bucket = std::unique_ptr(new RadosBucket(store, b, this)); + *existed = false; + bucket->set_attrs(attrs); + } + + if (!store->svc()->zone->is_meta_master()) { + JSONParser jp; + ret = store->forward_request_to_master(dpp, this, NULL, in_data, &jp, req_info, y); + if (ret < 0) { + return ret; + } + + JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp); + JSONDecoder::decode_json("object_ver", objv, &jp); + JSONDecoder::decode_json("bucket_info", master_info, &jp); + ldpp_dout(dpp, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl; + std::time_t ctime = ceph::real_clock::to_time_t(master_info.creation_time); + ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl; + pmaster_bucket= &master_info.bucket; + creation_time = master_info.creation_time; + pmaster_num_shards = &master_info.layout.current_index.layout.normal.num_shards; + pobjv = &objv; + if (master_info.obj_lock_enabled()) { + info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED; + } + } else { + pmaster_bucket = NULL; + pmaster_num_shards = NULL; + if (obj_lock_enabled) + info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED; + } + + std::string zid = zonegroup_id; + if (zid.empty()) { + zid = store->svc()->zone->get_zonegroup().get_id(); + } + + if (*existed) { + rgw_placement_rule selected_placement_rule; + ret = store->svc()->zone->select_bucket_placement(dpp, this->get_info(), + zid, placement_rule, + &selected_placement_rule, nullptr, y); + if (selected_placement_rule != info.placement_rule) { + ret = -EEXIST; + bucket_out->swap(bucket); + return ret; + } + } else { + + ret = store->getRados()->create_bucket(this->get_info(), bucket->get_key(), + zid, placement_rule, swift_ver_location, pquota_info, + attrs, info, pobjv, &ep_objv, creation_time, + pmaster_bucket, pmaster_num_shards, y, dpp, + exclusive); + if (ret == -EEXIST) { + *existed = true; + /* bucket already existed, might have raced with another bucket creation, + * or might be partial bucket creation that never completed. Read existing + * bucket info, verify that the reported bucket owner is the current user. + * If all is ok then update the user's list of buckets. Otherwise inform + * client about a name conflict. + */ + if (info.owner.compare(this->get_id()) != 0) { + return -EEXIST; + } + ret = 0; + } else if (ret != 0) { + return ret; + } + } + + bucket->set_version(ep_objv); + bucket->get_info() = info; + + RadosBucket* rbucket = static_cast(bucket.get()); + ret = rbucket->link(dpp, this, y, false); + if (ret && !*existed && ret != -EEXIST) { + /* if it exists (or previously existed), don't remove it! */ + ret = rbucket->unlink(dpp, this, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << ret + << dendl; + } + } else if (ret == -EEXIST || (ret == 0 && *existed)) { + ret = -ERR_BUCKET_EXISTS; + } + + bucket_out->swap(bucket); + + return ret; +} + +int RadosUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y) +{ + return store->ctl()->user->get_attrs_by_uid(dpp, get_id(), &attrs, y, &objv_tracker); +} + +int RadosUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) +{ + for(auto& it : new_attrs) { + attrs[it.first] = it.second; + } + return store_user(dpp, y, false); +} + +int RadosUser::read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time* last_stats_sync, + ceph::real_time* last_stats_update) +{ + return store->ctl()->user->read_stats(dpp, get_id(), stats, y, last_stats_sync, last_stats_update); +} + +int RadosUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) +{ + return store->svc()->user->read_stats_async(dpp, get_id(), cb); +} + +int RadosUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) +{ + return store->svc()->user->complete_flush_stats(dpp, get_id(), y); +} + +int RadosUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, + RGWUsageIter& usage_iter, + map& usage) +{ + std::string bucket_name; + return store->getRados()->read_usage(dpp, get_id(), bucket_name, start_epoch, + end_epoch, max_entries, is_truncated, + usage_iter, usage); +} + +int RadosUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) +{ + std::string bucket_name; + + return store->getRados()->trim_usage(dpp, get_id(), bucket_name, start_epoch, end_epoch); +} + +int RadosUser::load_user(const DoutPrefixProvider* dpp, optional_yield y) +{ + return store->ctl()->user->get_info_by_uid(dpp, info.user_id, &info, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker).set_attrs(&attrs)); +} + +int RadosUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info) +{ + return store->ctl()->user->store_info(dpp, info, y, + RGWUserCtl::PutParams().set_objv_tracker(&objv_tracker) + .set_exclusive(exclusive) + .set_attrs(&attrs) + .set_old_info(old_info)); +} + +int RadosUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y) +{ + return store->ctl()->user->remove_info(dpp, info, y, + RGWUserCtl::RemoveParams().set_objv_tracker(&objv_tracker)); +} + +int RadosUser::verify_mfa(const std::string& mfa_str, bool* verified, + const DoutPrefixProvider* dpp, optional_yield y) +{ + vector params; + get_str_vec(mfa_str, " ", params); + + if (params.size() != 2) { + ldpp_dout(dpp, 5) << "NOTICE: invalid mfa string provided: " << mfa_str << dendl; + return -EINVAL; + } + + string& serial = params[0]; + string& pin = params[1]; + + auto i = info.mfa_ids.find(serial); + if (i == info.mfa_ids.end()) { + ldpp_dout(dpp, 5) << "NOTICE: user does not have mfa device with serial=" << serial << dendl; + return -EACCES; + } + + int ret = store->svc()->cls->mfa.check_mfa(dpp, info.user_id, serial, pin, y); + if (ret < 0) { + ldpp_dout(dpp, 20) << "NOTICE: failed to check MFA, serial=" << serial << dendl; + return -EACCES; + } + + *verified = true; + + return 0; +} + +RadosBucket::~RadosBucket() {} + +int RadosBucket::remove_bucket(const DoutPrefixProvider* dpp, + bool delete_children, + bool forward_to_master, + req_info* req_info, + optional_yield y) +{ + int ret; + + // Refresh info + ret = load_bucket(dpp, y); + if (ret < 0) { + return ret; + } + + ListParams params; + params.list_versions = true; + params.allow_unordered = true; + + ListResults results; + + do { + results.objs.clear(); + + ret = list(dpp, params, 1000, results, y); + if (ret < 0) { + return ret; + } + + if (!results.objs.empty() && !delete_children) { + ldpp_dout(dpp, -1) << "ERROR: could not remove non-empty bucket " << info.bucket.name << + dendl; + return -ENOTEMPTY; + } + + for (const auto& obj : results.objs) { + rgw_obj_key key(obj.key); + /* xxx dang */ + ret = rgw_remove_object(dpp, store, this, key); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + } + } while(results.is_truncated); + + ret = abort_multiparts(dpp, store->ctx()); + if (ret < 0) { + return ret; + } + + // remove lifecycle config, if any (XXX note could be made generic) + (void) store->getRados()->get_lc()->remove_bucket_config( + this, get_attrs()); + + ret = store->ctl()->bucket->sync_user_stats(dpp, info.owner, info, y, nullptr); + if (ret < 0) { + ldout(store->ctx(), 1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl; + } + + RGWObjVersionTracker ot; + + // if we deleted children above we will force delete, as any that + // remain is detrius from a prior bug + ret = store->getRados()->delete_bucket(info, ot, y, dpp, !delete_children); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " << + info.bucket.name << dendl; + return ret; + } + + // if bucket has notification definitions associated with it + // they should be removed (note that any pending notifications on the bucket are still going to be sent) + const RGWPubSub ps(store, info.owner.tenant); + const RGWPubSub::Bucket ps_bucket(ps, this); + const auto ps_ret = ps_bucket.remove_notifications(dpp, y); + if (ps_ret < 0 && ps_ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: unable to remove notifications from bucket. ret=" << ps_ret << dendl; + } + + ret = store->ctl()->bucket->unlink_bucket(info.owner, info.bucket, y, dpp, false); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: unable to remove user bucket information" << dendl; + } + + if (forward_to_master) { + bufferlist in_data; + ret = store->forward_request_to_master(dpp, owner, &ot.read_version, in_data, nullptr, *req_info, y); + if (ret < 0) { + if (ret == -ENOENT) { + /* adjust error, we want to return with NoSuchBucket and not + * NoSuchKey */ + ret = -ERR_NO_SUCH_BUCKET; + } + return ret; + } + } + + return ret; +} + +int RadosBucket::remove_bucket_bypass_gc(int concurrent_max, bool + keep_index_consistent, + optional_yield y, const + DoutPrefixProvider *dpp) +{ + int ret; + map stats; + map common_prefixes; + RGWObjectCtx obj_ctx(store); + CephContext *cct = store->ctx(); + + string bucket_ver, master_ver; + + ret = load_bucket(dpp, y); + if (ret < 0) + return ret; + + const auto& index = info.get_current_index(); + ret = read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, NULL); + if (ret < 0) + return ret; + + ret = abort_multiparts(dpp, cct); + if (ret < 0) { + return ret; + } + + rgw::sal::Bucket::ListParams params; + rgw::sal::Bucket::ListResults results; + + params.list_versions = true; + params.allow_unordered = true; + + std::list handles; + + int max_aio = concurrent_max; + results.is_truncated = true; + + while (results.is_truncated) { + ret = list(dpp, params, listing_max_entries, results, y); + if (ret < 0) + return ret; + + std::vector::iterator it = results.objs.begin(); + for (; it != results.objs.end(); ++it) { + RGWObjState *astate = NULL; + RGWObjManifest *amanifest = nullptr; + rgw_obj obj{get_key(), it->key}; + + ret = store->getRados()->get_obj_state(dpp, &obj_ctx, get_info(), + obj, &astate, &amanifest, + false, y); + if (ret == -ENOENT) { + ldpp_dout(dpp, 1) << "WARNING: cannot find obj state for obj " << obj << dendl; + continue; + } + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: get obj state returned with error " << ret << dendl; + return ret; + } + + if (amanifest) { + RGWObjManifest& manifest = *amanifest; + RGWObjManifest::obj_iterator miter = manifest.obj_begin(dpp); + const rgw_obj head_obj = manifest.get_obj(); + rgw_raw_obj raw_head_obj; + store->get_raw_obj(manifest.get_head_placement_rule(), head_obj, &raw_head_obj); + + for (; miter != manifest.obj_end(dpp) && max_aio--; ++miter) { + if (!max_aio) { + ret = drain_aio(handles); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + max_aio = concurrent_max; + } + + rgw_raw_obj last_obj = miter.get_location().get_raw_obj(store->getRados()); + if (last_obj == raw_head_obj) { + // have the head obj deleted at the end + continue; + } + + ret = store->getRados()->delete_raw_obj_aio(dpp, last_obj, handles); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl; + return ret; + } + } // for all shadow objs + + ret = store->getRados()->delete_obj_aio(dpp, head_obj, get_info(), astate, + handles, keep_index_consistent, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl; + return ret; + } + } + + if (!max_aio) { + ret = drain_aio(handles); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + max_aio = concurrent_max; + } + obj_ctx.invalidate(obj); + } // for all RGW objects in results + } // while is_truncated + + ret = drain_aio(handles); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + + sync_user_stats(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl; + } + + RGWObjVersionTracker objv_tracker; + + // this function can only be run if caller wanted children to be + // deleted, so we can ignore the check for children as any that + // remain are detritus from a prior bug + ret = remove_bucket(dpp, true, false, nullptr, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " << this << dendl; + return ret; + } + + return ret; +} + +int RadosBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats) +{ + int ret; + + RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(); + RGWObjVersionTracker ep_ot; + if (info.bucket.bucket_id.empty()) { + ret = store->ctl()->bucket->read_bucket_info(info.bucket, &info, y, dpp, + RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(&mtime) + .set_attrs(&attrs) + .set_bectx_params(bectx_params), + &ep_ot); + } else { + ret = store->ctl()->bucket->read_bucket_instance_info(info.bucket, &info, y, dpp, + RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(&mtime) + .set_attrs(&attrs) + .set_bectx_params(bectx_params)); + } + if (ret != 0) { + return ret; + } + + bucket_version = ep_ot.read_version; + + if (get_stats) { + ret = store->ctl()->bucket->read_bucket_stats(info.bucket, &ent, y, dpp); + } + + return ret; +} + +int RadosBucket::read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, std::string* bucket_ver, std::string* master_ver, + std::map& stats, + std::string* max_marker, bool* syncstopped) +{ + return store->getRados()->get_bucket_stats(dpp, info, idx_layout, shard_id, bucket_ver, master_ver, stats, max_marker, syncstopped); +} + +int RadosBucket::read_stats_async(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetBucketStats_CB* ctx) +{ + return store->getRados()->get_bucket_stats_async(dpp, get_info(), idx_layout, shard_id, ctx); +} + +int RadosBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) +{ + return store->ctl()->bucket->sync_user_stats(dpp, owner->get_id(), info, y, &ent); +} + +int RadosBucket::update_container_stats(const DoutPrefixProvider* dpp) +{ + int ret; + map m; + + m[info.bucket.name] = ent; + ret = store->getRados()->update_containers_stats(m, dpp); + if (!ret) + return -EEXIST; + if (ret < 0) + return ret; + + map::iterator iter = m.find(info.bucket.name); + if (iter == m.end()) + return -EINVAL; + + ent.count = iter->second.count; + ent.size = iter->second.size; + ent.size_rounded = iter->second.size_rounded; + ent.creation_time = iter->second.creation_time; + ent.placement_rule = std::move(iter->second.placement_rule); + + info.creation_time = ent.creation_time; + info.placement_rule = ent.placement_rule; + + return 0; +} + +int RadosBucket::check_bucket_shards(const DoutPrefixProvider* dpp) +{ + return store->getRados()->check_bucket_shards(info, info.bucket, get_count(), dpp); +} + +int RadosBucket::link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint, RGWObjVersionTracker* objv) +{ + RGWBucketEntryPoint ep; + ep.bucket = info.bucket; + ep.owner = new_user->get_id(); + ep.creation_time = get_creation_time(); + ep.linked = true; + Attrs ep_attrs; + rgw_ep_info ep_data{ep, ep_attrs}; + + int r = store->ctl()->bucket->link_bucket(new_user->get_id(), info.bucket, + get_creation_time(), y, dpp, update_entrypoint, + &ep_data); + if (r < 0) + return r; + + if (objv) + *objv = ep_data.ep_objv; + + return r; +} + +int RadosBucket::unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint) +{ + return store->ctl()->bucket->unlink_bucket(new_user->get_id(), info.bucket, y, dpp, update_entrypoint); +} + +int RadosBucket::chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y) +{ + std::string obj_marker; + int r; + + if (!owner) { + ldpp_dout(dpp, 0) << __func__ << " Cannot chown without an owner " << dendl; + return -EINVAL; + } + + r = this->unlink(dpp, owner, y); + if (r < 0) { + return r; + } + + return this->link(dpp, &new_user, y); +} + +int RadosBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time _mtime) +{ + mtime = _mtime; + return store->getRados()->put_bucket_instance_info(info, exclusive, mtime, &attrs, dpp, null_yield); +} + +/* Make sure to call get_bucket_info() if you need it first */ +bool RadosBucket::is_owner(User* user) +{ + return (info.owner.compare(user->get_id()) == 0); +} + +int RadosBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y) +{ + return store->getRados()->check_bucket_empty(dpp, info, y); +} + +int RadosBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, + optional_yield y, bool check_size_only) +{ + return store->getRados()->check_quota(dpp, info.owner, get_key(), + quota, obj_size, y, check_size_only); +} + +int RadosBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) +{ + for(auto& it : new_attrs) { + attrs[it.first] = it.second; + } + return store->ctl()->bucket->set_bucket_instance_attrs(get_info(), + new_attrs, &get_info().objv_tracker, y, dpp); +} + +int RadosBucket::try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime) +{ + return store->getRados()->try_refresh_bucket_info(info, pmtime, dpp, &attrs); +} + +int RadosBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, + RGWUsageIter& usage_iter, + map& usage) +{ + return store->getRados()->read_usage(dpp, owner->get_id(), get_name(), start_epoch, + end_epoch, max_entries, is_truncated, + usage_iter, usage); +} + +int RadosBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) +{ + return store->getRados()->trim_usage(dpp, owner->get_id(), get_name(), start_epoch, end_epoch); +} + +int RadosBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list& objs_to_unlink) +{ + return store->getRados()->remove_objs_from_index(dpp, info, objs_to_unlink); +} + +int RadosBucket::check_index(const DoutPrefixProvider *dpp, std::map& existing_stats, std::map& calculated_stats) +{ + return store->getRados()->bucket_check_index(dpp, info, &existing_stats, &calculated_stats); +} + +int RadosBucket::rebuild_index(const DoutPrefixProvider *dpp) +{ + return store->getRados()->bucket_rebuild_index(dpp, info); +} + +int RadosBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) +{ + return store->getRados()->cls_obj_set_bucket_tag_timeout(dpp, info, timeout); +} + +int RadosBucket::purge_instance(const DoutPrefixProvider* dpp) +{ + int max_shards = (info.layout.current_index.layout.normal.num_shards > 0 ? info.layout.current_index.layout.normal.num_shards : 1); + for (int i = 0; i < max_shards; i++) { + RGWRados::BucketShard bs(store->getRados()); + int shard_id = (info.layout.current_index.layout.normal.num_shards > 0 ? i : -1); + int ret = bs.init(dpp, info, info.layout.current_index, shard_id); + if (ret < 0) { + cerr << "ERROR: bs.init(bucket=" << info.bucket << ", shard=" << shard_id + << "): " << cpp_strerror(-ret) << std::endl; + return ret; + } + ret = store->getRados()->bi_remove(dpp, bs); + if (ret < 0) { + cerr << "ERROR: failed to remove bucket index object: " + << cpp_strerror(-ret) << std::endl; + return ret; + } + } + return 0; +} + +int RadosBucket::set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy &acl, optional_yield y) +{ + bufferlist aclbl; + + acls = acl; + acl.encode(aclbl); + map& attrs = get_attrs(); + + attrs[RGW_ATTR_ACL] = aclbl; + info.owner = acl.get_owner().get_id(); + + int r = store->ctl()->bucket->store_bucket_instance_info(info.bucket, + info, y, dpp, + RGWBucketCtl::BucketInstance::PutParams().set_attrs(&attrs)); + if (r < 0) { + cerr << "ERROR: failed to set bucket owner: " << cpp_strerror(-r) << std::endl; + return r; + } + + return 0; +} + +std::unique_ptr RadosBucket::get_object(const rgw_obj_key& k) +{ + return std::make_unique(this->store, k, this); +} + +int RadosBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max, ListResults& results, optional_yield y) +{ + RGWRados::Bucket target(store->getRados(), get_info()); + if (params.shard_id >= 0) { + target.set_shard_id(params.shard_id); + } + RGWRados::Bucket::List list_op(&target); + + list_op.params.prefix = params.prefix; + list_op.params.delim = params.delim; + list_op.params.marker = params.marker; + list_op.params.ns = params.ns; + list_op.params.end_marker = params.end_marker; + list_op.params.ns = params.ns; + list_op.params.enforce_ns = params.enforce_ns; + list_op.params.access_list_filter = params.access_list_filter; + list_op.params.force_check_filter = params.force_check_filter; + list_op.params.list_versions = params.list_versions; + list_op.params.allow_unordered = params.allow_unordered; + + int ret = list_op.list_objects(dpp, max, &results.objs, &results.common_prefixes, &results.is_truncated, y); + if (ret >= 0) { + results.next_marker = list_op.get_next_marker(); + params.marker = results.next_marker; + } + + return ret; +} + +std::unique_ptr RadosBucket::get_multipart_upload( + const std::string& oid, + std::optional upload_id, + ACLOwner owner, ceph::real_time mtime) +{ + return std::make_unique(this->store, this, oid, upload_id, + std::move(owner), mtime); +} + +int RadosBucket::list_multiparts(const DoutPrefixProvider *dpp, + const string& prefix, + string& marker, + const string& delim, + const int& max_uploads, + vector>& uploads, + map *common_prefixes, + bool *is_truncated) +{ + rgw::sal::Bucket::ListParams params; + rgw::sal::Bucket::ListResults results; + MultipartMetaFilter mp_filter; + + params.prefix = prefix; + params.delim = delim; + params.marker = marker; + params.ns = RGW_OBJ_NS_MULTIPART; + params.access_list_filter = &mp_filter; + + int ret = list(dpp, params, max_uploads, results, null_yield); + + if (ret < 0) + return ret; + + if (!results.objs.empty()) { + for (const rgw_bucket_dir_entry& dentry : results.objs) { + rgw_obj_key key(dentry.key); + ACLOwner owner(rgw_user(dentry.meta.owner)); + owner.set_name(dentry.meta.owner_display_name); + uploads.push_back(this->get_multipart_upload(key.name, + std::nullopt, std::move(owner), dentry.meta.mtime)); + } + } + if (common_prefixes) { + *common_prefixes = std::move(results.common_prefixes); + } + *is_truncated = results.is_truncated; + marker = params.marker.name; + + return 0; +} + +int RadosBucket::abort_multiparts(const DoutPrefixProvider* dpp, + CephContext* cct) +{ + constexpr int max = 1000; + int ret, num_deleted = 0; + vector> uploads; + string marker; + bool is_truncated; + + const std::string empty_delim; + const std::string empty_prefix; + + do { + ret = list_multiparts(dpp, empty_prefix, marker, empty_delim, + max, uploads, nullptr, &is_truncated); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR : calling list_bucket_multiparts; ret=" << ret << + "; bucket=\"" << this << "\"" << dendl; + return ret; + } + ldpp_dout(dpp, 20) << __func__ << + " INFO: aborting and cleaning up multipart upload(s); bucket=\"" << + this << "\"; uploads.size()=" << uploads.size() << + "; is_truncated=" << is_truncated << dendl; + + if (!uploads.empty()) { + for (const auto& upload : uploads) { + ret = upload->abort(dpp, cct); + if (ret < 0) { + // we're doing a best-effort; if something cannot be found, + // log it and keep moving forward + if (ret != -ENOENT && ret != -ERR_NO_SUCH_UPLOAD) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR : failed to abort and clean-up multipart upload \"" << + upload->get_meta() << "\"" << dendl; + return ret; + } else { + ldpp_dout(dpp, 10) << __func__ << + " NOTE : unable to find part(s) of " + "aborted multipart upload of \"" << upload->get_meta() << + "\" for cleaning up" << dendl; + } + } + num_deleted++; + } + if (num_deleted) { + ldpp_dout(dpp, 0) << __func__ << + " WARNING : aborted " << num_deleted << + " incomplete multipart uploads" << dendl; + } + } + } while (is_truncated); + + return 0; +} + +std::string RadosBucket::topics_oid() const { + return pubsub_oid_prefix + get_tenant() + ".bucket." + get_name() + "/" + get_marker(); +} + +int RadosBucket::read_topics(rgw_pubsub_bucket_topics& notifications, + RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) +{ + bufferlist bl; + const int ret = rgw_get_system_obj(store->svc()->sysobj, + store->svc()->zone->get_zone_params().log_pool, + topics_oid(), + bl, + objv_tracker, + nullptr, y, dpp, nullptr); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(notifications, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 20) << " failed to decode bucket notifications from oid: " << topics_oid() << ". for bucket: " + << get_name() << ". error: " << err.what() << dendl; + return -EIO; + } + + return 0; +} + +int RadosBucket::write_topics(const rgw_pubsub_bucket_topics& notifications, + RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) { + bufferlist bl; + encode(notifications, bl); + + return rgw_put_system_obj(dpp, store->svc()->sysobj, + store->svc()->zone->get_zone_params().log_pool, + topics_oid(), + bl, false, objv_tracker, real_time(), y); +} + +int RadosBucket::remove_topics(RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) { + return rgw_delete_system_obj(dpp, store->svc()->sysobj, + store->svc()->zone->get_zone_params().log_pool, + topics_oid(), + objv_tracker, y); +} + +std::unique_ptr RadosStore::get_user(const rgw_user &u) +{ + return std::make_unique(this, u); +} + +std::string RadosStore::get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y) +{ + return getRados()->get_cluster_fsid(dpp, y); +} + +int RadosStore::get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr* user) +{ + RGWUserInfo uinfo; + User* u; + RGWObjVersionTracker objv_tracker; + + int r = ctl()->user->get_info_by_access_key(dpp, key, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker)); + if (r < 0) + return r; + + u = new RadosUser(this, uinfo); + if (!u) + return -ENOMEM; + + u->get_version_tracker() = objv_tracker; + + user->reset(u); + return 0; +} + +int RadosStore::get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr* user) +{ + RGWUserInfo uinfo; + User* u; + RGWObjVersionTracker objv_tracker; + + int r = ctl()->user->get_info_by_email(dpp, email, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker)); + if (r < 0) + return r; + + u = new RadosUser(this, uinfo); + if (!u) + return -ENOMEM; + + u->get_version_tracker() = objv_tracker; + + user->reset(u); + return 0; +} + +int RadosStore::get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr* user) +{ + RGWUserInfo uinfo; + User* u; + RGWObjVersionTracker objv_tracker; + + int r = ctl()->user->get_info_by_swift(dpp, user_str, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker)); + if (r < 0) + return r; + + u = new RadosUser(this, uinfo); + if (!u) + return -ENOMEM; + + u->get_version_tracker() = objv_tracker; + + user->reset(u); + return 0; +} + +std::unique_ptr RadosStore::get_object(const rgw_obj_key& k) +{ + return std::make_unique(this, k); +} + +int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr* bucket, optional_yield y) +{ + int ret; + Bucket* bp; + + bp = new RadosBucket(this, b, u); + ret = bp->load_bucket(dpp, y); + if (ret < 0) { + delete bp; + return ret; + } + + bucket->reset(bp); + return 0; +} + +int RadosStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr* bucket) +{ + Bucket* bp; + + bp = new RadosBucket(this, i, u); + /* Don't need to fetch the bucket info, use the provided one */ + + bucket->reset(bp); + return 0; +} + +int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr* bucket, optional_yield y) +{ + rgw_bucket b; + + b.tenant = tenant; + b.name = name; + + return get_bucket(dpp, u, b, bucket, y); +} + +bool RadosStore::is_meta_master() +{ + return svc()->zone->is_meta_master(); +} + +int RadosStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv, + bufferlist& in_data, + JSONParser* jp, req_info& info, + optional_yield y) +{ + if (is_meta_master()) { + /* We're master, don't forward */ + return 0; + } + + if (!svc()->zone->get_master_conn()) { + ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl; + return -EINVAL; + } + ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl; + bufferlist response; + std::string uid_str = user->get_id().to_str(); +#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response + int ret = svc()->zone->get_master_conn()->forward(dpp, rgw_user(uid_str), info, + objv, MAX_REST_RESPONSE, + &in_data, &response, y); + if (ret < 0) + return ret; + + ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl; + if (jp && !jp->parse(response.c_str(), response.length())) { + ldpp_dout(dpp, 0) << "failed parsing response from master zonegroup" << dendl; + return -EINVAL; + } + + return 0; +} + +int RadosStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, req_info& info, + optional_yield y) +{ + if (is_meta_master()) { + /* We're master, don't forward */ + return 0; + } + + if (!svc()->zone->get_master_conn()) { + ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl; + return -EINVAL; + } + ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl; + bufferlist response; +#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response + int ret = svc()->zone->get_master_conn()->forward_iam_request(dpp, key, info, + objv, MAX_REST_RESPONSE, + &in_data, &response, y); + if (ret < 0) + return ret; + + ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl; + + std::string r = response.c_str(); + std::string str_to_search = """; + std::string str_to_replace = "\""; + boost::replace_all(r, str_to_search, str_to_replace); + ldpp_dout(dpp, 20) << "r: " << r.c_str() << dendl; + + if (parser && !parser->parse(r.c_str(), r.length(), 1)) { + ldpp_dout(dpp, 0) << "ERROR: failed to parse response from master zonegroup" << dendl; + return -EIO; + } + + return 0; +} + +std::string RadosStore::zone_unique_id(uint64_t unique_num) +{ + return svc()->zone_utils->unique_id(unique_num); +} + +std::string RadosStore::zone_unique_trans_id(const uint64_t unique_num) +{ + return svc()->zone_utils->unique_trans_id(unique_num); +} + +int RadosStore::get_zonegroup(const std::string& id, + std::unique_ptr* zonegroup) +{ + ZoneGroup* zg; + RGWZoneGroup rzg; + int r = svc()->zone->get_zonegroup(id, rzg); + if (r < 0) + return r; + + zg = new RadosZoneGroup(this, rzg); + if (!zg) + return -ENOMEM; + + zonegroup->reset(zg); + return 0; +} + +int RadosStore::list_all_zones(const DoutPrefixProvider* dpp, std::list& zone_ids) +{ + return svc()->zone->list_zones(dpp, zone_ids); +} + +int RadosStore::cluster_stat(RGWClusterStat& stats) +{ + rados_cluster_stat_t rados_stats; + int ret; + + ret = rados->get_rados_handle()->cluster_stat(rados_stats); + if (ret < 0) + return ret; + + stats.kb = rados_stats.kb; + stats.kb_used = rados_stats.kb_used; + stats.kb_avail = rados_stats.kb_avail; + stats.num_objects = rados_stats.num_objects; + + return ret; +} + +std::unique_ptr RadosStore::get_lifecycle(void) +{ + return std::make_unique(this); +} + +std::unique_ptr RadosStore::get_completions(void) +{ + return std::make_unique(); +} + +std::unique_ptr RadosStore::get_notification( + rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, optional_yield y, const std::string* object_name) +{ + return std::make_unique(s, this, obj, src_obj, s, event_type, y, object_name); +} + +std::unique_ptr RadosStore::get_notification(const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) +{ + return std::make_unique(dpp, this, obj, src_obj, event_type, _bucket, _user_id, _user_tenant, _req_id, y); +} + +std::string RadosStore::topics_oid(const std::string& tenant) const { + return pubsub_oid_prefix + tenant; +} + +int RadosStore::read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) { + bufferlist bl; + const int ret = rgw_get_system_obj(svc()->sysobj, + svc()->zone->get_zone_params().log_pool, + topics_oid(tenant), + bl, + objv_tracker, + nullptr, y, dpp, nullptr); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(topics, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 20) << " failed to decode topics from oid: " << topics_oid(tenant) << + ". error: " << err.what() << dendl; + return -EIO; + } + + return 0; +} + +int RadosStore::write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) { + bufferlist bl; + encode(topics, bl); + + return rgw_put_system_obj(dpp, svc()->sysobj, + svc()->zone->get_zone_params().log_pool, + topics_oid(tenant), + bl, false, objv_tracker, real_time(), y); +} + +int RadosStore::remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) { + return rgw_delete_system_obj(dpp, svc()->sysobj, + svc()->zone->get_zone_params().log_pool, + topics_oid(tenant), + objv_tracker, y); +} + +int RadosStore::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj) +{ + return rados->delete_raw_obj(dpp, obj); +} + +int RadosStore::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio) +{ + RadosCompletions* raio = static_cast(aio); + + return rados->delete_raw_obj_aio(dpp, obj, raio->handles); +} + +void RadosStore::get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj) +{ + rados->obj_to_raw(placement_rule, obj, raw_obj); +} + +int RadosStore::get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size) +{ + return rados->get_max_chunk_size(obj.pool, chunk_size, dpp); +} + +int RadosStore::initialize(CephContext *cct, const DoutPrefixProvider *dpp) +{ + std::unique_ptr zg = + std::make_unique(this, svc()->zone->get_zonegroup()); + zone = make_unique(this, std::move(zg)); + return 0; +} + +int RadosStore::log_usage(const DoutPrefixProvider *dpp, map& usage_info) +{ + return rados->log_usage(dpp, usage_info); +} + +int RadosStore::log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) +{ + rgw_raw_obj obj(svc()->zone->get_zone_params().log_pool, oid); + + int ret = rados->append_async(dpp, obj, bl.length(), bl); + if (ret == -ENOENT) { + ret = rados->create_pool(dpp, svc()->zone->get_zone_params().log_pool); + if (ret < 0) + return ret; + // retry + ret = rados->append_async(dpp, obj, bl.length(), bl); + } + + return ret; +} + +int RadosStore::register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, + const map& meta) +{ + return rados->register_to_service_map(dpp, daemon_type, meta); +} + +void RadosStore::get_quota(RGWQuota& quota) +{ + quota.bucket_quota = svc()->quota->get_bucket_quota(); + quota.user_quota = svc()->quota->get_user_quota(); +} + +void RadosStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) +{ + bucket_ratelimit = svc()->zone->get_current_period().get_config().bucket_ratelimit; + user_ratelimit = svc()->zone->get_current_period().get_config().user_ratelimit; + anon_ratelimit = svc()->zone->get_current_period().get_config().anon_ratelimit; +} + +int RadosStore::set_buckets_enabled(const DoutPrefixProvider* dpp, vector& buckets, bool enabled) +{ + return rados->set_buckets_enabled(buckets, enabled, dpp); +} + +int RadosStore::get_sync_policy_handler(const DoutPrefixProvider* dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef* phandler, + optional_yield y) +{ + return ctl()->bucket->get_sync_policy_handler(zone, bucket, phandler, y, dpp); +} + +RGWDataSyncStatusManager* RadosStore::get_data_sync_manager(const rgw_zone_id& source_zone) +{ + return rados->get_data_sync_manager(source_zone); +} + +int RadosStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, + RGWUsageIter& usage_iter, + map& usage) +{ + rgw_user uid; + std::string bucket_name; + + return rados->read_usage(dpp, uid, bucket_name, start_epoch, end_epoch, max_entries, + is_truncated, usage_iter, usage); +} + +int RadosStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) +{ + rgw_user uid; + std::string bucket_name; + + return rados->trim_usage(dpp, uid, bucket_name, start_epoch, end_epoch); +} + +int RadosStore::get_config_key_val(std::string name, bufferlist* bl) +{ + return svc()->config_key->get(name, true, bl); +} + +int RadosStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) +{ + return ctl()->meta.mgr->list_keys_init(dpp, section, marker, phandle); +} + +int RadosStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list& keys, bool* truncated) +{ + return ctl()->meta.mgr->list_keys_next(dpp, handle, max, keys, truncated); +} + +void RadosStore::meta_list_keys_complete(void* handle) +{ + ctl()->meta.mgr->list_keys_complete(handle); +} + +std::string RadosStore::meta_get_marker(void* handle) +{ + return ctl()->meta.mgr->get_marker(handle); +} + +int RadosStore::meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y) +{ + return ctl()->meta.mgr->remove(metadata_key, y, dpp); +} + +void RadosStore::finalize(void) +{ + if (rados) + rados->finalize(); +} + +void RadosStore::register_admin_apis(RGWRESTMgr* mgr) +{ + mgr->register_resource("user", new RGWRESTMgr_User); + mgr->register_resource("bucket", new RGWRESTMgr_Bucket); + /*Registering resource for /admin/metadata */ + mgr->register_resource("metadata", new RGWRESTMgr_Metadata); + mgr->register_resource("log", new RGWRESTMgr_Log); + /* XXX These may become global when cbodley is done with his zone work */ + mgr->register_resource("config", new RGWRESTMgr_Config); + mgr->register_resource("realm", new RGWRESTMgr_Realm); + mgr->register_resource("ratelimit", new RGWRESTMgr_Ratelimit); +} + +std::unique_ptr RadosStore::get_lua_manager() +{ + return std::make_unique(this); +} + +std::unique_ptr RadosStore::get_role(std::string name, + std::string tenant, + std::string path, + std::string trust_policy, + std::string max_session_duration_str, + std::multimap tags) +{ + return std::make_unique(this, name, tenant, path, trust_policy, max_session_duration_str, tags); +} + +std::unique_ptr RadosStore::get_role(std::string id) +{ + return std::make_unique(this, id); +} + +std::unique_ptr RadosStore::get_role(const RGWRoleInfo& info) +{ + return std::make_unique(this, info); +} + +int RadosStore::get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + vector>& roles) +{ + auto pool = svc()->zone->get_zone_params().roles_pool; + std::string prefix; + + // List all roles if path prefix is empty + if (! path_prefix.empty()) { + prefix = tenant + RGWRole::role_path_oid_prefix + path_prefix; + } else { + prefix = tenant + RGWRole::role_path_oid_prefix; + } + + //Get the filtered objects + list result; + bool is_truncated; + RGWListRawObjsCtx ctx; + do { + list oids; + int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: " + << prefix << ": " << cpp_strerror(-r) << dendl; + return r; + } + for (const auto& iter : oids) { + result.push_back(iter.substr(RGWRole::role_path_oid_prefix.size())); + } + } while (is_truncated); + + for (const auto& it : result) { + //Find the role oid prefix from the end + size_t pos = it.rfind(RGWRole::role_oid_prefix); + if (pos == std::string::npos) { + continue; + } + // Split the result into path and info_oid + id + std::string path = it.substr(0, pos); + + /*Make sure that prefix is part of path (False results could've been returned) + because of the role info oid + id appended to the path)*/ + if(path_prefix.empty() || path.find(path_prefix) != std::string::npos) { + //Get id from info oid prefix + id + std::string id = it.substr(pos + RGWRole::role_oid_prefix.length()); + + std::unique_ptr role = get_role(id); + int ret = role->read_info(dpp, y); + if (ret < 0) { + return ret; + } + roles.push_back(std::move(role)); + } + } + + return 0; +} + +std::unique_ptr RadosStore::get_oidc_provider() +{ + return std::make_unique(this); +} + +int RadosStore::get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + vector>& providers) +{ + std::string prefix = tenant + RGWOIDCProvider::oidc_url_oid_prefix; + auto pool = svc()->zone->get_zone_params().oidc_pool; + + //Get the filtered objects + list result; + bool is_truncated; + RGWListRawObjsCtx ctx; + do { + list oids; + int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: OIDC pool: " + << pool.name << ": " << prefix << ": " << cpp_strerror(-r) << dendl; + return r; + } + for (const auto& iter : oids) { + std::unique_ptr provider = get_oidc_provider(); + bufferlist bl; + + r = rgw_get_system_obj(svc()->sysobj, pool, iter, bl, nullptr, nullptr, null_yield, dpp); + if (r < 0) { + return r; + } + + try { + using ceph::decode; + auto iter = bl.cbegin(); + decode(*provider, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: " + << pool.name << ": " << iter << dendl; + return -EIO; + } + + providers.push_back(std::move(provider)); + } + } while (is_truncated); + + return 0; +} + +std::unique_ptr RadosStore::get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) +{ + RGWBucketInfo& bucket_info = obj->get_bucket()->get_info(); + RGWObjectCtx& obj_ctx = static_cast(obj)->get_ctx(); + auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y); + return std::make_unique(dpp, y, + bucket_info, obj_ctx, obj->get_obj(), + this, std::move(aio), owner, + ptail_placement_rule, + unique_tag, position, + cur_accounted_size); +} + +std::unique_ptr RadosStore::get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) +{ + RGWBucketInfo& bucket_info = obj->get_bucket()->get_info(); + RGWObjectCtx& obj_ctx = static_cast(obj)->get_ctx(); + auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y); + return std::make_unique(dpp, y, + bucket_info, obj_ctx, obj->get_obj(), + this, std::move(aio), owner, + ptail_placement_rule, + olh_epoch, unique_tag); +} + +const std::string& RadosStore::get_compression_type(const rgw_placement_rule& rule) +{ + return svc()->zone->get_zone_params().get_compression_type(rule); +} + +bool RadosStore::valid_placement(const rgw_placement_rule& rule) +{ + return svc()->zone->get_zone_params().valid_placement(rule); +} + +int RadosStore::get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx* ioctx) +{ + return rados->get_obj_head_ioctx(dpp, bucket_info, obj, ioctx); +} + +RadosObject::~RadosObject() +{ + if (rados_ctx_owned) + delete rados_ctx; +} + +int RadosObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate, optional_yield y, bool follow_olh) +{ + int ret = store->getRados()->get_obj_state(dpp, rados_ctx, bucket->get_info(), get_obj(), pstate, &manifest, follow_olh, y); + if (ret < 0) { + return ret; + } + + /* Don't overwrite obj, atomic, or prefetch */ + rgw_obj obj = get_obj(); + bool is_atomic = state.is_atomic; + bool prefetch_data = state.prefetch_data; + + state = **pstate; + + state.obj = obj; + state.is_atomic = is_atomic; + state.prefetch_data = prefetch_data; + return ret; +} + +int RadosObject::read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj) +{ + read_op.params.attrs = &state.attrset; + read_op.params.target_obj = target_obj; + read_op.params.obj_size = &state.size; + read_op.params.lastmod = &state.mtime; + + return read_op.prepare(y, dpp); +} + +int RadosObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) +{ + Attrs empty; + return store->getRados()->set_attrs(dpp, rados_ctx, + bucket->get_info(), + get_obj(), + setattrs ? *setattrs : empty, + delattrs ? delattrs : nullptr, + y); +} + +int RadosObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj) +{ + RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj()); + RGWRados::Object::Read read_op(&op_target); + + return read_attrs(dpp, read_op, y, target_obj); +} + +int RadosObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) +{ + rgw_obj target = get_obj(); + rgw_obj save = get_obj(); + int r = get_obj_attrs(y, dpp, &target); + if (r < 0) { + return r; + } + + /* Temporarily set target */ + state.obj = target; + set_atomic(); + state.attrset[attr_name] = attr_val; + r = set_obj_attrs(dpp, &state.attrset, nullptr, y); + /* Restore target */ + state.obj = save; + + return r; +} + +int RadosObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) +{ + Attrs rmattr; + bufferlist bl; + + set_atomic(); + rmattr[attr_name] = bl; + return set_obj_attrs(dpp, nullptr, &rmattr, y); +} + +bool RadosObject::is_expired() { + auto iter = state.attrset.find(RGW_ATTR_DELETE_AT); + if (iter == state.attrset.end()) { + return false; + } + utime_t delete_at; + try { + auto bufit = iter->second.cbegin(); + decode(delete_at, bufit); + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl; + return false; + } + + return delete_at <= ceph_clock_now() && !delete_at.is_zero(); +} + +void RadosObject::gen_rand_obj_instance_name() +{ + store->getRados()->gen_rand_obj_instance_name(&state.obj.key); +} + +void RadosObject::raw_obj_to_obj(const rgw_raw_obj& raw_obj) +{ + rgw_obj tobj = get_obj(); + RGWSI_Tier_RADOS::raw_obj_to_obj(get_bucket()->get_key(), raw_obj, &tobj); + set_key(tobj.key); +} + +void RadosObject::get_raw_obj(rgw_raw_obj* raw_obj) +{ + store->getRados()->obj_to_raw((bucket->get_info()).placement_rule, get_obj(), raw_obj); +} + +int RadosObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map *m, + bool* pmore, optional_yield y) +{ + rgw_raw_obj raw_obj; + get_raw_obj(&raw_obj); + auto sysobj = store->svc()->sysobj->get_obj(raw_obj); + + return sysobj.omap().get_vals(dpp, marker, count, m, pmore, y); +} + +int RadosObject::omap_get_all(const DoutPrefixProvider *dpp, std::map *m, + optional_yield y) +{ + rgw_raw_obj raw_obj; + get_raw_obj(&raw_obj); + auto sysobj = store->svc()->sysobj->get_obj(raw_obj); + + return sysobj.omap().get_all(dpp, m, y); +} + +int RadosObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid, + const std::set& keys, + Attrs* vals) +{ + int ret; + rgw_raw_obj head_obj; + librados::IoCtx cur_ioctx; + rgw_obj obj = get_obj(); + + store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &head_obj); + ret = store->get_obj_head_ioctx(dpp, bucket->get_info(), obj, &cur_ioctx); + if (ret < 0) { + return ret; + } + + return cur_ioctx.omap_get_vals_by_keys(oid, keys, vals); +} + +int RadosObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) +{ + rgw_raw_obj raw_meta_obj; + rgw_obj obj = get_obj(); + + store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &raw_meta_obj); + + auto sysobj = store->svc()->sysobj->get_obj(raw_meta_obj); + + return sysobj.omap().set_must_exist(must_exist).set(dpp, key, val, y); +} + +int RadosObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) +{ + int r = get_obj_attrs(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read object attrs " << get_name() << cpp_strerror(-r) << dendl; + return r; + } + + const auto& aiter = get_attrs().find(RGW_ATTR_ACL); + if (aiter == get_attrs().end()) { + ldpp_dout(dpp, 0) << "ERROR: no acls found for object " << get_name() << dendl; + return -EINVAL; + } + + bufferlist& bl = aiter->second; + RGWAccessControlPolicy policy(store->ctx()); + ACLOwner owner; + auto bliter = bl.cbegin(); + try { + policy.decode(bliter); + owner = policy.get_owner(); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: decode policy failed" << err.what() + << dendl; + return -EIO; + } + + //Get the ACL from the policy + RGWAccessControlList& acl = policy.get_acl(); + + //Remove grant that is set to old owner + acl.remove_canon_user_grant(owner.get_id()); + + //Create a grant and add grant + ACLGrant grant; + grant.set_canon(new_user.get_id(), new_user.get_display_name(), RGW_PERM_FULL_CONTROL); + acl.add_grant(&grant); + + //Update the ACL owner to the new user + owner.set_id(new_user.get_id()); + owner.set_name(new_user.get_display_name()); + policy.set_owner(owner); + + bl.clear(); + encode(policy, bl); + + set_atomic(); + map attrs; + attrs[RGW_ATTR_ACL] = bl; + r = set_obj_attrs(dpp, &attrs, nullptr, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: modify attr failed " << cpp_strerror(-r) << dendl; + return r; + } + + return 0; +} + +std::unique_ptr RadosObject::get_serializer(const DoutPrefixProvider *dpp, const std::string& lock_name) +{ + return std::make_unique(dpp, store, this, lock_name); +} + +int RadosObject::transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + return store->getRados()->transition_obj(*rados_ctx, bucket->get_info(), get_obj(), placement_rule, mtime, olh_epoch, dpp, y); +} + +int RadosObject::transition_to_cloud(Bucket* bucket, + rgw::sal::PlacementTier* tier, + rgw_bucket_dir_entry& o, + std::set& cloud_targets, + CephContext* cct, + bool update_object, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + /* init */ + rgw::sal::RadosPlacementTier* rtier = static_cast(tier); + string id = "cloudid"; + string endpoint = rtier->get_rt().t.s3.endpoint; + RGWAccessKey key = rtier->get_rt().t.s3.key; + string region = rtier->get_rt().t.s3.region; + HostStyle host_style = rtier->get_rt().t.s3.host_style; + string bucket_name = rtier->get_rt().t.s3.target_path; + const rgw::sal::ZoneGroup& zonegroup = store->get_zone()->get_zonegroup(); + + if (bucket_name.empty()) { + bucket_name = "rgwx-" + zonegroup.get_name() + "-" + tier->get_storage_class() + + "-cloud-bucket"; + boost::algorithm::to_lower(bucket_name); + } + + /* Create RGW REST connection */ + S3RESTConn conn(cct, id, { endpoint }, key, zonegroup.get_id(), region, host_style); + + RGWLCCloudTierCtx tier_ctx(cct, dpp, o, store, bucket->get_info(), + this, conn, bucket_name, + rtier->get_rt().t.s3.target_storage_class); + tier_ctx.acl_mappings = rtier->get_rt().t.s3.acl_mappings; + tier_ctx.multipart_min_part_size = rtier->get_rt().t.s3.multipart_min_part_size; + tier_ctx.multipart_sync_threshold = rtier->get_rt().t.s3.multipart_sync_threshold; + tier_ctx.storage_class = tier->get_storage_class(); + + ldpp_dout(dpp, 0) << "Transitioning object(" << o.key << ") to the cloud endpoint(" << endpoint << ")" << dendl; + + /* Transition object to cloud end point */ + int ret = rgw_cloud_tier_transfer_object(tier_ctx, cloud_targets); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to transfer object(" << o.key << ") to the cloud endpoint(" << endpoint << ") ret=" << ret << dendl; + return ret; + } + + if (update_object) { + real_time read_mtime; + + std::unique_ptr read_op(get_read_op()); + read_op->params.lastmod = &read_mtime; + + ret = read_op->prepare(y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << ret << dendl; + return ret; + } + + if (read_mtime != tier_ctx.o.meta.mtime) { + /* raced */ + ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << -ECANCELED << dendl; + return -ECANCELED; + } + + rgw_placement_rule target_placement; + target_placement.inherit_from(tier_ctx.bucket_info.placement_rule); + target_placement.storage_class = tier->get_storage_class(); + + ret = write_cloud_tier(dpp, y, tier_ctx.o.versioned_epoch, + tier, tier_ctx.is_multipart_upload, + target_placement, tier_ctx.obj); + + } + + return ret; +} + +int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp, + optional_yield y, + uint64_t olh_epoch, + PlacementTier* tier, + bool is_multipart_upload, + rgw_placement_rule& target_placement, + Object* head_obj) +{ + rgw::sal::RadosPlacementTier* rtier = static_cast(tier); + map attrs = get_attrs(); + RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj()); + RGWRados::Object::Write obj_op(&op_target); + + obj_op.meta.modify_tail = true; + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.category = RGWObjCategory::CloudTiered; + obj_op.meta.delete_at = real_time(); + bufferlist blo; + obj_op.meta.data = &blo; + obj_op.meta.if_match = NULL; + obj_op.meta.if_nomatch = NULL; + obj_op.meta.user_data = NULL; + obj_op.meta.zones_trace = NULL; + obj_op.meta.delete_at = real_time(); + obj_op.meta.olh_epoch = olh_epoch; + + RGWObjManifest *pmanifest; + RGWObjManifest manifest; + + pmanifest = &manifest; + RGWObjTier tier_config; + tier_config.name = tier->get_storage_class(); + tier_config.tier_placement = rtier->get_rt(); + tier_config.is_multipart_upload = is_multipart_upload; + + pmanifest->set_tier_type("cloud-s3"); + pmanifest->set_tier_config(tier_config); + + /* check if its necessary */ + pmanifest->set_head(target_placement, head_obj->get_obj(), 0); + pmanifest->set_tail_placement(target_placement, head_obj->get_obj().bucket); + pmanifest->set_obj_size(0); + obj_op.meta.manifest = pmanifest; + + /* update storage class */ + bufferlist bl; + bl.append(tier->get_storage_class()); + attrs[RGW_ATTR_STORAGE_CLASS] = bl; + + attrs.erase(RGW_ATTR_ID_TAG); + attrs.erase(RGW_ATTR_TAIL_TAG); + + return obj_op.write_meta(dpp, 0, 0, attrs, y); +} + +int RadosObject::get_max_chunk_size(const DoutPrefixProvider* dpp, rgw_placement_rule placement_rule, uint64_t* max_chunk_size, uint64_t* alignment) +{ + return store->getRados()->get_max_chunk_size(placement_rule, get_obj(), max_chunk_size, dpp, alignment); +} + +void RadosObject::get_max_aligned_size(uint64_t size, uint64_t alignment, + uint64_t* max_size) +{ + store->getRados()->get_max_aligned_size(size, alignment, max_size); +} + +bool RadosObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) +{ + rgw_obj obj; + rgw_pool p1, p2; + + obj = get_obj(); + + if (r1 == r2) + return true; + + if (!store->getRados()->get_obj_data_pool(r1, obj, &p1)) { + return false; + } + if (!store->getRados()->get_obj_data_pool(r2, obj, &p2)) { + return false; + } + + return p1 == p2; +} + +int RadosObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) +{ + int ret; + RGWObjManifest *amanifest{nullptr}; + rgw_raw_obj head_obj; + + RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj()); + RGWRados::Object::Read parent_op(&op_target); + uint64_t obj_size; + + parent_op.params.obj_size = &obj_size; + parent_op.params.attrs = &get_attrs(); + + ret = parent_op.prepare(y, dpp); + if (ret < 0) { + return ret; + } + + head_obj = parent_op.state.head_obj; + + ret = op_target.get_manifest(dpp, &amanifest, y); + if (ret < 0) { + return ret; + } + + ::encode_json("head", head_obj, f); + ::encode_json("manifest", *amanifest, f); + f->open_array_section("data_location"); + for (auto miter = amanifest->obj_begin(dpp); miter != amanifest->obj_end(dpp); ++miter) { + f->open_object_section("obj"); + rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store->getRados()); + uint64_t ofs = miter.get_ofs(); + uint64_t left = amanifest->get_obj_size() - ofs; + ::encode_json("ofs", miter.get_ofs(), f); + ::encode_json("loc", raw_loc, f); + ::encode_json("loc_ofs", miter.location_ofs(), f); + uint64_t loc_size = miter.get_stripe_size(); + if (loc_size > left) { + loc_size = left; + } + ::encode_json("loc_size", loc_size, f); + f->close_section(); + } + f->close_section(); + + return 0; +} + +std::unique_ptr RadosObject::get_read_op() +{ + return std::make_unique(this, rados_ctx); +} + +RadosObject::RadosReadOp::RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx) : + source(_source), + rctx(_rctx), + op_target(_source->store->getRados(), + _source->get_bucket()->get_info(), + *static_cast(rctx), + _source->get_obj()), + parent_op(&op_target) +{ } + +int RadosObject::RadosReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp) +{ + uint64_t obj_size; + + parent_op.conds.mod_ptr = params.mod_ptr; + parent_op.conds.unmod_ptr = params.unmod_ptr; + parent_op.conds.high_precision_time = params.high_precision_time; + parent_op.conds.mod_zone_id = params.mod_zone_id; + parent_op.conds.mod_pg_ver = params.mod_pg_ver; + parent_op.conds.if_match = params.if_match; + parent_op.conds.if_nomatch = params.if_nomatch; + parent_op.params.lastmod = params.lastmod; + parent_op.params.target_obj = params.target_obj; + parent_op.params.obj_size = &obj_size; + parent_op.params.attrs = &source->get_attrs(); + + int ret = parent_op.prepare(y, dpp); + if (ret < 0) + return ret; + + source->set_key(parent_op.state.obj.key); + source->set_obj_size(obj_size); + + return ret; +} + +int RadosObject::RadosReadOp::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp) +{ + return parent_op.read(ofs, end, bl, y, dpp); +} + +int RadosObject::RadosReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) +{ + return parent_op.get_attr(dpp, name, dest, y); +} + +std::unique_ptr RadosObject::get_delete_op() +{ + return std::make_unique(this); +} + +RadosObject::RadosDeleteOp::RadosDeleteOp(RadosObject *_source) : + source(_source), + op_target(_source->store->getRados(), + _source->get_bucket()->get_info(), + _source->get_ctx(), + _source->get_obj()), + parent_op(&op_target) +{ } + +int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y) +{ + parent_op.params.bucket_owner = params.bucket_owner.get_id(); + parent_op.params.versioning_status = params.versioning_status; + parent_op.params.obj_owner = params.obj_owner; + parent_op.params.olh_epoch = params.olh_epoch; + parent_op.params.marker_version_id = params.marker_version_id; + parent_op.params.bilog_flags = params.bilog_flags; + parent_op.params.remove_objs = params.remove_objs; + parent_op.params.expiration_time = params.expiration_time; + parent_op.params.unmod_since = params.unmod_since; + parent_op.params.mtime = params.mtime; + parent_op.params.high_precision_time = params.high_precision_time; + parent_op.params.zones_trace = params.zones_trace; + parent_op.params.abortmp = params.abortmp; + parent_op.params.parts_accounted_size = params.parts_accounted_size; + + int ret = parent_op.delete_obj(y, dpp); + if (ret < 0) + return ret; + + result.delete_marker = parent_op.result.delete_marker; + result.version_id = parent_op.result.version_id; + + return ret; +} + +int RadosObject::delete_object(const DoutPrefixProvider* dpp, + optional_yield y, + bool prevent_versioning) +{ + RGWRados::Object del_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj()); + RGWRados::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = bucket->get_info().owner; + del_op.params.versioning_status = prevent_versioning ? 0 : bucket->get_info().versioning_status(); + + return del_op.delete_obj(y, dpp); +} + +int RadosObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, + Completions* aio, bool keep_index_consistent, + optional_yield y) +{ + RadosCompletions* raio = static_cast(aio); + + return store->getRados()->delete_obj_aio(dpp, get_obj(), bucket->get_info(), astate, + raio->handles, keep_index_consistent, y); +} + +int RadosObject::copy_object(User* user, + req_info* info, + const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, + rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, + ceph::real_time* mtime, + const ceph::real_time* mod_ptr, + const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, + const char* if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + Attrs& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, + std::string* tag, + std::string* etag, + void (*progress_cb)(off_t, void *), + void* progress_data, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + return store->getRados()->copy_obj(*rados_ctx, + user->get_id(), + info, + source_zone, + dest_object->get_obj(), + get_obj(), + dest_bucket->get_info(), + src_bucket->get_info(), + dest_placement, + src_mtime, + mtime, + mod_ptr, + unmod_ptr, + high_precision_time, + if_match, + if_nomatch, + static_cast(attrs_mod), + copy_if_newer, + attrs, + category, + olh_epoch, + (delete_at ? *delete_at : real_time()), + version_id, + tag, + etag, + progress_cb, + progress_data, + dpp, + y); +} + +int RadosObject::RadosReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end, RGWGetDataCB* cb, optional_yield y) +{ + return parent_op.iterate(dpp, ofs, end, cb, y); +} + +int RadosObject::swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) +{ + rgw_obj obj = get_obj(); + return store->getRados()->swift_versioning_restore(*rados_ctx, + bucket->get_owner()->get_id(), + bucket->get_info(), + obj, + restored, + dpp); +} + +int RadosObject::swift_versioning_copy(const DoutPrefixProvider* dpp, optional_yield y) +{ + return store->getRados()->swift_versioning_copy(*rados_ctx, + bucket->get_info().owner, + bucket->get_info(), + get_obj(), + dpp, + y); +} + +int RadosMultipartUpload::cleanup_part_history(const DoutPrefixProvider* dpp, + optional_yield y, + RadosMultipartPart *part, + list& remove_objs) +{ + cls_rgw_obj_chain chain; + for (auto& ppfx : part->get_past_prefixes()) { + rgw_obj past_obj; + past_obj.init_ns(bucket->get_key(), ppfx + "." + std::to_string(part->info.num), mp_ns); + rgw_obj_index_key past_key; + past_obj.key.get_index_key(&past_key); + // Remove past upload part objects from index, too. + remove_objs.push_back(past_key); + + RGWObjManifest manifest = part->get_manifest(); + manifest.set_prefix(ppfx); + RGWObjManifest::obj_iterator miter = manifest.obj_begin(dpp); + for (; miter != manifest.obj_end(dpp); ++miter) { + rgw_raw_obj raw_part_obj = miter.get_location().get_raw_obj(store->getRados()); + cls_rgw_obj_key part_key(raw_part_obj.oid); + chain.push_obj(raw_part_obj.pool.to_str(), part_key, raw_part_obj.loc); + } + } + if (store->getRados()->get_gc() == nullptr) { + // Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified) + store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id()); + } else { + // use upload id as tag and do it synchronously + auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id()); + if (ret < 0 && leftover_chain) { + ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl; + if (ret == -ENOENT) { + return -ERR_NO_SUCH_UPLOAD; + } + // Delete objects inline if send chain to gc fails + store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id()); + } + } + return 0; +} + + +int RadosMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct) +{ + std::unique_ptr meta_obj = get_meta_obj(); + meta_obj->set_in_extra_data(true); + meta_obj->set_hash_source(mp_obj.get_key()); + cls_rgw_obj_chain chain; + list remove_objs; + bool truncated; + int marker = 0; + int ret; + uint64_t parts_accounted_size = 0; + + do { + ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " << + ret << dendl; + return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret; + } + + for (auto part_it = parts.begin(); + part_it != parts.end(); + ++part_it) { + RadosMultipartPart* obj_part = dynamic_cast(part_it->second.get()); + if (obj_part->info.manifest.empty()) { + std::unique_ptr obj = bucket->get_object( + rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART)); + obj->set_hash_source(mp_obj.get_key()); + ret = obj->delete_object(dpp, null_yield); + if (ret < 0 && ret != -ENOENT) + return ret; + } else { + auto target = meta_obj->get_obj(); + store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain); + RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp); + if (oiter != obj_part->info.manifest.obj_end(dpp)) { + std::unique_ptr head = bucket->get_object(rgw_obj_key()); + rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados()); + dynamic_cast(head.get())->raw_obj_to_obj(raw_head); + + rgw_obj_index_key key; + head->get_key().get_index_key(&key); + remove_objs.push_back(key); + + cleanup_part_history(dpp, null_yield, obj_part, remove_objs); + } + } + parts_accounted_size += obj_part->info.accounted_size; + } + } while (truncated); + + if (store->getRados()->get_gc() == nullptr) { + //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified) + store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id()); + } else { + /* use upload id as tag and do it synchronously */ + auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id()); + if (ret < 0 && leftover_chain) { + ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl; + if (ret == -ENOENT) { + return -ERR_NO_SUCH_UPLOAD; + } + //Delete objects inline if send chain to gc fails + store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id()); + } + } + + std::unique_ptr del_op = meta_obj->get_delete_op(); + del_op->params.bucket_owner = bucket->get_acl_owner(); + del_op->params.versioning_status = 0; + if (!remove_objs.empty()) { + del_op->params.remove_objs = &remove_objs; + } + + del_op->params.abortmp = true; + del_op->params.parts_accounted_size = parts_accounted_size; + + // and also remove the metadata obj + ret = del_op->delete_obj(dpp, null_yield); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " << + ret << dendl; + } + return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret; +} + +std::unique_ptr RadosMultipartUpload::get_meta_obj() +{ + return bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns)); +} + +int RadosMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) +{ + int ret; + std::string oid = mp_obj.get_key(); + RGWObjectCtx obj_ctx(store); + + do { + char buf[33]; + string tmp_obj_name; + std::unique_ptr obj; + gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); + std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */ + upload_id.append(buf); + + mp_obj.init(oid, upload_id); + tmp_obj_name = mp_obj.get_meta(); + + obj = bucket->get_object(rgw_obj_key(tmp_obj_name, string(), mp_ns)); + // the meta object will be indexed with 0 size, we c + obj->set_in_extra_data(true); + obj->set_hash_source(oid); + + RGWRados::Object op_target(store->getRados(), + obj->get_bucket()->get_info(), + obj_ctx, obj->get_obj()); + RGWRados::Object::Write obj_op(&op_target); + + op_target.set_versioning_disabled(true); /* no versioning for multipart meta */ + obj_op.meta.owner = owner.get_id(); + obj_op.meta.category = RGWObjCategory::MultiMeta; + obj_op.meta.flags = PUT_OBJ_CREATE_EXCL; + obj_op.meta.mtime = &mtime; + + multipart_upload_info upload_info; + upload_info.dest_placement = dest_placement; + + bufferlist bl; + encode(upload_info, bl); + obj_op.meta.data = &bl; + + ret = obj_op.write_meta(dpp, bl.length(), 0, attrs, y); + } while (ret == -EEXIST); + + return ret; +} + +int RadosMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct, + int num_parts, int marker, + int *next_marker, bool *truncated, + bool assume_unsorted) +{ + map parts_map; + map::iterator iter; + + std::unique_ptr obj = bucket->get_object( + rgw_obj_key(get_meta(), std::string(), RGW_OBJ_NS_MULTIPART)); + obj->set_in_extra_data(true); + + bool sorted_omap = is_v2_upload_id(get_upload_id()) && !assume_unsorted; + + parts.clear(); + + int ret; + if (sorted_omap) { + string p; + p = "part."; + char buf[32]; + + snprintf(buf, sizeof(buf), "%08d", marker); + p.append(buf); + + ret = obj->omap_get_vals(dpp, p, num_parts + 1, &parts_map, + nullptr, null_yield); + } else { + ret = obj->omap_get_all(dpp, &parts_map, null_yield); + } + if (ret < 0) { + return ret; + } + + int i; + int last_num = 0; + + uint32_t expected_next = marker + 1; + + for (i = 0, iter = parts_map.begin(); + (i < num_parts || !sorted_omap) && iter != parts_map.end(); + ++iter, ++i) { + bufferlist& bl = iter->second; + auto bli = bl.cbegin(); + std::unique_ptr part = std::make_unique(); + try { + decode(part->info, bli); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: could not part info, caught buffer::error" << + dendl; + return -EIO; + } + if (sorted_omap) { + if (part->info.num != expected_next) { + /* ouch, we expected a specific part num here, but we got a + * different one. Either a part is missing, or it could be a + * case of mixed rgw versions working on the same upload, + * where one gateway doesn't support correctly sorted omap + * keys for multipart upload just assume data is unsorted. + */ + return list_parts(dpp, cct, num_parts, marker, next_marker, truncated, true); + } + expected_next++; + } + if (sorted_omap || + (int)part->info.num > marker) { + last_num = part->info.num; + parts[part->info.num] = std::move(part); + } + } + + if (sorted_omap) { + if (truncated) { + *truncated = (iter != parts_map.end()); + } + } else { + /* rebuild a map with only num_parts entries */ + std::map> new_parts; + std::map>::iterator piter; + for (i = 0, piter = parts.begin(); + i < num_parts && piter != parts.end(); + ++i, ++piter) { + last_num = piter->first; + new_parts[piter->first] = std::move(piter->second); + } + + if (truncated) { + *truncated = (piter != parts.end()); + } + + parts.swap(new_parts); + } + + if (next_marker) { + *next_marker = last_num; + } + + return 0; +} + +int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp, + optional_yield y, CephContext* cct, + map& part_etags, + list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& ofs, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) +{ + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + std::string etag; + bufferlist etag_bl; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + bool truncated; + int ret; + + int total_parts = 0; + int handled_parts = 0; + int max_parts = 1000; + int marker = 0; + uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size; + auto etags_iter = part_etags.begin(); + rgw::sal::Attrs attrs = target_obj->get_attrs(); + + do { + ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + if (ret < 0) + return ret; + + total_parts += parts.size(); + if (!truncated && total_parts != (int)part_etags.size()) { + ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts + << " expected: " << part_etags.size() << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + + for (auto obj_iter = parts.begin(); etags_iter != part_etags.end() && obj_iter != parts.end(); ++etags_iter, ++obj_iter, ++handled_parts) { + RadosMultipartPart* part = dynamic_cast(obj_iter->second.get()); + uint64_t part_size = part->get_size(); + if (handled_parts < (int)part_etags.size() - 1 && + part_size < min_part_size) { + ret = -ERR_TOO_SMALL; + return ret; + } + + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + if (etags_iter->first != (int)obj_iter->first) { + ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: " + << etags_iter->first << " next uploaded: " + << obj_iter->first << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + string part_etag = rgw_string_unquote(etags_iter->second); + if (part_etag.compare(part->get_etag()) != 0) { + ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first + << " etag: " << etags_iter->second << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + + hex_to_buf(part->get_etag().c_str(), petag, + CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + + RGWUploadPartInfo& obj_part = part->info; + + /* update manifest for part */ + string oid = mp_obj.get_part(part->info.num); + rgw_obj src_obj; + src_obj.init_ns(bucket->get_key(), oid, mp_ns); + + if (obj_part.manifest.empty()) { + ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj=" + << src_obj << dendl; + ret = -ERR_INVALID_PART; + return ret; + } else { + manifest.append(dpp, obj_part.manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params()); + auto manifest_prefix = part->info.manifest.get_prefix(); + if (not manifest_prefix.empty()) { + // It has an explicit prefix. Override the default one. + src_obj.init_ns(bucket->get_key(), manifest_prefix + "." + std::to_string(part->info.num), mp_ns); + } + } + + bool part_compressed = (obj_part.cs_info.compression_type != "none"); + if ((handled_parts > 0) && + ((part_compressed != compressed) || + (cs_info.compression_type != obj_part.cs_info.compression_type))) { + ldpp_dout(dpp, 0) << "ERROR: compression type was changed during multipart upload (" + << cs_info.compression_type << ">>" << obj_part.cs_info.compression_type << ")" << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + + if (part_compressed) { + int64_t new_ofs; // offset in compression data for new part + if (cs_info.blocks.size() > 0) + new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len; + else + new_ofs = 0; + for (const auto& block : obj_part.cs_info.blocks) { + compression_block cb; + cb.old_ofs = block.old_ofs + cs_info.orig_size; + cb.new_ofs = new_ofs; + cb.len = block.len; + cs_info.blocks.push_back(cb); + new_ofs = cb.new_ofs + cb.len; + } + if (!compressed) + cs_info.compression_type = obj_part.cs_info.compression_type; + cs_info.orig_size += obj_part.cs_info.orig_size; + compressed = true; + } + + rgw_obj_index_key remove_key; + src_obj.key.get_index_key(&remove_key); + + remove_objs.push_back(remove_key); + + cleanup_part_history(dpp, y, part, remove_objs); + + ofs += obj_part.size; + accounted_size += obj_part.accounted_size; + } + } while (truncated); + hash.Final((unsigned char *)final_etag); + + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], + sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)part_etags.size()); + etag = final_etag_str; + ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl; + + etag_bl.append(etag); + + attrs[RGW_ATTR_ETAG] = etag_bl; + + if (compressed) { + // write compression attribute to full object + bufferlist tmp; + encode(cs_info, tmp); + attrs[RGW_ATTR_COMPRESSION] = tmp; + } + + target_obj->set_atomic(); + + RGWRados::Object op_target(store->getRados(), + target_obj->get_bucket()->get_info(), + dynamic_cast(target_obj)->get_ctx(), + target_obj->get_obj()); + RGWRados::Object::Write obj_op(&op_target); + + obj_op.meta.manifest = &manifest; + obj_op.meta.remove_objs = &remove_objs; + + obj_op.meta.ptag = &tag; /* use req_id as operation tag */ + obj_op.meta.owner = owner.get_id(); + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.modify_tail = true; + obj_op.meta.completeMultipart = true; + obj_op.meta.olh_epoch = olh_epoch; + + ret = obj_op.write_meta(dpp, ofs, accounted_size, attrs, y); + if (ret < 0) + return ret; + + return ret; +} + +int RadosMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs) +{ + if (!rule && !attrs) { + return 0; + } + + if (rule) { + if (!placement.empty()) { + *rule = &placement; + if (!attrs) { + /* Don't need attrs, done */ + return 0; + } + } else { + *rule = nullptr; + } + } + + /* We need either attributes or placement, so we need a read */ + std::unique_ptr meta_obj; + meta_obj = get_meta_obj(); + meta_obj->set_in_extra_data(true); + + multipart_upload_info upload_info; + bufferlist headbl; + + /* Read the obj head which contains the multipart_upload_info */ + std::unique_ptr read_op = meta_obj->get_read_op(); + meta_obj->set_prefetch_data(); + + int ret = read_op->prepare(y, dpp); + if (ret < 0) { + if (ret == -ENOENT) { + return -ERR_NO_SUCH_UPLOAD; + } + return ret; + } + + extract_span_context(meta_obj->get_attrs(), trace_ctx); + + if (attrs) { + /* Attrs are filled in by prepare */ + *attrs = meta_obj->get_attrs(); + if (!rule || *rule != nullptr) { + /* placement was cached; don't actually read */ + return 0; + } + } + + /* Now read the placement from the head */ + ret = read_op->read(0, store->ctx()->_conf->rgw_max_chunk_size, headbl, y, dpp); + if (ret < 0) { + if (ret == -ENOENT) { + return -ERR_NO_SUCH_UPLOAD; + } + return ret; + } + + if (headbl.length() <= 0) { + return -ERR_NO_SUCH_UPLOAD; + } + + /* Decode multipart_upload_info */ + auto hiter = headbl.cbegin(); + try { + decode(upload_info, hiter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode multipart upload info" << dendl; + return -EIO; + } + placement = upload_info.dest_placement; + *rule = &placement; + + return 0; +} + +std::unique_ptr RadosMultipartUpload::get_writer( + const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) +{ + RGWBucketInfo& bucket_info = obj->get_bucket()->get_info(); + RGWObjectCtx& obj_ctx = static_cast(obj)->get_ctx(); + auto aio = rgw::make_throttle(store->ctx()->_conf->rgw_put_obj_min_window_size, y); + return std::make_unique(dpp, y, get_upload_id(), + bucket_info, obj_ctx, + obj->get_obj(), store, std::move(aio), owner, + ptail_placement_rule, part_num, part_num_str); +} + +MPRadosSerializer::MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name) : + lock(lock_name) +{ + rgw_pool meta_pool; + rgw_raw_obj raw_obj; + + obj->get_raw_obj(&raw_obj); + oid = raw_obj.oid; + store->getRados()->get_obj_data_pool(obj->get_bucket()->get_placement_rule(), + obj->get_obj(), &meta_pool); + store->getRados()->open_pool_ctx(dpp, meta_pool, ioctx, true, true); +} + +int MPRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) +{ + op.assert_exists(); + lock.set_duration(dur); + lock.lock_exclusive(&op); + int ret = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (! ret) { + locked = true; + } + return ret; +} + +LCRadosSerializer::LCRadosSerializer(RadosStore* store, const std::string& _oid, const std::string& lock_name, const std::string& cookie) : + StoreLCSerializer(_oid), + lock(lock_name) +{ + ioctx = &store->getRados()->lc_pool_ctx; + lock.set_cookie(cookie); +} + +int LCRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) +{ + lock.set_duration(dur); + return lock.lock_exclusive(ioctx, oid); +} + +int RadosLifecycle::get_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) +{ + cls_rgw_lc_entry cls_entry; + int ret = cls_rgw_lc_get_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker, cls_entry); + if (ret) + return ret; + + LCEntry* e; + e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status); + if (!e) + return -ENOMEM; + + entry->reset(e); + return 0; +} + +int RadosLifecycle::get_next_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) +{ + cls_rgw_lc_entry cls_entry; + int ret = cls_rgw_lc_get_next_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker, + cls_entry); + + if (ret) + return ret; + + LCEntry* e; + e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status); + if (!e) + return -ENOMEM; + + entry->reset(e); + return 0; +} + +int RadosLifecycle::set_entry(const std::string& oid, LCEntry& entry) +{ + cls_rgw_lc_entry cls_entry; + + cls_entry.bucket = entry.get_bucket(); + cls_entry.start_time = entry.get_start_time(); + cls_entry.status = entry.get_status(); + + return cls_rgw_lc_set_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry); +} + +int RadosLifecycle::list_entries(const std::string& oid, const std::string& marker, + uint32_t max_entries, std::vector>& entries) +{ + entries.clear(); + + vector cls_entries; + int ret = cls_rgw_lc_list(*store->getRados()->get_lc_pool_ctx(), oid, marker, max_entries, cls_entries); + + if (ret < 0) + return ret; + + for (auto& entry : cls_entries) { + entries.push_back(std::make_unique(entry.bucket, oid, + entry.start_time, entry.status)); + } + + return ret; +} + +int RadosLifecycle::rm_entry(const std::string& oid, LCEntry& entry) +{ + cls_rgw_lc_entry cls_entry; + + cls_entry.bucket = entry.get_bucket(); + cls_entry.start_time = entry.get_start_time(); + cls_entry.status = entry.get_status(); + + return cls_rgw_lc_rm_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry); +} + +int RadosLifecycle::get_head(const std::string& oid, std::unique_ptr* head) +{ + cls_rgw_lc_obj_head cls_head; + int ret = cls_rgw_lc_get_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head); + if (ret) + return ret; + + LCHead* h; + h = new StoreLCHead(cls_head.start_date, cls_head.shard_rollover_date, cls_head.marker); + if (!h) + return -ENOMEM; + + head->reset(h); + return 0; +} + +int RadosLifecycle::put_head(const std::string& oid, LCHead& head) +{ + cls_rgw_lc_obj_head cls_head; + + cls_head.marker = head.get_marker(); + cls_head.start_date = head.get_start_date(); + cls_head.shard_rollover_date = head.get_shard_rollover_date(); + + return cls_rgw_lc_put_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head); +} + +std::unique_ptr RadosLifecycle::get_serializer(const std::string& lock_name, + const std::string& oid, + const std::string& cookie) +{ + return std::make_unique(store, oid, lock_name, cookie); +} + +int RadosNotification::publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags) +{ + return rgw::notify::publish_reserve(dpp, event_type, res, obj_tags); +} + +int RadosNotification::publish_commit(const DoutPrefixProvider* dpp, uint64_t size, + const ceph::real_time& mtime, const std::string& etag, const std::string& version) +{ + return rgw::notify::publish_commit(obj, size, mtime, etag, version, event_type, res, dpp); +} + +int RadosAtomicWriter::prepare(optional_yield y) +{ + return processor.prepare(y); +} + +int RadosAtomicWriter::process(bufferlist&& data, uint64_t offset) +{ + return processor.process(std::move(data), offset); +} + +int RadosAtomicWriter::complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) +{ + return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, + if_match, if_nomatch, user_data, zones_trace, canceled, y); +} + +int RadosAppendWriter::prepare(optional_yield y) +{ + return processor.prepare(y); +} + +int RadosAppendWriter::process(bufferlist&& data, uint64_t offset) +{ + return processor.process(std::move(data), offset); +} + +int RadosAppendWriter::complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) +{ + return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, + if_match, if_nomatch, user_data, zones_trace, canceled, y); +} + +int RadosMultipartWriter::prepare(optional_yield y) +{ + return processor.prepare(y); +} + +int RadosMultipartWriter::process(bufferlist&& data, uint64_t offset) +{ + return processor.process(std::move(data), offset); +} + +int RadosMultipartWriter::complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) +{ + return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, + if_match, if_nomatch, user_data, zones_trace, canceled, y); +} + +const std::string& RadosZoneGroup::get_endpoint() const +{ + if (!group.endpoints.empty()) { + return group.endpoints.front(); + } else { + // use zonegroup's master zone endpoints + auto z = group.zones.find(group.master_zone); + if (z != group.zones.end() && !z->second.endpoints.empty()) { + return z->second.endpoints.front(); + } + } + return empty; +} + +bool RadosZoneGroup::placement_target_exists(std::string& target) const +{ + return !!group.placement_targets.count(target); +} + +void RadosZoneGroup::get_placement_target_names(std::set& names) const +{ + for (const auto& target : group.placement_targets) { + names.emplace(target.second.name); + } +} + +int RadosZoneGroup::get_placement_tier(const rgw_placement_rule& rule, + std::unique_ptr* tier) +{ + std::map::const_iterator titer; + titer = group.placement_targets.find(rule.name); + if (titer == group.placement_targets.end()) { + return -ENOENT; + } + + const auto& target_rule = titer->second; + std::map::const_iterator ttier; + ttier = target_rule.tier_targets.find(rule.storage_class); + if (ttier == target_rule.tier_targets.end()) { + // not found + return -ENOENT; + } + + PlacementTier* t; + t = new RadosPlacementTier(store, ttier->second); + if (!t) + return -ENOMEM; + + tier->reset(t); + return 0; +} + +int RadosZoneGroup::get_zone_by_id(const std::string& id, std::unique_ptr* zone) +{ + RGWZone* rz = store->svc()->zone->find_zone(id); + if (!rz) + return -ENOENT; + + Zone* z = new RadosZone(store, clone(), *rz); + zone->reset(z); + return 0; +} + +int RadosZoneGroup::get_zone_by_name(const std::string& name, std::unique_ptr* zone) +{ + rgw_zone_id id; + int ret = store->svc()->zone->find_zone_id_by_name(name, &id); + if (ret < 0) + return ret; + + RGWZone* rz = store->svc()->zone->find_zone(id.id); + if (!rz) + return -ENOENT; + + Zone* z = new RadosZone(store, clone(), *rz); + zone->reset(z); + return 0; +} + +int RadosZoneGroup::list_zones(std::list& zone_ids) +{ + for (const auto& entry : group.zones) + { + zone_ids.push_back(entry.second.id); + } + return 0; +} + +std::unique_ptr RadosZone::clone() +{ + if (local_zone) + return std::make_unique(store, group->clone()); + + return std::make_unique(store, group->clone(), rgw_zone); +} + +const std::string& RadosZone::get_id() +{ + if (local_zone) + return store->svc()->zone->zone_id().id; + + return rgw_zone.id; +} + +const std::string& RadosZone::get_name() const +{ + if (local_zone) + return store->svc()->zone->zone_name(); + + return rgw_zone.name; +} + +bool RadosZone::is_writeable() +{ + if (local_zone) + return store->svc()->zone->zone_is_writeable(); + + return !rgw_zone.read_only; +} + +bool RadosZone::get_redirect_endpoint(std::string* endpoint) +{ + if (local_zone) + return store->svc()->zone->get_redirect_zone_endpoint(endpoint); + + endpoint = &rgw_zone.redirect_zone; + return true; +} + +bool RadosZone::has_zonegroup_api(const std::string& api) const +{ + return store->svc()->zone->has_zonegroup_api(api); +} + +const std::string& RadosZone::get_current_period_id() +{ + return store->svc()->zone->get_current_period_id(); +} + +const RGWAccessKey& RadosZone::get_system_key() +{ + return store->svc()->zone->get_zone_params().system_key; +} + +const std::string& RadosZone::get_realm_name() +{ + return store->svc()->zone->get_realm().get_name(); +} + +const std::string& RadosZone::get_realm_id() +{ + return store->svc()->zone->get_realm().get_id(); +} + +const std::string_view RadosZone::get_tier_type() +{ + if (local_zone) + return store->svc()->zone->get_zone().tier_type; + + return rgw_zone.id; +} + +RGWBucketSyncPolicyHandlerRef RadosZone::get_sync_policy_handler() +{ + return store->svc()->zone->get_sync_policy_handler(get_id()); +} + +RadosLuaManager::RadosLuaManager(RadosStore* _s) : + store(_s), + pool((store->svc() && store->svc()->zone) ? store->svc()->zone->get_zone_params().log_pool : rgw_pool()) +{ } + +int RadosLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) +{ + if (pool.empty()) { + ldpp_dout(dpp, 10) << "WARNING: missing pool when reading lua script " << dendl; + return 0; + } + bufferlist bl; + + int r = rgw_get_system_obj(store->svc()->sysobj, pool, key, bl, nullptr, nullptr, y, dpp); + if (r < 0) { + return r; + } + + auto iter = bl.cbegin(); + try { + ceph::decode(script, iter); + } catch (buffer::error& err) { + return -EIO; + } + + return 0; +} + +int RadosLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) +{ + if (pool.empty()) { + ldpp_dout(dpp, 10) << "WARNING: missing pool when writing lua script " << dendl; + return 0; + } + bufferlist bl; + ceph::encode(script, bl); + + int r = rgw_put_system_obj(dpp, store->svc()->sysobj, pool, key, bl, false, nullptr, real_time(), y); + if (r < 0) { + return r; + } + + return 0; +} + +int RadosLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) +{ + if (pool.empty()) { + ldpp_dout(dpp, 10) << "WARNING: missing pool when deleting lua script " << dendl; + return 0; + } + int r = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, key, nullptr, y); + if (r < 0 && r != -ENOENT) { + return r; + } + + return 0; +} + +const std::string PACKAGE_LIST_OBJECT_NAME = "lua_package_allowlist"; + +int RadosLuaManager::add_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name) +{ + // add package to list + const bufferlist empty_bl; + std::map new_package{{package_name, empty_bl}}; + librados::ObjectWriteOperation op; + op.omap_set(new_package); + auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()), + PACKAGE_LIST_OBJECT_NAME, &op, y); + + if (ret < 0) { + return ret; + } + return 0; +} + +int RadosLuaManager::remove_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name) +{ + librados::ObjectWriteOperation op; + size_t pos = package_name.find(" "); + if (pos != package_name.npos) { + // remove specfic version of the the package + op.omap_rm_keys(std::set({package_name})); + auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()), + PACKAGE_LIST_OBJECT_NAME, &op, y); + if (ret < 0) { + return ret; + } + return 0; + } + // otherwise, remove any existing versions of the package + rgw::lua::packages_t packages; + auto ret = list_packages(dpp, y, packages); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + for(const auto& package : packages) { + const std::string package_no_version = package.substr(0, package.find(" ")); + if (package_no_version.compare(package_name) == 0) { + op.omap_rm_keys(std::set({package})); + ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()), + PACKAGE_LIST_OBJECT_NAME, &op, y); + if (ret < 0) { + return ret; + } + } + } + return 0; +} + +int RadosLuaManager::list_packages(const DoutPrefixProvider *dpp, optional_yield y, rgw::lua::packages_t& packages) +{ + constexpr auto max_chunk = 1024U; + std::string start_after; + bool more = true; + int rval; + while (more) { + librados::ObjectReadOperation op; + rgw::lua::packages_t packages_chunk; + op.omap_get_keys2(start_after, max_chunk, &packages_chunk, &more, &rval); + const auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()), + PACKAGE_LIST_OBJECT_NAME, &op, nullptr, y); + + if (ret < 0) { + return ret; + } + + packages.merge(packages_chunk); + } + + return 0; +} + +int RadosOIDCProvider::store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) +{ + auto sysobj = store->svc()->sysobj; + std::string oid = tenant + get_url_oid_prefix() + url; + + bufferlist bl; + using ceph::encode; + encode(*this, bl); + return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().oidc_pool, oid, bl, exclusive, nullptr, real_time(), y); +} + +int RadosOIDCProvider::read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) +{ + auto sysobj = store->svc()->sysobj; + auto& pool = store->svc()->zone->get_zone_params().oidc_pool; + std::string oid = tenant + get_url_oid_prefix() + url; + bufferlist bl; + + int ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr, null_yield, dpp); + if (ret < 0) { + return ret; + } + + try { + using ceph::decode; + auto iter = bl.cbegin(); + decode(*this, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: " << pool.name << + ": " << url << dendl; + return -EIO; + } + + return 0; +} + +int RadosOIDCProvider::delete_obj(const DoutPrefixProvider *dpp, optional_yield y) +{ + auto& pool = store->svc()->zone->get_zone_params().oidc_pool; + + std::string url, tenant; + auto ret = get_tenant_url_from_arn(tenant, url); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to parse arn" << dendl; + return -EINVAL; + } + + if (this->tenant != tenant) { + ldpp_dout(dpp, 0) << "ERROR: tenant in arn doesn't match that of user " << this->tenant << ", " + << tenant << ": " << dendl; + return -EINVAL; + } + + // Delete url + std::string oid = tenant + get_url_oid_prefix() + url; + ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: deleting oidc url from pool: " << pool.name << ": " + << provider_url << ": " << cpp_strerror(-ret) << dendl; + } + + return ret; +} + +int RadosRole::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + using ceph::encode; + std::string oid; + + oid = info.id; + + bufferlist bl; + encode(this->info, bl); + + if (!this->info.tags.empty()) { + bufferlist bl_tags; + encode(this->info.tags, bl_tags); + map attrs; + attrs.emplace("tagging", bl_tags); + + RGWSI_MBSObj_PutParams params(bl, &attrs, info.mtime, exclusive); + std::unique_ptr ctx(store->svc()->role->svc.meta_be->alloc_ctx()); + ctx->init(store->svc()->role->get_be_handler()); + return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp); + } else { + RGWSI_MBSObj_PutParams params(bl, nullptr, info.mtime, exclusive); + std::unique_ptr ctx(store->svc()->role->svc.meta_be->alloc_ctx()); + ctx->init(store->svc()->role->get_be_handler()); + return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp); + } +} + +int RadosRole::store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + auto sysobj = store->svc()->sysobj; + RGWNameToId nameToId; + nameToId.obj_id = info.id; + + std::string oid = info.tenant + get_names_oid_prefix() + info.name; + + bufferlist bl; + using ceph::encode; + encode(nameToId, bl); + + return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y); +} + +int RadosRole::store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + auto sysobj = store->svc()->sysobj; + std::string oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id; + + bufferlist bl; + + return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y); +} + +int RadosRole::read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y) +{ + auto sysobj = store->svc()->sysobj; + std::string oid = info.tenant + get_names_oid_prefix() + role_name; + bufferlist bl; + + int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, y, dpp); + if (ret < 0) { + return ret; + } + + RGWNameToId nameToId; + try { + auto iter = bl.cbegin(); + using ceph::decode; + decode(nameToId, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode role from Role pool: " << role_name << dendl; + return -EIO; + } + role_id = nameToId.obj_id; + return 0; +} + +int RadosRole::read_name(const DoutPrefixProvider *dpp, optional_yield y) +{ + auto sysobj = store->svc()->sysobj; + std::string oid = info.tenant + get_names_oid_prefix() + info.name; + bufferlist bl; + + int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed reading role name from Role pool: " << info.name << + ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + RGWNameToId nameToId; + try { + using ceph::decode; + auto iter = bl.cbegin(); + decode(nameToId, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode role name from Role pool: " << info.name << dendl; + return -EIO; + } + info.id = nameToId.obj_id; + return 0; +} + +int RadosRole::read_info(const DoutPrefixProvider *dpp, optional_yield y) +{ + std::string oid; + + oid = info.id; + ldpp_dout(dpp, 20) << "INFO: oid in read_info is: " << oid << dendl; + + bufferlist bl; + + RGWSI_MBSObj_GetParams params(&bl, &info.attrs, &info.mtime); + std::unique_ptr ctx(store->svc()->role->svc.meta_be->alloc_ctx()); + ctx->init(store->svc()->role->get_be_handler()); + int ret = store->svc()->role->svc.meta_be->get(ctx.get(), oid, params, &info.objv_tracker, y, dpp, true); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed reading role info from Role pool: " << info.id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + try { + using ceph::decode; + auto iter = bl.cbegin(); + decode(this->info, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode role info from Role pool: " << info.id << dendl; + return -EIO; + } + + auto it = info.attrs.find("tagging"); + if (it != info.attrs.end()) { + bufferlist bl_tags = it->second; + try { + using ceph::decode; + auto iter = bl_tags.cbegin(); + decode(info.tags, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode attrs" << info.id << dendl; + return -EIO; + } + } + + return 0; +} + +int RadosRole::create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y) +{ + int ret; + + if (! validate_input(dpp)) { + return -EINVAL; + } + + if (!role_id.empty()) { + info.id = role_id; + } + + /* check to see the name is not used */ + ret = read_id(dpp, info.name, info.tenant, info.id, y); + if (exclusive && ret == 0) { + ldpp_dout(dpp, 0) << "ERROR: name " << info.name << " already in use for role id " + << info.id << dendl; + return -EEXIST; + } else if ( ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "failed reading role id " << info.id << ": " + << cpp_strerror(-ret) << dendl; + return ret; + } + + if (info.id.empty()) { + /* create unique id */ + uuid_d new_uuid; + char uuid_str[37]; + new_uuid.generate_random(); + new_uuid.print(uuid_str); + info.id = uuid_str; + } + + //arn + info.arn = role_arn_prefix + info.tenant + ":role" + info.path + info.name; + + // Creation time + real_clock::time_point t = real_clock::now(); + + struct timeval tv; + real_clock::to_timeval(t, tv); + + char buf[30]; + struct tm result; + gmtime_r(&tv.tv_sec, &result); + strftime(buf,30,"%Y-%m-%dT%H:%M:%S", &result); + sprintf(buf + strlen(buf),".%dZ",(int)tv.tv_usec/1000); + info.creation_date.assign(buf, strlen(buf)); + + auto& pool = store->svc()->zone->get_zone_params().roles_pool; + ret = store_info(dpp, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: storing role info in Role pool: " + << info.id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = store_name(dpp, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: storing role name in Role pool: " + << info.name << ": " << cpp_strerror(-ret) << dendl; + + //Delete the role info that was stored in the previous call + std::string oid = get_info_oid_prefix() + info.id; + int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y); + if (info_ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: " + << info.id << ": " << cpp_strerror(-info_ret) << dendl; + } + return ret; + } + + ret = store_path(dpp, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: storing role path in Role pool: " + << info.path << ": " << cpp_strerror(-ret) << dendl; + //Delete the role info that was stored in the previous call + std::string oid = get_info_oid_prefix() + info.id; + int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y); + if (info_ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: " + << info.id << ": " << cpp_strerror(-info_ret) << dendl; + } + //Delete role name that was stored in previous call + oid = info.tenant + get_names_oid_prefix() + info.name; + int name_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y); + if (name_ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: cleanup of role name from Role pool: " + << info.name << ": " << cpp_strerror(-name_ret) << dendl; + } + return ret; + } + return 0; +} + +int RadosRole::delete_obj(const DoutPrefixProvider *dpp, optional_yield y) +{ + auto& pool = store->svc()->zone->get_zone_params().roles_pool; + + int ret = read_name(dpp, y); + if (ret < 0) { + return ret; + } + + ret = read_info(dpp, y); + if (ret < 0) { + return ret; + } + + if (! info.perm_policy_map.empty()) { + return -ERR_DELETE_CONFLICT; + } + + // Delete id + std::string oid = get_info_oid_prefix() + info.id; + ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: deleting role id from Role pool: " + << info.id << ": " << cpp_strerror(-ret) << dendl; + } + + // Delete name + oid = info.tenant + get_names_oid_prefix() + info.name; + ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: deleting role name from Role pool: " + << info.name << ": " << cpp_strerror(-ret) << dendl; + } + + // Delete path + oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id; + ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: deleting role path from Role pool: " + << info.path << ": " << cpp_strerror(-ret) << dendl; + } + return ret; +} + +} // namespace rgw::sal + +extern "C" { + +void* newRadosStore(void) +{ + rgw::sal::RadosStore* store = new rgw::sal::RadosStore(); + if (store) { + RGWRados* rados = new RGWRados(); + + if (!rados) { + delete store; store = nullptr; + } else { + store->setRados(rados); + rados->set_store(store); + } + } + + return store; +} + +} diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h new file mode 100644 index 000000000..4d2dc9709 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sal_rados.h @@ -0,0 +1,978 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_sal_store.h" +#include "rgw_rados.h" +#include "rgw_notify.h" +#include "rgw_oidc_provider.h" +#include "rgw_role.h" +#include "rgw_multi.h" +#include "rgw_putobj_processor.h" +#include "services/svc_tier_rados.h" +#include "cls/lock/cls_lock_client.h" + +namespace rgw { namespace sal { + +class RadosMultipartUpload; + +class RadosCompletions : public Completions { + public: + std::list handles; + RadosCompletions() {} + ~RadosCompletions() = default; + virtual int drain() override; +}; + +class RadosPlacementTier: public StorePlacementTier { + RadosStore* store; + RGWZoneGroupPlacementTier tier; +public: + RadosPlacementTier(RadosStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {} + virtual ~RadosPlacementTier() = default; + + virtual const std::string& get_tier_type() { return tier.tier_type; } + virtual const std::string& get_storage_class() { return tier.storage_class; } + virtual bool retain_head_object() { return tier.retain_head_object; } + RGWZoneGroupPlacementTier& get_rt() { return tier; } +}; + +class RadosZoneGroup : public StoreZoneGroup { + RadosStore* store; + const RGWZoneGroup group; + std::string empty; +public: + RadosZoneGroup(RadosStore* _store, const RGWZoneGroup& _group) : store(_store), group(_group) {} + virtual ~RadosZoneGroup() = default; + + virtual const std::string& get_id() const override { return group.get_id(); }; + virtual const std::string& get_name() const override { return group.get_name(); }; + virtual int equals(const std::string& other_zonegroup) const override { + return group.equals(other_zonegroup); + }; + /** Get the endpoint from zonegroup, or from master zone if not set */ + virtual const std::string& get_endpoint() const override; + virtual bool placement_target_exists(std::string& target) const override; + virtual bool is_master_zonegroup() const override { + return group.is_master_zonegroup(); + }; + virtual const std::string& get_api_name() const override { return group.api_name; }; + virtual void get_placement_target_names(std::set& names) const override; + virtual const std::string& get_default_placement_name() const override { + return group.default_placement.name; }; + virtual int get_hostnames(std::list& names) const override { + names = group.hostnames; + return 0; + }; + virtual int get_s3website_hostnames(std::list& names) const override { + names = group.hostnames_s3website; + return 0; + }; + virtual int get_zone_count() const override { + return group.zones.size(); + } + virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr* tier); + virtual int get_zone_by_id(const std::string& id, std::unique_ptr* zone) override; + virtual int get_zone_by_name(const std::string& name, std::unique_ptr* zone) override; + virtual int list_zones(std::list& zone_ids) override; + bool supports(std::string_view feature) const override { + return group.supports(feature); + } + virtual std::unique_ptr clone() override { + return std::make_unique(store, group); + } + const RGWZoneGroup& get_group() const { return group; } +}; + +class RadosZone : public StoreZone { + protected: + RadosStore* store; + std::unique_ptr group; + RGWZone rgw_zone; + bool local_zone{false}; + public: + RadosZone(RadosStore* _store, std::unique_ptr _zg) : store(_store), group(std::move(_zg)), local_zone(true) {} + RadosZone(RadosStore* _store, std::unique_ptr _zg, RGWZone& z) : store(_store), group(std::move(_zg)), rgw_zone(z) {} + ~RadosZone() = default; + + virtual std::unique_ptr clone() override; + virtual ZoneGroup& get_zonegroup() override { return *(group.get()); } + virtual const std::string& get_id() override; + virtual const std::string& get_name() const override; + virtual bool is_writeable() override; + virtual bool get_redirect_endpoint(std::string* endpoint) override; + virtual bool has_zonegroup_api(const std::string& api) const override; + virtual const std::string& get_current_period_id() override; + virtual const RGWAccessKey& get_system_key() override; + virtual const std::string& get_realm_name() override; + virtual const std::string& get_realm_id() override; + virtual const std::string_view get_tier_type() override; + virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override; +}; + +class RadosStore : public StoreDriver { + private: + RGWRados* rados; + RGWUserCtl* user_ctl; + std::unique_ptr zone; + std::string topics_oid(const std::string& tenant) const; + + public: + RadosStore() + : rados(nullptr) { + } + ~RadosStore() { + delete rados; + } + + virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override; + virtual const std::string get_name() const override { + return "rados"; + } + virtual std::string get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual std::unique_ptr get_user(const rgw_user& u) override; + virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr* user) override; + virtual int get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr* user) override; + virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr* user) override; + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr* bucket, optional_yield y) override; + virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr* bucket) override; + virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr* bucket, optional_yield y) override; + virtual bool is_meta_master() override; + virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv, + bufferlist& in_data, JSONParser* jp, req_info& info, + optional_yield y) override; + virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, req_info& info, + optional_yield y) override; + virtual Zone* get_zone() { return zone.get(); } + virtual std::string zone_unique_id(uint64_t unique_num) override; + virtual std::string zone_unique_trans_id(const uint64_t unique_num) override; + virtual int get_zonegroup(const std::string& id, std::unique_ptr* zonegroup) override; + virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list& zone_ids) override; + virtual int cluster_stat(RGWClusterStat& stats) override; + virtual std::unique_ptr get_lifecycle(void) override; + virtual std::unique_ptr get_completions(void) override; + virtual std::unique_ptr get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, optional_yield y, const std::string* object_name=nullptr) override; + virtual std::unique_ptr get_notification( + const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, + rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, + std::string& _req_id, optional_yield y) override; + int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override; + int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override; + int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override; + virtual RGWLC* get_rgwlc(void) override { return rados->get_lc(); } + virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return rados->get_cr_registry(); } + + virtual int log_usage(const DoutPrefixProvider *dpp, std::map& usage_info) override; + virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override; + virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, + const std::map& meta) override; + virtual void get_quota(RGWQuota& quota) override; + virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override; + virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, std::vector& buckets, bool enabled) override; + virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef* phandler, + optional_yield y) override; + virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override; + virtual void wakeup_meta_sync_shards(std::set& shard_ids) override { rados->wakeup_meta_sync_shards(shard_ids); } + virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map>& shard_ids) override { rados->wakeup_data_sync_shards(dpp, source_zone, shard_ids); } + virtual int clear_usage(const DoutPrefixProvider *dpp) override { return rados->clear_usage(dpp); } + virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, + RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + virtual int get_config_key_val(std::string name, bufferlist* bl) override; + virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override; + virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list& keys, bool* truncated) override; + virtual void meta_list_keys_complete(void* handle) override; + virtual std::string meta_get_marker(void* handle) override; + virtual int meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y) override; + virtual const RGWSyncModuleInstanceRef& get_sync_module() { return rados->get_sync_module(); } + virtual std::string get_host_id() { return rados->host_id; } + virtual std::unique_ptr get_lua_manager() override; + virtual std::unique_ptr get_role(std::string name, + std::string tenant, + std::string path="", + std::string trust_policy="", + std::string max_session_duration_str="", + std::multimap tags={}) override; + virtual std::unique_ptr get_role(std::string id) override; + virtual std::unique_ptr get_role(const RGWRoleInfo& info) override; + virtual int get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + std::vector>& roles) override; + virtual std::unique_ptr get_oidc_provider() override; + virtual int get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + std::vector>& providers) override; + virtual std::unique_ptr get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) override; + virtual std::unique_ptr get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) override; + virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override; + virtual bool valid_placement(const rgw_placement_rule& rule) override; + + virtual void finalize(void) override; + + virtual CephContext* ctx(void) override { return rados->ctx(); } + + virtual void register_admin_apis(RGWRESTMgr* mgr) override; + + /* Unique to RadosStore */ + int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + librados::IoCtx* ioctx); + int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj); + int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio); + void get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj); + int get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size); + + void setRados(RGWRados * st) { rados = st; } + RGWRados* getRados(void) { return rados; } + + RGWServices* svc() { return &rados->svc; } + const RGWServices* svc() const { return &rados->svc; } + RGWCtl* ctl() { return &rados->ctl; } + const RGWCtl* ctl() const { return &rados->ctl; } + + void setUserCtl(RGWUserCtl *_ctl) { user_ctl = _ctl; } +}; + +class RadosUser : public StoreUser { + private: + RadosStore* store; + + public: + RadosUser(RadosStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { } + RadosUser(RadosStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { } + RadosUser(RadosStore *_st) : store(_st) { } + RadosUser(RadosUser& _o) = default; + + virtual std::unique_ptr clone() override { + return std::unique_ptr(new RadosUser(*this)); + } + int list_buckets(const DoutPrefixProvider* dpp, const std::string& marker, const std::string& end_marker, + uint64_t max, bool need_stats, BucketList& buckets, + optional_yield y) override; + virtual int create_bucket(const DoutPrefixProvider* dpp, + const rgw_bucket& b, + const std::string& zonegroup_id, + rgw_placement_rule& placement_rule, + std::string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool* existed, + req_info& req_info, + std::unique_ptr* bucket, + optional_yield y) override; + virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override; + virtual int read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time* last_stats_sync = nullptr, + ceph::real_time* last_stats_update = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override; + virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + + virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override; + virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override; + + friend class RadosBucket; +}; + +class RadosObject : public StoreObject { + private: + RadosStore* store; + RGWAccessControlPolicy acls; + RGWObjManifest *manifest{nullptr}; + RGWObjectCtx* rados_ctx; + bool rados_ctx_owned; + + public: + + struct RadosReadOp : public ReadOp { + private: + RadosObject* source; + RGWObjectCtx* rctx; + RGWRados::Object op_target; + RGWRados::Object::Read parent_op; + + public: + RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx); + + virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override; + + /* + * Both `read` and `iterate` read up through index `end` + * *inclusive*. The number of bytes that could be returned is + * `end - ofs + 1`. + */ + virtual int read(int64_t ofs, int64_t end, + bufferlist& bl, optional_yield y, + const DoutPrefixProvider* dpp) override; + virtual int iterate(const DoutPrefixProvider* dpp, + int64_t ofs, int64_t end, + RGWGetDataCB* cb, optional_yield y) override; + + virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override; + }; + + struct RadosDeleteOp : public DeleteOp { + private: + RadosObject* source; + RGWRados::Object op_target; + RGWRados::Object::Delete parent_op; + + public: + RadosDeleteOp(RadosObject* _source); + + virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override; + }; + + RadosObject(RadosStore *_st, const rgw_obj_key& _k) + : StoreObject(_k), + store(_st), + acls(), + rados_ctx(new RGWObjectCtx(dynamic_cast(store))), + rados_ctx_owned(true) { + } + RadosObject(RadosStore *_st, const rgw_obj_key& _k, Bucket* _b) + : StoreObject(_k, _b), + store(_st), + acls(), + rados_ctx(new RGWObjectCtx(dynamic_cast(store))) , + rados_ctx_owned(true) { + } + RadosObject(RadosObject& _o) : StoreObject(_o) { + store = _o.store; + acls = _o.acls; + manifest = _o.manifest; + rados_ctx = _o.rados_ctx; + rados_ctx_owned = false; + } + + virtual ~RadosObject(); + + virtual void invalidate() override { + StoreObject::invalidate(); + rados_ctx->invalidate(get_obj()); + } + virtual int delete_object(const DoutPrefixProvider* dpp, + optional_yield y, bool prevent_versioning) override; + virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio, + bool keep_index_consistent, optional_yield y) override; + virtual int copy_object(User* user, + req_info* info, const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, ceph::real_time* mtime, + const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, const char* if_nomatch, + AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs, + RGWObjCategory category, uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, std::string* tag, std::string* etag, + void (*progress_cb)(off_t, void *), void* progress_data, + const DoutPrefixProvider* dpp, optional_yield y) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } + virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; } + virtual void set_atomic() override { + rados_ctx->set_atomic(state.obj); + StoreObject::set_atomic(); + } + virtual void set_prefetch_data() override { + rados_ctx->set_prefetch_data(state.obj); + StoreObject::set_prefetch_data(); + } + virtual void set_compressed() override { + rados_ctx->set_compressed(state.obj); + StoreObject::set_compressed(); + } + + virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override; + virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override; + virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override; + virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override; + virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override; + virtual bool is_expired() override; + virtual void gen_rand_obj_instance_name() override; + void get_raw_obj(rgw_raw_obj* raw_obj); + virtual std::unique_ptr clone() override { + return std::unique_ptr(new RadosObject(*this)); + } + virtual std::unique_ptr get_serializer(const DoutPrefixProvider *dpp, + const std::string& lock_name) override; + virtual int transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int transition_to_cloud(Bucket* bucket, + rgw::sal::PlacementTier* tier, + rgw_bucket_dir_entry& o, + std::set& cloud_targets, + CephContext* cct, + bool update_object, + const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override; + virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override; + + /* Swift versioning */ + virtual int swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) override; + virtual int swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) override; + + /* OPs */ + virtual std::unique_ptr get_read_op() override; + virtual std::unique_ptr get_delete_op() override; + + /* OMAP */ + virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map *m, + bool* pmore, optional_yield y) override; + virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map *m, + optional_yield y) override; + virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid, + const std::set& keys, + Attrs* vals) override; + virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) override; + virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) override; + + /* Internal to RadosStore */ + int get_max_chunk_size(const DoutPrefixProvider* dpp, + rgw_placement_rule placement_rule, + uint64_t* max_chunk_size, + uint64_t* alignment = nullptr); + void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t* max_size); + void raw_obj_to_obj(const rgw_raw_obj& raw_obj); + int write_cloud_tier(const DoutPrefixProvider* dpp, + optional_yield y, + uint64_t olh_epoch, + rgw::sal::PlacementTier* tier, + bool is_multipart_upload, + rgw_placement_rule& target_placement, + Object* head_obj); + RGWObjManifest* get_manifest() { return manifest; } + RGWObjectCtx& get_ctx() { return *rados_ctx; } + + private: + int read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr); +}; + +class RadosBucket : public StoreBucket { + private: + RadosStore* store; + RGWAccessControlPolicy acls; + std::string topics_oid() const; + + public: + RadosBucket(RadosStore *_st) + : store(_st), + acls() { + } + + RadosBucket(RadosStore *_st, User* _u) + : StoreBucket(_u), + store(_st), + acls() { + } + + RadosBucket(RadosStore *_st, const rgw_bucket& _b) + : StoreBucket(_b), + store(_st), + acls() { + } + + RadosBucket(RadosStore *_st, const RGWBucketEnt& _e) + : StoreBucket(_e), + store(_st), + acls() { + } + + RadosBucket(RadosStore *_st, const RGWBucketInfo& _i) + : StoreBucket(_i), + store(_st), + acls() { + } + + RadosBucket(RadosStore *_st, const rgw_bucket& _b, User* _u) + : StoreBucket(_b, _u), + store(_st), + acls() { + } + + RadosBucket(RadosStore *_st, const RGWBucketEnt& _e, User* _u) + : StoreBucket(_e, _u), + store(_st), + acls() { + } + + RadosBucket(RadosStore *_st, const RGWBucketInfo& _i, User* _u) + : StoreBucket(_i, _u), + store(_st), + acls() { + } + + virtual ~RadosBucket(); + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, ListResults&, optional_yield y) override; + virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override; + virtual int remove_bucket_bypass_gc(int concurrent_max, bool + keep_index_consistent, + optional_yield y, const + DoutPrefixProvider *dpp) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } + virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl, optional_yield y) override; + virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats = false) override; + virtual int read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, std::string* bucket_ver, std::string* master_ver, + std::map& stats, + std::string* max_marker = nullptr, + bool* syncstopped = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetBucketStats_CB* ctx) override; + virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int update_container_stats(const DoutPrefixProvider* dpp) override; + virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override; + virtual int chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y) override; + virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time mtime) override; + virtual bool is_owner(User* user) override; + virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& attrs, optional_yield y) override; + virtual int try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime) override; + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list& objs_to_unlink) override; + virtual int check_index(const DoutPrefixProvider *dpp, std::map& existing_stats, std::map& calculated_stats) override; + virtual int rebuild_index(const DoutPrefixProvider *dpp) override; + virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override; + virtual int purge_instance(const DoutPrefixProvider* dpp) override; + virtual std::unique_ptr clone() override { + return std::make_unique(*this); + } + virtual std::unique_ptr get_multipart_upload( + const std::string& oid, + std::optional upload_id=std::nullopt, + ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override; + virtual int list_multiparts(const DoutPrefixProvider *dpp, + const std::string& prefix, + std::string& marker, + const std::string& delim, + const int& max_uploads, + std::vector>& uploads, + std::map *common_prefixes, + bool *is_truncated) override; + virtual int abort_multiparts(const DoutPrefixProvider* dpp, + CephContext* cct) override; + int read_topics(rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override; + int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override; + int remove_topics(RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override; + + private: + int link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true, RGWObjVersionTracker* objv = nullptr); + int unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true); + friend class RadosUser; +}; + +class RadosMultipartPart : public StoreMultipartPart { +protected: + RGWUploadPartInfo info; + +public: + RadosMultipartPart() = default; + virtual ~RadosMultipartPart() = default; + + virtual uint32_t get_num() { return info.num; } + virtual uint64_t get_size() { return info.accounted_size; } + virtual const std::string& get_etag() { return info.etag; } + virtual ceph::real_time& get_mtime() { return info.modified; } + + /* For RadosStore code */ + RGWObjManifest& get_manifest() { return info.manifest; } + const std::set& get_past_prefixes() const { return info.past_prefixes; } + + friend class RadosMultipartUpload; +}; + +class RadosMultipartUpload : public StoreMultipartUpload { + RadosStore* store; + RGWMPObj mp_obj; + ACLOwner owner; + ceph::real_time mtime; + rgw_placement_rule placement; + RGWObjManifest manifest; + +public: + RadosMultipartUpload(RadosStore* _store, Bucket* _bucket, const std::string& oid, + std::optional upload_id, ACLOwner owner, + ceph::real_time _mtime) + : StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id), + owner(owner), mtime(_mtime) {} + virtual ~RadosMultipartUpload() = default; + + virtual const std::string& get_meta() const override { return mp_obj.get_meta(); } + virtual const std::string& get_key() const override { return mp_obj.get_key(); } + virtual const std::string& get_upload_id() const override { return mp_obj.get_upload_id(); } + virtual const ACLOwner& get_owner() const override { return owner; } + virtual ceph::real_time& get_mtime() override { return mtime; } + virtual std::unique_ptr get_meta_obj() override; + virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override; + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int num_parts, int marker, + int* next_marker, bool* truncated, + bool assume_unsorted = false) override; + virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override; + virtual int complete(const DoutPrefixProvider* dpp, + optional_yield y, CephContext* cct, + std::map& part_etags, + std::list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& ofs, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) override; + virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override; + virtual std::unique_ptr get_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) override; +protected: + int cleanup_part_history(const DoutPrefixProvider* dpp, + optional_yield y, + RadosMultipartPart* part, + std::list& remove_objs); +}; + +class MPRadosSerializer : public StoreMPSerializer { + librados::IoCtx ioctx; + rados::cls::lock::Lock lock; + librados::ObjectWriteOperation op; + +public: + MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name); + + virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override; + virtual int unlock() override { + return lock.unlock(&ioctx, oid); + } +}; + +class LCRadosSerializer : public StoreLCSerializer { + librados::IoCtx* ioctx; + rados::cls::lock::Lock lock; + +public: + LCRadosSerializer(RadosStore* store, const std::string& oid, const std::string& lock_name, const std::string& cookie); + + virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override; + virtual int unlock() override { + return lock.unlock(ioctx, oid); + } +}; + +class RadosLifecycle : public StoreLifecycle { + RadosStore* store; + +public: + RadosLifecycle(RadosStore* _st) : store(_st) {} + + using StoreLifecycle::get_entry; + virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr* entry) override; + virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr* entry) override; + virtual int set_entry(const std::string& oid, LCEntry& entry) override; + virtual int list_entries(const std::string& oid, const std::string& marker, + uint32_t max_entries, + std::vector>& entries) override; + virtual int rm_entry(const std::string& oid, LCEntry& entry) override; + virtual int get_head(const std::string& oid, std::unique_ptr* head) override; + virtual int put_head(const std::string& oid, LCHead& head) override; + virtual std::unique_ptr get_serializer(const std::string& lock_name, + const std::string& oid, + const std::string& cookie) override; +}; + +class RadosNotification : public StoreNotification { + RadosStore* store; + /* XXX it feels incorrect to me that rgw::notify::reservation_t is + * currently RADOS-specific; instead, I think notification types such as + * reservation_t should be generally visible, whereas the internal + * notification behavior should be made portable (e.g., notification + * to non-RADOS message sinks) */ + rgw::notify::reservation_t res; + + public: + RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, req_state* _s, rgw::notify::EventType _type, optional_yield y, const std::string* object_name) : + StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _s, _obj, _src_obj, object_name, y) { } + + RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, rgw::notify::EventType _type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) : + StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _obj, _src_obj, _bucket, _user_id, _user_tenant, _req_id, y) {} + + ~RadosNotification() = default; + + rgw::notify::reservation_t& get_reservation(void) { + return res; + } + + virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override; + virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size, + const ceph::real_time& mtime, const std::string& etag, const std::string& version) override; +}; + +class RadosAtomicWriter : public StoreWriter { +protected: + rgw::sal::RadosStore* store; + std::unique_ptr aio; + RGWObjectCtx& obj_ctx; + rgw::putobj::AtomicObjectProcessor processor; + +public: + RadosAtomicWriter(const DoutPrefixProvider *dpp, + optional_yield y, + RGWBucketInfo& bucket_info, + RGWObjectCtx& obj_ctx, + const rgw_obj& obj, + RadosStore* _store, std::unique_ptr _aio, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) : + StoreWriter(dpp, y), + store(_store), + aio(std::move(_aio)), + obj_ctx(obj_ctx), + processor(&*aio, store->getRados(), bucket_info, + ptail_placement_rule, owner, obj_ctx, + obj, olh_epoch, unique_tag, + dpp, y) + {} + ~RadosAtomicWriter() = default; + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; +}; + +class RadosAppendWriter : public StoreWriter { +protected: + rgw::sal::RadosStore* store; + std::unique_ptr aio; + RGWObjectCtx& obj_ctx; + rgw::putobj::AppendObjectProcessor processor; + +public: + RadosAppendWriter(const DoutPrefixProvider *dpp, + optional_yield y, + RGWBucketInfo& bucket_info, + RGWObjectCtx& obj_ctx, + const rgw_obj& obj, + RadosStore* _store, std::unique_ptr _aio, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) : + StoreWriter(dpp, y), + store(_store), + aio(std::move(_aio)), + obj_ctx(obj_ctx), + processor(&*aio, store->getRados(), bucket_info, + ptail_placement_rule, owner, obj_ctx, + obj, unique_tag, position, + cur_accounted_size, dpp, y) + {} + ~RadosAppendWriter() = default; + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; +}; + +class RadosMultipartWriter : public StoreWriter { +protected: + rgw::sal::RadosStore* store; + std::unique_ptr aio; + RGWObjectCtx& obj_ctx; + rgw::putobj::MultipartObjectProcessor processor; + +public: + RadosMultipartWriter(const DoutPrefixProvider *dpp, + optional_yield y, const std::string& upload_id, + RGWBucketInfo& bucket_info, + RGWObjectCtx& obj_ctx, + const rgw_obj& obj, + RadosStore* _store, std::unique_ptr _aio, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, const std::string& part_num_str) : + StoreWriter(dpp, y), + store(_store), + aio(std::move(_aio)), + obj_ctx(obj_ctx), + processor(&*aio, store->getRados(), bucket_info, + ptail_placement_rule, owner, obj_ctx, + obj, upload_id, + part_num, part_num_str, dpp, y) + {} + ~RadosMultipartWriter() = default; + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; +}; + +class RadosLuaManager : public StoreLuaManager { + RadosStore* const store; + rgw_pool pool; + +public: + RadosLuaManager(RadosStore* _s); + virtual ~RadosLuaManager() = default; + + virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script); + virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script); + virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key); + virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name); + virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name); + virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages); +}; + +class RadosOIDCProvider : public RGWOIDCProvider { + RadosStore* store; +public: + RadosOIDCProvider(RadosStore* _store) : store(_store) {} + ~RadosOIDCProvider() = default; + + virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override; + virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override; + virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override; + void encode(bufferlist& bl) const { + RGWOIDCProvider::encode(bl); + } + void decode(bufferlist::const_iterator& bl) { + RGWOIDCProvider::decode(bl); + } +}; + +class RadosRole : public RGWRole { + RadosStore* store; +public: + RadosRole(RadosStore* _store, std::string name, + std::string tenant, + std::string path, + std::string trust_policy, + std::string max_session_duration, + std::multimap tags) : RGWRole(name, tenant, path, trust_policy, max_session_duration, tags), store(_store) {} + RadosRole(RadosStore* _store, std::string id) : RGWRole(id), store(_store) {} + RadosRole(RadosStore* _store, const RGWRoleInfo& info) : RGWRole(info), store(_store) {} + RadosRole(RadosStore* _store) : store(_store) {} + ~RadosRole() = default; + + virtual int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override; + virtual int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override; + virtual int store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override; + virtual int read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y) override; + virtual int read_name(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int read_info(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y) override; + virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override; +}; +}} // namespace rgw::sal + +WRITE_CLASS_ENCODER(rgw::sal::RadosOIDCProvider) diff --git a/src/rgw/driver/rados/rgw_service.cc b/src/rgw/driver/rados/rgw_service.cc new file mode 100644 index 000000000..4fcb1ebde --- /dev/null +++ b/src/rgw/driver/rados/rgw_service.cc @@ -0,0 +1,476 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_service.h" + +#include "services/svc_finisher.h" +#include "services/svc_bi_rados.h" +#include "services/svc_bilog_rados.h" +#include "services/svc_bucket_sobj.h" +#include "services/svc_bucket_sync_sobj.h" +#include "services/svc_cls.h" +#include "services/svc_config_key_rados.h" +#include "services/svc_mdlog.h" +#include "services/svc_meta.h" +#include "services/svc_meta_be.h" +#include "services/svc_meta_be_sobj.h" +#include "services/svc_meta_be_otp.h" +#include "services/svc_notify.h" +#include "services/svc_otp.h" +#include "services/svc_rados.h" +#include "services/svc_zone.h" +#include "services/svc_zone_utils.h" +#include "services/svc_quota.h" +#include "services/svc_sync_modules.h" +#include "services/svc_sys_obj.h" +#include "services/svc_sys_obj_cache.h" +#include "services/svc_sys_obj_core.h" +#include "services/svc_user_rados.h" +#include "services/svc_role_rados.h" + +#include "common/errno.h" + +#include "rgw_bucket.h" +#include "rgw_datalog.h" +#include "rgw_metadata.h" +#include "rgw_otp.h" +#include "rgw_user.h" +#include "rgw_role.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWServices_Def::RGWServices_Def() = default; +RGWServices_Def::~RGWServices_Def() +{ + shutdown(); +} + +int RGWServices_Def::init(CephContext *cct, + bool have_cache, + bool raw, + bool run_sync, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + finisher = std::make_unique(cct); + bucket_sobj = std::make_unique(cct); + bucket_sync_sobj = std::make_unique(cct); + bi_rados = std::make_unique(cct); + bilog_rados = std::make_unique(cct); + cls = std::make_unique(cct); + config_key_rados = std::make_unique(cct); + datalog_rados = std::make_unique(cct); + mdlog = std::make_unique(cct, run_sync); + meta = std::make_unique(cct); + meta_be_sobj = std::make_unique(cct); + meta_be_otp = std::make_unique(cct); + notify = std::make_unique(cct); + otp = std::make_unique(cct); + rados = std::make_unique(cct); + zone = std::make_unique(cct); + zone_utils = std::make_unique(cct); + quota = std::make_unique(cct); + sync_modules = std::make_unique(cct); + sysobj = std::make_unique(cct); + sysobj_core = std::make_unique(cct); + user_rados = std::make_unique(cct); + role_rados = std::make_unique(cct); + + if (have_cache) { + sysobj_cache = std::make_unique(dpp, cct); + } + + vector meta_bes{meta_be_sobj.get(), meta_be_otp.get()}; + + finisher->init(); + bi_rados->init(zone.get(), rados.get(), bilog_rados.get(), datalog_rados.get()); + bilog_rados->init(bi_rados.get()); + bucket_sobj->init(zone.get(), sysobj.get(), sysobj_cache.get(), + bi_rados.get(), meta.get(), meta_be_sobj.get(), + sync_modules.get(), bucket_sync_sobj.get()); + bucket_sync_sobj->init(zone.get(), + sysobj.get(), + sysobj_cache.get(), + bucket_sobj.get()); + cls->init(zone.get(), rados.get()); + config_key_rados->init(rados.get()); + mdlog->init(rados.get(), zone.get(), sysobj.get(), cls.get()); + meta->init(sysobj.get(), mdlog.get(), meta_bes); + meta_be_sobj->init(sysobj.get(), mdlog.get()); + meta_be_otp->init(sysobj.get(), mdlog.get(), cls.get()); + notify->init(zone.get(), rados.get(), finisher.get()); + otp->init(zone.get(), meta.get(), meta_be_otp.get()); + rados->init(); + zone->init(sysobj.get(), rados.get(), sync_modules.get(), bucket_sync_sobj.get()); + zone_utils->init(rados.get(), zone.get()); + quota->init(zone.get()); + sync_modules->init(zone.get()); + sysobj_core->core_init(rados.get(), zone.get()); + if (have_cache) { + sysobj_cache->init(rados.get(), zone.get(), notify.get()); + sysobj->init(rados.get(), sysobj_cache.get()); + } else { + sysobj->init(rados.get(), sysobj_core.get()); + } + user_rados->init(rados.get(), zone.get(), sysobj.get(), sysobj_cache.get(), + meta.get(), meta_be_sobj.get(), sync_modules.get()); + role_rados->init(zone.get(), meta.get(), meta_be_sobj.get(), sysobj.get()); + + can_shutdown = true; + + int r = finisher->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start finisher service (" << cpp_strerror(-r) << dendl; + return r; + } + + if (!raw) { + r = notify->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start notify service (" << cpp_strerror(-r) << dendl; + return r; + } + } + + r = rados->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start rados service (" << cpp_strerror(-r) << dendl; + return r; + } + + if (!raw) { + r = zone->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start zone service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = datalog_rados->start(dpp, &zone->get_zone(), + zone->get_zone_params(), + rados->get_rados_handle()); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start datalog_rados service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = mdlog->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start mdlog service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = sync_modules->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start sync modules service (" << cpp_strerror(-r) << dendl; + return r; + } + } + + r = cls->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start cls service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = config_key_rados->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start config_key service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = zone_utils->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start zone_utils service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = quota->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start quota service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = sysobj_core->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_core service (" << cpp_strerror(-r) << dendl; + return r; + } + + if (have_cache) { + r = sysobj_cache->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_cache service (" << cpp_strerror(-r) << dendl; + return r; + } + } + + r = sysobj->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj service (" << cpp_strerror(-r) << dendl; + return r; + } + + if (!raw) { + r = meta_be_sobj->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start meta_be_sobj service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = meta->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start meta service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = bucket_sobj->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start bucket service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = bucket_sync_sobj->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start bucket_sync service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = user_rados->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start user_rados service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = otp->start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start otp service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = role_rados->start(y, dpp); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start role_rados service (" << cpp_strerror(-r) << dendl; + return r; + } + + } + + /* cache or core services will be started by sysobj */ + + return 0; +} + +void RGWServices_Def::shutdown() +{ + if (!can_shutdown) { + return; + } + + if (has_shutdown) { + return; + } + + role_rados->shutdown(); + datalog_rados.reset(); + user_rados->shutdown(); + sync_modules->shutdown(); + otp->shutdown(); + notify->shutdown(); + meta_be_otp->shutdown(); + meta_be_sobj->shutdown(); + meta->shutdown(); + mdlog->shutdown(); + config_key_rados->shutdown(); + cls->shutdown(); + bilog_rados->shutdown(); + bi_rados->shutdown(); + bucket_sync_sobj->shutdown(); + bucket_sobj->shutdown(); + finisher->shutdown(); + + sysobj->shutdown(); + sysobj_core->shutdown(); + notify->shutdown(); + if (sysobj_cache) { + sysobj_cache->shutdown(); + } + quota->shutdown(); + zone_utils->shutdown(); + zone->shutdown(); + rados->shutdown(); + + has_shutdown = true; + +} + + +int RGWServices::do_init(CephContext *_cct, bool have_cache, bool raw, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp) +{ + cct = _cct; + + int r = _svc.init(cct, have_cache, raw, run_sync, y, dpp); + if (r < 0) { + return r; + } + + finisher = _svc.finisher.get(); + bi_rados = _svc.bi_rados.get(); + bi = bi_rados; + bilog_rados = _svc.bilog_rados.get(); + bucket_sobj = _svc.bucket_sobj.get(); + bucket = bucket_sobj; + bucket_sync_sobj = _svc.bucket_sync_sobj.get(); + bucket_sync = bucket_sync_sobj; + cls = _svc.cls.get(); + config_key_rados = _svc.config_key_rados.get(); + config_key = config_key_rados; + datalog_rados = _svc.datalog_rados.get(); + mdlog = _svc.mdlog.get(); + meta = _svc.meta.get(); + meta_be_sobj = _svc.meta_be_sobj.get(); + meta_be_otp = _svc.meta_be_otp.get(); + notify = _svc.notify.get(); + otp = _svc.otp.get(); + rados = _svc.rados.get(); + zone = _svc.zone.get(); + zone_utils = _svc.zone_utils.get(); + quota = _svc.quota.get(); + sync_modules = _svc.sync_modules.get(); + sysobj = _svc.sysobj.get(); + cache = _svc.sysobj_cache.get(); + core = _svc.sysobj_core.get(); + user = _svc.user_rados.get(); + role = _svc.role_rados.get(); + + return 0; +} + +RGWServiceInstance::~RGWServiceInstance() {} + +int RGWServiceInstance::start(optional_yield y, const DoutPrefixProvider *dpp) +{ + if (start_state != StateInit) { + return 0; + } + + start_state = StateStarting;; /* setting started prior to do_start() on purpose so that circular + references can call start() on each other */ + + int r = do_start(y, dpp); + if (r < 0) { + return r; + } + + start_state = StateStarted; + + return 0; +} + +RGWCtlDef::RGWCtlDef() {} +RGWCtlDef::~RGWCtlDef() {} +RGWCtlDef::_meta::_meta() {} +RGWCtlDef::_meta::~_meta() {} + + +int RGWCtlDef::init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp) +{ + meta.mgr.reset(new RGWMetadataManager(svc.meta)); + + meta.user.reset(RGWUserMetaHandlerAllocator::alloc(svc.user)); + + auto sync_module = svc.sync_modules->get_sync_module(); + if (sync_module) { + meta.bucket.reset(sync_module->alloc_bucket_meta_handler()); + meta.bucket_instance.reset(sync_module->alloc_bucket_instance_meta_handler(driver)); + } else { + meta.bucket.reset(RGWBucketMetaHandlerAllocator::alloc()); + meta.bucket_instance.reset(RGWBucketInstanceMetaHandlerAllocator::alloc(driver)); + } + + meta.otp.reset(RGWOTPMetaHandlerAllocator::alloc()); + meta.role = std::make_unique(driver, svc.role); + + user.reset(new RGWUserCtl(svc.zone, svc.user, (RGWUserMetadataHandler *)meta.user.get())); + bucket.reset(new RGWBucketCtl(svc.zone, + svc.bucket, + svc.bucket_sync, + svc.bi, svc.user)); + otp.reset(new RGWOTPCtl(svc.zone, svc.otp)); + + RGWBucketMetadataHandlerBase *bucket_meta_handler = static_cast(meta.bucket.get()); + RGWBucketInstanceMetadataHandlerBase *bi_meta_handler = static_cast(meta.bucket_instance.get()); + + bucket_meta_handler->init(svc.bucket, bucket.get()); + bi_meta_handler->init(svc.zone, svc.bucket, svc.bi); + + RGWOTPMetadataHandlerBase *otp_handler = static_cast(meta.otp.get()); + otp_handler->init(svc.zone, svc.meta_be_otp, svc.otp); + + user->init(bucket.get()); + bucket->init(user.get(), + (RGWBucketMetadataHandler *)bucket_meta_handler, + (RGWBucketInstanceMetadataHandler *)bi_meta_handler, + svc.datalog_rados, + dpp); + + otp->init((RGWOTPMetadataHandler *)meta.otp.get()); + + return 0; +} + +int RGWCtl::init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp) +{ + svc = _svc; + cct = svc->cct; + + int r = _ctl.init(*svc, driver, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start init ctls (" << cpp_strerror(-r) << dendl; + return r; + } + + meta.mgr = _ctl.meta.mgr.get(); + meta.user = _ctl.meta.user.get(); + meta.bucket = _ctl.meta.bucket.get(); + meta.bucket_instance = _ctl.meta.bucket_instance.get(); + meta.otp = _ctl.meta.otp.get(); + meta.role = _ctl.meta.role.get(); + + user = _ctl.user.get(); + bucket = _ctl.bucket.get(); + otp = _ctl.otp.get(); + + r = meta.user->attach(meta.mgr); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start init meta.user ctl (" << cpp_strerror(-r) << dendl; + return r; + } + + r = meta.bucket->attach(meta.mgr); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start init meta.bucket ctl (" << cpp_strerror(-r) << dendl; + return r; + } + + r = meta.bucket_instance->attach(meta.mgr); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start init meta.bucket_instance ctl (" << cpp_strerror(-r) << dendl; + return r; + } + + r = meta.otp->attach(meta.mgr); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl; + return r; + } + + r = meta.role->attach(meta.mgr); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl; + return r; + } + return 0; +} + diff --git a/src/rgw/driver/rados/rgw_service.h b/src/rgw/driver/rados/rgw_service.h new file mode 100644 index 000000000..4c0b8d842 --- /dev/null +++ b/src/rgw/driver/rados/rgw_service.h @@ -0,0 +1,215 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include "common/async/yield_context.h" + +#include "rgw_common.h" + +struct RGWServices_Def; + +class RGWServiceInstance +{ + friend struct RGWServices_Def; + +protected: + CephContext *cct; + + enum StartState { + StateInit = 0, + StateStarting = 1, + StateStarted = 2, + } start_state{StateInit}; + + virtual void shutdown() {} + virtual int do_start(optional_yield, const DoutPrefixProvider *dpp) { + return 0; + } +public: + RGWServiceInstance(CephContext *_cct) : cct(_cct) {} + virtual ~RGWServiceInstance(); + + int start(optional_yield y, const DoutPrefixProvider *dpp); + bool is_started() { + return (start_state == StateStarted); + } + + CephContext *ctx() { + return cct; + } +}; + +class RGWSI_Finisher; +class RGWSI_Bucket; +class RGWSI_Bucket_SObj; +class RGWSI_Bucket_Sync; +class RGWSI_Bucket_Sync_SObj; +class RGWSI_BucketIndex; +class RGWSI_BucketIndex_RADOS; +class RGWSI_BILog_RADOS; +class RGWSI_Cls; +class RGWSI_ConfigKey; +class RGWSI_ConfigKey_RADOS; +class RGWSI_MDLog; +class RGWSI_Meta; +class RGWSI_MetaBackend; +class RGWSI_MetaBackend_SObj; +class RGWSI_MetaBackend_OTP; +class RGWSI_Notify; +class RGWSI_OTP; +class RGWSI_RADOS; +class RGWSI_Zone; +class RGWSI_ZoneUtils; +class RGWSI_Quota; +class RGWSI_SyncModules; +class RGWSI_SysObj; +class RGWSI_SysObj_Core; +class RGWSI_SysObj_Cache; +class RGWSI_User; +class RGWSI_User_RADOS; +class RGWDataChangesLog; +class RGWSI_Role_RADOS; + +struct RGWServices_Def +{ + bool can_shutdown{false}; + bool has_shutdown{false}; + + std::unique_ptr finisher; + std::unique_ptr bucket_sobj; + std::unique_ptr bucket_sync_sobj; + std::unique_ptr bi_rados; + std::unique_ptr bilog_rados; + std::unique_ptr cls; + std::unique_ptr config_key_rados; + std::unique_ptr mdlog; + std::unique_ptr meta; + std::unique_ptr meta_be_sobj; + std::unique_ptr meta_be_otp; + std::unique_ptr notify; + std::unique_ptr otp; + std::unique_ptr rados; + std::unique_ptr zone; + std::unique_ptr zone_utils; + std::unique_ptr quota; + std::unique_ptr sync_modules; + std::unique_ptr sysobj; + std::unique_ptr sysobj_core; + std::unique_ptr sysobj_cache; + std::unique_ptr user_rados; + std::unique_ptr datalog_rados; + std::unique_ptr role_rados; + + RGWServices_Def(); + ~RGWServices_Def(); + + int init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp); + void shutdown(); +}; + + +struct RGWServices +{ + RGWServices_Def _svc; + + CephContext *cct; + + RGWSI_Finisher *finisher{nullptr}; + RGWSI_Bucket *bucket{nullptr}; + RGWSI_Bucket_SObj *bucket_sobj{nullptr}; + RGWSI_Bucket_Sync *bucket_sync{nullptr}; + RGWSI_Bucket_Sync_SObj *bucket_sync_sobj{nullptr}; + RGWSI_BucketIndex *bi{nullptr}; + RGWSI_BucketIndex_RADOS *bi_rados{nullptr}; + RGWSI_BILog_RADOS *bilog_rados{nullptr}; + RGWSI_Cls *cls{nullptr}; + RGWSI_ConfigKey_RADOS *config_key_rados{nullptr}; + RGWSI_ConfigKey *config_key{nullptr}; + RGWDataChangesLog *datalog_rados{nullptr}; + RGWSI_MDLog *mdlog{nullptr}; + RGWSI_Meta *meta{nullptr}; + RGWSI_MetaBackend *meta_be_sobj{nullptr}; + RGWSI_MetaBackend *meta_be_otp{nullptr}; + RGWSI_Notify *notify{nullptr}; + RGWSI_OTP *otp{nullptr}; + RGWSI_RADOS *rados{nullptr}; + RGWSI_Zone *zone{nullptr}; + RGWSI_ZoneUtils *zone_utils{nullptr}; + RGWSI_Quota *quota{nullptr}; + RGWSI_SyncModules *sync_modules{nullptr}; + RGWSI_SysObj *sysobj{nullptr}; + RGWSI_SysObj_Cache *cache{nullptr}; + RGWSI_SysObj_Core *core{nullptr}; + RGWSI_User *user{nullptr}; + RGWSI_Role_RADOS *role{nullptr}; + + int do_init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp); + + int init(CephContext *cct, bool have_cache, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp) { + return do_init(cct, have_cache, false, run_sync, y, dpp); + } + + int init_raw(CephContext *cct, bool have_cache, optional_yield y, const DoutPrefixProvider *dpp) { + return do_init(cct, have_cache, true, false, y, dpp); + } + void shutdown() { + _svc.shutdown(); + } +}; + +class RGWMetadataManager; +class RGWMetadataHandler; +class RGWUserCtl; +class RGWBucketCtl; +class RGWOTPCtl; + +struct RGWCtlDef { + struct _meta { + std::unique_ptr mgr; + std::unique_ptr bucket; + std::unique_ptr bucket_instance; + std::unique_ptr user; + std::unique_ptr otp; + std::unique_ptr role; + + _meta(); + ~_meta(); + } meta; + + std::unique_ptr user; + std::unique_ptr bucket; + std::unique_ptr otp; + + RGWCtlDef(); + ~RGWCtlDef(); + + int init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp); +}; + +struct RGWCtl { + CephContext *cct{nullptr}; + RGWServices *svc{nullptr}; + + RGWCtlDef _ctl; + + struct _meta { + RGWMetadataManager *mgr{nullptr}; + + RGWMetadataHandler *bucket{nullptr}; + RGWMetadataHandler *bucket_instance{nullptr}; + RGWMetadataHandler *user{nullptr}; + RGWMetadataHandler *otp{nullptr}; + RGWMetadataHandler *role{nullptr}; + } meta; + + RGWUserCtl *user{nullptr}; + RGWBucketCtl *bucket{nullptr}; + RGWOTPCtl *otp{nullptr}; + + int init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp); +}; diff --git a/src/rgw/driver/rados/rgw_sync.cc b/src/rgw/driver/rados/rgw_sync.cc new file mode 100644 index 000000000..d0ec90796 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync.cc @@ -0,0 +1,2568 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_sync.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" + +#include "services/svc_zone.h" +#include "services/svc_mdlog.h" +#include "services/svc_cls.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "meta sync: ") + +using namespace std; + +static string mdlog_sync_status_oid = "mdlog.sync-status"; +static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard"; +static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index"; + +RGWContinuousLeaseCR::~RGWContinuousLeaseCR() {} + +RGWSyncErrorLogger::RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) { + for (int i = 0; i < num_shards; i++) { + oids.push_back(get_shard_oid(oid_prefix, i)); + } +} +string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) { + char buf[oid_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id); + return string(buf); +} + +RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const DoutPrefixProvider *dpp, const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) { + cls_log_entry entry; + + rgw_sync_error_info info(source_zone, error_code, message); + bufferlist bl; + encode(info, bl); + store->svc()->cls->timelog.prepare_entry(entry, real_clock::now(), section, name, bl); + + uint32_t shard_id = ++counter % num_shards; + + + return new RGWRadosTimelogAddCR(dpp, store, oids[shard_id], entry); +} + +void RGWSyncBackoff::update_wait_time() +{ + if (cur_wait == 0) { + cur_wait = 1; + } else { + cur_wait = (cur_wait << 1); + } + if (cur_wait >= max_secs) { + cur_wait = max_secs; + } +} + +void RGWSyncBackoff::backoff_sleep() +{ + update_wait_time(); + sleep(cur_wait); +} + +void RGWSyncBackoff::backoff(RGWCoroutine *op) +{ + update_wait_time(); + op->wait(utime_t(cur_wait, 0)); +} + +int RGWBackoffControlCR::operate(const DoutPrefixProvider *dpp) { + reenter(this) { + // retry the operation until it succeeds + while (true) { + yield { + std::lock_guard l{lock}; + cr = alloc_cr(); + cr->get(); + call(cr); + } + { + std::lock_guard l{lock}; + cr->put(); + cr = NULL; + } + if (retcode >= 0) { + break; + } + if (retcode != -EBUSY && retcode != -EAGAIN) { + ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl; + if (exit_on_error) { + return set_cr_error(retcode); + } + } + if (reset_backoff) { + backoff.reset(); + } + yield backoff.backoff(this); + } + + // run an optional finisher + yield call(alloc_finisher_cr()); + if (retcode < 0) { + ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} + +void rgw_mdlog_info::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("num_objects", num_shards, obj); + JSONDecoder::decode_json("period", period, obj); + JSONDecoder::decode_json("realm_epoch", realm_epoch, obj); +} + +void rgw_mdlog_entry::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("section", section, obj); + JSONDecoder::decode_json("name", name, obj); + utime_t ut; + JSONDecoder::decode_json("timestamp", ut, obj); + timestamp = ut.to_real_time(); + JSONDecoder::decode_json("data", log_data, obj); +} + +void rgw_mdlog_shard_data::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("entries", entries, obj); +}; + +int RGWShardCollectCR::operate(const DoutPrefixProvider *dpp) { + reenter(this) { + while (spawn_next()) { + current_running++; + + if (current_running >= max_concurrent) { + int child_ret; + yield wait_for_child(); + if (collect_next(&child_ret)) { + current_running--; + child_ret = handle_result(child_ret); + if (child_ret < 0) { + status = child_ret; + } + } + } + } + while (current_running > 0) { + int child_ret; + yield wait_for_child(); + if (collect_next(&child_ret)) { + current_running--; + child_ret = handle_result(child_ret); + if (child_ret < 0) { + status = child_ret; + } + } + } + if (status < 0) { + return set_cr_error(status); + } + return set_cr_done(); + } + return 0; +} + +class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR { + RGWMetaSyncEnv *sync_env; + + const std::string& period; + int num_shards; + map *mdlog_info; + + int shard_id; +#define READ_MDLOG_MAX_CONCURRENT 10 + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to fetch mdlog status: " << cpp_strerror(r) << dendl; + } + return r; + } +public: + RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env, + const std::string& period, int _num_shards, + map *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT), + sync_env(_sync_env), + period(period), num_shards(_num_shards), + mdlog_info(_mdlog_info), shard_id(0) {} + bool spawn_next() override; +}; + +class RGWListRemoteMDLogCR : public RGWShardCollectCR { + RGWMetaSyncEnv *sync_env; + + const std::string& period; + map shards; + int max_entries_per_shard; + map *result; + + map::iterator iter; +#define READ_MDLOG_MAX_CONCURRENT 10 + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to list remote mdlog shard: " << cpp_strerror(r) << dendl; + } + return r; + } +public: + RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env, + const std::string& period, map& _shards, + int _max_entries_per_shard, + map *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT), + sync_env(_sync_env), period(period), + max_entries_per_shard(_max_entries_per_shard), + result(_result) { + shards.swap(_shards); + iter = shards.begin(); + } + bool spawn_next() override; +}; + +int RGWRemoteMetaLog::read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info) +{ + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { NULL, NULL } }; + + int ret = conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog info" << dendl; + return ret; + } + + ldpp_dout(dpp, 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl; + + return 0; +} + +int RGWRemoteMetaLog::read_master_log_shards_info(const DoutPrefixProvider *dpp, const string &master_period, map *shards_info) +{ + if (store->svc()->zone->is_meta_master()) { + return 0; + } + + rgw_mdlog_info log_info; + int ret = read_log_info(dpp, &log_info); + if (ret < 0) { + return ret; + } + + return run(dpp, new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info)); +} + +int RGWRemoteMetaLog::read_master_log_shards_next(const DoutPrefixProvider *dpp, const string& period, map shard_markers, map *result) +{ + if (store->svc()->zone->is_meta_master()) { + return 0; + } + + return run(dpp, new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result)); +} + +int RGWRemoteMetaLog::init() +{ + conn = store->svc()->zone->get_master_conn(); + + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + + error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS); + + init_sync_env(&sync_env); + + tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "meta"); + + return 0; +} + +#define CLONE_MAX_ENTRIES 100 + +int RGWMetaSyncStatusManager::init(const DoutPrefixProvider *dpp) +{ + if (store->svc()->zone->is_meta_master()) { + return 0; + } + + if (!store->svc()->zone->get_master_conn()) { + ldpp_dout(dpp, -1) << "no REST connection to master zone" << dendl; + return -EIO; + } + + int r = rgw_init_ioctx(dpp, store->getRados()->get_rados_handle(), store->svc()->zone->get_zone_params().log_pool, ioctx, true); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to open log pool (" << store->svc()->zone->get_zone_params().log_pool << " ret=" << r << dendl; + return r; + } + + r = master_log.init(); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to init remote log, r=" << r << dendl; + return r; + } + + RGWMetaSyncEnv& sync_env = master_log.get_sync_env(); + + rgw_meta_sync_status sync_status; + r = read_sync_status(dpp, &sync_status); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: failed to read sync status, r=" << r << dendl; + return r; + } + + int num_shards = sync_status.sync_info.num_shards; + + for (int i = 0; i < num_shards; i++) { + shard_objs[i] = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.shard_obj_name(i)); + } + + std::unique_lock wl{ts_to_shard_lock}; + for (int i = 0; i < num_shards; i++) { + clone_markers.push_back(string()); + utime_shard ut; + ut.shard_id = i; + ts_to_shard[ut] = i; + } + + return 0; +} + +void RGWMetaSyncEnv::init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn, + RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager, + RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer) { + dpp = _dpp; + cct = _cct; + store = _store; + conn = _conn; + async_rados = _async_rados; + http_manager = _http_manager; + error_logger = _error_logger; + sync_tracer = _sync_tracer; +} + +string RGWMetaSyncEnv::status_oid() +{ + return mdlog_sync_status_oid; +} + +string RGWMetaSyncEnv::shard_obj_name(int shard_id) +{ + char buf[mdlog_sync_status_shard_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id); + + return string(buf); +} + +class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + RGWMetadataLog *mdlog; + int shard_id; + int max_entries; + +protected: + int _send_request(const DoutPrefixProvider *dpp) override { + real_time from_time; + real_time end_time; + + void *handle; + + mdlog->init_list_entries(shard_id, from_time, end_time, marker, &handle); + + int ret = mdlog->list_entries(dpp, handle, max_entries, entries, &marker, &truncated); + + mdlog->complete_list_entries(handle); + + return ret; + } +public: + string marker; + list entries; + bool truncated; + + RGWAsyncReadMDLogEntries(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + RGWMetadataLog* mdlog, int _shard_id, + std::string _marker, int _max_entries) + : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(_store), mdlog(mdlog), + shard_id(_shard_id), max_entries(_max_entries), marker(std::move(_marker)) {} +}; + +class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine { + RGWMetaSyncEnv *sync_env; + RGWMetadataLog *const mdlog; + int shard_id; + string marker; + string *pmarker; + int max_entries; + list *entries; + bool *truncated; + + RGWAsyncReadMDLogEntries *req{nullptr}; + +public: + RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog, + int _shard_id, string*_marker, int _max_entries, + list *_entries, bool *_truncated) + : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog), + shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries), + entries(_entries), truncated(_truncated) {} + + ~RGWReadMDLogEntriesCR() override { + if (req) { + req->finish(); + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + marker = *pmarker; + req = new RGWAsyncReadMDLogEntries(dpp, this, stack->create_completion_notifier(), + sync_env->store, mdlog, shard_id, marker, + max_entries); + sync_env->async_rados->queue(req); + return 0; + } + + int request_complete() override { + *pmarker = std::move(req->marker); + *entries = std::move(req->entries); + *truncated = req->truncated; + return req->get_ret_status(); + } +}; + + +class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine { + RGWMetaSyncEnv *env; + RGWRESTReadResource *http_op; + + const std::string& period; + int shard_id; + RGWMetadataLogInfo *shard_info; + +public: + RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period, + int _shard_id, RGWMetadataLogInfo *_shard_info) + : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL), + period(period), shard_id(_shard_id), shard_info(_shard_info) {} + + int operate(const DoutPrefixProvider *dpp) override { + auto store = env->store; + RGWRESTConn *conn = store->svc()->zone->get_master_conn(); + reenter(this) { + yield { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + rgw_http_param_pair pairs[] = { { "type" , "metadata" }, + { "id", buf }, + { "period", period.c_str() }, + { "info" , NULL }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(conn, p, pairs, NULL, + env->http_manager); + + init_new_io(http_op); + + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(env->dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + int ret = http_op->wait(shard_info, null_yield); + http_op->put(); + if (ret < 0) { + return set_cr_error(ret); + } + return set_cr_done(); + } + } + return 0; + } +}; + +RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env, + const std::string& period, + int shard_id, + RGWMetadataLogInfo* info) +{ + return new RGWReadRemoteMDLogShardInfoCR(env, period, shard_id, info); +} + +class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine { + RGWMetaSyncEnv *sync_env; + RGWRESTReadResource *http_op; + + const std::string& period; + int shard_id; + string marker; + uint32_t max_entries; + rgw_mdlog_shard_data *result; + +public: + RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period, + int _shard_id, const string& _marker, uint32_t _max_entries, + rgw_mdlog_shard_data *_result) + : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL), + period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {} + + int send_request(const DoutPrefixProvider *dpp) override { + RGWRESTConn *conn = sync_env->conn; + + char buf[32]; + snprintf(buf, sizeof(buf), "%d", shard_id); + + char max_entries_buf[32]; + snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries); + + const char *marker_key = (marker.empty() ? "" : "marker"); + + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { "id", buf }, + { "period", period.c_str() }, + { "max-entries", max_entries_buf }, + { marker_key, marker.c_str() }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager); + init_new_io(http_op); + + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return ret; + } + + return 0; + } + + int request_complete() override { + int ret = http_op->wait(result, null_yield); + http_op->put(); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl; + return ret; + } + return 0; + } +}; + +RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env, + const std::string& period, + int shard_id, + const std::string& marker, + uint32_t max_entries, + rgw_mdlog_shard_data *result) +{ + return new RGWListRemoteMDLogShardCR(env, period, shard_id, marker, + max_entries, result); +} + +bool RGWReadRemoteMDLogInfoCR::spawn_next() { + if (shard_id >= num_shards) { + return false; + } + spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false); + shard_id++; + return true; +} + +bool RGWListRemoteMDLogCR::spawn_next() { + if (iter == shards.end()) { + return false; + } + + spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false); + ++iter; + return true; +} + +class RGWInitSyncStatusCoroutine : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + rgw_meta_sync_info status; + vector shards_info; + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; +public: + RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env, + const rgw_meta_sync_info &status) + : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env), + status(status), shards_info(status.num_shards), + lease_cr(nullptr), lease_stack(nullptr) {} + + ~RGWInitSyncStatusCoroutine() override { + if (lease_cr) { + lease_cr->abort(); + } + } + + int operate(const DoutPrefixProvider *dpp) override { + int ret; + reenter(this) { + yield { + set_status("acquiring sync lock"); + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + rgw::sal::RadosStore* store = sync_env->store; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store, + rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()), + lock_name, lock_duration, this, nullptr)); + lease_stack.reset(spawn(lease_cr.get(), false)); + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + ldpp_dout(dpp, 5) << "failed to take lease" << dendl; + set_status("lease lock failed, early abort"); + return set_cr_error(lease_cr->get_ret_status()); + } + set_sleeping(true); + yield; + } + yield { + set_status("writing sync status"); + rgw::sal::RadosStore* store = sync_env->store; + call(new RGWSimpleRadosWriteCR(dpp, store, + rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()), + status)); + } + + if (retcode < 0) { + set_status("failed to write sync status"); + ldpp_dout(dpp, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl; + yield lease_cr->go_down(); + return set_cr_error(retcode); + } + /* fetch current position in logs */ + set_status("fetching remote log position"); + yield { + for (int i = 0; i < (int)status.num_shards; i++) { + spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i, + &shards_info[i]), false); + } + } + + drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */ + + yield { + set_status("updating sync status"); + for (int i = 0; i < (int)status.num_shards; i++) { + rgw_meta_sync_marker marker; + RGWMetadataLogInfo& info = shards_info[i]; + marker.next_step_marker = info.marker; + marker.timestamp = info.last_update; + rgw::sal::RadosStore* store = sync_env->store; + spawn(new RGWSimpleRadosWriteCR(dpp, + store, + rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(i)), + marker), true); + } + } + yield { + set_status("changing sync state: build full sync maps"); + status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps; + rgw::sal::RadosStore* store = sync_env->store; + call(new RGWSimpleRadosWriteCR(dpp, store, + rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()), + status)); + } + set_status("drop lock lease"); + yield lease_cr->go_down(); + while (collect(&ret, NULL)) { + if (ret < 0) { + return set_cr_error(ret); + } + yield; + } + drain_all(); + return set_cr_done(); + } + return 0; + } +}; + +class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + RGWMetaSyncEnv *env; + const int num_shards; + int shard_id{0}; + map& markers; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to read metadata sync markers: " + << cpp_strerror(r) << dendl; + } + return r; + } + public: + RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards, + map& markers) + : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS), + env(env), num_shards(num_shards), markers(markers) + {} + bool spawn_next() override; +}; + +bool RGWReadSyncStatusMarkersCR::spawn_next() +{ + if (shard_id >= num_shards) { + return false; + } + using CR = RGWSimpleRadosReadCR; + rgw_raw_obj obj{env->store->svc()->zone->get_zone_params().log_pool, + env->shard_obj_name(shard_id)}; + spawn(new CR(env->dpp, env->store, obj, &markers[shard_id]), false); + shard_id++; + return true; +} + +class RGWReadSyncStatusCoroutine : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + rgw_meta_sync_status *sync_status; + +public: + RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env, + rgw_meta_sync_status *_status) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status) + {} + int operate(const DoutPrefixProvider *dpp) override; +}; + +int RGWReadSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + // read sync info + using ReadInfoCR = RGWSimpleRadosReadCR; + yield { + bool empty_on_enoent = false; // fail on ENOENT + rgw_raw_obj obj{sync_env->store->svc()->zone->get_zone_params().log_pool, + sync_env->status_oid()}; + call(new ReadInfoCR(dpp, sync_env->store, obj, + &sync_status->sync_info, empty_on_enoent)); + } + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to read sync status info with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + // read shard markers + using ReadMarkersCR = RGWReadSyncStatusMarkersCR; + yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards, + sync_status->sync_markers)); + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to read sync status markers with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} + +class RGWFetchAllMetaCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + int num_shards; + + + int ret_status; + + list sections; + list::iterator sections_iter; + + struct meta_list_result { + list keys; + string marker; + uint64_t count{0}; + bool truncated{false}; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("keys", keys, obj); + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("count", count, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + } + } result; + list::iterator iter; + + std::unique_ptr entries_index; + + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; + bool lost_lock; + bool failed; + + string marker; + + map& markers; + + RGWSyncTraceNodeRef tn; + +public: + RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards, + map& _markers, + RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + num_shards(_num_shards), + ret_status(0), lease_cr(nullptr), lease_stack(nullptr), + lost_lock(false), failed(false), markers(_markers) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "fetch_all_meta"); + } + + ~RGWFetchAllMetaCR() override { + } + + void append_section_from_set(set& all_sections, const string& name) { + set::iterator iter = all_sections.find(name); + if (iter != all_sections.end()) { + sections.emplace_back(std::move(*iter)); + all_sections.erase(iter); + } + } + /* + * meta sync should go in the following order: user, bucket.instance, bucket + * then whatever other sections exist (if any) + */ + void rearrange_sections() { + set all_sections; + std::move(sections.begin(), sections.end(), + std::inserter(all_sections, all_sections.end())); + sections.clear(); + + append_section_from_set(all_sections, "user"); + append_section_from_set(all_sections, "bucket.instance"); + append_section_from_set(all_sections, "bucket"); + append_section_from_set(all_sections, "roles"); + + std::move(all_sections.begin(), all_sections.end(), + std::back_inserter(sections)); + } + + int operate(const DoutPrefixProvider *dpp) override { + RGWRESTConn *conn = sync_env->conn; + + reenter(this) { + yield { + set_status(string("acquiring lock (") + sync_env->status_oid() + ")"); + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, + sync_env->store, + rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()), + lock_name, lock_duration, this, nullptr)); + lease_stack.reset(spawn(lease_cr.get(), false)); + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + ldpp_dout(dpp, 5) << "failed to take lease" << dendl; + set_status("lease lock failed, early abort"); + return set_cr_error(lease_cr->get_ret_status()); + } + set_sleeping(true); + yield; + } + entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards, + sync_env->store->svc()->zone->get_zone_params().log_pool, + mdlog_sync_full_sync_index_prefix)); + yield { + call(new RGWReadRESTResourceCR >(cct, conn, sync_env->http_manager, + "/admin/metadata", NULL, §ions)); + } + if (get_ret_status() < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata sections" << dendl; + yield entries_index->finish(); + yield lease_cr->go_down(); + drain_all(); + return set_cr_error(get_ret_status()); + } + rearrange_sections(); + sections_iter = sections.begin(); + for (; sections_iter != sections.end(); ++sections_iter) { + do { + yield { +#define META_FULL_SYNC_CHUNK_SIZE "1000" + string entrypoint = string("/admin/metadata/") + *sections_iter; + rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE }, + { "marker", result.marker.c_str() }, + { NULL, NULL } }; + result.keys.clear(); + call(new RGWReadRESTResourceCR(cct, conn, sync_env->http_manager, + entrypoint, pairs, &result)); + } + ret_status = get_ret_status(); + if (ret_status == -ENOENT) { + set_retcode(0); /* reset coroutine status so that we don't return it */ + ret_status = 0; + } + if (ret_status < 0) { + tn->log(0, SSTR("ERROR: failed to fetch metadata section: " << *sections_iter)); + yield entries_index->finish(); + yield lease_cr->go_down(); + drain_all(); + return set_cr_error(ret_status); + } + iter = result.keys.begin(); + for (; iter != result.keys.end(); ++iter) { + if (!lease_cr->is_locked()) { + lost_lock = true; + tn->log(1, "lease is lost, abort"); + break; + } + yield; // allow entries_index consumer to make progress + + tn->log(20, SSTR("list metadata: section=" << *sections_iter << " key=" << *iter)); + string s = *sections_iter + ":" + *iter; + int shard_id; + rgw::sal::RadosStore* store = sync_env->store; + int ret = store->ctl()->meta.mgr->get_shard_id(*sections_iter, *iter, &shard_id); + if (ret < 0) { + tn->log(0, SSTR("ERROR: could not determine shard id for " << *sections_iter << ":" << *iter)); + ret_status = ret; + break; + } + if (!entries_index->append(s, shard_id)) { + break; + } + } + } while (result.truncated); + } + yield { + if (!entries_index->finish()) { + failed = true; + } + } + if (!failed) { + for (map::iterator iter = markers.begin(); iter != markers.end(); ++iter) { + int shard_id = (int)iter->first; + rgw_meta_sync_marker& marker = iter->second; + marker.total_entries = entries_index->get_total_entries(shard_id); + spawn(new RGWSimpleRadosWriteCR(dpp, sync_env->store, + rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)), + marker), true); + } + } + + drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */ + + yield lease_cr->go_down(); + + int ret; + while (collect(&ret, NULL)) { + if (ret < 0) { + return set_cr_error(ret); + } + yield; + } + drain_all(); + if (failed) { + yield return set_cr_error(-EIO); + } + if (lost_lock) { + yield return set_cr_error(-EBUSY); + } + + if (ret_status < 0) { + yield return set_cr_error(ret_status); + } + + yield return set_cr_done(); + } + return 0; + } +}; + +static string full_sync_index_shard_oid(int shard_id) +{ + char buf[mdlog_sync_full_sync_index_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id); + return string(buf); +} + +class RGWReadRemoteMetadataCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + RGWRESTReadResource *http_op; + + string section; + string key; + + bufferlist *pbl; + + RGWSyncTraceNodeRef tn; + +public: + RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env, + const string& _section, const string& _key, bufferlist *_pbl, + const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + http_op(NULL), + section(_section), + key(_key), + pbl(_pbl) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "read_remote_meta", + section + ":" + key); + } + + int operate(const DoutPrefixProvider *dpp) override { + RGWRESTConn *conn = sync_env->conn; + reenter(this) { + yield { + string key_encode; + url_encode(key, key_encode); + rgw_http_param_pair pairs[] = { { "key" , key.c_str()}, + { NULL, NULL } }; + + string p = string("/admin/metadata/") + section + "/" + key_encode; + + http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); + + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + int ret = http_op->wait(pbl, null_yield); + http_op->put(); + if (ret < 0) { + return set_cr_error(ret); + } + return set_cr_done(); + } + } + return 0; + } +}; + +class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + string raw_key; + bufferlist bl; + const DoutPrefixProvider *dpp; +protected: + int _send_request(const DoutPrefixProvider *dpp) override { + int ret = store->ctl()->meta.mgr->put(raw_key, bl, null_yield, dpp, RGWMDLogSyncType::APPLY_ALWAYS, true); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl; + return ret; + } + return 0; + } +public: + RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + const string& _raw_key, + bufferlist& _bl, + const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store), + raw_key(_raw_key), bl(_bl), dpp(dpp) {} +}; + + +class RGWMetaStoreEntryCR : public RGWSimpleCoroutine { + RGWMetaSyncEnv *sync_env; + string raw_key; + bufferlist bl; + + RGWAsyncMetaStoreEntry *req; + +public: + RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env, + const string& _raw_key, + bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), + raw_key(_raw_key), bl(_bl), req(NULL) { + } + + ~RGWMetaStoreEntryCR() override { + if (req) { + req->finish(); + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(), + sync_env->store, raw_key, bl, dpp); + sync_env->async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest { + rgw::sal::RadosStore* store; + string raw_key; + const DoutPrefixProvider *dpp; +protected: + int _send_request(const DoutPrefixProvider *dpp) override { + int ret = store->ctl()->meta.mgr->remove(raw_key, null_yield, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl; + return ret; + } + return 0; + } +public: + RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store, + const string& _raw_key, const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store), + raw_key(_raw_key), dpp(dpp) {} +}; + + +class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine { + RGWMetaSyncEnv *sync_env; + string raw_key; + + RGWAsyncMetaRemoveEntry *req; + +public: + RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env, + const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), + raw_key(_raw_key), req(NULL) { + } + + ~RGWMetaRemoveEntryCR() override { + if (req) { + req->finish(); + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(), + sync_env->store, raw_key, dpp); + sync_env->async_rados->queue(req); + return 0; + } + + int request_complete() override { + int r = req->get_ret_status(); + if (r == -ENOENT) { + r = 0; + } + return r; + } +}; + +#define META_SYNC_UPDATE_MARKER_WINDOW 10 + + +int RGWLastCallerWinsCR::operate(const DoutPrefixProvider *dpp) { + RGWCoroutine *call_cr; + reenter(this) { + while (cr) { + call_cr = cr; + cr = nullptr; + yield call(call_cr); + /* cr might have been modified at this point */ + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: RGWLastCallerWinsCR() failed: retcode=" << retcode << dendl; + return set_cr_error(retcode); + } + } + return set_cr_done(); + } + return 0; +} + +class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack { + RGWMetaSyncEnv *sync_env; + + string marker_oid; + rgw_meta_sync_marker sync_marker; + + RGWSyncTraceNodeRef tn; + +public: + RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env, + const string& _marker_oid, + const rgw_meta_sync_marker& _marker, + RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW), + sync_env(_sync_env), + marker_oid(_marker_oid), + sync_marker(_marker), + tn(_tn){} + + RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override { + sync_marker.marker = new_marker; + if (index_pos > 0) { + sync_marker.pos = index_pos; + } + + if (!real_clock::is_zero(timestamp)) { + sync_marker.timestamp = timestamp; + } + + ldpp_dout(sync_env->dpp, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl; + tn->log(20, SSTR("new marker=" << new_marker)); + rgw::sal::RadosStore* store = sync_env->store; + return new RGWSimpleRadosWriteCR(sync_env->dpp, store, + rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, marker_oid), + sync_marker); + } + + RGWOrderCallCR *allocate_order_control_cr() override { + return new RGWLastCallerWinsCR(sync_env->cct); + } +}; + +RGWMetaSyncSingleEntryCR::RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env, + const string& _raw_key, const string& _entry_marker, + const RGWMDLogStatus& _op_status, + RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + raw_key(_raw_key), entry_marker(_entry_marker), + op_status(_op_status), + pos(0), sync_status(0), + marker_tracker(_marker_tracker), tries(0) { + error_injection = (sync_env->cct->_conf->rgw_sync_meta_inject_err_probability > 0); + tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", raw_key); +} + +int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) { + reenter(this) { +#define NUM_TRANSIENT_ERROR_RETRIES 10 + + if (error_injection && + rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) { + return set_cr_error(-EIO); + } + + if (op_status != MDLOG_STATUS_COMPLETE) { + tn->log(20, "skipping pending operation"); + yield call(marker_tracker->finish(entry_marker)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + tn->set_flag(RGW_SNS_FLAG_ACTIVE); + for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) { + yield { + pos = raw_key.find(':'); + section = raw_key.substr(0, pos); + key = raw_key.substr(pos + 1); + tn->log(10, SSTR("fetching remote metadata entry" << (tries == 0 ? "" : " (retry)"))); + call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl, tn)); + } + + sync_status = retcode; + + if (sync_status == -ENOENT) { + break; + } + + if (sync_status < 0) { + if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) { + ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata entry: " << section << ":" << key << ", will retry" << dendl; + continue; + } + + tn->log(10, SSTR("failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status)); + log_error() << "failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl; + yield call(sync_env->error_logger->log_error_cr(dpp, sync_env->conn->get_remote_id(), section, key, -sync_status, + string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status))); + return set_cr_error(sync_status); + } + + break; + } + + retcode = 0; + for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) { + if (sync_status != -ENOENT) { + tn->log(10, SSTR("storing local metadata entry: " << section << ":" << key)); + yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl)); + } else { + tn->log(10, SSTR("removing local metadata entry:" << section << ":" << key)); + yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key)); + if (retcode == -ENOENT) { + retcode = 0; + break; + } + } + if ((retcode < 0) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) { + ldpp_dout(dpp, 20) << *this << ": failed to store metadata entry: " << section << ":" << key << ", got retcode=" << retcode << ", will retry" << dendl; + continue; + } + break; + } + + sync_status = retcode; + + if (sync_status == 0 && marker_tracker) { + /* update marker */ + yield call(marker_tracker->finish(entry_marker)); + sync_status = retcode; + } + if (sync_status < 0) { + tn->log(10, SSTR("failed, status=" << sync_status)); + return set_cr_error(sync_status); + } + tn->log(10, "success"); + return set_cr_done(); + } + return 0; +} + +class RGWCloneMetaLogCoroutine : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + RGWMetadataLog *mdlog; + + const std::string& period; + int shard_id; + string marker; + bool truncated = false; + string *new_marker; + + int max_entries = CLONE_MAX_ENTRIES; + + RGWRESTReadResource *http_op = nullptr; + boost::intrusive_ptr completion; + + RGWMetadataLogInfo shard_info; + rgw_mdlog_shard_data data; + +public: + RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog, + const std::string& period, int _id, + const string& _marker, string *_new_marker) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog), + period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) { + if (new_marker) { + *new_marker = marker; + } + } + ~RGWCloneMetaLogCoroutine() override { + if (http_op) { + http_op->put(); + } + if (completion) { + completion->cancel(); + } + } + + int operate(const DoutPrefixProvider *dpp) override; + + int state_init(); + int state_read_shard_status(); + int state_read_shard_status_complete(); + int state_send_rest_request(const DoutPrefixProvider *dpp); + int state_receive_rest_response(); + int state_store_mdlog_entries(); + int state_store_mdlog_entries_complete(); +}; + +class RGWMetaSyncShardCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + const rgw_pool& pool; + const std::string& period; //< currently syncing period id + const epoch_t realm_epoch; //< realm_epoch of period + RGWMetadataLog* mdlog; //< log of syncing period + uint32_t shard_id; + rgw_meta_sync_marker& sync_marker; + boost::optional temp_marker; //< for pending updates + string marker; + string max_marker; + const std::string& period_marker; //< max marker stored in next period + + RGWRadosGetOmapKeysCR::ResultPtr omapkeys; + std::set entries; + std::set::iterator iter; + + string oid; + + RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr; + + list log_entries; + list::iterator log_iter; + bool truncated = false; + + string mdlog_marker; + string raw_key; + rgw_mdlog_entry mdlog_entry; + + ceph::mutex inc_lock = ceph::make_mutex("RGWMetaSyncShardCR::inc_lock"); + ceph::condition_variable inc_cond; + + boost::asio::coroutine incremental_cr; + boost::asio::coroutine full_cr; + + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; + + bool lost_lock = false; + + bool *reset_backoff; + + // hold a reference to the cr stack while it's in the map + using StackRef = boost::intrusive_ptr; + map stack_to_pos; + map pos_to_prev; + + bool can_adjust_marker = false; + bool done_with_period = false; + + int total_entries = 0; + + RGWSyncTraceNodeRef tn; +public: + RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool, + const std::string& period, epoch_t realm_epoch, + RGWMetadataLog* mdlog, uint32_t _shard_id, + rgw_meta_sync_marker& _marker, + const std::string& period_marker, bool *_reset_backoff, + RGWSyncTraceNodeRef& _tn) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool), + period(period), realm_epoch(realm_epoch), mdlog(mdlog), + shard_id(_shard_id), sync_marker(_marker), + period_marker(period_marker), + reset_backoff(_reset_backoff), tn(_tn) { + *reset_backoff = false; + } + + ~RGWMetaSyncShardCR() override { + delete marker_tracker; + if (lease_cr) { + lease_cr->abort(); + } + } + + void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) { + delete marker_tracker; + marker_tracker = mt; + } + + int operate(const DoutPrefixProvider *dpp) override { + int r; + while (true) { + switch (sync_marker.state) { + case rgw_meta_sync_marker::FullSync: + r = full_sync(); + if (r < 0) { + ldpp_dout(dpp, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl; + return set_cr_error(r); + } + return 0; + case rgw_meta_sync_marker::IncrementalSync: + r = incremental_sync(); + if (r < 0) { + ldpp_dout(dpp, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl; + return set_cr_error(r); + } + return 0; + } + } + /* unreachable */ + return 0; + } + + void collect_children() + { + int child_ret; + RGWCoroutinesStack *child; + while (collect_next(&child_ret, &child)) { + auto iter = stack_to_pos.find(child); + if (iter == stack_to_pos.end()) { + /* some other stack that we don't care about */ + continue; + } + + string& pos = iter->second; + + if (child_ret < 0) { + ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl; + // on any error code from RGWMetaSyncSingleEntryCR, we do not advance + // the sync status marker past this entry, and set + // can_adjust_marker=false to exit out of RGWMetaSyncShardCR. + // RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the + // previous marker and retry + can_adjust_marker = false; + } + + map::iterator prev_iter = pos_to_prev.find(pos); + ceph_assert(prev_iter != pos_to_prev.end()); + + if (pos_to_prev.size() == 1) { + if (can_adjust_marker) { + sync_marker.marker = pos; + } + pos_to_prev.erase(prev_iter); + } else { + ceph_assert(pos_to_prev.size() > 1); + pos_to_prev.erase(prev_iter); + prev_iter = pos_to_prev.begin(); + if (can_adjust_marker) { + sync_marker.marker = prev_iter->second; + } + } + + ldpp_dout(sync_env->dpp, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl; + stack_to_pos.erase(iter); + } + } + + int full_sync() { +#define OMAP_GET_MAX_ENTRIES 100 + int max_entries = OMAP_GET_MAX_ENTRIES; + reenter(&full_cr) { + set_status("full_sync"); + tn->log(10, "start full sync"); + oid = full_sync_index_shard_oid(shard_id); + can_adjust_marker = true; + /* grab lock */ + yield { + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + rgw::sal::RadosStore* store = sync_env->store; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store, + rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)), + lock_name, lock_duration, this, nullptr)); + lease_stack.reset(spawn(lease_cr.get(), false)); + lost_lock = false; + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + drain_all(); + tn->log(5, "failed to take lease"); + return lease_cr->get_ret_status(); + } + set_sleeping(true); + yield; + } + tn->log(10, "took lease"); + + /* lock succeeded, a retry now should avoid previous backoff status */ + *reset_backoff = true; + + /* prepare marker tracker */ + set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env, + sync_env->shard_obj_name(shard_id), + sync_marker, tn)); + + marker = sync_marker.marker; + + total_entries = sync_marker.pos; + + /* sync! */ + do { + if (!lease_cr->is_locked()) { + tn->log(1, "lease is lost, abort"); + lost_lock = true; + break; + } + omapkeys = std::make_shared(); + yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid), + marker, max_entries, omapkeys)); + if (retcode < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl; + tn->log(0, SSTR("ERROR: failed to list omap keys, status=" << retcode)); + yield lease_cr->go_down(); + drain_all(); + return retcode; + } + entries = std::move(omapkeys->entries); + tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync")); + if (entries.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + iter = entries.begin(); + for (; iter != entries.end(); ++iter) { + marker = *iter; + tn->log(20, SSTR("full sync: " << marker)); + total_entries++; + if (!marker_tracker->start(marker, total_entries, real_time())) { + tn->log(0, SSTR("ERROR: cannot start syncing " << marker << ". Duplicate entry?")); + } else { + // fetch remote and write locally + yield { + RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker, tn), false); + // stack_to_pos holds a reference to the stack + stack_to_pos[stack] = marker; + pos_to_prev[marker] = marker; + } + // limit spawn window + while (num_spawned() > static_cast(cct->_conf->rgw_meta_sync_spawn_window)) { + yield wait_for_child(); + collect_children(); + } + } + } + collect_children(); + } while (omapkeys->more && can_adjust_marker); + + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + + while (num_spawned() > 1) { + yield wait_for_child(); + collect_children(); + } + + if (!lost_lock) { + /* update marker to reflect we're done with full sync */ + if (can_adjust_marker) { + // apply updates to a temporary marker, or operate() will send us + // to incremental_sync() after we yield + temp_marker = sync_marker; + temp_marker->state = rgw_meta_sync_marker::IncrementalSync; + temp_marker->marker = std::move(temp_marker->next_step_marker); + temp_marker->next_step_marker.clear(); + temp_marker->realm_epoch = realm_epoch; + ldpp_dout(sync_env->dpp, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl; + + using WriteMarkerCR = RGWSimpleRadosWriteCR; + yield call(new WriteMarkerCR(sync_env->dpp, sync_env->store, + rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)), + *temp_marker)); + } + + if (retcode < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl; + yield lease_cr->go_down(); + drain_all(); + return retcode; + } + // clean up full sync index + yield { + auto oid = full_sync_index_shard_oid(shard_id); + call(new RGWRadosRemoveCR(sync_env->store, {pool, oid})); + } + } + + /* + * if we reached here, it means that lost_lock is true, otherwise the state + * change in the previous block will prevent us from reaching here + */ + + yield lease_cr->go_down(); + + lease_cr.reset(); + + drain_all(); + + if (!can_adjust_marker) { + return -EAGAIN; + } + + if (lost_lock) { + return -EBUSY; + } + + tn->log(10, "full sync complete"); + + // apply the sync marker update + ceph_assert(temp_marker); + sync_marker = std::move(*temp_marker); + temp_marker = boost::none; + // must not yield after this point! + } + return 0; + } + + + int incremental_sync() { + reenter(&incremental_cr) { + set_status("incremental_sync"); + tn->log(10, "start incremental sync"); + can_adjust_marker = true; + /* grab lock */ + if (!lease_cr) { /* could have had a lease_cr lock from previous state */ + yield { + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + rgw::sal::RadosStore* store = sync_env->store; + lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store, + rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)), + lock_name, lock_duration, this, nullptr)); + lease_stack.reset(spawn(lease_cr.get(), false)); + lost_lock = false; + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + drain_all(); + tn->log(5, "failed to take lease"); + return lease_cr->get_ret_status(); + } + set_sleeping(true); + yield; + } + } + tn->log(10, "took lease"); + // if the period has advanced, we can't use the existing marker + if (sync_marker.realm_epoch < realm_epoch) { + ldpp_dout(sync_env->dpp, 4) << "clearing marker=" << sync_marker.marker + << " from old realm_epoch=" << sync_marker.realm_epoch + << " (now " << realm_epoch << ')' << dendl; + sync_marker.realm_epoch = realm_epoch; + sync_marker.marker.clear(); + } + mdlog_marker = sync_marker.marker; + set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env, + sync_env->shard_obj_name(shard_id), + sync_marker, tn)); + + /* + * mdlog_marker: the remote sync marker positiion + * sync_marker: the local sync marker position + * max_marker: the max mdlog position that we fetched + * marker: the current position we try to sync + * period_marker: the last marker before the next period begins (optional) + */ + marker = max_marker = sync_marker.marker; + /* inc sync */ + do { + if (!lease_cr->is_locked()) { + lost_lock = true; + tn->log(1, "lease is lost, abort"); + break; + } +#define INCREMENTAL_MAX_ENTRIES 100 + ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << " truncated=" << truncated << dendl; + if (!period_marker.empty() && period_marker <= mdlog_marker) { + tn->log(10, SSTR("finished syncing current period: mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker << " period_marker=" << period_marker)); + done_with_period = true; + break; + } + if (mdlog_marker <= max_marker || !truncated) { + /* we're at the tip, try to bring more entries */ + ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl; + yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog, + period, shard_id, + mdlog_marker, &mdlog_marker)); + } + if (retcode < 0) { + tn->log(10, SSTR(*this << ": failed to fetch more log entries, retcode=" << retcode)); + yield lease_cr->go_down(); + drain_all(); + *reset_backoff = false; // back off and try again later + return retcode; + } + truncated = true; + *reset_backoff = true; /* if we got to this point, all systems function */ + if (mdlog_marker > max_marker) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + tn->log(20, SSTR("mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker)); + marker = max_marker; + yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id, + &max_marker, INCREMENTAL_MAX_ENTRIES, + &log_entries, &truncated)); + if (retcode < 0) { + tn->log(10, SSTR("failed to list mdlog entries, retcode=" << retcode)); + yield lease_cr->go_down(); + drain_all(); + *reset_backoff = false; // back off and try again later + return retcode; + } + for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) { + if (!period_marker.empty() && period_marker <= log_iter->id) { + done_with_period = true; + if (period_marker < log_iter->id) { + tn->log(10, SSTR("found key=" << log_iter->id + << " past period_marker=" << period_marker)); + break; + } + ldpp_dout(sync_env->dpp, 10) << "found key at period_marker=" << period_marker << dendl; + // sync this entry, then return control to RGWMetaSyncCR + } + if (!mdlog_entry.convert_from(*log_iter)) { + tn->log(0, SSTR("ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry")); + continue; + } + tn->log(20, SSTR("log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp)); + if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl; + } else { + raw_key = log_iter->section + ":" + log_iter->name; + yield { + RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker, tn), false); + ceph_assert(stack); + // stack_to_pos holds a reference to the stack + stack_to_pos[stack] = log_iter->id; + pos_to_prev[log_iter->id] = marker; + } + // limit spawn window + while (num_spawned() > static_cast(cct->_conf->rgw_meta_sync_spawn_window)) { + yield wait_for_child(); + collect_children(); + } + } + marker = log_iter->id; + } + } + collect_children(); + ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl; + if (done_with_period) { + // return control to RGWMetaSyncCR and advance to the next period + tn->log(10, SSTR(*this << ": done with period")); + break; + } + if (mdlog_marker == max_marker && can_adjust_marker) { + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + yield wait(utime_t(cct->_conf->rgw_meta_sync_poll_interval, 0)); + } + } while (can_adjust_marker); + + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + + while (num_spawned() > 1) { + yield wait_for_child(); + collect_children(); + } + + yield lease_cr->go_down(); + + drain_all(); + + if (lost_lock) { + return -EBUSY; + } + + if (!can_adjust_marker) { + return -EAGAIN; + } + + return set_cr_done(); + } + /* TODO */ + return 0; + } +}; + +class RGWMetaSyncShardControlCR : public RGWBackoffControlCR +{ + RGWMetaSyncEnv *sync_env; + + const rgw_pool& pool; + const std::string& period; + epoch_t realm_epoch; + RGWMetadataLog* mdlog; + uint32_t shard_id; + rgw_meta_sync_marker sync_marker; + const std::string period_marker; + + RGWSyncTraceNodeRef tn; + + static constexpr bool exit_on_error = false; // retry on all errors +public: + RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool, + const std::string& period, epoch_t realm_epoch, + RGWMetadataLog* mdlog, uint32_t _shard_id, + const rgw_meta_sync_marker& _marker, + std::string&& period_marker, + RGWSyncTraceNodeRef& _tn_parent) + : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env), + pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog), + shard_id(_shard_id), sync_marker(_marker), + period_marker(std::move(period_marker)) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "shard", + std::to_string(shard_id)); + } + + RGWCoroutine *alloc_cr() override { + return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog, + shard_id, sync_marker, period_marker, backoff_ptr(), tn); + } + + RGWCoroutine *alloc_finisher_cr() override { + rgw::sal::RadosStore* store = sync_env->store; + return new RGWSimpleRadosReadCR(sync_env->dpp, store, + rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)), + &sync_marker); + } +}; + +class RGWMetaSyncCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + const rgw_pool& pool; + RGWPeriodHistory::Cursor cursor; //< sync position in period history + RGWPeriodHistory::Cursor next; //< next period in history + rgw_meta_sync_status sync_status; + RGWSyncTraceNodeRef tn; + + std::mutex mutex; //< protect access to shard_crs + + // TODO: it should be enough to hold a reference on the stack only, as calling + // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has + // already completed + using ControlCRRef = boost::intrusive_ptr; + using StackRef = boost::intrusive_ptr; + using RefPair = std::pair; + map shard_crs; + int ret{0}; + +public: + RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, const RGWPeriodHistory::Cursor &cursor, + const rgw_meta_sync_status& _sync_status, RGWSyncTraceNodeRef& _tn) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + pool(sync_env->store->svc()->zone->get_zone_params().log_pool), + cursor(cursor), sync_status(_sync_status), tn(_tn) {} + + ~RGWMetaSyncCR() { + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + // loop through one period at a time + tn->log(1, "start"); + for (;;) { + if (cursor == sync_env->store->svc()->mdlog->get_period_history()->get_current()) { + next = RGWPeriodHistory::Cursor{}; + if (cursor) { + ldpp_dout(dpp, 10) << "RGWMetaSyncCR on current period=" + << cursor.get_period().get_id() << dendl; + } else { + ldpp_dout(dpp, 10) << "RGWMetaSyncCR with no period" << dendl; + } + } else { + next = cursor; + next.next(); + ldpp_dout(dpp, 10) << "RGWMetaSyncCR on period=" + << cursor.get_period().get_id() << ", next=" + << next.get_period().get_id() << dendl; + } + + yield { + // get the mdlog for the current period (may be empty) + auto& period_id = sync_status.sync_info.period; + auto realm_epoch = sync_status.sync_info.realm_epoch; + auto mdlog = sync_env->store->svc()->mdlog->get_log(period_id); + + tn->log(1, SSTR("realm epoch=" << realm_epoch << " period id=" << period_id)); + + // prevent wakeup() from accessing shard_crs while we're spawning them + std::lock_guard lock(mutex); + + // sync this period on each shard + for (const auto& m : sync_status.sync_markers) { + uint32_t shard_id = m.first; + auto& marker = m.second; + + std::string period_marker; + if (next) { + // read the maximum marker from the next period's sync status + period_marker = next.get_period().get_sync_status()[shard_id]; + if (period_marker.empty()) { + // no metadata changes have occurred on this shard, skip it + ldpp_dout(dpp, 10) << "RGWMetaSyncCR: skipping shard " << shard_id + << " with empty period marker" << dendl; + continue; + } + } + + using ShardCR = RGWMetaSyncShardControlCR; + auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch, + mdlog, shard_id, marker, + std::move(period_marker), tn); + auto stack = spawn(cr, false); + shard_crs[shard_id] = RefPair{cr, stack}; + } + } + // wait for each shard to complete + while (ret == 0 && num_spawned() > 0) { + yield wait_for_child(); + collect(&ret, nullptr); + } + drain_all(); + { + // drop shard cr refs under lock + std::lock_guard lock(mutex); + shard_crs.clear(); + } + if (ret < 0) { + return set_cr_error(ret); + } + // advance to the next period + ceph_assert(next); + cursor = next; + + // write the updated sync info + sync_status.sync_info.period = cursor.get_period().get_id(); + sync_status.sync_info.realm_epoch = cursor.get_epoch(); + yield call(new RGWSimpleRadosWriteCR(dpp, sync_env->store, + rgw_raw_obj(pool, sync_env->status_oid()), + sync_status.sync_info)); + } + } + return 0; + } + + void wakeup(int shard_id) { + std::lock_guard lock(mutex); + auto iter = shard_crs.find(shard_id); + if (iter == shard_crs.end()) { + return; + } + iter->second.first->wakeup(); + } +}; + +void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) { + env->dpp = dpp; + env->cct = store->ctx(); + env->store = store; + env->conn = conn; + env->async_rados = async_rados; + env->http_manager = &http_manager; + env->error_logger = error_logger; + env->sync_tracer = store->getRados()->get_sync_tracer(); +} + +int RGWRemoteMetaLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status) +{ + if (store->svc()->zone->is_meta_master()) { + return 0; + } + // cannot run concurrently with run_sync(), so run in a separate manager + RGWCoroutinesManager crs(store->ctx(), store->getRados()->get_cr_registry()); + RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWMetaSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + tn->log(20, "read sync status"); + ret = crs.run(dpp, new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status)); + http_manager.stop(); + return ret; +} + +int RGWRemoteMetaLog::init_sync_status(const DoutPrefixProvider *dpp) +{ + if (store->svc()->zone->is_meta_master()) { + return 0; + } + + rgw_mdlog_info mdlog_info; + int r = read_log_info(dpp, &mdlog_info); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl; + return r; + } + + rgw_meta_sync_info sync_info; + sync_info.num_shards = mdlog_info.num_shards; + auto cursor = store->svc()->mdlog->get_period_history()->get_current(); + if (cursor) { + sync_info.period = cursor.get_period().get_id(); + sync_info.realm_epoch = cursor.get_epoch(); + } + + return run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_info)); +} + +int RGWRemoteMetaLog::store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info) +{ + tn->log(20, "store sync info"); + return run(dpp, new RGWSimpleRadosWriteCR(dpp, store, + rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.status_oid()), + sync_info)); +} + +// return a cursor to the period at our sync position +static RGWPeriodHistory::Cursor get_period_at(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, + const rgw_meta_sync_info& info, + optional_yield y) +{ + if (info.period.empty()) { + // return an empty cursor with error=0 + return RGWPeriodHistory::Cursor{}; + } + + // look for an existing period in our history + auto cursor = store->svc()->mdlog->get_period_history()->lookup(info.realm_epoch); + if (cursor) { + // verify that the period ids match + auto& existing = cursor.get_period().get_id(); + if (existing != info.period) { + ldpp_dout(dpp, -1) << "ERROR: sync status period=" << info.period + << " does not match period=" << existing + << " in history at realm epoch=" << info.realm_epoch << dendl; + return RGWPeriodHistory::Cursor{-EEXIST}; + } + return cursor; + } + + // read the period from rados or pull it from the master + RGWPeriod period; + int r = store->svc()->mdlog->pull_period(dpp, info.period, period, y); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to read period id " + << info.period << ": " << cpp_strerror(r) << dendl; + return RGWPeriodHistory::Cursor{r}; + } + // attach the period to our history + cursor = store->svc()->mdlog->get_period_history()->attach(dpp, std::move(period), y); + if (!cursor) { + r = cursor.get_error(); + ldpp_dout(dpp, -1) << "ERROR: failed to read period history back to " + << info.period << ": " << cpp_strerror(r) << dendl; + } + return cursor; +} + +int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y) +{ + if (store->svc()->zone->is_meta_master()) { + return 0; + } + + int r = 0; + + // get shard count and oldest log period from master + rgw_mdlog_info mdlog_info; + for (;;) { + if (going_down) { + ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl; + return 0; + } + r = read_log_info(dpp, &mdlog_info); + if (r == -EIO || r == -ENOENT) { + // keep retrying if master isn't alive or hasn't initialized the log + ldpp_dout(dpp, 10) << __func__ << "(): waiting for master.." << dendl; + backoff.backoff_sleep(); + continue; + } + backoff.reset(); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl; + return r; + } + break; + } + + rgw_meta_sync_status sync_status; + do { + if (going_down) { + ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl; + return 0; + } + r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status)); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch sync status r=" << r << dendl; + return r; + } + + if (!mdlog_info.period.empty()) { + // restart sync if the remote has a period, but: + // a) our status does not, or + // b) our sync period comes before the remote's oldest log period + if (sync_status.sync_info.period.empty() || + sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) { + sync_status.sync_info.state = rgw_meta_sync_info::StateInit; + string reason; + if (sync_status.sync_info.period.empty()) { + reason = "period is empty"; + } else { + reason = SSTR("sync_info realm epoch is behind: " << sync_status.sync_info.realm_epoch << " < " << mdlog_info.realm_epoch); + } + tn->log(1, "initialize sync (reason: " + reason + ")"); + ldpp_dout(dpp, 1) << "epoch=" << sync_status.sync_info.realm_epoch + << " in sync status comes before remote's oldest mdlog epoch=" + << mdlog_info.realm_epoch << ", restarting sync" << dendl; + } + } + + if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) { + ldpp_dout(dpp, 20) << __func__ << "(): init" << dendl; + sync_status.sync_info.num_shards = mdlog_info.num_shards; + auto cursor = store->svc()->mdlog->get_period_history()->get_current(); + if (cursor) { + // run full sync, then start incremental from the current period/epoch + sync_status.sync_info.period = cursor.get_period().get_id(); + sync_status.sync_info.realm_epoch = cursor.get_epoch(); + } + r = run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info)); + if (r == -EBUSY) { + backoff.backoff_sleep(); + continue; + } + backoff.reset(); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to init sync status r=" << r << dendl; + return r; + } + } + } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit); + + auto num_shards = sync_status.sync_info.num_shards; + if (num_shards != mdlog_info.num_shards) { + ldpp_dout(dpp, -1) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl; + return -EINVAL; + } + + RGWPeriodHistory::Cursor cursor; + do { + r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status)); + if (r < 0 && r != -ENOENT) { + tn->log(0, SSTR("ERROR: failed to fetch sync status r=" << r)); + return r; + } + + switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) { + case rgw_meta_sync_info::StateBuildingFullSyncMaps: + tn->log(20, "building full sync maps"); + r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn)); + if (r == -EBUSY || r == -EIO) { + backoff.backoff_sleep(); + continue; + } + backoff.reset(); + if (r < 0) { + tn->log(0, SSTR("ERROR: failed to fetch all metadata keys (r=" << r << ")")); + return r; + } + + sync_status.sync_info.state = rgw_meta_sync_info::StateSync; + r = store_sync_info(dpp, sync_status.sync_info); + if (r < 0) { + tn->log(0, SSTR("ERROR: failed to update sync status (r=" << r << ")")); + return r; + } + /* fall through */ + case rgw_meta_sync_info::StateSync: + tn->log(20, "sync"); + // find our position in the period history (if any) + cursor = get_period_at(dpp, store, sync_status.sync_info, y); + r = cursor.get_error(); + if (r < 0) { + return r; + } + meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status, tn); + r = run(dpp, meta_sync_cr); + if (r < 0) { + tn->log(0, "ERROR: failed to fetch all metadata keys"); + return r; + } + break; + default: + tn->log(0, "ERROR: bad sync state!"); + return -EIO; + } + } while (!going_down); + + return 0; +} + +void RGWRemoteMetaLog::wakeup(int shard_id) +{ + if (!meta_sync_cr) { + return; + } + meta_sync_cr->wakeup(shard_id); +} + +int RGWCloneMetaLogCoroutine::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + do { + yield { + ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl; + return state_init(); + } + yield { + ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl; + return state_read_shard_status(); + } + yield { + ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl; + return state_read_shard_status_complete(); + } + yield { + ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl; + return state_send_rest_request(dpp); + } + yield { + ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl; + return state_receive_rest_response(); + } + yield { + ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl; + return state_store_mdlog_entries(); + } + } while (truncated); + yield { + ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl; + return state_store_mdlog_entries_complete(); + } + } + + return 0; +} + +int RGWCloneMetaLogCoroutine::state_init() +{ + data = rgw_mdlog_shard_data(); + + return 0; +} + +int RGWCloneMetaLogCoroutine::state_read_shard_status() +{ + const bool add_ref = false; // default constructs with refs=1 + + completion.reset(new RGWMetadataLogInfoCompletion( + [this](int ret, const cls_log_header& header) { + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(sync_env->dpp, 1) << "ERROR: failed to read mdlog info with " + << cpp_strerror(ret) << dendl; + } + } else { + shard_info.marker = header.max_marker; + shard_info.last_update = header.max_time.to_real_time(); + } + // wake up parent stack + io_complete(); + }), add_ref); + + int ret = mdlog->get_info_async(sync_env->dpp, shard_id, completion.get()); + if (ret < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl; + return set_cr_error(ret); + } + + return io_block(0); +} + +int RGWCloneMetaLogCoroutine::state_read_shard_status_complete() +{ + completion.reset(); + + ldpp_dout(sync_env->dpp, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl; + + marker = shard_info.marker; + + return 0; +} + +int RGWCloneMetaLogCoroutine::state_send_rest_request(const DoutPrefixProvider *dpp) +{ + RGWRESTConn *conn = sync_env->conn; + + char buf[32]; + snprintf(buf, sizeof(buf), "%d", shard_id); + + char max_entries_buf[32]; + snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries); + + const char *marker_key = (marker.empty() ? "" : "marker"); + + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { "id", buf }, + { "period", period.c_str() }, + { "max-entries", max_entries_buf }, + { marker_key, marker.c_str() }, + { NULL, NULL } }; + + http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); + + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + http_op = NULL; + return set_cr_error(ret); + } + + return io_block(0); +} + +int RGWCloneMetaLogCoroutine::state_receive_rest_response() +{ + int ret = http_op->wait(&data, null_yield); + if (ret < 0) { + error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl; + ldpp_dout(sync_env->dpp, 5) << "failed to wait for op, ret=" << ret << dendl; + http_op->put(); + http_op = NULL; + return set_cr_error(ret); + } + http_op->put(); + http_op = NULL; + + ldpp_dout(sync_env->dpp, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl; + + truncated = ((int)data.entries.size() == max_entries); + + if (data.entries.empty()) { + if (new_marker) { + *new_marker = marker; + } + return set_cr_done(); + } + + if (new_marker) { + *new_marker = data.entries.back().id; + } + + return 0; +} + + +int RGWCloneMetaLogCoroutine::state_store_mdlog_entries() +{ + list dest_entries; + + vector::iterator iter; + for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) { + rgw_mdlog_entry& entry = *iter; + ldpp_dout(sync_env->dpp, 20) << "entry: name=" << entry.name << dendl; + + cls_log_entry dest_entry; + dest_entry.id = entry.id; + dest_entry.section = entry.section; + dest_entry.name = entry.name; + dest_entry.timestamp = utime_t(entry.timestamp); + + encode(entry.log_data, dest_entry.data); + + dest_entries.push_back(dest_entry); + + marker = entry.id; + } + + RGWAioCompletionNotifier *cn = stack->create_completion_notifier(); + + int ret = mdlog->store_entries_in_shard(sync_env->dpp, dest_entries, shard_id, cn->completion()); + if (ret < 0) { + cn->put(); + ldpp_dout(sync_env->dpp, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl; + return set_cr_error(ret); + } + return io_block(0); +} + +int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete() +{ + return set_cr_done(); +} + +void rgw_meta_sync_info::decode_json(JSONObj *obj) +{ + string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "init") { + state = StateInit; + } else if (s == "building-full-sync-maps") { + state = StateBuildingFullSyncMaps; + } else if (s == "sync") { + state = StateSync; + } + JSONDecoder::decode_json("num_shards", num_shards, obj); + JSONDecoder::decode_json("period", period, obj); + JSONDecoder::decode_json("realm_epoch", realm_epoch, obj); +} + +void rgw_meta_sync_info::dump(Formatter *f) const +{ + string s; + switch ((SyncState)state) { + case StateInit: + s = "init"; + break; + case StateBuildingFullSyncMaps: + s = "building-full-sync-maps"; + break; + case StateSync: + s = "sync"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); + encode_json("num_shards", num_shards, f); + encode_json("period", period, f); + encode_json("realm_epoch", realm_epoch, f); +} + + +void rgw_meta_sync_marker::decode_json(JSONObj *obj) +{ + int s; + JSONDecoder::decode_json("state", s, obj); + state = s; + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("next_step_marker", next_step_marker, obj); + JSONDecoder::decode_json("total_entries", total_entries, obj); + JSONDecoder::decode_json("pos", pos, obj); + utime_t ut; + JSONDecoder::decode_json("timestamp", ut, obj); + timestamp = ut.to_real_time(); + JSONDecoder::decode_json("realm_epoch", realm_epoch, obj); +} + +void rgw_meta_sync_marker::dump(Formatter *f) const +{ + encode_json("state", (int)state, f); + encode_json("marker", marker, f); + encode_json("next_step_marker", next_step_marker, f); + encode_json("total_entries", total_entries, f); + encode_json("pos", pos, f); + encode_json("timestamp", utime_t(timestamp), f); + encode_json("realm_epoch", realm_epoch, f); +} + +void rgw_meta_sync_status::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("info", sync_info, obj); + JSONDecoder::decode_json("markers", sync_markers, obj); +} + +void rgw_meta_sync_status::dump(Formatter *f) const { + encode_json("info", sync_info, f); + encode_json("markers", sync_markers, f); +} + +void rgw_sync_error_info::dump(Formatter *f) const { + encode_json("source_zone", source_zone, f); + encode_json("error_code", error_code, f); + encode_json("message", message, f); +} + diff --git a/src/rgw/driver/rados/rgw_sync.h b/src/rgw/driver/rados/rgw_sync.h new file mode 100644 index 000000000..e6c255cc6 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync.h @@ -0,0 +1,547 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "include/stringify.h" + +#include "rgw_coroutine.h" +#include "rgw_http_client.h" +#include "rgw_metadata.h" +#include "rgw_meta_sync_status.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "rgw_sync_trace.h" +#include "rgw_mdlog.h" + +#define ERROR_LOGGER_SHARDS 32 +#define RGW_SYNC_ERROR_LOG_SHARD_PREFIX "sync.error-log" + +struct rgw_mdlog_info { + uint32_t num_shards; + std::string period; //< period id of the master's oldest metadata log + epoch_t realm_epoch; //< realm epoch of oldest metadata log + + rgw_mdlog_info() : num_shards(0), realm_epoch(0) {} + + void decode_json(JSONObj *obj); +}; + + +struct rgw_mdlog_entry { + std::string id; + std::string section; + std::string name; + ceph::real_time timestamp; + RGWMetadataLogData log_data; + + void decode_json(JSONObj *obj); + + bool convert_from(cls_log_entry& le) { + id = le.id; + section = le.section; + name = le.name; + timestamp = le.timestamp.to_real_time(); + try { + auto iter = le.data.cbegin(); + decode(log_data, iter); + } catch (buffer::error& err) { + return false; + } + return true; + } +}; + +struct rgw_mdlog_shard_data { + std::string marker; + bool truncated; + std::vector entries; + + void decode_json(JSONObj *obj); +}; + +class RGWAsyncRadosProcessor; +class RGWMetaSyncStatusManager; +class RGWMetaSyncCR; +class RGWRESTConn; +class RGWSyncTraceManager; + +class RGWSyncErrorLogger { + rgw::sal::RadosStore* store; + + std::vector oids; + int num_shards; + + std::atomic counter = { 0 }; +public: + RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const std::string &oid_prefix, int _num_shards); + RGWCoroutine *log_error_cr(const DoutPrefixProvider *dpp, const std::string& source_zone, const std::string& section, const std::string& name, uint32_t error_code, const std::string& message); + + static std::string get_shard_oid(const std::string& oid_prefix, int shard_id); +}; + +struct rgw_sync_error_info { + std::string source_zone; + uint32_t error_code; + std::string message; + + rgw_sync_error_info() : error_code(0) {} + rgw_sync_error_info(const std::string& _source_zone, uint32_t _error_code, const std::string& _message) : source_zone(_source_zone), error_code(_error_code), message(_message) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(source_zone, bl); + encode(error_code, bl); + encode(message, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(source_zone, bl); + decode(error_code, bl); + decode(message, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_sync_error_info) + +#define DEFAULT_BACKOFF_MAX 30 + +class RGWSyncBackoff { + int cur_wait; + int max_secs; + + void update_wait_time(); +public: + explicit RGWSyncBackoff(int _max_secs = DEFAULT_BACKOFF_MAX) : cur_wait(0), max_secs(_max_secs) {} + + void backoff_sleep(); + void reset() { + cur_wait = 0; + } + + void backoff(RGWCoroutine *op); +}; + +class RGWBackoffControlCR : public RGWCoroutine +{ + RGWCoroutine *cr; + ceph::mutex lock; + + RGWSyncBackoff backoff; + bool reset_backoff; + + bool exit_on_error; + +protected: + bool *backoff_ptr() { + return &reset_backoff; + } + + ceph::mutex& cr_lock() { + return lock; + } + + RGWCoroutine *get_cr() { + return cr; + } + +public: + RGWBackoffControlCR(CephContext *_cct, bool _exit_on_error) + : RGWCoroutine(_cct), + cr(nullptr), + lock(ceph::make_mutex("RGWBackoffControlCR::lock:" + stringify(this))), + reset_backoff(false), exit_on_error(_exit_on_error) { + } + + ~RGWBackoffControlCR() override { + if (cr) { + cr->put(); + } + } + + virtual RGWCoroutine *alloc_cr() = 0; + virtual RGWCoroutine *alloc_finisher_cr() { return NULL; } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +struct RGWMetaSyncEnv { + const DoutPrefixProvider *dpp; + CephContext *cct{nullptr}; + rgw::sal::RadosStore* store{nullptr}; + RGWRESTConn *conn{nullptr}; + RGWAsyncRadosProcessor *async_rados{nullptr}; + RGWHTTPManager *http_manager{nullptr}; + RGWSyncErrorLogger *error_logger{nullptr}; + RGWSyncTraceManager *sync_tracer{nullptr}; + + RGWMetaSyncEnv() {} + + void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn, + RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager, + RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer); + + std::string shard_obj_name(int shard_id); + std::string status_oid(); +}; + +class RGWRemoteMetaLog : public RGWCoroutinesManager { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + RGWRESTConn *conn; + RGWAsyncRadosProcessor *async_rados; + + RGWHTTPManager http_manager; + RGWMetaSyncStatusManager *status_manager; + RGWSyncErrorLogger *error_logger{nullptr}; + RGWSyncTraceManager *sync_tracer{nullptr}; + + RGWMetaSyncCR *meta_sync_cr{nullptr}; + + RGWSyncBackoff backoff; + + RGWMetaSyncEnv sync_env; + + void init_sync_env(RGWMetaSyncEnv *env); + int store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info); + + std::atomic going_down = { false }; + + RGWSyncTraceNodeRef tn; + +public: + RGWRemoteMetaLog(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store, + RGWAsyncRadosProcessor *async_rados, + RGWMetaSyncStatusManager *_sm) + : RGWCoroutinesManager(_store->ctx(), _store->getRados()->get_cr_registry()), + dpp(dpp), store(_store), conn(NULL), async_rados(async_rados), + http_manager(store->ctx(), completion_mgr), + status_manager(_sm) {} + + virtual ~RGWRemoteMetaLog() override; + + int init(); + void finish(); + + int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info); + int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map *shards_info); + int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map shard_markers, std::map *result); + int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status); + int init_sync_status(const DoutPrefixProvider *dpp); + int run_sync(const DoutPrefixProvider *dpp, optional_yield y); + + void wakeup(int shard_id); + + RGWMetaSyncEnv& get_sync_env() { + return sync_env; + } +}; + +class RGWMetaSyncStatusManager : public DoutPrefixProvider { + rgw::sal::RadosStore* store; + librados::IoCtx ioctx; + + RGWRemoteMetaLog master_log; + + std::map shard_objs; + + struct utime_shard { + real_time ts; + int shard_id; + + utime_shard() : shard_id(-1) {} + + bool operator<(const utime_shard& rhs) const { + if (ts == rhs.ts) { + return shard_id < rhs.shard_id; + } + return ts < rhs.ts; + } + }; + + ceph::shared_mutex ts_to_shard_lock = ceph::make_shared_mutex("ts_to_shard_lock"); + std::map ts_to_shard; + std::vector clone_markers; + +public: + RGWMetaSyncStatusManager(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados) + : store(_store), master_log(this, store, async_rados, this) + {} + + virtual ~RGWMetaSyncStatusManager() override; + + int init(const DoutPrefixProvider *dpp); + + int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status) { + return master_log.read_sync_status(dpp, sync_status); + } + int init_sync_status(const DoutPrefixProvider *dpp) { return master_log.init_sync_status(dpp); } + int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info) { + return master_log.read_log_info(dpp, log_info); + } + int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map *shards_info) { + return master_log.read_master_log_shards_info(dpp, master_period, shards_info); + } + int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map shard_markers, std::map *result) { + return master_log.read_master_log_shards_next(dpp, period, shard_markers, result); + } + + int run(const DoutPrefixProvider *dpp, optional_yield y) { return master_log.run_sync(dpp, y); } + + + // implements DoutPrefixProvider + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; + + void wakeup(int shard_id) { return master_log.wakeup(shard_id); } + void stop() { + master_log.finish(); + } +}; + +class RGWOrderCallCR : public RGWCoroutine +{ +public: + RGWOrderCallCR(CephContext *cct) : RGWCoroutine(cct) {} + + virtual void call_cr(RGWCoroutine *_cr) = 0; +}; + +class RGWLastCallerWinsCR : public RGWOrderCallCR +{ + RGWCoroutine *cr{nullptr}; + +public: + explicit RGWLastCallerWinsCR(CephContext *cct) : RGWOrderCallCR(cct) {} + ~RGWLastCallerWinsCR() { + if (cr) { + cr->put(); + } + } + + int operate(const DoutPrefixProvider *dpp) override; + + void call_cr(RGWCoroutine *_cr) override { + if (cr) { + cr->put(); + } + cr = _cr; + } +}; + +template +class RGWSyncShardMarkerTrack { + struct marker_entry { + uint64_t pos; + real_time timestamp; + + marker_entry() : pos(0) {} + marker_entry(uint64_t _p, const real_time& _ts) : pos(_p), timestamp(_ts) {} + }; + typename std::map pending; + + std::map finish_markers; + + int window_size; + int updates_since_flush; + + RGWOrderCallCR *order_cr{nullptr}; + +protected: + typename std::set need_retry_set; + + virtual RGWCoroutine *store_marker(const T& new_marker, uint64_t index_pos, const real_time& timestamp) = 0; + virtual RGWOrderCallCR *allocate_order_control_cr() = 0; + virtual void handle_finish(const T& marker) { } + +public: + RGWSyncShardMarkerTrack(int _window_size) : window_size(_window_size), updates_since_flush(0) {} + virtual ~RGWSyncShardMarkerTrack() { + if (order_cr) { + order_cr->put(); + } + } + + bool start(const T& pos, int index_pos, const real_time& timestamp) { + if (pending.find(pos) != pending.end()) { + return false; + } + pending[pos] = marker_entry(index_pos, timestamp); + return true; + } + + void try_update_high_marker(const T& pos, int index_pos, const real_time& timestamp) { + finish_markers[pos] = marker_entry(index_pos, timestamp); + } + + RGWCoroutine *finish(const T& pos) { + if (pending.empty()) { + /* can happen, due to a bug that ended up with multiple objects with the same name and version + * -- which can happen when versioning is enabled an the version is 'null'. + */ + return NULL; + } + + typename std::map::iterator iter = pending.begin(); + + bool is_first = (pos == iter->first); + + typename std::map::iterator pos_iter = pending.find(pos); + if (pos_iter == pending.end()) { + /* see pending.empty() comment */ + return NULL; + } + + finish_markers[pos] = pos_iter->second; + + pending.erase(pos); + + handle_finish(pos); + + updates_since_flush++; + + if (is_first && (updates_since_flush >= window_size || pending.empty())) { + return flush(); + } + return NULL; + } + + RGWCoroutine *flush() { + if (finish_markers.empty()) { + return NULL; + } + + typename std::map::iterator i; + + if (pending.empty()) { + i = finish_markers.end(); + } else { + i = finish_markers.lower_bound(pending.begin()->first); + } + if (i == finish_markers.begin()) { + return NULL; + } + updates_since_flush = 0; + + auto last = i; + --i; + const T& high_marker = i->first; + marker_entry& high_entry = i->second; + RGWCoroutine *cr = order(store_marker(high_marker, high_entry.pos, high_entry.timestamp)); + finish_markers.erase(finish_markers.begin(), last); + return cr; + } + + /* + * a key needs retry if it was processing when another marker that points + * to the same bucket shards arrives. Instead of processing it, we mark + * it as need_retry so that when we finish processing the original, we + * retry the processing on the same bucket shard, in case there are more + * entries to process. This closes a race that can happen. + */ + bool need_retry(const K& key) { + return (need_retry_set.find(key) != need_retry_set.end()); + } + + void set_need_retry(const K& key) { + need_retry_set.insert(key); + } + + void reset_need_retry(const K& key) { + need_retry_set.erase(key); + } + + RGWCoroutine *order(RGWCoroutine *cr) { + /* either returns a new RGWLastWriteWinsCR, or update existing one, in which case it returns + * nothing and the existing one will call the cr + */ + if (order_cr && order_cr->is_done()) { + order_cr->put(); + order_cr = nullptr; + } + if (!order_cr) { + order_cr = allocate_order_control_cr(); + order_cr->get(); + order_cr->call_cr(cr); + return order_cr; + } + order_cr->call_cr(cr); + return nullptr; /* don't call it a second time */ + } +}; + +class RGWMetaSyncShardMarkerTrack; + +class RGWMetaSyncSingleEntryCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + std::string raw_key; + std::string entry_marker; + RGWMDLogStatus op_status; + + ssize_t pos; + std::string section; + std::string key; + + int sync_status; + + bufferlist md_bl; + + RGWMetaSyncShardMarkerTrack *marker_tracker; + + int tries; + + bool error_injection; + + RGWSyncTraceNodeRef tn; + +public: + RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env, + const std::string& _raw_key, const std::string& _entry_marker, + const RGWMDLogStatus& _op_status, + RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent); + + int operate(const DoutPrefixProvider *dpp) override; +}; + +class RGWShardCollectCR : public RGWCoroutine { + int current_running = 0; + protected: + int max_concurrent; + int status = 0; + + // called with the result of each child. error codes can be ignored by + // returning 0. if handle_result() returns a negative value, it's + // treated as an error and stored in 'status'. the last such error is + // reported to the caller with set_cr_error() + virtual int handle_result(int r) = 0; + public: + RGWShardCollectCR(CephContext *_cct, int _max_concurrent) + : RGWCoroutine(_cct), max_concurrent(_max_concurrent) + {} + + virtual bool spawn_next() = 0; + int operate(const DoutPrefixProvider *dpp) override; +}; + +// factory functions for meta sync coroutines needed in mdlog trimming + +RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env, + const std::string& period, + int shard_id, + RGWMetadataLogInfo* info); + +RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env, + const std::string& period, + int shard_id, + const std::string& marker, + uint32_t max_entries, + rgw_mdlog_shard_data *result); + diff --git a/src/rgw/driver/rados/rgw_sync_counters.cc b/src/rgw/driver/rados/rgw_sync_counters.cc new file mode 100644 index 000000000..1d23d58dc --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_counters.cc @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/ceph_context.h" +#include "rgw_sync_counters.h" + +namespace sync_counters { + +PerfCountersRef build(CephContext *cct, const std::string& name) +{ + PerfCountersBuilder b(cct, name, l_first, l_last); + + // share these counters with ceph-mgr + b.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + b.add_u64_avg(l_fetch, "fetch_bytes", "Number of object bytes replicated"); + b.add_u64_counter(l_fetch_not_modified, "fetch_not_modified", "Number of objects already replicated"); + b.add_u64_counter(l_fetch_err, "fetch_errors", "Number of object replication errors"); + + b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests"); + b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors"); + + auto logger = PerfCountersRef{ b.create_perf_counters(), cct }; + cct->get_perfcounters_collection()->add(logger.get()); + return logger; +} + +} // namespace sync_counters diff --git a/src/rgw/driver/rados/rgw_sync_counters.h b/src/rgw/driver/rados/rgw_sync_counters.h new file mode 100644 index 000000000..df3acc680 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_counters.h @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "common/perf_counters_collection.h" + +namespace sync_counters { + +enum { + l_first = 805000, + + l_fetch, + l_fetch_not_modified, + l_fetch_err, + + l_poll, + l_poll_err, + + l_last, +}; + +PerfCountersRef build(CephContext *cct, const std::string& name); + +} // namespace sync_counters diff --git a/src/rgw/driver/rados/rgw_sync_error_repo.cc b/src/rgw/driver/rados/rgw_sync_error_repo.cc new file mode 100644 index 000000000..44305b60b --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_error_repo.cc @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "rgw_sync_error_repo.h" +#include "rgw_coroutine.h" +#include "rgw_sal.h" +#include "services/svc_rados.h" +#include "cls/cmpomap/client.h" + +namespace rgw::error_repo { + +// prefix for the binary encoding of keys. this particular value is not +// valid as the first byte of a utf8 code point, so we use this to +// differentiate the binary encoding from existing string keys for +// backward-compatibility +constexpr uint8_t binary_key_prefix = 0x80; + +struct key_type { + rgw_bucket_shard bs; + std::optional gen; +}; + +void encode(const key_type& k, bufferlist& bl, uint64_t f=0) +{ + ENCODE_START(1, 1, bl); + encode(k.bs, bl); + encode(k.gen, bl); + ENCODE_FINISH(bl); +} + +void decode(key_type& k, bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(k.bs, bl); + decode(k.gen, bl); + DECODE_FINISH(bl); +} + +std::string encode_key(const rgw_bucket_shard& bs, + std::optional gen) +{ + using ceph::encode; + const auto key = key_type{bs, gen}; + bufferlist bl; + encode(binary_key_prefix, bl); + encode(key, bl); + return bl.to_str(); +} + +int decode_key(std::string encoded, + rgw_bucket_shard& bs, + std::optional& gen) +{ + using ceph::decode; + key_type key; + const auto bl = bufferlist::static_from_string(encoded); + auto p = bl.cbegin(); + try { + uint8_t prefix; + decode(prefix, p); + if (prefix != binary_key_prefix) { + return -EINVAL; + } + decode(key, p); + } catch (const buffer::error&) { + return -EIO; + } + if (!p.end()) { + return -EIO; // buffer contained unexpected bytes + } + bs = std::move(key.bs); + gen = key.gen; + return 0; +} + +ceph::real_time decode_value(const bufferlist& bl) +{ + uint64_t value; + try { + using ceph::decode; + decode(value, bl); + } catch (const buffer::error&) { + value = 0; // empty buffer = 0 + } + return ceph::real_clock::zero() + ceph::timespan(value); +} + +int write(librados::ObjectWriteOperation& op, + const std::string& key, + ceph::real_time timestamp) +{ + // overwrite the existing timestamp if value is greater + const uint64_t value = timestamp.time_since_epoch().count(); + using namespace ::cls::cmpomap; + const bufferlist zero = u64_buffer(0); // compare against 0 for missing keys + return cmp_set_vals(op, Mode::U64, Op::GT, {{key, u64_buffer(value)}}, zero); +} + +int remove(librados::ObjectWriteOperation& op, + const std::string& key, + ceph::real_time timestamp) +{ + // remove the omap key if value >= existing + const uint64_t value = timestamp.time_since_epoch().count(); + using namespace ::cls::cmpomap; + return cmp_rm_keys(op, Mode::U64, Op::GTE, {{key, u64_buffer(value)}}); +} + +class RGWErrorRepoWriteCR : public RGWSimpleCoroutine { + RGWSI_RADOS::Obj obj; + std::string key; + ceph::real_time timestamp; + + boost::intrusive_ptr cn; + public: + RGWErrorRepoWriteCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj, + const std::string& key, ceph::real_time timestamp) + : RGWSimpleCoroutine(rados->ctx()), + obj(rados->obj(raw_obj)), + key(key), timestamp(timestamp) + {} + + int send_request(const DoutPrefixProvider *dpp) override { + librados::ObjectWriteOperation op; + int r = write(op, key, timestamp); + if (r < 0) { + return r; + } + r = obj.open(dpp); + if (r < 0) { + return r; + } + + cn = stack->create_completion_notifier(); + return obj.aio_operate(cn->completion(), &op); + } + + int request_complete() override { + return cn->completion()->get_return_value(); + } +}; + +RGWCoroutine* write_cr(RGWSI_RADOS* rados, + const rgw_raw_obj& obj, + const std::string& key, + ceph::real_time timestamp) +{ + return new RGWErrorRepoWriteCR(rados, obj, key, timestamp); +} + + +class RGWErrorRepoRemoveCR : public RGWSimpleCoroutine { + RGWSI_RADOS::Obj obj; + std::string key; + ceph::real_time timestamp; + + boost::intrusive_ptr cn; + public: + RGWErrorRepoRemoveCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj, + const std::string& key, ceph::real_time timestamp) + : RGWSimpleCoroutine(rados->ctx()), + obj(rados->obj(raw_obj)), + key(key), timestamp(timestamp) + {} + + int send_request(const DoutPrefixProvider *dpp) override { + librados::ObjectWriteOperation op; + int r = remove(op, key, timestamp); + if (r < 0) { + return r; + } + r = obj.open(dpp); + if (r < 0) { + return r; + } + + cn = stack->create_completion_notifier(); + return obj.aio_operate(cn->completion(), &op); + } + + int request_complete() override { + return cn->completion()->get_return_value(); + } +}; + +RGWCoroutine* remove_cr(RGWSI_RADOS* rados, + const rgw_raw_obj& obj, + const std::string& key, + ceph::real_time timestamp) +{ + return new RGWErrorRepoRemoveCR(rados, obj, key, timestamp); +} + +} // namespace rgw::error_repo diff --git a/src/rgw/driver/rados/rgw_sync_error_repo.h b/src/rgw/driver/rados/rgw_sync_error_repo.h new file mode 100644 index 000000000..60525d281 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_error_repo.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include +#include "include/rados/librados_fwd.hpp" +#include "include/buffer_fwd.h" +#include "common/ceph_time.h" + +class RGWSI_RADOS; +class RGWCoroutine; +struct rgw_raw_obj; +struct rgw_bucket_shard; + +namespace rgw::error_repo { + +// binary-encode a bucket/shard/gen and return it as a string +std::string encode_key(const rgw_bucket_shard& bs, + std::optional gen); + +// try to decode a key. returns -EINVAL if not in binary format +int decode_key(std::string encoded, + rgw_bucket_shard& bs, + std::optional& gen); + +// decode a timestamp as a uint64_t for CMPXATTR_MODE_U64 +ceph::real_time decode_value(const ceph::bufferlist& bl); + +// write an omap key iff the given timestamp is newer +int write(librados::ObjectWriteOperation& op, + const std::string& key, + ceph::real_time timestamp); +RGWCoroutine* write_cr(RGWSI_RADOS* rados, + const rgw_raw_obj& obj, + const std::string& key, + ceph::real_time timestamp); + +// remove an omap key iff there isn't a newer timestamp +int remove(librados::ObjectWriteOperation& op, + const std::string& key, + ceph::real_time timestamp); +RGWCoroutine* remove_cr(RGWSI_RADOS* rados, + const rgw_raw_obj& obj, + const std::string& key, + ceph::real_time timestamp); + +} // namespace rgw::error_repo diff --git a/src/rgw/driver/rados/rgw_sync_module.cc b/src/rgw/driver/rados/rgw_sync_module.cc new file mode 100644 index 000000000..5a1e70be3 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module.cc @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_cr_rados.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_bucket.h" + +#include "rgw_sync_module_log.h" +#include "rgw_sync_module_es.h" +#include "rgw_sync_module_aws.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +RGWMetadataHandler *RGWSyncModuleInstance::alloc_bucket_meta_handler() +{ + return RGWBucketMetaHandlerAllocator::alloc(); +} + +RGWBucketInstanceMetadataHandlerBase* RGWSyncModuleInstance::alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver) +{ + return RGWBucketInstanceMetaHandlerAllocator::alloc(driver); +} + +RGWStatRemoteObjCBCR::RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc, + rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct), + sc(_sc), sync_env(_sc->env), + src_bucket(_src_bucket), key(_key) { +} + +RGWCallStatRemoteObjCR::RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc, + rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct), + sc(_sc), sync_env(_sc->env), + src_bucket(_src_bucket), key(_key) { +} + +int RGWCallStatRemoteObjCR::operate(const DoutPrefixProvider *dpp) { + reenter(this) { + yield { + call(new RGWStatRemoteObjCR(sync_env->async_rados, sync_env->driver, + sc->source_zone, + src_bucket, key, &mtime, &size, &etag, &attrs, &headers)); + } + if (retcode < 0) { + ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() returned " << retcode << dendl; + return set_cr_error(retcode); + } + ldpp_dout(dpp, 20) << "stat of remote obj: z=" << sc->source_zone + << " b=" << src_bucket << " k=" << key + << " size=" << size << " mtime=" << mtime << dendl; + yield { + RGWStatRemoteObjCBCR *cb = allocate_callback(); + if (cb) { + cb->set_result(mtime, size, etag, std::move(attrs), std::move(headers)); + call(cb); + } + } + if (retcode < 0) { + ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() callback returned " << retcode << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} + +void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager) +{ + RGWSyncModuleRef default_module(std::make_shared()); + modules_manager->register_module("rgw", default_module, true); + + RGWSyncModuleRef archive_module(std::make_shared()); + modules_manager->register_module("archive", archive_module); + + RGWSyncModuleRef log_module(std::make_shared()); + modules_manager->register_module("log", log_module); + + RGWSyncModuleRef es_module(std::make_shared()); + modules_manager->register_module("elasticsearch", es_module); + + RGWSyncModuleRef aws_module(std::make_shared()); + modules_manager->register_module("cloud", aws_module); +} diff --git a/src/rgw/driver/rados/rgw_sync_module.h b/src/rgw/driver/rados/rgw_sync_module.h new file mode 100644 index 000000000..38abb3d1a --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module.h @@ -0,0 +1,203 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_common.h" +#include "rgw_coroutine.h" + +class RGWBucketInfo; +class RGWRemoteDataLog; +struct RGWDataSyncCtx; +struct RGWDataSyncEnv; +struct rgw_bucket_entry_owner; +struct rgw_obj_key; +struct rgw_bucket_sync_pipe; + + +class RGWDataSyncModule { +public: + RGWDataSyncModule() {} + virtual ~RGWDataSyncModule() {} + + virtual void init(RGWDataSyncCtx *sync_env, uint64_t instance_id) {} + + virtual RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) { + return nullptr; + } + + virtual RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) { + return nullptr; + } + virtual RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, + rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, + std::optional versioned_epoch, + const rgw_zone_set_entry& my_trace_entry, + rgw_zone_set *zones_trace) = 0; + virtual RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime, + bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0; + virtual RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0; +}; + +class RGWRESTMgr; +class RGWMetadataHandler; +class RGWBucketInstanceMetadataHandlerBase; + +class RGWSyncModuleInstance { +public: + RGWSyncModuleInstance() {} + virtual ~RGWSyncModuleInstance() {} + virtual RGWDataSyncModule *get_data_handler() = 0; + virtual RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) { + return orig; + } + virtual bool supports_user_writes() { + return false; + } + virtual RGWMetadataHandler *alloc_bucket_meta_handler(); + virtual RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver); + + // indication whether the sync module start with full sync (default behavior) + // incremental sync would follow anyway + virtual bool should_full_sync() const { + return true; + } +}; + +typedef std::shared_ptr RGWSyncModuleInstanceRef; + +class JSONFormattable; + +class RGWSyncModule { + +public: + RGWSyncModule() {} + virtual ~RGWSyncModule() {} + + virtual bool supports_writes() { + return false; + } + virtual bool supports_data_export() = 0; + virtual int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) = 0; +}; + +typedef std::shared_ptr RGWSyncModuleRef; + + +class RGWSyncModulesManager { + ceph::mutex lock = ceph::make_mutex("RGWSyncModulesManager"); + + std::map modules; +public: + RGWSyncModulesManager() = default; + + void register_module(const std::string& name, RGWSyncModuleRef& module, bool is_default = false) { + std::lock_guard l{lock}; + modules[name] = module; + if (is_default) { + modules[std::string()] = module; + } + } + + bool get_module(const std::string& name, RGWSyncModuleRef *module) { + std::lock_guard l{lock}; + auto iter = modules.find(name); + if (iter == modules.end()) { + return false; + } + if (module != nullptr) { + *module = iter->second; + } + return true; + } + + + bool supports_data_export(const std::string& name) { + RGWSyncModuleRef module; + if (!get_module(name, &module)) { + return false; + } + + return module->supports_data_export(); + } + + int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const std::string& name, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) { + RGWSyncModuleRef module; + if (!get_module(name, &module)) { + return -ENOENT; + } + + return module.get()->create_instance(dpp, cct, config, instance); + } + + std::vector get_registered_module_names() const { + std::vector names; + for (auto& i: modules) { + if (!i.first.empty()) { + names.push_back(i.first); + } + } + return names; + } +}; + +class RGWStatRemoteObjCBCR : public RGWCoroutine { +protected: + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + rgw_bucket src_bucket; + rgw_obj_key key; + + ceph::real_time mtime; + uint64_t size = 0; + std::string etag; + std::map attrs; + std::map headers; +public: + RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc, + rgw_bucket& _src_bucket, rgw_obj_key& _key); + ~RGWStatRemoteObjCBCR() override {} + + void set_result(ceph::real_time& _mtime, + uint64_t _size, + const std::string& _etag, + std::map&& _attrs, + std::map&& _headers) { + mtime = _mtime; + size = _size; + etag = _etag; + attrs = std::move(_attrs); + headers = std::move(_headers); + } +}; + +class RGWCallStatRemoteObjCR : public RGWCoroutine { + ceph::real_time mtime; + uint64_t size{0}; + std::string etag; + std::map attrs; + std::map headers; + +protected: + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + + rgw_bucket src_bucket; + rgw_obj_key key; + +public: + RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc, + rgw_bucket& _src_bucket, rgw_obj_key& _key); + + ~RGWCallStatRemoteObjCR() override {} + + int operate(const DoutPrefixProvider *dpp) override; + + virtual RGWStatRemoteObjCBCR *allocate_callback() { + return nullptr; + } +}; + +void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager); diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.cc b/src/rgw/driver/rados/rgw_sync_module_aws.cc new file mode 100644 index 000000000..cefcd9dd1 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module_aws.cc @@ -0,0 +1,1823 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" + +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_sync_module_aws.h" +#include "rgw_cr_rados.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rest.h" +#include "rgw_acl.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include + +#define dout_subsys ceph_subsys_rgw + + +#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024) + +using namespace std; + +static string default_target_path = "rgw-${zonegroup}-${sid}/${bucket}"; + +static string get_key_oid(const rgw_obj_key& key) +{ + string oid = key.name; + if (!key.instance.empty() && + !key.have_null_instance()) { + oid += string(":") + key.instance; + } + return oid; +} + +static string obj_to_aws_path(const rgw_obj& obj) +{ + return obj.bucket.name + "/" + get_key_oid(obj.key); +} + +/* + + json configuration definition: + + { + "connection": { + "access_key": , + "secret": , + "endpoint": , + "host_style": , + }, + "acls": [ { "type": , + "source_id": , + "dest_id": } ... ], # optional, acl mappings, no mappings if does not exist + "target_path": , # override default + + + # anything below here is for non trivial configuration + # can be used in conjuction with the above + + "default": { + "connection": { + "access_key": , + "secret": , + "endpoint": , + "host_style" , + }, + "acls": [ # list of source uids and how they map into destination uids in the dest objects acls + { + "type" : , # optional, default is id + "source_id": , + "dest_id": + } ... ] + "target_path": "rgwx-${sid}/${bucket}" # how a bucket name is mapped to destination path, + # final object name will be target_path + "/" + obj + }, + "connections": [ + { + "id": , + "access_key": , + "secret": , + "endpoint": , + } ... ], + "acl_profiles": [ + { + "id": , # acl mappings + "acls": [ { + "type": , + "source_id": , + "dest_id": + } ... ] + } + ], + "profiles": [ + { + "source_bucket": , # can specify either specific bucket name (foo), or prefix (foo*) + "target_path": , # (override default) + "connection_id": , # optional, if empty references default connection + "acls_id": , # optional, if empty references default mappings + } ... ], + } + +target path optional variables: + +(evaluated at init) +sid: sync instance id, randomly generated by sync process on first sync initalization +zonegroup: zonegroup name +zonegroup_id: zonegroup name +zone: zone name +zone_id: zone name + +(evaluated when syncing) +bucket: bucket name +owner: bucket owner + +*/ + +struct ACLMapping { + ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER}; + string source_id; + string dest_id; + + ACLMapping() = default; + + ACLMapping(ACLGranteeTypeEnum t, + const string& s, + const string& d) : type(t), + source_id(s), + dest_id(d) {} + + void init(const JSONFormattable& config) { + const string& t = config["type"]; + + if (t == "email") { + type = ACL_TYPE_EMAIL_USER; + } else if (t == "uri") { + type = ACL_TYPE_GROUP; + } else { + type = ACL_TYPE_CANON_USER; + } + + source_id = config["source_id"]; + dest_id = config["dest_id"]; + } + + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ObjectSection os(jf, "acl_mapping"); + string s; + switch (type) { + case ACL_TYPE_EMAIL_USER: + s = "email"; + break; + case ACL_TYPE_GROUP: + s = "uri"; + break; + default: + s = "id"; + break; + } + encode_json("type", s, &jf); + encode_json("source_id", source_id, &jf); + encode_json("dest_id", dest_id, &jf); + } +}; + +struct ACLMappings { + map acl_mappings; + + void init(const JSONFormattable& config) { + for (auto& c : config.array()) { + ACLMapping m; + m.init(c); + + acl_mappings.emplace(std::make_pair(m.source_id, m)); + } + } + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ArraySection os(jf, "acls"); + + for (auto& i : acl_mappings) { + i.second.dump_conf(cct, jf); + } + } +}; + +struct AWSSyncConfig_ACLProfiles { + map > acl_profiles; + + void init(const JSONFormattable& config) { + for (auto& c : config.array()) { + const string& profile_id = c["id"]; + + std::shared_ptr ap{new ACLMappings}; + ap->init(c["acls"]); + + acl_profiles[profile_id] = ap; + } + } + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ArraySection section(jf, "acl_profiles"); + + for (auto& p : acl_profiles) { + Formatter::ObjectSection section(jf, "profile"); + encode_json("id", p.first, &jf); + p.second->dump_conf(cct, jf); + } + } + + bool find(const string& profile_id, ACLMappings *result) const { + auto iter = acl_profiles.find(profile_id); + if (iter == acl_profiles.end()) { + return false; + } + *result = *iter->second; + return true; + } +}; + +struct AWSSyncConfig_Connection { + string connection_id; + string endpoint; + RGWAccessKey key; + std::optional region; + HostStyle host_style{PathStyle}; + + bool has_endpoint{false}; + bool has_key{false}; + bool has_host_style{false}; + + void init(const JSONFormattable& config) { + has_endpoint = config.exists("endpoint"); + has_key = config.exists("access_key") || config.exists("secret"); + has_host_style = config.exists("host_style"); + + connection_id = config["id"]; + endpoint = config["endpoint"]; + + key = RGWAccessKey(config["access_key"], config["secret"]); + + if (config.exists("region")) { + region = config["region"]; + } else { + region.reset(); + } + + string host_style_str = config["host_style"]; + if (host_style_str != "virtual") { + host_style = PathStyle; + } else { + host_style = VirtualStyle; + } + } + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ObjectSection section(jf, "connection"); + encode_json("id", connection_id, &jf); + encode_json("endpoint", endpoint, &jf); + string s = (host_style == PathStyle ? "path" : "virtual"); + encode_json("region", region, &jf); + encode_json("host_style", s, &jf); + + { + Formatter::ObjectSection os(jf, "key"); + encode_json("access_key", key.id, &jf); + string secret = (key.key.empty() ? "" : "******"); + encode_json("secret", secret, &jf); + } + } +}; + +static int conf_to_uint64(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, const string& key, uint64_t *pval) +{ + string sval; + if (config.find(key, &sval)) { + string err; + uint64_t val = strict_strtoll(sval.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(dpp, 0) << "ERROR: could not parse configurable value for cloud sync module: " << key << ": " << sval << dendl; + return -EINVAL; + } + *pval = val; + } + return 0; +} + +struct AWSSyncConfig_S3 { + uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE}; + uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE}; + + int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) { + int r = conf_to_uint64(dpp, cct, config, "multipart_sync_threshold", &multipart_sync_threshold); + if (r < 0) { + return r; + } + + r = conf_to_uint64(dpp, cct, config, "multipart_min_part_size", &multipart_min_part_size); + if (r < 0) { + return r; + } +#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024) + if (multipart_min_part_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) { + multipart_min_part_size = MULTIPART_MIN_POSSIBLE_PART_SIZE; + } + return 0; + } + + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ObjectSection section(jf, "s3"); + encode_json("multipart_sync_threshold", multipart_sync_threshold, &jf); + encode_json("multipart_min_part_size", multipart_min_part_size, &jf); + } +}; + +struct AWSSyncConfig_Profile { + string source_bucket; + bool prefix{false}; + string target_path; + string connection_id; + string acls_id; + + std::shared_ptr conn_conf; + std::shared_ptr acls; + + std::shared_ptr conn; + + void init(const JSONFormattable& config) { + source_bucket = config["source_bucket"]; + + prefix = (!source_bucket.empty() && source_bucket[source_bucket.size() - 1] == '*'); + + if (prefix) { + source_bucket = source_bucket.substr(0, source_bucket.size() - 1); + } + + target_path = config["target_path"]; + connection_id = config["connection_id"]; + acls_id = config["acls_id"]; + + if (config.exists("connection")) { + conn_conf = make_shared(); + conn_conf->init(config["connection"]); + } + + if (config.exists("acls")) { + acls = make_shared(); + acls->init(config["acls"]); + } + } + + void dump_conf(CephContext *cct, JSONFormatter& jf, const char *section = "config") const { + Formatter::ObjectSection config(jf, section); + string sb{source_bucket}; + if (prefix) { + sb.append("*"); + } + encode_json("source_bucket", sb, &jf); + encode_json("target_path", target_path, &jf); + encode_json("connection_id", connection_id, &jf); + encode_json("acls_id", acls_id, &jf); + if (conn_conf.get()) { + conn_conf->dump_conf(cct, jf); + } + if (acls.get()) { + acls->dump_conf(cct, jf); + } + } +}; + +static void find_and_replace(const string& src, const string& find, const string& replace, string *dest) +{ + string s = src; + + size_t pos = s.find(find); + while (pos != string::npos) { + size_t next_ofs = pos + find.size(); + s = s.substr(0, pos) + replace + s.substr(next_ofs); + pos = s.find(find, next_ofs); + } + + *dest = s; +} + +static void apply_meta_param(const string& src, const string& param, const string& val, string *dest) +{ + string s = string("${") + param + "}"; + find_and_replace(src, s, val, dest); +} + + +struct AWSSyncConfig { + AWSSyncConfig_Profile default_profile; + std::shared_ptr root_profile; + + map > connections; + AWSSyncConfig_ACLProfiles acl_profiles; + + map > explicit_profiles; + + AWSSyncConfig_S3 s3; + + int init_profile(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, AWSSyncConfig_Profile& profile, + bool connection_must_exist) { + if (!profile.connection_id.empty()) { + if (profile.conn_conf) { + ldpp_dout(dpp, 0) << "ERROR: ambiguous profile connection configuration, connection_id=" << profile.connection_id << dendl; + return -EINVAL; + } + if (connections.find(profile.connection_id) == connections.end()) { + ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent connection_id=" << profile.connection_id << dendl; + return -EINVAL; + } + profile.conn_conf = connections[profile.connection_id]; + } else if (!profile.conn_conf) { + profile.connection_id = default_profile.connection_id; + auto i = connections.find(profile.connection_id); + if (i != connections.end()) { + profile.conn_conf = i->second; + } + } + + if (connection_must_exist && !profile.conn_conf) { + ldpp_dout(dpp, 0) << "ERROR: remote connection undefined for sync profile" << dendl; + return -EINVAL; + } + + if (profile.conn_conf && default_profile.conn_conf) { + if (!profile.conn_conf->has_endpoint) { + profile.conn_conf->endpoint = default_profile.conn_conf->endpoint; + } + if (!profile.conn_conf->has_host_style) { + profile.conn_conf->host_style = default_profile.conn_conf->host_style; + } + if (!profile.conn_conf->has_key) { + profile.conn_conf->key = default_profile.conn_conf->key; + } + } + + ACLMappings acl_mappings; + + if (!profile.acls_id.empty()) { + if (!acl_profiles.find(profile.acls_id, &acl_mappings)) { + ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent acls id=" << profile.acls_id << dendl; + return -EINVAL; + } + profile.acls = acl_profiles.acl_profiles[profile.acls_id]; + } else if (!profile.acls) { + if (default_profile.acls) { + profile.acls = default_profile.acls; + profile.acls_id = default_profile.acls_id; + } + } + + if (profile.target_path.empty()) { + profile.target_path = default_profile.target_path; + } + if (profile.target_path.empty()) { + profile.target_path = default_target_path; + } + + return 0; + } + + int init_target(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, std::shared_ptr *ptarget) { + std::shared_ptr profile; + profile.reset(new AWSSyncConfig_Profile); + profile->init(profile_conf); + + int ret = init_profile(dpp, cct, profile_conf, *profile, true); + if (ret < 0) { + return ret; + } + + auto& sb = profile->source_bucket; + + if (explicit_profiles.find(sb) != explicit_profiles.end()) { + ldpp_dout(dpp, 0) << "WARNING: duplicate target configuration in sync module" << dendl; + } + + explicit_profiles[sb] = profile; + if (ptarget) { + *ptarget = profile; + } + return 0; + } + + bool do_find_profile(const rgw_bucket bucket, std::shared_ptr *result) { + const string& name = bucket.name; + auto iter = explicit_profiles.upper_bound(name); + if (iter == explicit_profiles.begin()) { + return false; + } + + --iter; + if (iter->first.size() > name.size()) { + return false; + } + if (name.compare(0, iter->first.size(), iter->first) != 0) { + return false; + } + + std::shared_ptr& target = iter->second; + + if (!target->prefix && + name.size() != iter->first.size()) { + return false; + } + + *result = target; + return true; + } + + void find_profile(const rgw_bucket bucket, std::shared_ptr *result) { + if (!do_find_profile(bucket, result)) { + *result = root_profile; + } + } + + AWSSyncConfig() {} + + int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) { + auto& default_conf = config["default"]; + + if (config.exists("default")) { + default_profile.init(default_conf); + init_profile(dpp, cct, default_conf, default_profile, false); + } + + for (auto& conn : config["connections"].array()) { + auto new_conn = conn; + + std::shared_ptr c{new AWSSyncConfig_Connection}; + c->init(new_conn); + + connections[new_conn["id"]] = c; + } + + acl_profiles.init(config["acl_profiles"]); + + int r = s3.init(dpp, cct, config["s3"]); + if (r < 0) { + return r; + } + + auto new_root_conf = config; + + r = init_target(dpp, cct, new_root_conf, &root_profile); /* the root profile config */ + if (r < 0) { + return r; + } + + for (auto target_conf : config["profiles"].array()) { + int r = init_target(dpp, cct, target_conf, nullptr); + if (r < 0) { + return r; + } + } + + JSONFormatter jf(true); + dump_conf(cct, jf); + stringstream ss; + jf.flush(ss); + + ldpp_dout(dpp, 5) << "sync module config (parsed representation):\n" << ss.str() << dendl; + + return 0; + } + + void expand_target(RGWDataSyncCtx *sc, const string& sid, const string& path, string *dest) { + apply_meta_param(path, "sid", sid, dest); + + const RGWZoneGroup& zg = sc->env->svc->zone->get_zonegroup(); + apply_meta_param(path, "zonegroup", zg.get_name(), dest); + apply_meta_param(path, "zonegroup_id", zg.get_id(), dest); + + const RGWZone& zone = sc->env->svc->zone->get_zone(); + apply_meta_param(path, "zone", zone.name, dest); + apply_meta_param(path, "zone_id", zone.id, dest); + } + + void update_config(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, const string& sid) { + expand_target(sc, sid, root_profile->target_path, &root_profile->target_path); + ldpp_dout(dpp, 20) << "updated target: (root) -> " << root_profile->target_path << dendl; + for (auto& t : explicit_profiles) { + expand_target(sc, sid, t.second->target_path, &t.second->target_path); + ldpp_dout(dpp, 20) << "updated target: " << t.first << " -> " << t.second->target_path << dendl; + } + } + + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ObjectSection config(jf, "config"); + root_profile->dump_conf(cct, jf); + jf.open_array_section("connections"); + for (auto c : connections) { + c.second->dump_conf(cct, jf); + } + jf.close_section(); + + acl_profiles.dump_conf(cct, jf); + + { // targets + Formatter::ArraySection as(jf, "profiles"); + for (auto& t : explicit_profiles) { + Formatter::ObjectSection target_section(jf, "profile"); + encode_json("name", t.first, &jf); + t.second->dump_conf(cct, jf); + } + } + } + + string get_path(std::shared_ptr& profile, + const RGWBucketInfo& bucket_info, + const rgw_obj_key& obj) { + string bucket_str; + string owner; + if (!bucket_info.owner.tenant.empty()) { + bucket_str = owner = bucket_info.owner.tenant + "-"; + owner += bucket_info.owner.id; + } + bucket_str += bucket_info.bucket.name; + + const string& path = profile->target_path; + + string new_path; + apply_meta_param(path, "bucket", bucket_str, &new_path); + apply_meta_param(new_path, "owner", owner, &new_path); + + new_path += string("/") + get_key_oid(obj); + + return new_path; + } + + void get_target(std::shared_ptr& profile, + const RGWBucketInfo& bucket_info, + const rgw_obj_key& obj, + string *bucket_name, + string *obj_name) { + string path = get_path(profile, bucket_info, obj); + size_t pos = path.find('/'); + + *bucket_name = path.substr(0, pos); + *obj_name = path.substr(pos + 1); + } + + void init_conns(RGWDataSyncCtx *sc, const string& id) { + auto sync_env = sc->env; + + update_config(sync_env->dpp, sc, id); + + auto& root_conf = root_profile->conn_conf; + + root_profile->conn.reset(new S3RESTConn(sc->cct, + id, + { root_conf->endpoint }, + root_conf->key, + sync_env->svc->zone->get_zonegroup().get_id(), + root_conf->region, + root_conf->host_style)); + + for (auto i : explicit_profiles) { + auto& c = i.second; + + c->conn.reset(new S3RESTConn(sc->cct, + id, + { c->conn_conf->endpoint }, + c->conn_conf->key, + sync_env->svc->zone->get_zonegroup().get_id(), + c->conn_conf->region, + c->conn_conf->host_style)); + } + } +}; + + +struct AWSSyncInstanceEnv { + AWSSyncConfig conf; + string id; + + explicit AWSSyncInstanceEnv(AWSSyncConfig& _conf) : conf(_conf) {} + + void init(RGWDataSyncCtx *sc, uint64_t instance_id) { + char buf[32]; + snprintf(buf, sizeof(buf), "%llx", (unsigned long long)instance_id); + id = buf; + + conf.init_conns(sc, id); + } + + void get_profile(const rgw_bucket& bucket, std::shared_ptr *ptarget) { + conf.find_profile(bucket, ptarget); + ceph_assert(ptarget); + } +}; + +static int do_decode_rest_obj(const DoutPrefixProvider *dpp, CephContext *cct, map& attrs, map& headers, rgw_rest_obj *info) +{ + for (auto header : headers) { + const string& val = header.second; + if (header.first == "RGWX_OBJECT_SIZE") { + info->content_len = atoi(val.c_str()); + } else { + info->attrs[header.first] = val; + } + } + + info->acls.set_ctx(cct); + auto aiter = attrs.find(RGW_ATTR_ACL); + if (aiter != attrs.end()) { + bufferlist& bl = aiter->second; + auto bliter = bl.cbegin(); + try { + info->acls.decode(bliter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl; + return -EIO; + } + } else { + ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl; + } + + return 0; +} + +class RGWRESTStreamGetCRF : public RGWStreamReadHTTPResourceCRF +{ + RGWDataSyncCtx *sc; + RGWRESTConn *conn; + const rgw_obj& src_obj; + RGWRESTConn::get_obj_params req_params; + + rgw_sync_aws_src_obj_properties src_properties; +public: + RGWRESTStreamGetCRF(CephContext *_cct, + RGWCoroutinesEnv *_env, + RGWCoroutine *_caller, + RGWDataSyncCtx *_sc, + RGWRESTConn *_conn, + const rgw_obj& _src_obj, + const rgw_sync_aws_src_obj_properties& _src_properties) : RGWStreamReadHTTPResourceCRF(_cct, _env, _caller, + _sc->env->http_manager, _src_obj.key), + sc(_sc), conn(_conn), src_obj(_src_obj), + src_properties(_src_properties) { + } + + int init(const DoutPrefixProvider *dpp) override { + /* init input connection */ + + + req_params.get_op = true; + req_params.prepend_metadata = true; + + req_params.unmod_ptr = &src_properties.mtime; + req_params.etag = src_properties.etag; + req_params.mod_zone_id = src_properties.zone_short_id; + req_params.mod_pg_ver = src_properties.pg_ver; + + if (range.is_set) { + req_params.range_is_set = true; + req_params.range_start = range.ofs; + req_params.range_end = range.ofs + range.size - 1; + } + + RGWRESTStreamRWRequest *in_req; + int ret = conn->get_obj(dpp, src_obj, req_params, false /* send */, &in_req); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): conn->get_obj() returned ret=" << ret << dendl; + return ret; + } + + set_req(in_req); + + return RGWStreamReadHTTPResourceCRF::init(dpp); + } + + int decode_rest_obj(const DoutPrefixProvider *dpp, map& headers, bufferlist& extra_data) override { + map src_attrs; + + ldpp_dout(dpp, 20) << __func__ << ":" << " headers=" << headers << " extra_data.length()=" << extra_data.length() << dendl; + + if (extra_data.length() > 0) { + JSONParser jp; + if (!jp.parse(extra_data.c_str(), extra_data.length())) { + ldpp_dout(dpp, 0) << "ERROR: failed to parse response extra data. len=" << extra_data.length() << " data=" << extra_data.c_str() << dendl; + return -EIO; + } + + JSONDecoder::decode_json("attrs", src_attrs, &jp); + } + return do_decode_rest_obj(dpp, sc->cct, src_attrs, headers, &rest_obj); + } + + bool need_extra_data() override { + return true; + } +}; + +static std::set keep_headers = { "CONTENT_TYPE", + "CONTENT_ENCODING", + "CONTENT_DISPOSITION", + "CONTENT_LANGUAGE" }; + +class RGWAWSStreamPutCRF : public RGWStreamWriteHTTPResourceCRF +{ + RGWDataSyncCtx *sc; + rgw_sync_aws_src_obj_properties src_properties; + std::shared_ptr target; + const rgw_obj& dest_obj; + string etag; +public: + RGWAWSStreamPutCRF(CephContext *_cct, + RGWCoroutinesEnv *_env, + RGWCoroutine *_caller, + RGWDataSyncCtx *_sc, + const rgw_sync_aws_src_obj_properties& _src_properties, + std::shared_ptr& _target, + const rgw_obj& _dest_obj) : RGWStreamWriteHTTPResourceCRF(_cct, _env, _caller, _sc->env->http_manager), + sc(_sc), src_properties(_src_properties), target(_target), dest_obj(_dest_obj) { + } + + int init() override { + /* init output connection */ + RGWRESTStreamS3PutObj *out_req{nullptr}; + + if (multipart.is_multipart) { + char buf[32]; + snprintf(buf, sizeof(buf), "%d", multipart.part_num); + rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() }, + { "partNumber", buf }, + { nullptr, nullptr } }; + target->conn->put_obj_send_init(dest_obj, params, &out_req); + } else { + target->conn->put_obj_send_init(dest_obj, nullptr, &out_req); + } + + set_req(out_req); + + return RGWStreamWriteHTTPResourceCRF::init(); + } + + static bool keep_attr(const string& h) { + return (keep_headers.find(h) != keep_headers.end() || + boost::algorithm::starts_with(h, "X_AMZ_")); + } + + static void init_send_attrs(const DoutPrefixProvider *dpp, + CephContext *cct, + const rgw_rest_obj& rest_obj, + const rgw_sync_aws_src_obj_properties& src_properties, + const AWSSyncConfig_Profile *target, + map *attrs) { + auto& new_attrs = *attrs; + + new_attrs.clear(); + + for (auto& hi : rest_obj.attrs) { + if (keep_attr(hi.first)) { + new_attrs.insert(hi); + } + } + + auto acl = rest_obj.acls.get_acl(); + + map > access_map; + + if (target->acls) { + for (auto& grant : acl.get_grant_map()) { + auto& orig_grantee = grant.first; + auto& perm = grant.second; + + string grantee; + + const auto& am = target->acls->acl_mappings; + + auto iter = am.find(orig_grantee); + if (iter == am.end()) { + ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl; + continue; + } + + grantee = iter->second.dest_id; + + string type; + + switch (iter->second.type) { + case ACL_TYPE_CANON_USER: + type = "id"; + break; + case ACL_TYPE_EMAIL_USER: + type = "emailAddress"; + break; + case ACL_TYPE_GROUP: + type = "uri"; + break; + default: + continue; + } + + string tv = type + "=" + grantee; + + int flags = perm.get_permission().get_permissions(); + if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) { + access_map[flags].push_back(tv); + continue; + } + + for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) { + if (flags & i) { + access_map[i].push_back(tv); + } + } + } + } + + for (auto aiter : access_map) { + int grant_type = aiter.first; + + string header_str("x-amz-grant-"); + + switch (grant_type) { + case RGW_PERM_READ: + header_str.append("read"); + break; + case RGW_PERM_WRITE: + header_str.append("write"); + break; + case RGW_PERM_READ_ACP: + header_str.append("read-acp"); + break; + case RGW_PERM_WRITE_ACP: + header_str.append("write-acp"); + break; + case RGW_PERM_FULL_CONTROL: + header_str.append("full-control"); + break; + } + + string s; + + for (auto viter : aiter.second) { + if (!s.empty()) { + s.append(", "); + } + s.append(viter); + } + + ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl; + + new_attrs[header_str] = s; + } + + char buf[32]; + snprintf(buf, sizeof(buf), "%llu", (long long)src_properties.versioned_epoch); + new_attrs["x-amz-meta-rgwx-versioned-epoch"] = buf; + + utime_t ut(src_properties.mtime); + snprintf(buf, sizeof(buf), "%lld.%09lld", + (long long)ut.sec(), + (long long)ut.nsec()); + + new_attrs["x-amz-meta-rgwx-source-mtime"] = buf; + new_attrs["x-amz-meta-rgwx-source-etag"] = src_properties.etag; + new_attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name; + if (!rest_obj.key.instance.empty()) { + new_attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance; + } + } + + void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) override { + RGWRESTStreamS3PutObj *r = static_cast(req); + + map new_attrs; + if (!multipart.is_multipart) { + init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs); + } + + r->set_send_length(rest_obj.content_len); + + RGWAccessControlPolicy policy; + + r->send_ready(dpp, target->conn->get_key(), new_attrs, policy); + } + + void handle_headers(const map& headers) { + for (auto h : headers) { + if (h.first == "ETAG") { + etag = h.second; + } + } + } + + bool get_etag(string *petag) { + if (etag.empty()) { + return false; + } + *petag = etag; + return true; + } +}; + + +class RGWAWSStreamObjToCloudPlainCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWRESTConn *source_conn; + std::shared_ptr target; + const rgw_obj& src_obj; + const rgw_obj& dest_obj; + + rgw_sync_aws_src_obj_properties src_properties; + + std::shared_ptr in_crf; + std::shared_ptr out_crf; + +public: + RGWAWSStreamObjToCloudPlainCR(RGWDataSyncCtx *_sc, + RGWRESTConn *_source_conn, + const rgw_obj& _src_obj, + const rgw_sync_aws_src_obj_properties& _src_properties, + std::shared_ptr _target, + const rgw_obj& _dest_obj) : RGWCoroutine(_sc->cct), + sc(_sc), + source_conn(_source_conn), + target(_target), + src_obj(_src_obj), + dest_obj(_dest_obj), + src_properties(_src_properties) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + /* init input */ + in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc, + source_conn, src_obj, + src_properties)); + + /* init output */ + out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc, + src_properties, target, dest_obj)); + + yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSStreamObjToCloudMultipartPartCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWRESTConn *source_conn; + std::shared_ptr target; + const rgw_obj& src_obj; + const rgw_obj& dest_obj; + + rgw_sync_aws_src_obj_properties src_properties; + + string upload_id; + + rgw_sync_aws_multipart_part_info part_info; + + std::shared_ptr in_crf; + std::shared_ptr out_crf; + + string *petag; + +public: + RGWAWSStreamObjToCloudMultipartPartCR(RGWDataSyncCtx *_sc, + RGWRESTConn *_source_conn, + const rgw_obj& _src_obj, + std::shared_ptr& _target, + const rgw_obj& _dest_obj, + const rgw_sync_aws_src_obj_properties& _src_properties, + const string& _upload_id, + const rgw_sync_aws_multipart_part_info& _part_info, + string *_petag) : RGWCoroutine(_sc->cct), + sc(_sc), + source_conn(_source_conn), + target(_target), + src_obj(_src_obj), + dest_obj(_dest_obj), + src_properties(_src_properties), + upload_id(_upload_id), + part_info(_part_info), + petag(_petag) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + /* init input */ + in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc, + source_conn, src_obj, + src_properties)); + + in_crf->set_range(part_info.ofs, part_info.size); + + /* init output */ + out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc, + src_properties, target, dest_obj)); + + out_crf->set_multipart(upload_id, part_info.part_num, part_info.size); + + yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + if (!(static_cast(out_crf.get()))->get_etag(petag)) { + ldpp_dout(dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl; + return set_cr_error(-EIO); + } + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSAbortMultipartCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWRESTConn *dest_conn; + const rgw_obj& dest_obj; + + string upload_id; + +public: + RGWAWSAbortMultipartCR(RGWDataSyncCtx *_sc, + RGWRESTConn *_dest_conn, + const rgw_obj& _dest_obj, + const string& _upload_id) : RGWCoroutine(_sc->cct), + sc(_sc), + dest_conn(_dest_conn), + dest_obj(_dest_obj), + upload_id(_upload_id) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + + yield { + rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} }; + bufferlist bl; + call(new RGWDeleteRESTResourceCR(sc->cct, dest_conn, sc->env->http_manager, + obj_to_aws_path(dest_obj), params)); + } + + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (retcode=" << retcode << ")" << dendl; + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSInitMultipartCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWRESTConn *dest_conn; + const rgw_obj& dest_obj; + + uint64_t obj_size; + map attrs; + + bufferlist out_bl; + + string *upload_id; + + struct InitMultipartResult { + string bucket; + string key; + string upload_id; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Bucket", bucket, obj); + RGWXMLDecoder::decode_xml("Key", key, obj); + RGWXMLDecoder::decode_xml("UploadId", upload_id, obj); + } + } result; + +public: + RGWAWSInitMultipartCR(RGWDataSyncCtx *_sc, + RGWRESTConn *_dest_conn, + const rgw_obj& _dest_obj, + uint64_t _obj_size, + const map& _attrs, + string *_upload_id) : RGWCoroutine(_sc->cct), + sc(_sc), + dest_conn(_dest_conn), + dest_obj(_dest_obj), + obj_size(_obj_size), + attrs(_attrs), + upload_id(_upload_id) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + + yield { + rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} }; + bufferlist bl; + call(new RGWPostRawRESTResourceCR (sc->cct, dest_conn, sc->env->http_manager, + obj_to_aws_path(dest_obj), params, &attrs, bl, &out_bl)); + } + + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl; + return set_cr_error(retcode); + } + { + /* + * If one of the following fails we cannot abort upload, as we cannot + * extract the upload id. If one of these fail it's very likely that that's + * the least of our problem. + */ + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl; + return set_cr_error(-EIO); + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl; + return set_cr_error(-EIO); + } + + try { + RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl; + return set_cr_error(-EIO); + } + } + + ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl; + + *upload_id = result.upload_id; + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSCompleteMultipartCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWRESTConn *dest_conn; + const rgw_obj& dest_obj; + + bufferlist out_bl; + + string upload_id; + + struct CompleteMultipartReq { + map parts; + + explicit CompleteMultipartReq(const map& _parts) : parts(_parts) {} + + void dump_xml(Formatter *f) const { + for (auto p : parts) { + f->open_object_section("Part"); + encode_xml("PartNumber", p.first, f); + encode_xml("ETag", p.second.etag, f); + f->close_section(); + }; + } + } req_enc; + + struct CompleteMultipartResult { + string location; + string bucket; + string key; + string etag; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Location", bucket, obj); + RGWXMLDecoder::decode_xml("Bucket", bucket, obj); + RGWXMLDecoder::decode_xml("Key", key, obj); + RGWXMLDecoder::decode_xml("ETag", etag, obj); + } + } result; + +public: + RGWAWSCompleteMultipartCR(RGWDataSyncCtx *_sc, + RGWRESTConn *_dest_conn, + const rgw_obj& _dest_obj, + string _upload_id, + const map& _parts) : RGWCoroutine(_sc->cct), + sc(_sc), + dest_conn(_dest_conn), + dest_obj(_dest_obj), + upload_id(_upload_id), + req_enc(_parts) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + + yield { + rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} }; + stringstream ss; + XMLFormatter formatter; + + encode_xml("CompleteMultipartUpload", req_enc, &formatter); + + formatter.flush(ss); + + bufferlist bl; + bl.append(ss.str()); + + call(new RGWPostRawRESTResourceCR (sc->cct, dest_conn, sc->env->http_manager, + obj_to_aws_path(dest_obj), params, nullptr, bl, &out_bl)); + } + + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl; + return set_cr_error(retcode); + } + { + /* + * If one of the following fails we cannot abort upload, as we cannot + * extract the upload id. If one of these fail it's very likely that that's + * the least of our problem. + */ + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl; + return set_cr_error(-EIO); + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl; + return set_cr_error(-EIO); + } + + try { + RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl; + return set_cr_error(-EIO); + } + } + + ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl; + + return set_cr_done(); + } + + return 0; + } +}; + + +class RGWAWSStreamAbortMultipartUploadCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWRESTConn *dest_conn; + const rgw_obj& dest_obj; + const rgw_raw_obj status_obj; + + string upload_id; + +public: + + RGWAWSStreamAbortMultipartUploadCR(RGWDataSyncCtx *_sc, + RGWRESTConn *_dest_conn, + const rgw_obj& _dest_obj, + const rgw_raw_obj& _status_obj, + const string& _upload_id) : RGWCoroutine(_sc->cct), sc(_sc), + dest_conn(_dest_conn), + dest_obj(_dest_obj), + status_obj(_status_obj), + upload_id(_upload_id) {} + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield call(new RGWAWSAbortMultipartCR(sc, dest_conn, dest_obj, upload_id)); + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " retcode=" << retcode << dendl; + /* ignore error, best effort */ + } + yield call(new RGWRadosRemoveCR(sc->env->driver, status_obj)); + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " retcode=" << retcode << dendl; + /* ignore error, best effort */ + } + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSStreamObjToCloudMultipartCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + AWSSyncConfig& conf; + RGWRESTConn *source_conn; + std::shared_ptr target; + const rgw_obj& src_obj; + const rgw_obj& dest_obj; + + uint64_t obj_size; + string src_etag; + rgw_sync_aws_src_obj_properties src_properties; + rgw_rest_obj rest_obj; + + rgw_sync_aws_multipart_upload_info status; + + map new_attrs; + + rgw_sync_aws_multipart_part_info *pcur_part_info{nullptr}; + + int ret_err{0}; + + rgw_raw_obj status_obj; + +public: + RGWAWSStreamObjToCloudMultipartCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, + AWSSyncConfig& _conf, + RGWRESTConn *_source_conn, + const rgw_obj& _src_obj, + std::shared_ptr& _target, + const rgw_obj& _dest_obj, + uint64_t _obj_size, + const rgw_sync_aws_src_obj_properties& _src_properties, + const rgw_rest_obj& _rest_obj) : RGWCoroutine(_sc->cct), + sc(_sc), + sync_env(_sc->env), + conf(_conf), + source_conn(_source_conn), + target(_target), + src_obj(_src_obj), + dest_obj(_dest_obj), + obj_size(_obj_size), + src_properties(_src_properties), + rest_obj(_rest_obj), + status_obj(sync_env->svc->zone->get_zone_params().log_pool, + RGWBucketPipeSyncStatusManager::obj_status_oid(_sync_pipe, sc->source_zone, src_obj)) { + } + + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + yield call(new RGWSimpleRadosReadCR( + dpp, sync_env->driver, status_obj, &status, false)); + + if (retcode < 0 && retcode != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " retcode=" << retcode << dendl; + return retcode; + } + + if (retcode >= 0) { + /* check here that mtime and size did not change */ + + if (status.src_properties.mtime != src_properties.mtime || status.obj_size != obj_size || + status.src_properties.etag != src_properties.etag) { + yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id)); + retcode = -ENOENT; + } + } + + if (retcode == -ENOENT) { + RGWAWSStreamPutCRF::init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs); + + yield call(new RGWAWSInitMultipartCR(sc, target->conn.get(), dest_obj, status.obj_size, std::move(new_attrs), &status.upload_id)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + status.obj_size = obj_size; + status.src_properties = src_properties; +#define MULTIPART_MAX_PARTS 10000 + uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS; + status.part_size = std::max(conf.s3.multipart_min_part_size, min_part_size); + status.num_parts = (obj_size + status.part_size - 1) / status.part_size; + status.cur_part = 1; + } + + for (; (uint32_t)status.cur_part <= status.num_parts; ++status.cur_part) { + yield { + rgw_sync_aws_multipart_part_info& cur_part_info = status.parts[status.cur_part]; + cur_part_info.part_num = status.cur_part; + cur_part_info.ofs = status.cur_ofs; + cur_part_info.size = std::min((uint64_t)status.part_size, status.obj_size - status.cur_ofs); + + pcur_part_info = &cur_part_info; + + status.cur_ofs += status.part_size; + + call(new RGWAWSStreamObjToCloudMultipartPartCR(sc, + source_conn, src_obj, + target, + dest_obj, + status.src_properties, + status.upload_id, + cur_part_info, + &cur_part_info.etag)); + } + + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to sync obj=" << src_obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << status.cur_part << " (error: " << cpp_strerror(-retcode) << ")" << dendl; + ret_err = retcode; + yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id)); + return set_cr_error(ret_err); + } + + yield call(new RGWSimpleRadosWriteCR(dpp, sync_env->driver, status_obj, status)); + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to store multipart upload state, retcode=" << retcode << dendl; + /* continue with upload anyway */ + } + ldpp_dout(dpp, 20) << "sync of object=" << src_obj << " via multipart upload, finished sending part #" << status.cur_part << " etag=" << pcur_part_info->etag << dendl; + } + + yield call(new RGWAWSCompleteMultipartCR(sc, target->conn.get(), dest_obj, status.upload_id, status.parts)); + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << src_obj << " (error: " << cpp_strerror(-retcode) << ")" << dendl; + ret_err = retcode; + yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id)); + return set_cr_error(ret_err); + } + + /* remove status obj */ + yield call(new RGWRadosRemoveCR(sync_env->driver, status_obj)); + if (retcode < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload obj=" << src_obj << " upload_id=" << status.upload_id << " part number " << status.cur_part << " (" << cpp_strerror(-retcode) << ")" << dendl; + /* ignore error, best effort */ + } + return set_cr_done(); + } + + return 0; + } +}; +template +int decode_attr(map& attrs, const char *attr_name, T *result, T def_val) +{ + map::iterator iter = attrs.find(attr_name); + if (iter == attrs.end()) { + *result = def_val; + return 0; + } + bufferlist& bl = iter->second; + if (bl.length() == 0) { + *result = def_val; + return 0; + } + auto bliter = bl.cbegin(); + try { + decode(*result, bliter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +// maybe use Fetch Remote Obj instead? +class RGWAWSHandleRemoteObjCBCR: public RGWStatRemoteObjCBCR { + rgw_bucket_sync_pipe sync_pipe; + AWSSyncInstanceEnv& instance; + + uint64_t versioned_epoch{0}; + + RGWRESTConn *source_conn{nullptr}; + std::shared_ptr target; + bufferlist res; + unordered_map bucket_created; + rgw_rest_obj rest_obj; + int ret{0}; + + uint32_t src_zone_short_id{0}; + uint64_t src_pg_ver{0}; + + bufferlist out_bl; + + struct CreateBucketResult { + string code; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Code", code, obj); + } + } result; + + rgw_obj src_obj; + rgw_obj dest_obj; + +public: + RGWAWSHandleRemoteObjCBCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, + rgw_obj_key& _key, + AWSSyncInstanceEnv& _instance, + uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key), + sync_pipe(_sync_pipe), + instance(_instance), versioned_epoch(_versioned_epoch) + {} + + ~RGWAWSHandleRemoteObjCBCR(){ + } + + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + ret = decode_attr(attrs, RGW_ATTR_PG_VER, &src_pg_ver, (uint64_t)0); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl; + } else { + ret = decode_attr(attrs, RGW_ATTR_SOURCE_ZONE, &src_zone_short_id, (uint32_t)0); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode source zone short_id attr, ignoring" << dendl; + src_pg_ver = 0; /* all or nothing */ + } + } + ldpp_dout(dpp, 4) << "AWS: download begin: z=" << sc->source_zone + << " b=" << src_bucket << " k=" << key << " size=" << size + << " mtime=" << mtime << " etag=" << etag + << " zone_short_id=" << src_zone_short_id << " pg_ver=" << src_pg_ver + << dendl; + + source_conn = sync_env->svc->zone->get_zone_conn(sc->source_zone); + if (!source_conn) { + ldpp_dout(dpp, 0) << "ERROR: cannot find http connection to zone " << sc->source_zone << dendl; + return set_cr_error(-EINVAL); + } + + instance.get_profile(sync_pipe.info.source_bs.bucket, &target); + instance.conf.get_target(target, sync_pipe.dest_bucket_info, key, &dest_obj.bucket.name, &dest_obj.key.name); + + if (bucket_created.find(dest_obj.bucket.name) == bucket_created.end()){ + yield { + ldpp_dout(dpp, 0) << "AWS: creating bucket " << dest_obj.bucket.name << dendl; + bufferlist bl; + call(new RGWPutRawRESTResourceCR (sc->cct, target->conn.get(), + sync_env->http_manager, + dest_obj.bucket.name, nullptr, bl, &out_bl)); + } + if (retcode < 0 ) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl; + return set_cr_error(retcode); + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl; + return set_cr_error(retcode); + } + + try { + RGWXMLDecoder::decode_xml("Error", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl; + return set_cr_error(retcode); + } + + if (result.code != "BucketAlreadyOwnedByYou") { + return set_cr_error(retcode); + } + } + + bucket_created[dest_obj.bucket.name] = true; + } + + yield { + src_obj.bucket = src_bucket; + src_obj.key = key; + + /* init output */ + rgw_sync_aws_src_obj_properties src_properties; + src_properties.mtime = mtime; + src_properties.etag = etag; + src_properties.zone_short_id = src_zone_short_id; + src_properties.pg_ver = src_pg_ver; + src_properties.versioned_epoch = versioned_epoch; + + if (size < instance.conf.s3.multipart_sync_threshold) { + call(new RGWAWSStreamObjToCloudPlainCR(sc, source_conn, src_obj, + src_properties, + target, + dest_obj)); + } else { + rgw_rest_obj rest_obj; + rest_obj.init(key); + if (do_decode_rest_obj(dpp, sc->cct, attrs, headers, &rest_obj)) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode rest obj out of headers=" << headers << ", attrs=" << attrs << dendl; + return set_cr_error(-EINVAL); + } + call(new RGWAWSStreamObjToCloudMultipartCR(sc, sync_pipe, instance.conf, source_conn, src_obj, + target, dest_obj, size, src_properties, rest_obj)); + } + } + if (retcode < 0) { + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSHandleRemoteObjCR : public RGWCallStatRemoteObjCR { + rgw_bucket_sync_pipe sync_pipe; + AWSSyncInstanceEnv& instance; + uint64_t versioned_epoch; +public: + RGWAWSHandleRemoteObjCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, + AWSSyncInstanceEnv& _instance, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key), + sync_pipe(_sync_pipe), + instance(_instance), versioned_epoch(_versioned_epoch) { + } + + ~RGWAWSHandleRemoteObjCR() {} + + RGWStatRemoteObjCBCR *allocate_callback() override { + return new RGWAWSHandleRemoteObjCBCR(sc, sync_pipe, key, instance, versioned_epoch); + } +}; + +class RGWAWSRemoveRemoteObjCBCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + std::shared_ptr target; + rgw_bucket_sync_pipe sync_pipe; + rgw_obj_key key; + ceph::real_time mtime; + AWSSyncInstanceEnv& instance; + int ret{0}; +public: + RGWAWSRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime, + AWSSyncInstanceEnv& _instance) : RGWCoroutine(_sc->cct), sc(_sc), + sync_pipe(_sync_pipe), key(_key), + mtime(_mtime), instance(_instance) {} + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + ldpp_dout(dpp, 0) << ": remove remote obj: z=" << sc->source_zone + << " b=" <cct, target->conn.get(), + sc->env->http_manager, + path, nullptr /* params */)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } + +}; + + +class RGWAWSDataSyncModule: public RGWDataSyncModule { + CephContext *cct; + AWSSyncInstanceEnv instance; +public: + RGWAWSDataSyncModule(CephContext *_cct, AWSSyncConfig& _conf) : + cct(_cct), + instance(_conf) { + } + + void init(RGWDataSyncCtx *sc, uint64_t instance_id) override { + instance.init(sc, instance_id); + } + + ~RGWAWSDataSyncModule() {} + + RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, + std::optional versioned_epoch, + const rgw_zone_set_entry& source_trace_entry, + rgw_zone_set *zones_trace) override { + ldout(sc->cct, 0) << instance.id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + return new RGWAWSHandleRemoteObjCR(sc, sync_pipe, key, instance, versioned_epoch.value_or(0)); + } + RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, + rgw_zone_set *zones_trace) override { + ldout(sc->cct, 0) <<"rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return new RGWAWSRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, instance); + } + RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, + rgw_zone_set *zones_trace) override { + ldout(sc->cct, 0) <<"AWS Not implemented: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime + << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return NULL; + } +}; + +class RGWAWSSyncModuleInstance : public RGWSyncModuleInstance { + RGWAWSDataSyncModule data_handler; +public: + RGWAWSSyncModuleInstance(CephContext *cct, AWSSyncConfig& _conf) : data_handler(cct, _conf) {} + RGWDataSyncModule *get_data_handler() override { + return &data_handler; + } +}; + +int RGWAWSSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance){ + AWSSyncConfig conf; + + int r = conf.init(dpp, cct, config); + if (r < 0) { + return r; + } + + instance->reset(new RGWAWSSyncModuleInstance(cct, conf)); + return 0; +} diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.h b/src/rgw/driver/rados/rgw_sync_module_aws.h new file mode 100644 index 000000000..92532ff00 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module_aws.h @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_sync_module.h" + +struct rgw_sync_aws_multipart_part_info { + int part_num{0}; + uint64_t ofs{0}; + uint64_t size{0}; + std::string etag; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(part_num, bl); + encode(ofs, bl); + encode(size, bl); + encode(etag, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(part_num, bl); + decode(ofs, bl); + decode(size, bl); + decode(etag, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_part_info) + +struct rgw_sync_aws_src_obj_properties { + ceph::real_time mtime; + std::string etag; + uint32_t zone_short_id{0}; + uint64_t pg_ver{0}; + uint64_t versioned_epoch{0}; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(mtime, bl); + encode(etag, bl); + encode(zone_short_id, bl); + encode(pg_ver, bl); + encode(versioned_epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(mtime, bl); + decode(etag, bl); + decode(zone_short_id, bl); + decode(pg_ver, bl); + decode(versioned_epoch, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_sync_aws_src_obj_properties) + +struct rgw_sync_aws_multipart_upload_info { + std::string upload_id; + uint64_t obj_size; + rgw_sync_aws_src_obj_properties src_properties; + uint32_t part_size{0}; + uint32_t num_parts{0}; + + int cur_part{0}; + uint64_t cur_ofs{0}; + + std::map parts; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(upload_id, bl); + encode(obj_size, bl); + encode(src_properties, bl); + encode(part_size, bl); + encode(num_parts, bl); + encode(cur_part, bl); + encode(cur_ofs, bl); + encode(parts, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(upload_id, bl); + decode(obj_size, bl); + decode(src_properties, bl); + decode(part_size, bl); + decode(num_parts, bl); + decode(cur_part, bl); + decode(cur_ofs, bl); + decode(parts, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_upload_info) + +class RGWAWSSyncModule : public RGWSyncModule { + public: + RGWAWSSyncModule() {} + bool supports_data_export() override { return false;} + int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; diff --git a/src/rgw/driver/rados/rgw_sync_module_es.cc b/src/rgw/driver/rados/rgw_sync_module_es.cc new file mode 100644 index 000000000..4e8eb6201 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module_es.cc @@ -0,0 +1,962 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_b64.h" +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_sync_module_es.h" +#include "rgw_sync_module_es_rest.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rest.h" +#include "rgw_op.h" +#include "rgw_es_query.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include "include/str_list.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +/* + * allowlist utility. Config string is a list of entries, where an entry is either an item, + * a prefix, or a suffix. An item would be the name of the entity that we'd look up, + * a prefix would be a string ending with an asterisk, a suffix would be a string starting + * with an asterisk. For example: + * + * bucket1, bucket2, foo*, *bar + */ +class ItemList { + bool approve_all{false}; + + set entries; + set prefixes; + set suffixes; + + void parse(const string& str) { + list l; + + get_str_list(str, ",", l); + + for (auto& entry : l) { + entry = rgw_trim_whitespace(entry); + if (entry.empty()) { + continue; + } + + if (entry == "*") { + approve_all = true; + return; + } + + if (entry[0] == '*') { + suffixes.insert(entry.substr(1)); + continue; + } + + if (entry.back() == '*') { + prefixes.insert(entry.substr(0, entry.size() - 1)); + continue; + } + + entries.insert(entry); + } + } + +public: + ItemList() {} + void init(const string& str, bool def_val) { + if (str.empty()) { + approve_all = def_val; + } else { + parse(str); + } + } + + bool exists(const string& entry) { + if (approve_all) { + return true; + } + + if (entries.find(entry) != entries.end()) { + return true; + } + + auto i = prefixes.upper_bound(entry); + if (i != prefixes.begin()) { + --i; + if (boost::algorithm::starts_with(entry, *i)) { + return true; + } + } + + for (i = suffixes.begin(); i != suffixes.end(); ++i) { + if (boost::algorithm::ends_with(entry, *i)) { + return true; + } + } + + return false; + } +}; + +#define ES_NUM_SHARDS_MIN 5 + +#define ES_NUM_SHARDS_DEFAULT 16 +#define ES_NUM_REPLICAS_DEFAULT 1 + +using ESVersion = std::pair; +static constexpr ESVersion ES_V5{5,0}; +static constexpr ESVersion ES_V7{7,0}; + +struct ESInfo { + std::string name; + std::string cluster_name; + std::string cluster_uuid; + ESVersion version; + + void decode_json(JSONObj *obj); + + std::string get_version_str(){ + return std::to_string(version.first) + "." + std::to_string(version.second); + } +}; + +// simple wrapper structure to wrap the es version nested type +struct es_version_decoder { + ESVersion version; + + int parse_version(const std::string& s) { + int major, minor; + int ret = sscanf(s.c_str(), "%d.%d", &major, &minor); + if (ret < 0) { + return ret; + } + version = std::make_pair(major,minor); + return 0; + } + + void decode_json(JSONObj *obj) { + std::string s; + JSONDecoder::decode_json("number",s,obj); + if (parse_version(s) < 0) + throw JSONDecoder::err("Failed to parse ElasticVersion"); + } +}; + + +void ESInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("cluster_name", cluster_name, obj); + JSONDecoder::decode_json("cluster_uuid", cluster_uuid, obj); + es_version_decoder esv; + JSONDecoder::decode_json("version", esv, obj); + version = std::move(esv.version); +} + +struct ElasticConfig { + uint64_t sync_instance{0}; + string id; + string index_path; + std::unique_ptr conn; + bool explicit_custom_meta{true}; + string override_index_path; + ItemList index_buckets; + ItemList allow_owners; + uint32_t num_shards{0}; + uint32_t num_replicas{0}; + std::map default_headers = {{ "Content-Type", "application/json" }}; + ESInfo es_info; + + void init(CephContext *cct, const JSONFormattable& config) { + string elastic_endpoint = config["endpoint"]; + id = string("elastic:") + elastic_endpoint; + conn.reset(new RGWRESTConn(cct, (rgw::sal::Driver*)nullptr, id, { elastic_endpoint }, nullopt /* region */ )); + explicit_custom_meta = config["explicit_custom_meta"](true); + index_buckets.init(config["index_buckets_list"], true); /* approve all buckets by default */ + allow_owners.init(config["approved_owners_list"], true); /* approve all bucket owners by default */ + override_index_path = config["override_index_path"]; + num_shards = config["num_shards"](ES_NUM_SHARDS_DEFAULT); + if (num_shards < ES_NUM_SHARDS_MIN) { + num_shards = ES_NUM_SHARDS_MIN; + } + num_replicas = config["num_replicas"](ES_NUM_REPLICAS_DEFAULT); + if (string user = config["username"], pw = config["password"]; + !user.empty() && !pw.empty()) { + auto auth_string = user + ":" + pw; + default_headers.emplace("AUTHORIZATION", "Basic " + rgw::to_base64(auth_string)); + } + + } + + void init_instance(const RGWRealm& realm, uint64_t instance_id) { + sync_instance = instance_id; + + if (!override_index_path.empty()) { + index_path = override_index_path; + return; + } + + char buf[32]; + snprintf(buf, sizeof(buf), "-%08x", (uint32_t)(sync_instance & 0xFFFFFFFF)); + + index_path = "/rgw-" + realm.get_name() + buf; + } + + string get_index_path() { + return index_path; + } + + map& get_request_headers() { + return default_headers; + } + + string get_obj_path(const RGWBucketInfo& bucket_info, const rgw_obj_key& key) { + if (es_info.version >= ES_V7) { + return index_path+ "/_doc/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance)); +; + } else { + return index_path + "/object/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance)); + } + } + + bool should_handle_operation(RGWBucketInfo& bucket_info) { + return index_buckets.exists(bucket_info.bucket.name) && + allow_owners.exists(bucket_info.owner.to_str()); + } +}; + +using ElasticConfigRef = std::shared_ptr; + +static const char *es_type_to_str(const ESType& t) { + switch (t) { + case ESType::String: return "string"; + case ESType::Text: return "text"; + case ESType::Keyword: return "keyword"; + case ESType::Long: return "long"; + case ESType::Integer: return "integer"; + case ESType::Short: return "short"; + case ESType::Byte: return "byte"; + case ESType::Double: return "double"; + case ESType::Float: return "float"; + case ESType::Half_Float: return "half_float"; + case ESType::Scaled_Float: return "scaled_float"; + case ESType::Date: return "date"; + case ESType::Boolean: return "boolean"; + case ESType::Integer_Range: return "integer_range"; + case ESType::Float_Range: return "float_range"; + case ESType::Double_Range: return "date_range"; + case ESType::Date_Range: return "date_range"; + case ESType::Geo_Point: return "geo_point"; + case ESType::Ip: return "ip"; + default: + return ""; + } +} + +struct es_type_v2 { + ESType estype; + const char *format{nullptr}; + std::optional analyzed; + + es_type_v2(ESType et) : estype(et) {} + + void dump(Formatter *f) const { + const char *type_str = es_type_to_str(estype); + encode_json("type", type_str, f); + if (format) { + encode_json("format", format, f); + } + + auto is_analyzed = analyzed; + + if (estype == ESType::String && + !is_analyzed) { + is_analyzed = false; + } + + if (is_analyzed) { + encode_json("index", (is_analyzed.value() ? "analyzed" : "not_analyzed"), f); + } + } +}; + +struct es_type_v5 { + ESType estype; + const char *format{nullptr}; + std::optional analyzed; + std::optional index; + + es_type_v5(ESType et) : estype(et) {} + + void dump(Formatter *f) const { + ESType new_estype; + if (estype != ESType::String) { + new_estype = estype; + } else { + bool is_analyzed = analyzed.value_or(false); + new_estype = (is_analyzed ? ESType::Text : ESType::Keyword); + /* index = true; ... Not setting index=true, because that's the default, + * and dumping a boolean value *might* be a problem when backporting this + * because value might get quoted + */ + } + + const char *type_str = es_type_to_str(new_estype); + encode_json("type", type_str, f); + if (format) { + encode_json("format", format, f); + } + if (index) { + encode_json("index", index.value(), f); + } + } +}; + +template +struct es_type : public T { + es_type(T t) : T(t) {} + es_type& set_format(const char *f) { + T::format = f; + return *this; + } + + es_type& set_analyzed(bool a) { + T::analyzed = a; + return *this; + } +}; + +template +struct es_index_mappings { + ESVersion es_version; + ESType string_type {ESType::String}; + + es_index_mappings(ESVersion esv):es_version(esv) { + } + + es_type est(ESType t) const { + return es_type(t); + } + + void dump_custom(const char *section, ESType type, const char *format, Formatter *f) const { + f->open_object_section(section); + ::encode_json("type", "nested", f); + f->open_object_section("properties"); + encode_json("name", est(string_type), f); + encode_json("value", est(type).set_format(format), f); + f->close_section(); // entry + f->close_section(); // custom-string + } + + void dump(Formatter *f) const { + if (es_version <= ES_V7) + f->open_object_section("object"); + f->open_object_section("properties"); + encode_json("bucket", est(string_type), f); + encode_json("name", est(string_type), f); + encode_json("instance", est(string_type), f); + encode_json("versioned_epoch", est(ESType::Long), f); + f->open_object_section("meta"); + f->open_object_section("properties"); + encode_json("cache_control", est(string_type), f); + encode_json("content_disposition", est(string_type), f); + encode_json("content_encoding", est(string_type), f); + encode_json("content_language", est(string_type), f); + encode_json("content_type", est(string_type), f); + encode_json("storage_class", est(string_type), f); + encode_json("etag", est(string_type), f); + encode_json("expires", est(string_type), f); + encode_json("mtime", est(ESType::Date) + .set_format("strict_date_optional_time||epoch_millis"), f); + encode_json("size", est(ESType::Long), f); + dump_custom("custom-string", string_type, nullptr, f); + dump_custom("custom-int", ESType::Long, nullptr, f); + dump_custom("custom-date", ESType::Date, "strict_date_optional_time||epoch_millis", f); + f->close_section(); // properties + f->close_section(); // meta + f->close_section(); // properties + + if (es_version <= ES_V7) + f->close_section(); // object + } +}; + +struct es_index_settings { + uint32_t num_replicas; + uint32_t num_shards; + + es_index_settings(uint32_t _replicas, uint32_t _shards) : num_replicas(_replicas), num_shards(_shards) {} + + void dump(Formatter *f) const { + encode_json("number_of_replicas", num_replicas, f); + encode_json("number_of_shards", num_shards, f); + } +}; + +struct es_index_config_base { + virtual ~es_index_config_base() {} + virtual void dump(Formatter *f) const = 0; +}; + +template +struct es_index_config : public es_index_config_base { + es_index_settings settings; + es_index_mappings mappings; + + es_index_config(es_index_settings& _s, ESVersion esv) : settings(_s), mappings(esv) { + } + + void dump(Formatter *f) const { + encode_json("settings", settings, f); + encode_json("mappings", mappings, f); + } +}; + +static bool is_sys_attr(const std::string& attr_name){ + static constexpr std::initializer_list rgw_sys_attrs = + {RGW_ATTR_PG_VER, + RGW_ATTR_SOURCE_ZONE, + RGW_ATTR_ID_TAG, + RGW_ATTR_TEMPURL_KEY1, + RGW_ATTR_TEMPURL_KEY2, + RGW_ATTR_UNIX1, + RGW_ATTR_UNIX_KEY1 + }; + + return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end(); +} + +static size_t attr_len(const bufferlist& val) +{ + size_t len = val.length(); + if (len && val[len - 1] == '\0') { + --len; + } + + return len; +} + +struct es_obj_metadata { + const DoutPrefixProvider *dpp; + CephContext *cct; + ElasticConfigRef es_conf; + RGWBucketInfo bucket_info; + rgw_obj_key key; + ceph::real_time mtime; + uint64_t size; + map attrs; + uint64_t versioned_epoch; + + es_obj_metadata(CephContext *_cct, ElasticConfigRef _es_conf, const RGWBucketInfo& _bucket_info, + const rgw_obj_key& _key, ceph::real_time& _mtime, uint64_t _size, + map& _attrs, uint64_t _versioned_epoch) : cct(_cct), es_conf(_es_conf), bucket_info(_bucket_info), key(_key), + mtime(_mtime), size(_size), attrs(std::move(_attrs)), versioned_epoch(_versioned_epoch) {} + + void dump(Formatter *f) const { + map out_attrs; + map custom_meta; + RGWAccessControlPolicy policy; + set permissions; + RGWObjTags obj_tags; + + for (auto i : attrs) { + const string& attr_name = i.first; + bufferlist& val = i.second; + + if (!boost::algorithm::starts_with(attr_name, RGW_ATTR_PREFIX)) { + continue; + } + + if (boost::algorithm::starts_with(attr_name, RGW_ATTR_META_PREFIX)) { + custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1), + string(val.c_str(), attr_len(val))); + continue; + } + + if (boost::algorithm::starts_with(attr_name, RGW_ATTR_CRYPT_PREFIX)) { + continue; + } + + if (boost::algorithm::starts_with(attr_name, RGW_ATTR_OLH_PREFIX)) { + // skip versioned object olh info + continue; + } + + if (attr_name == RGW_ATTR_ACL) { + try { + auto i = val.cbegin(); + decode(policy, i); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl; + continue; + } + + const RGWAccessControlList& acl = policy.get_acl(); + + permissions.insert(policy.get_owner().get_id().to_str()); + for (auto acliter : acl.get_grant_map()) { + const ACLGrant& grant = acliter.second; + if (grant.get_type().get_type() == ACL_TYPE_CANON_USER && + ((uint32_t)grant.get_permission().get_permissions() & RGW_PERM_READ) != 0) { + rgw_user user; + if (grant.get_id(user)) { + permissions.insert(user.to_str()); + } + } + } + } else if (attr_name == RGW_ATTR_TAGS) { + try { + auto tags_bl = val.cbegin(); + decode(obj_tags, tags_bl); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode obj tags for " + << bucket_info.bucket << "/" << key << dendl; + continue; + } + } else if (attr_name == RGW_ATTR_COMPRESSION) { + RGWCompressionInfo cs_info; + try { + auto vals_bl = val.cbegin(); + decode(cs_info, vals_bl); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode compression attr for " + << bucket_info.bucket << "/" << key << dendl; + continue; + } + out_attrs.emplace("compression",std::move(cs_info.compression_type)); + } else { + if (!is_sys_attr(attr_name)) { + out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1), + std::string(val.c_str(), attr_len(val))); + } + } + } + ::encode_json("bucket", bucket_info.bucket.name, f); + ::encode_json("name", key.name, f); + string instance = key.instance; + if (instance.empty()) + instance = "null"; + ::encode_json("instance", instance, f); + ::encode_json("versioned_epoch", versioned_epoch, f); + ::encode_json("owner", policy.get_owner(), f); + ::encode_json("permissions", permissions, f); + f->open_object_section("meta"); + ::encode_json("size", size, f); + + string mtime_str; + rgw_to_iso8601(mtime, &mtime_str); + ::encode_json("mtime", mtime_str, f); + for (auto i : out_attrs) { + ::encode_json(i.first.c_str(), i.second, f); + } + map custom_str; + map custom_int; + map custom_date; + + for (auto i : custom_meta) { + auto config = bucket_info.mdsearch_config.find(i.first); + if (config == bucket_info.mdsearch_config.end()) { + if (!es_conf->explicit_custom_meta) { + /* default custom meta is of type string */ + custom_str[i.first] = i.second; + } else { + ldpp_dout(dpp, 20) << "custom meta entry key=" << i.first << " not found in bucket mdsearch config: " << bucket_info.mdsearch_config << dendl; + } + continue; + } + switch (config->second) { + case ESEntityTypeMap::ES_ENTITY_DATE: + custom_date[i.first] = i.second; + break; + case ESEntityTypeMap::ES_ENTITY_INT: + custom_int[i.first] = i.second; + break; + default: + custom_str[i.first] = i.second; + } + } + + if (!custom_str.empty()) { + f->open_array_section("custom-string"); + for (auto i : custom_str) { + f->open_object_section("entity"); + ::encode_json("name", i.first.c_str(), f); + ::encode_json("value", i.second, f); + f->close_section(); + } + f->close_section(); + } + if (!custom_int.empty()) { + f->open_array_section("custom-int"); + for (auto i : custom_int) { + f->open_object_section("entity"); + ::encode_json("name", i.first.c_str(), f); + ::encode_json("value", i.second, f); + f->close_section(); + } + f->close_section(); + } + if (!custom_date.empty()) { + f->open_array_section("custom-date"); + for (auto i : custom_date) { + /* + * try to exlicitly parse date field, otherwise elasticsearch could reject the whole doc, + * which will end up with failed sync + */ + real_time t; + int r = parse_time(i.second.c_str(), &t); + if (r < 0) { + ldpp_dout(dpp, 20) << __func__ << "(): failed to parse time (" << i.second << "), skipping encoding of custom date attribute" << dendl; + continue; + } + + string time_str; + rgw_to_iso8601(t, &time_str); + + f->open_object_section("entity"); + ::encode_json("name", i.first.c_str(), f); + ::encode_json("value", time_str.c_str(), f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); // meta + const auto& m = obj_tags.get_tags(); + if (m.size() > 0){ + f->open_array_section("tagging"); + for (const auto &it : m) { + f->open_object_section("tag"); + ::encode_json("key", it.first, f); + ::encode_json("value",it.second, f); + f->close_section(); + } + f->close_section(); // tagging + } + } +}; + +class RGWElasticGetESInfoCBCR : public RGWCoroutine { +public: + RGWElasticGetESInfoCBCR(RGWDataSyncCtx *_sc, + ElasticConfigRef _conf) : RGWCoroutine(_sc->cct), + sc(_sc), sync_env(_sc->env), + conf(_conf) {} + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch info for zone: " << sc->source_zone << dendl; + yield call(new RGWReadRESTResourceCR (sync_env->cct, + conf->conn.get(), + sync_env->http_manager, + "/", nullptr /*params*/, + &(conf->default_headers), + &(conf->es_info))); + if (retcode < 0) { + ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch failed: " << retcode << dendl; + return set_cr_error(retcode); + } + + ldpp_dout(dpp, 5) << conf->id << ": got elastic version=" << conf->es_info.get_version_str() << dendl; + return set_cr_done(); + } + return 0; + } +private: + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + ElasticConfigRef conf; +}; + +class RGWElasticPutIndexCBCR : public RGWCoroutine { +public: + RGWElasticPutIndexCBCR(RGWDataSyncCtx *_sc, + ElasticConfigRef _conf) : RGWCoroutine(_sc->cct), + sc(_sc), sync_env(_sc->env), + conf(_conf) {} + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + ldpp_dout(dpp, 5) << conf->id << ": put elasticsearch index for zone: " << sc->source_zone << dendl; + + yield { + string path = conf->get_index_path(); + es_index_settings settings(conf->num_replicas, conf->num_shards); + std::unique_ptr index_conf; + + if (conf->es_info.version >= ES_V5) { + ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version >= 5" << dendl; + index_conf.reset(new es_index_config(settings, conf->es_info.version)); + } else { + ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version < 5" << dendl; + index_conf.reset(new es_index_config(settings, conf->es_info.version)); + } + call(new RGWPutRESTResourceCR (sc->cct, + conf->conn.get(), + sync_env->http_manager, + path, nullptr /*params*/, + &(conf->default_headers), + *index_conf, nullptr, &err_response)); + } + if (retcode < 0) { + + if (err_response.error.type != "index_already_exists_exception" && + err_response.error.type != "resource_already_exists_exception") { + ldpp_dout(dpp, 0) << "elasticsearch: failed to initialize index: response.type=" << err_response.error.type << " response.reason=" << err_response.error.reason << dendl; + return set_cr_error(retcode); + } + + ldpp_dout(dpp, 0) << "elasticsearch: index already exists, assuming external initialization" << dendl; + } + return set_cr_done(); + } + return 0; + } + +private: + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + ElasticConfigRef conf; + + struct _err_response { + struct err_reason { + vector root_cause; + string type; + string reason; + string index; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("root_cause", root_cause, obj); + JSONDecoder::decode_json("type", type, obj); + JSONDecoder::decode_json("reason", reason, obj); + JSONDecoder::decode_json("index", index, obj); + } + } error; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("error", error, obj); + } + } err_response; +}; + +class RGWElasticInitConfigCBCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + ElasticConfigRef conf; + +public: + RGWElasticInitConfigCBCR(RGWDataSyncCtx *_sc, + ElasticConfigRef _conf) : RGWCoroutine(_sc->cct), + sc(_sc), sync_env(_sc->env), + conf(_conf) {} + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + + yield call(new RGWElasticGetESInfoCBCR(sc, conf)); + + if (retcode < 0) { + return set_cr_error(retcode); + } + + yield call(new RGWElasticPutIndexCBCR(sc, conf)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } + +}; + +class RGWElasticHandleRemoteObjCBCR : public RGWStatRemoteObjCBCR { + rgw_bucket_sync_pipe sync_pipe; + ElasticConfigRef conf; + uint64_t versioned_epoch; +public: + RGWElasticHandleRemoteObjCBCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, + ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key), + sync_pipe(_sync_pipe), conf(_conf), + versioned_epoch(_versioned_epoch) {} + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + ldpp_dout(dpp, 10) << ": stat of remote obj: z=" << sc->source_zone + << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key + << " size=" << size << " mtime=" << mtime << dendl; + + yield { + string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key); + es_obj_metadata doc(sync_env->cct, conf, sync_pipe.dest_bucket_info, key, mtime, size, attrs, versioned_epoch); + + call(new RGWPutRESTResourceCR(sync_env->cct, conf->conn.get(), + sync_env->http_manager, + path, nullptr /* params */, + &(conf->default_headers), + doc, nullptr /* result */)); + + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +class RGWElasticHandleRemoteObjCR : public RGWCallStatRemoteObjCR { + rgw_bucket_sync_pipe sync_pipe; + ElasticConfigRef conf; + uint64_t versioned_epoch; +public: + RGWElasticHandleRemoteObjCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, + ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key), + sync_pipe(_sync_pipe), + conf(_conf), versioned_epoch(_versioned_epoch) { + } + + ~RGWElasticHandleRemoteObjCR() override {} + + RGWStatRemoteObjCBCR *allocate_callback() override { + return new RGWElasticHandleRemoteObjCBCR(sc, sync_pipe, key, conf, versioned_epoch); + } +}; + +class RGWElasticRemoveRemoteObjCBCR : public RGWCoroutine { + RGWDataSyncCtx *sc; + RGWDataSyncEnv *sync_env; + rgw_bucket_sync_pipe sync_pipe; + rgw_obj_key key; + ceph::real_time mtime; + ElasticConfigRef conf; +public: + RGWElasticRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc, + rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime, + ElasticConfigRef _conf) : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), + sync_pipe(_sync_pipe), key(_key), + mtime(_mtime), conf(_conf) {} + int operate(const DoutPrefixProvider *dpp) override { + reenter(this) { + ldpp_dout(dpp, 10) << ": remove remote obj: z=" << sc->source_zone + << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl; + yield { + string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key); + + call(new RGWDeleteRESTResourceCR(sync_env->cct, conf->conn.get(), + sync_env->http_manager, + path, nullptr /* params */)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } + +}; + +class RGWElasticDataSyncModule : public RGWDataSyncModule { + ElasticConfigRef conf; +public: + RGWElasticDataSyncModule(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) : conf(std::make_shared()) { + conf->init(cct, config); + } + ~RGWElasticDataSyncModule() override {} + + void init(RGWDataSyncCtx *sc, uint64_t instance_id) override { + conf->init_instance(sc->env->svc->zone->get_realm(), instance_id); + } + + RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override { + ldpp_dout(dpp, 5) << conf->id << ": init" << dendl; + return new RGWElasticInitConfigCBCR(sc, conf); + } + + RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override { + ldpp_dout(dpp, 5) << conf->id << ": start_sync" << dendl; + // try to get elastic search version + return new RGWElasticGetESInfoCBCR(sc, conf); + } + + RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional versioned_epoch, const rgw_zone_set_entry& source_trace_entry, rgw_zone_set *zones_trace) override { + ldpp_dout(dpp, 10) << conf->id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) { + ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl; + return nullptr; + } + return new RGWElasticHandleRemoteObjCR(sc, sync_pipe, key, conf, versioned_epoch.value_or(0)); + } + RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + /* versioned and versioned epoch params are useless in the elasticsearch backend case */ + ldpp_dout(dpp, 10) << conf->id << ": rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) { + ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl; + return nullptr; + } + return new RGWElasticRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, conf); + } + RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + ldpp_dout(dpp, 10) << conf->id << ": create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime + << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + ldpp_dout(dpp, 10) << conf->id << ": skipping operation (not handled)" << dendl; + return NULL; + } + RGWRESTConn *get_rest_conn() { + return conf->conn.get(); + } + + string get_index_path() { + return conf->get_index_path(); + } + + map& get_request_headers() { + return conf->get_request_headers(); + } +}; + +RGWElasticSyncModuleInstance::RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) +{ + data_handler = std::unique_ptr(new RGWElasticDataSyncModule(dpp, cct, config)); +} + +RGWDataSyncModule *RGWElasticSyncModuleInstance::get_data_handler() +{ + return data_handler.get(); +} + +RGWRESTConn *RGWElasticSyncModuleInstance::get_rest_conn() +{ + return data_handler->get_rest_conn(); +} + +string RGWElasticSyncModuleInstance::get_index_path() { + return data_handler->get_index_path(); +} + +map& RGWElasticSyncModuleInstance::get_request_headers() { + return data_handler->get_request_headers(); +} + +RGWRESTMgr *RGWElasticSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) { + if (dialect != RGW_REST_S3) { + return orig; + } + delete orig; + return new RGWRESTMgr_MDSearch_S3(); +} + +int RGWElasticSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) { + string endpoint = config["endpoint"]; + instance->reset(new RGWElasticSyncModuleInstance(dpp, cct, config)); + return 0; +} + diff --git a/src/rgw/driver/rados/rgw_sync_module_es.h b/src/rgw/driver/rados/rgw_sync_module_es.h new file mode 100644 index 000000000..c8c9fcc43 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module_es.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_sync_module.h" + +enum class ESType { + /* string datatypes */ + String, /* Deprecated Since 5.X+ */ + Text, + Keyword, + + /* Numeric Types */ + Long, Integer, Short, Byte, Double, Float, Half_Float, Scaled_Float, + + /* Date Type */ + Date, + + /* Boolean */ + Boolean, + + /* Binary; Must Be Base64 Encoded */ + Binary, + + /* Range Types */ + Integer_Range, Float_Range, Long_Range, Double_Range, Date_Range, + + /* A Few Specialized Types */ + Geo_Point, + Ip +}; + + +class RGWElasticSyncModule : public RGWSyncModule { +public: + RGWElasticSyncModule() {} + bool supports_data_export() override { + return false; + } + int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; + +class RGWElasticDataSyncModule; +class RGWRESTConn; + +class RGWElasticSyncModuleInstance : public RGWSyncModuleInstance { + std::unique_ptr data_handler; +public: + RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config); + RGWDataSyncModule *get_data_handler() override; + RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override; + RGWRESTConn *get_rest_conn(); + std::string get_index_path(); + std::map& get_request_headers(); + bool supports_user_writes() override { + return true; + } +}; diff --git a/src/rgw/driver/rados/rgw_sync_module_es_rest.cc b/src/rgw/driver/rados/rgw_sync_module_es_rest.cc new file mode 100644 index 000000000..db9d48adb --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module_es_rest.cc @@ -0,0 +1,428 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_sync_module_es.h" +#include "rgw_sync_module_es_rest.h" +#include "rgw_es_query.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_sal_rados.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +struct es_index_obj_response { + string bucket; + rgw_obj_key key; + uint64_t versioned_epoch{0}; + ACLOwner owner; + set read_permissions; + + struct { + uint64_t size{0}; + ceph::real_time mtime; + string etag; + string content_type; + string storage_class; + map custom_str; + map custom_int; + map custom_date; + + template + struct _custom_entry { + string name; + T value; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("value", value, obj); + } + }; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("size", size, obj); + string mtime_str; + JSONDecoder::decode_json("mtime", mtime_str, obj); + parse_time(mtime_str.c_str(), &mtime); + JSONDecoder::decode_json("etag", etag, obj); + JSONDecoder::decode_json("content_type", content_type, obj); + JSONDecoder::decode_json("storage_class", storage_class, obj); + list<_custom_entry > str_entries; + JSONDecoder::decode_json("custom-string", str_entries, obj); + for (auto& e : str_entries) { + custom_str[e.name] = e.value; + } + list<_custom_entry > int_entries; + JSONDecoder::decode_json("custom-int", int_entries, obj); + for (auto& e : int_entries) { + custom_int[e.name] = e.value; + } + list<_custom_entry > date_entries; + JSONDecoder::decode_json("custom-date", date_entries, obj); + for (auto& e : date_entries) { + custom_date[e.name] = e.value; + } + } + } meta; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket", bucket, obj); + JSONDecoder::decode_json("name", key.name, obj); + JSONDecoder::decode_json("instance", key.instance, obj); + JSONDecoder::decode_json("versioned_epoch", versioned_epoch, obj); + JSONDecoder::decode_json("permissions", read_permissions, obj); + JSONDecoder::decode_json("owner", owner, obj); + JSONDecoder::decode_json("meta", meta, obj); + } +}; + +struct es_search_response { + uint32_t took; + bool timed_out; + struct { + uint32_t total; + uint32_t successful; + uint32_t failed; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("total", total, obj); + JSONDecoder::decode_json("successful", successful, obj); + JSONDecoder::decode_json("failed", failed, obj); + } + } shards; + struct obj_hit { + string index; + string type; + string id; + // double score + es_index_obj_response source; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("_index", index, obj); + JSONDecoder::decode_json("_type", type, obj); + JSONDecoder::decode_json("_id", id, obj); + JSONDecoder::decode_json("_source", source, obj); + } + }; + struct { + uint32_t total; + // double max_score; + list hits; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("total", total, obj); + // JSONDecoder::decode_json("max_score", max_score, obj); + JSONDecoder::decode_json("hits", hits, obj); + } + } hits; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("took", took, obj); + JSONDecoder::decode_json("timed_out", timed_out, obj); + JSONDecoder::decode_json("_shards", shards, obj); + JSONDecoder::decode_json("hits", hits, obj); + } +}; + +class RGWMetadataSearchOp : public RGWOp { + RGWSyncModuleInstanceRef sync_module_ref; + RGWElasticSyncModuleInstance *es_module; +protected: + string expression; + string custom_prefix; +#define MAX_KEYS_DEFAULT 100 + uint64_t max_keys{MAX_KEYS_DEFAULT}; + string marker_str; + uint64_t marker{0}; + string next_marker; + bool is_truncated{false}; + string err; + + es_search_response response; + +public: + RGWMetadataSearchOp(const RGWSyncModuleInstanceRef& sync_module) : sync_module_ref(sync_module) { + es_module = static_cast(sync_module_ref.get()); + } + + int verify_permission(optional_yield) override { + return 0; + } + virtual int get_params() = 0; + void pre_exec() override; + void execute(optional_yield y) override; + + const char* name() const override { return "metadata_search"; } + virtual RGWOpType get_type() override { return RGW_OP_METADATA_SEARCH; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +void RGWMetadataSearchOp::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWMetadataSearchOp::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) + return; + + list > conds; + + if (!s->user->get_info().system) { + conds.push_back(make_pair("permissions", s->user->get_id().to_str())); + } + + if (!s->bucket_name.empty()) { + conds.push_back(make_pair("bucket", s->bucket_name)); + } + + ESQueryCompiler es_query(expression, &conds, custom_prefix); + + static map aliases = { + { "bucket", "bucket" }, /* forces lowercase */ + { "name", "name" }, + { "key", "name" }, + { "instance", "instance" }, + { "etag", "meta.etag" }, + { "size", "meta.size" }, + { "mtime", "meta.mtime" }, + { "lastmodified", "meta.mtime" }, + { "last_modified", "meta.mtime" }, + { "contenttype", "meta.content_type" }, + { "content_type", "meta.content_type" }, + { "storageclass", "meta.storage_class" }, + { "storage_class", "meta.storage_class" }, + }; + es_query.set_field_aliases(&aliases); + + static map generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR}, + {"name", ESEntityTypeMap::ES_ENTITY_STR}, + {"instance", ESEntityTypeMap::ES_ENTITY_STR}, + {"permissions", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.content_type", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE}, + {"meta.size", ESEntityTypeMap::ES_ENTITY_INT}, + {"meta.storage_class", ESEntityTypeMap::ES_ENTITY_STR} }; + ESEntityTypeMap gm(generic_map); + es_query.set_generic_type_map(&gm); + + static set restricted_fields = { {"permissions"} }; + es_query.set_restricted_fields(&restricted_fields); + + map custom_map; + for (auto& i : s->bucket->get_info().mdsearch_config) { + custom_map[i.first] = (ESEntityTypeMap::EntityType)i.second; + } + + ESEntityTypeMap em(custom_map); + es_query.set_custom_type_map(&em); + + bool valid = es_query.compile(&err); + if (!valid) { + ldpp_dout(this, 10) << "invalid query, failed generating request json" << dendl; + op_ret = -EINVAL; + return; + } + + JSONFormatter f; + encode_json("root", es_query, &f); + + RGWRESTConn *conn = es_module->get_rest_conn(); + + bufferlist in; + bufferlist out; + + stringstream ss; + + f.flush(ss); + in.append(ss.str()); + + string resource = es_module->get_index_path() + "/_search"; + param_vec_t params; + static constexpr int BUFSIZE = 32; + char buf[BUFSIZE]; + snprintf(buf, sizeof(buf), "%lld", (long long)max_keys); + params.push_back(param_pair_t("size", buf)); + if (marker > 0) { + params.push_back(param_pair_t("from", marker_str.c_str())); + } + ldpp_dout(this, 20) << "sending request to elasticsearch, payload=" << string(in.c_str(), in.length()) << dendl; + auto& extra_headers = es_module->get_request_headers(); + op_ret = conn->get_resource(s, resource, ¶ms, &extra_headers, + out, &in, nullptr, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to fetch resource (r=" << resource << ", ret=" << op_ret << ")" << dendl; + return; + } + + ldpp_dout(this, 20) << "response: " << string(out.c_str(), out.length()) << dendl; + + JSONParser jparser; + if (!jparser.parse(out.c_str(), out.length())) { + ldpp_dout(this, 0) << "ERROR: failed to parse elasticsearch response" << dendl; + op_ret = -EINVAL; + return; + } + + try { + decode_json_obj(response, &jparser); + } catch (const JSONDecoder::err& e) { + ldpp_dout(this, 0) << "ERROR: failed to decode JSON input: " << e.what() << dendl; + op_ret = -EINVAL; + return; + } + +} + +class RGWMetadataSearch_ObjStore_S3 : public RGWMetadataSearchOp { +public: + explicit RGWMetadataSearch_ObjStore_S3(const RGWSyncModuleInstanceRef& _sync_module) : RGWMetadataSearchOp(_sync_module) { + custom_prefix = "x-amz-meta-"; + } + + int get_params() override { + expression = s->info.args.get("query"); + bool exists; + string max_keys_str = s->info.args.get("max-keys", &exists); +#define MAX_KEYS_MAX 10000 + if (exists) { + string err; + max_keys = strict_strtoll(max_keys_str.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + if (max_keys > MAX_KEYS_MAX) { + max_keys = MAX_KEYS_MAX; + } + } + marker_str = s->info.args.get("marker", &exists); + if (exists) { + string err; + marker = strict_strtoll(marker_str.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + } + uint64_t nm = marker + max_keys; + static constexpr int BUFSIZE = 32; + char buf[BUFSIZE]; + snprintf(buf, sizeof(buf), "%lld", (long long)nm); + next_marker = buf; + return 0; + } + void send_response() override { + if (op_ret) { + s->err.message = err; + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + is_truncated = (response.hits.hits.size() >= max_keys); + + s->formatter->open_object_section("SearchMetadataResponse"); + s->formatter->dump_string("Marker", marker_str); + s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false")); + if (is_truncated) { + s->formatter->dump_string("NextMarker", next_marker); + } + if (s->format == RGWFormat::JSON) { + s->formatter->open_array_section("Objects"); + } + for (auto& i : response.hits.hits) { + s->formatter->open_object_section("Contents"); + es_index_obj_response& e = i.source; + s->formatter->dump_string("Bucket", e.bucket); + s->formatter->dump_string("Key", e.key.name); + string instance = (!e.key.instance.empty() ? e.key.instance : "null"); + s->formatter->dump_string("Instance", instance.c_str()); + s->formatter->dump_int("VersionedEpoch", e.versioned_epoch); + dump_time(s, "LastModified", e.meta.mtime); + s->formatter->dump_int("Size", e.meta.size); + s->formatter->dump_format("ETag", "\"%s\"", e.meta.etag.c_str()); + s->formatter->dump_string("ContentType", e.meta.content_type.c_str()); + s->formatter->dump_string("StorageClass", e.meta.storage_class.c_str()); + dump_owner(s, e.owner.get_id(), e.owner.get_display_name()); + s->formatter->open_array_section("CustomMetadata"); + for (auto& m : e.meta.custom_str) { + s->formatter->open_object_section("Entry"); + s->formatter->dump_string("Name", m.first.c_str()); + s->formatter->dump_string("Value", m.second); + s->formatter->close_section(); + } + for (auto& m : e.meta.custom_int) { + s->formatter->open_object_section("Entry"); + s->formatter->dump_string("Name", m.first.c_str()); + s->formatter->dump_int("Value", m.second); + s->formatter->close_section(); + } + for (auto& m : e.meta.custom_date) { + s->formatter->open_object_section("Entry"); + s->formatter->dump_string("Name", m.first.c_str()); + s->formatter->dump_string("Value", m.second); + s->formatter->close_section(); + } + s->formatter->close_section(); + rgw_flush_formatter(s, s->formatter); + s->formatter->close_section(); + }; + if (s->format == RGWFormat::JSON) { + s->formatter->close_section(); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +}; + +class RGWHandler_REST_MDSearch_S3 : public RGWHandler_REST_S3 { +protected: + RGWOp *op_get() override { + if (s->info.args.exists("query")) { + return new RGWMetadataSearch_ObjStore_S3(driver->get_sync_module()); + } + if (!s->init_state.url_bucket.empty() && + s->info.args.exists("mdsearch")) { + return new RGWGetBucketMetaSearch_ObjStore_S3; + } + return nullptr; + } + RGWOp *op_head() override { + return nullptr; + } + RGWOp *op_post() override { + return nullptr; + } +public: + explicit RGWHandler_REST_MDSearch_S3(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {} + virtual ~RGWHandler_REST_MDSearch_S3() {} +}; + + +RGWHandler_REST* RGWRESTMgr_MDSearch_S3::get_handler(rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + int ret = + RGWHandler_REST_S3::init_from_header(driver, s, + RGWFormat::XML, true); + if (ret < 0) { + return nullptr; + } + + if (!s->object->empty()) { + return nullptr; + } + + RGWHandler_REST *handler = new RGWHandler_REST_MDSearch_S3(auth_registry); + + ldpp_dout(s, 20) << __func__ << " handler=" << typeid(*handler).name() + << dendl; + return handler; +} + diff --git a/src/rgw/driver/rados/rgw_sync_module_es_rest.h b/src/rgw/driver/rados/rgw_sync_module_es_rest.h new file mode 100644 index 000000000..b18271a69 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module_es_rest.h @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" + +class RGWElasticSyncModuleInstance; + +class RGWRESTMgr_MDSearch_S3 : public RGWRESTMgr { +public: + explicit RGWRESTMgr_MDSearch_S3() {} + + RGWHandler_REST *get_handler(rgw::sal::Driver* driver, + req_state* s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; diff --git a/src/rgw/driver/rados/rgw_sync_module_log.cc b/src/rgw/driver/rados/rgw_sync_module_log.cc new file mode 100644 index 000000000..9666ecc4c --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module_log.cc @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_cr_rados.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_sync_module_log.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +class RGWLogStatRemoteObjCBCR : public RGWStatRemoteObjCBCR { +public: + RGWLogStatRemoteObjCBCR(RGWDataSyncCtx *_sc, + rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWStatRemoteObjCBCR(_sc, _src_bucket, _key) {} + int operate(const DoutPrefixProvider *dpp) override { + ldpp_dout(dpp, 0) << "SYNC_LOG: stat of remote obj: z=" << sc->source_zone + << " b=" << src_bucket << " k=" << key << " size=" << size << " mtime=" << mtime + << " attrs=" << attrs << dendl; + return set_cr_done(); + } + +}; + +class RGWLogStatRemoteObjCR : public RGWCallStatRemoteObjCR { +public: + RGWLogStatRemoteObjCR(RGWDataSyncCtx *_sc, + rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCallStatRemoteObjCR(_sc, _src_bucket, _key) { + } + + ~RGWLogStatRemoteObjCR() override {} + + RGWStatRemoteObjCBCR *allocate_callback() override { + return new RGWLogStatRemoteObjCBCR(sc, src_bucket, key); + } +}; + +class RGWLogDataSyncModule : public RGWDataSyncModule { + string prefix; +public: + explicit RGWLogDataSyncModule(const string& _prefix) : prefix(_prefix) {} + + RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional versioned_epoch, const rgw_zone_set_entry& source_trace_entry, rgw_zone_set *zones_trace) override { + ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + return new RGWLogStatRemoteObjCR(sc, sync_pipe.info.source_bs.bucket, key); + } + RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return NULL; + } + RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime + << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return NULL; + } +}; + +class RGWLogSyncModuleInstance : public RGWSyncModuleInstance { + RGWLogDataSyncModule data_handler; +public: + explicit RGWLogSyncModuleInstance(const string& prefix) : data_handler(prefix) {} + RGWDataSyncModule *get_data_handler() override { + return &data_handler; + } +}; + +int RGWLogSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) { + string prefix = config["prefix"]; + instance->reset(new RGWLogSyncModuleInstance(prefix)); + return 0; +} + diff --git a/src/rgw/driver/rados/rgw_sync_module_log.h b/src/rgw/driver/rados/rgw_sync_module_log.h new file mode 100644 index 000000000..ab475959d --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_module_log.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_sync_module.h" + +class RGWLogSyncModule : public RGWSyncModule { +public: + RGWLogSyncModule() {} + bool supports_data_export() override { + return false; + } + int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; diff --git a/src/rgw/driver/rados/rgw_sync_trace.cc b/src/rgw/driver/rados/rgw_sync_trace.cc new file mode 100644 index 000000000..b34683593 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_trace.cc @@ -0,0 +1,290 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#ifndef CEPH_RGW_SYNC_TRACE_H +#define CEPH_RGW_SYNC_TRACE_H + +#include + +#include "common/debug.h" +#include "common/ceph_json.h" + +#include "rgw_sync_trace.h" +#include "rgw_rados.h" +#include "rgw_worker.h" + +#define dout_context g_ceph_context + +static constexpr auto dout_subsys = ceph_subsys_rgw; + +using namespace std; + + +RGWSyncTraceNode::RGWSyncTraceNode(CephContext *_cct, uint64_t _handle, + const RGWSyncTraceNodeRef& _parent, + const string& _type, const string& _id) : cct(_cct), + parent(_parent), + type(_type), + id(_id), + handle(_handle), + history(cct->_conf->rgw_sync_trace_per_node_log_size) +{ + if (parent.get()) { + prefix = parent->get_prefix(); + } + + if (!type.empty()) { + prefix += type; + if (!id.empty()) { + prefix += "[" + id + "]"; + } + prefix += ":"; + } +} + +void RGWSyncTraceNode::log(int level, const string& s) +{ + status = s; + history.push_back(status); + /* dump output on either rgw_sync, or rgw -- but only once */ + if (cct->_conf->subsys.should_gather(ceph_subsys_rgw_sync, level)) { + lsubdout(cct, rgw_sync, + ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl; + } else { + lsubdout(cct, rgw, + ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl; + } +} + + +class RGWSyncTraceServiceMapThread : public RGWRadosThread { + RGWRados *store; + RGWSyncTraceManager *manager; + + uint64_t interval_msec() override { + return cct->_conf->rgw_sync_trace_servicemap_update_interval * 1000; + } +public: + RGWSyncTraceServiceMapThread(RGWRados *_store, RGWSyncTraceManager *_manager) + : RGWRadosThread(_store, "sync-trace"), store(_store), manager(_manager) {} + + int process(const DoutPrefixProvider *dpp) override; +}; + +int RGWSyncTraceServiceMapThread::process(const DoutPrefixProvider *dpp) +{ + map status; + status["current_sync"] = manager->get_active_names(); + int ret = store->update_service_map(dpp, std::move(status)); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: update_service_map() returned ret=" << ret << dendl; + } + return 0; +} + +RGWSyncTraceNodeRef RGWSyncTraceManager::add_node(const RGWSyncTraceNodeRef& parent, + const std::string& type, + const std::string& id) +{ + shunique_lock wl(lock, ceph::acquire_unique); + auto handle = alloc_handle(); + RGWSyncTraceNodeRef& ref = nodes[handle]; + ref.reset(new RGWSyncTraceNode(cct, handle, parent, type, id)); + // return a separate shared_ptr that calls finish() on the node instead of + // deleting it. the lambda capture holds a reference to the original 'ref' + auto deleter = [ref, this] (RGWSyncTraceNode *node) { finish_node(node); }; + return {ref.get(), deleter}; +} + +bool RGWSyncTraceNode::match(const string& search_term, bool search_history) +{ + try { + std::regex expr(search_term); + std::smatch m; + + if (regex_search(prefix, m, expr)) { + return true; + } + if (regex_search(status, m,expr)) { + return true; + } + if (!search_history) { + return false; + } + + for (auto h : history) { + if (regex_search(h, m, expr)) { + return true; + } + } + } catch (const std::regex_error& e) { + ldout(cct, 5) << "NOTICE: sync trace: bad expression: bad regex search term" << dendl; + } + + return false; +} + +void RGWSyncTraceManager::init(RGWRados *store) +{ + service_map_thread = new RGWSyncTraceServiceMapThread(store, this); + service_map_thread->start(); +} + +RGWSyncTraceManager::~RGWSyncTraceManager() +{ + cct->get_admin_socket()->unregister_commands(this); + service_map_thread->stop(); + delete service_map_thread; + + nodes.clear(); +} + +int RGWSyncTraceManager::hook_to_admin_command() +{ + AdminSocket *admin_socket = cct->get_admin_socket(); + + admin_commands = { { "sync trace show name=search,type=CephString,req=false", "sync trace show [filter_str]: show current multisite tracing information" }, + { "sync trace history name=search,type=CephString,req=false", "sync trace history [filter_str]: show history of multisite tracing information" }, + { "sync trace active name=search,type=CephString,req=false", "show active multisite sync entities information" }, + { "sync trace active_short name=search,type=CephString,req=false", "show active multisite sync entities entries" } }; + for (auto cmd : admin_commands) { + int r = admin_socket->register_command(cmd[0], this, + cmd[1]); + if (r < 0) { + lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl; + return r; + } + } + return 0; +} + +static void dump_node(RGWSyncTraceNode *entry, bool show_history, Formatter *f) +{ + f->open_object_section("entry"); + ::encode_json("status", entry->to_str(), f); + if (show_history) { + f->open_array_section("history"); + for (auto h : entry->get_history()) { + ::encode_json("entry", h, f); + } + f->close_section(); + } + f->close_section(); +} + +string RGWSyncTraceManager::get_active_names() +{ + shunique_lock rl(lock, ceph::acquire_shared); + + stringstream ss; + JSONFormatter f; + + f.open_array_section("result"); + for (auto n : nodes) { + auto& entry = n.second; + + if (!entry->test_flags(RGW_SNS_FLAG_ACTIVE)) { + continue; + } + const string& name = entry->get_resource_name(); + if (!name.empty()) { + ::encode_json("entry", name, &f); + } + f.flush(ss); + } + f.close_section(); + f.flush(ss); + + return ss.str(); +} + +int RGWSyncTraceManager::call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& ss, + bufferlist& out) { + + bool show_history = (command == "sync trace history"); + bool show_short = (command == "sync trace active_short"); + bool show_active = (command == "sync trace active") || show_short; + + string search; + + auto si = cmdmap.find("search"); + if (si != cmdmap.end()) { + search = boost::get(si->second); + } + + shunique_lock rl(lock, ceph::acquire_shared); + + f->open_object_section("result"); + f->open_array_section("running"); + for (auto n : nodes) { + auto& entry = n.second; + + if (!search.empty() && !entry->match(search, show_history)) { + continue; + } + if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) { + continue; + } + if (show_short) { + const string& name = entry->get_resource_name(); + if (!name.empty()) { + ::encode_json("entry", name, f); + } + } else { + dump_node(entry.get(), show_history, f); + } + f->flush(out); + } + f->close_section(); + + f->open_array_section("complete"); + for (auto& entry : complete_nodes) { + if (!search.empty() && !entry->match(search, show_history)) { + continue; + } + if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) { + continue; + } + dump_node(entry.get(), show_history, f); + f->flush(out); + } + f->close_section(); + + f->close_section(); + + return 0; +} + +void RGWSyncTraceManager::finish_node(RGWSyncTraceNode *node) +{ + RGWSyncTraceNodeRef old_node; + + { + shunique_lock wl(lock, ceph::acquire_unique); + if (!node) { + return; + } + auto iter = nodes.find(node->handle); + if (iter == nodes.end()) { + /* not found, already finished */ + return; + } + + if (complete_nodes.full()) { + /* take a reference to the entry that is going to be evicted, + * can't let it get evicted under lock held, otherwise + * it's a deadlock as it will call finish_node() + */ + old_node = complete_nodes.front(); + } + + complete_nodes.push_back(iter->second); + nodes.erase(iter); + } +}; + +#endif + diff --git a/src/rgw/driver/rados/rgw_sync_trace.h b/src/rgw/driver/rados/rgw_sync_trace.h new file mode 100644 index 000000000..1fcc8bed8 --- /dev/null +++ b/src/rgw/driver/rados/rgw_sync_trace.h @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "common/ceph_mutex.h" +#include "common/shunique_lock.h" +#include "common/admin_socket.h" + +#include +#include +#include +#include +#include + +#define SSTR(o) ({ \ + std::stringstream ss; \ + ss << o; \ + ss.str(); \ +}) + +#define RGW_SNS_FLAG_ACTIVE 1 +#define RGW_SNS_FLAG_ERROR 2 + +class RGWRados; +class RGWSyncTraceManager; +class RGWSyncTraceNode; +class RGWSyncTraceServiceMapThread; + +using RGWSyncTraceNodeRef = std::shared_ptr; + +class RGWSyncTraceNode final { + friend class RGWSyncTraceManager; + + CephContext *cct; + RGWSyncTraceNodeRef parent; + + uint16_t state{0}; + std::string status; + + ceph::mutex lock = ceph::make_mutex("RGWSyncTraceNode::lock"); + + std::string type; + std::string id; + + std::string prefix; + + std::string resource_name; + + uint64_t handle; + + boost::circular_buffer history; + + // private constructor, create with RGWSyncTraceManager::add_node() + RGWSyncTraceNode(CephContext *_cct, uint64_t _handle, + const RGWSyncTraceNodeRef& _parent, + const std::string& _type, const std::string& _id); + + public: + void set_resource_name(const std::string& s) { + resource_name = s; + } + + const std::string& get_resource_name() { + return resource_name; + } + + void set_flag(uint16_t s) { + state |= s; + } + void unset_flag(uint16_t s) { + state &= ~s; + } + bool test_flags(uint16_t f) { + return (state & f) == f; + } + void log(int level, const std::string& s); + + std::string to_str() { + return prefix + " " + status; + } + + const std::string& get_prefix() { + return prefix; + } + + std::ostream& operator<<(std::ostream& os) { + os << to_str(); + return os; + } + + boost::circular_buffer& get_history() { + return history; + } + + bool match(const std::string& search_term, bool search_history); +}; + +class RGWSyncTraceManager : public AdminSocketHook { + friend class RGWSyncTraceNode; + + mutable std::shared_timed_mutex lock; + using shunique_lock = ceph::shunique_lock; + + CephContext *cct; + RGWSyncTraceServiceMapThread *service_map_thread{nullptr}; + + std::map nodes; + boost::circular_buffer complete_nodes; + + std::atomic count = { 0 }; + + std::list > admin_commands; + + uint64_t alloc_handle() { + return ++count; + } + void finish_node(RGWSyncTraceNode *node); + +public: + RGWSyncTraceManager(CephContext *_cct, int max_lru) : cct(_cct), complete_nodes(max_lru) {} + ~RGWSyncTraceManager(); + + void init(RGWRados *store); + + const RGWSyncTraceNodeRef root_node; + + RGWSyncTraceNodeRef add_node(const RGWSyncTraceNodeRef& parent, + const std::string& type, + const std::string& id = ""); + + int hook_to_admin_command(); + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& ss, + bufferlist& out) override; + std::string get_active_names(); +}; diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc new file mode 100644 index 000000000..66651da5c --- /dev/null +++ b/src/rgw/driver/rados/rgw_tools.cc @@ -0,0 +1,437 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" +#include "librados/librados_asio.h" + +#include "include/stringify.h" + +#include "rgw_tools.h" +#include "rgw_acl_s3.h" +#include "rgw_aio_throttle.h" +#include "rgw_compression.h" +#include "common/BackTrace.h" + +#define dout_subsys ceph_subsys_rgw + +#define READ_CHUNK_LEN (512 * 1024) + +using namespace std; + +int rgw_init_ioctx(const DoutPrefixProvider *dpp, + librados::Rados *rados, const rgw_pool& pool, + librados::IoCtx& ioctx, bool create, + bool mostly_omap, + bool bulk) +{ + int r = rados->ioctx_create(pool.name.c_str(), ioctx); + if (r == -ENOENT && create) { + r = rados->pool_create(pool.name.c_str()); + if (r == -ERANGE) { + ldpp_dout(dpp, 0) + << __func__ + << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r) + << " (this can be due to a pool or placement group misconfiguration, e.g." + << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)" + << dendl; + } + if (r < 0 && r != -EEXIST) { + return r; + } + + r = rados->ioctx_create(pool.name.c_str(), ioctx); + if (r < 0) { + return r; + } + + r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false); + if (r < 0 && r != -EOPNOTSUPP) { + return r; + } + + if (mostly_omap) { + // set pg_autoscale_bias + bufferlist inbl; + float bias = g_conf().get_val("rgw_rados_pool_autoscale_bias"); + int r = rados->mon_command( + "{\"prefix\": \"osd pool set\", \"pool\": \"" + + pool.name + "\", \"var\": \"pg_autoscale_bias\", \"val\": \"" + + stringify(bias) + "\"}", + inbl, NULL, NULL); + if (r < 0) { + ldpp_dout(dpp, 10) << __func__ << " warning: failed to set pg_autoscale_bias on " + << pool.name << dendl; + } + // set recovery_priority + int p = g_conf().get_val("rgw_rados_pool_recovery_priority"); + r = rados->mon_command( + "{\"prefix\": \"osd pool set\", \"pool\": \"" + + pool.name + "\", \"var\": \"recovery_priority\": \"" + + stringify(p) + "\"}", + inbl, NULL, NULL); + if (r < 0) { + ldpp_dout(dpp, 10) << __func__ << " warning: failed to set recovery_priority on " + << pool.name << dendl; + } + } + if (bulk) { + // set bulk + bufferlist inbl; + int r = rados->mon_command( + "{\"prefix\": \"osd pool set\", \"pool\": \"" + + pool.name + "\", \"var\": \"bulk\", \"val\": \"true\"}", + inbl, NULL, NULL); + if (r < 0) { + ldpp_dout(dpp, 10) << __func__ << " warning: failed to set 'bulk' on " + << pool.name << dendl; + } + } + } else if (r < 0) { + return r; + } + if (!pool.ns.empty()) { + ioctx.set_namespace(pool.ns); + } + return 0; +} + +map* no_change_attrs() { + static map no_change; + return &no_change; +} + +int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj, + const rgw_pool& pool, const string& oid, bufferlist& data, bool exclusive, + RGWObjVersionTracker *objv_tracker, real_time set_mtime, optional_yield y, map *pattrs) +{ + map no_attrs; + if (!pattrs) { + pattrs = &no_attrs; + } + + rgw_raw_obj obj(pool, oid); + + auto sysobj = svc_sysobj->get_obj(obj); + int ret; + + if (pattrs != no_change_attrs()) { + ret = sysobj.wop() + .set_objv_tracker(objv_tracker) + .set_exclusive(exclusive) + .set_mtime(set_mtime) + .set_attrs(*pattrs) + .write(dpp, data, y); + } else { + ret = sysobj.wop() + .set_objv_tracker(objv_tracker) + .set_exclusive(exclusive) + .set_mtime(set_mtime) + .write_data(dpp, data, y); + } + + return ret; +} + +int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj, + const rgw_pool& pool, const std::string& key, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, optional_yield y, + std::map *pattrs) +{ + rgw_raw_obj obj(pool, key); + auto sysobj = svc_sysobj->get_obj(obj); + return sysobj.rop() + .set_attrs(pattrs) + .set_last_mod(pmtime) + .stat(y, dpp); +} + + +int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool, const string& key, bufferlist& bl, + RGWObjVersionTracker *objv_tracker, real_time *pmtime, optional_yield y, + const DoutPrefixProvider *dpp, map *pattrs, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version, bool raw_attrs) +{ + const rgw_raw_obj obj(pool, key); + auto sysobj = svc_sysobj->get_obj(obj); + auto rop = sysobj.rop(); + return rop.set_attrs(pattrs) + .set_last_mod(pmtime) + .set_objv_tracker(objv_tracker) + .set_raw_attrs(raw_attrs) + .set_cache_info(cache_info) + .set_refresh_version(refresh_version) + .read(dpp, &bl, y); +} + +int rgw_delete_system_obj(const DoutPrefixProvider *dpp, + RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const string& oid, + RGWObjVersionTracker *objv_tracker, optional_yield y) +{ + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid}); + rgw_raw_obj obj(pool, oid); + return sysobj.wop() + .set_objv_tracker(objv_tracker) + .remove(dpp, y); +} + +int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid, + librados::ObjectReadOperation *op, bufferlist* pbl, + optional_yield y, int flags) +{ + // given a yield_context, call async_operate() to yield the coroutine instead + // of blocking + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + boost::system::error_code ec; + auto bl = librados::async_operate( + context, ioctx, oid, op, flags, yield[ec]); + if (pbl) { + *pbl = std::move(bl); + } + return -ec.value(); + } + // work on asio threads should be asynchronous, so warn when they block + if (is_asio_thread) { + ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl; +#ifdef _BACKTRACE_LOGGING + ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl; +#endif + } + return ioctx.operate(oid, op, nullptr, flags); +} + +int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid, + librados::ObjectWriteOperation *op, optional_yield y, + int flags) +{ + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + boost::system::error_code ec; + librados::async_operate(context, ioctx, oid, op, flags, yield[ec]); + return -ec.value(); + } + if (is_asio_thread) { + ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl; +#ifdef _BACKTRACE_LOGGING + ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl; +#endif + } + return ioctx.operate(oid, op, flags); +} + +int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid, + bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl, + optional_yield y) +{ + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + boost::system::error_code ec; + auto reply = librados::async_notify(context, ioctx, oid, + bl, timeout_ms, yield[ec]); + if (pbl) { + *pbl = std::move(reply); + } + return -ec.value(); + } + if (is_asio_thread) { + ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl; +#ifdef _BACKTRACE_LOGGING + ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl; +#endif + } + return ioctx.notify2(oid, bl, timeout_ms, pbl); +} + +void rgw_filter_attrset(map& unfiltered_attrset, const string& check_prefix, + map *attrset) +{ + attrset->clear(); + map::iterator iter; + for (iter = unfiltered_attrset.lower_bound(check_prefix); + iter != unfiltered_attrset.end(); ++iter) { + if (!boost::algorithm::starts_with(iter->first, check_prefix)) + break; + (*attrset)[iter->first] = iter->second; + } +} + +RGWDataAccess::RGWDataAccess(rgw::sal::Driver* _driver) : driver(_driver) +{ +} + + +int RGWDataAccess::Bucket::finish_init() +{ + auto iter = attrs.find(RGW_ATTR_ACL); + if (iter == attrs.end()) { + return 0; + } + + bufferlist::const_iterator bliter = iter->second.begin(); + try { + policy.decode(bliter); + } catch (buffer::error& err) { + return -EIO; + } + + return 0; +} + +int RGWDataAccess::Bucket::init(const DoutPrefixProvider *dpp, optional_yield y) +{ + std::unique_ptr bucket; + int ret = sd->driver->get_bucket(dpp, nullptr, tenant, name, &bucket, y); + if (ret < 0) { + return ret; + } + + bucket_info = bucket->get_info(); + mtime = bucket->get_modification_time(); + attrs = bucket->get_attrs(); + + return finish_init(); +} + +int RGWDataAccess::Bucket::init(const RGWBucketInfo& _bucket_info, + const map& _attrs) +{ + bucket_info = _bucket_info; + attrs = _attrs; + + return finish_init(); +} + +int RGWDataAccess::Bucket::get_object(const rgw_obj_key& key, + ObjectRef *obj) { + obj->reset(new Object(sd, shared_from_this(), key)); + return 0; +} + +int RGWDataAccess::Object::put(bufferlist& data, + map& attrs, + const DoutPrefixProvider *dpp, + optional_yield y) +{ + rgw::sal::Driver* driver = sd->driver; + CephContext *cct = driver->ctx(); + + string tag; + append_rand_alpha(cct, tag, tag, 32); + + RGWBucketInfo& bucket_info = bucket->bucket_info; + + rgw::BlockingAioThrottle aio(driver->ctx()->_conf->rgw_put_obj_min_window_size); + + std::unique_ptr b; + driver->get_bucket(NULL, bucket_info, &b); + std::unique_ptr obj = b->get_object(key); + + auto& owner = bucket->policy.get_owner(); + + string req_id = driver->zone_unique_id(driver->get_new_req_id()); + + std::unique_ptr processor; + processor = driver->get_atomic_writer(dpp, y, obj.get(), + owner.get_id(), + nullptr, olh_epoch, req_id); + + int ret = processor->prepare(y); + if (ret < 0) + return ret; + + rgw::sal::DataProcessor *filter = processor.get(); + + CompressorRef plugin; + boost::optional compressor; + + const auto& compression_type = driver->get_compression_type(bucket_info.placement_rule); + if (compression_type != "none") { + plugin = Compressor::create(driver->ctx(), compression_type); + if (!plugin) { + ldpp_dout(dpp, 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } else { + compressor.emplace(driver->ctx(), plugin, filter); + filter = &*compressor; + } + } + + off_t ofs = 0; + auto obj_size = data.length(); + + RGWMD5Etag etag_calc; + + do { + size_t read_len = std::min(data.length(), (unsigned int)cct->_conf->rgw_max_chunk_size); + + bufferlist bl; + + data.splice(0, read_len, &bl); + etag_calc.update(bl); + + ret = filter->process(std::move(bl), ofs); + if (ret < 0) + return ret; + + ofs += read_len; + } while (data.length() > 0); + + ret = filter->process({}, ofs); + if (ret < 0) { + return ret; + } + bool has_etag_attr = false; + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + etag = bl.to_str(); + has_etag_attr = true; + } + + if (!aclbl) { + RGWAccessControlPolicy_S3 policy(cct); + + policy.create_canned(bucket->policy.get_owner(), bucket->policy.get_owner(), string()); /* default private policy */ + + policy.encode(aclbl.emplace()); + } + + if (etag.empty()) { + etag_calc.finish(&etag); + } + + if (!has_etag_attr) { + bufferlist etagbl; + etagbl.append(etag); + attrs[RGW_ATTR_ETAG] = etagbl; + } + attrs[RGW_ATTR_ACL] = *aclbl; + + string *puser_data = nullptr; + if (user_data) { + puser_data = &(*user_data); + } + + return processor->complete(obj_size, etag, + &mtime, mtime, + attrs, delete_at, + nullptr, nullptr, + puser_data, + nullptr, nullptr, y); +} + +void RGWDataAccess::Object::set_policy(const RGWAccessControlPolicy& policy) +{ + policy.encode(aclbl.emplace()); +} + +void rgw_complete_aio_completion(librados::AioCompletion* c, int r) { + auto pc = c->pc; + librados::CB_AioCompleteAndSafe cb(pc); + cb(r); +} diff --git a/src/rgw/driver/rados/rgw_tools.h b/src/rgw/driver/rados/rgw_tools.h new file mode 100644 index 000000000..66600856d --- /dev/null +++ b/src/rgw/driver/rados/rgw_tools.h @@ -0,0 +1,276 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "include/types.h" +#include "include/ceph_hash.h" + +#include "common/ceph_time.h" + +#include "rgw_common.h" +#include "rgw_sal_fwd.h" + +class RGWSI_SysObj; + +class RGWRados; +struct RGWObjVersionTracker; +class optional_yield; + +struct obj_version; + + +int rgw_init_ioctx(const DoutPrefixProvider *dpp, + librados::Rados *rados, const rgw_pool& pool, + librados::IoCtx& ioctx, + bool create = false, + bool mostly_omap = false, + bool bulk = false); + +#define RGW_NO_SHARD -1 + +#define RGW_SHARDS_PRIME_0 7877 +#define RGW_SHARDS_PRIME_1 65521 + +extern const std::string MP_META_SUFFIX; + +inline int rgw_shards_max() +{ + return RGW_SHARDS_PRIME_1; +} + +// only called by rgw_shard_id and rgw_bucket_shard_index +static inline int rgw_shards_mod(unsigned hval, int max_shards) +{ + if (max_shards <= RGW_SHARDS_PRIME_0) { + return hval % RGW_SHARDS_PRIME_0 % max_shards; + } + return hval % RGW_SHARDS_PRIME_1 % max_shards; +} + +// used for logging and tagging +inline int rgw_shard_id(const std::string& key, int max_shards) +{ + return rgw_shards_mod(ceph_str_hash_linux(key.c_str(), key.size()), + max_shards); +} + +void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id); +void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name); +void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name); + +int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj, + const rgw_pool& pool, const std::string& oid, + bufferlist& data, bool exclusive, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime, optional_yield y, + std::map *pattrs = nullptr); +int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool, + const std::string& key, bufferlist& bl, + RGWObjVersionTracker *objv_tracker, real_time *pmtime, + optional_yield y, const DoutPrefixProvider *dpp, + std::map *pattrs = nullptr, + rgw_cache_entry_info *cache_info = nullptr, + boost::optional refresh_version = boost::none, + bool raw_attrs=false); +int rgw_delete_system_obj(const DoutPrefixProvider *dpp, + RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const std::string& oid, + RGWObjVersionTracker *objv_tracker, optional_yield y); +int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj, + const rgw_pool& pool, const std::string& key, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, optional_yield y, + std::map *pattrs = nullptr); + +const char *rgw_find_mime_by_ext(std::string& ext); + +void rgw_filter_attrset(std::map& unfiltered_attrset, const std::string& check_prefix, + std::map *attrset); + +/// indicates whether the current thread is in boost::asio::io_context::run(), +/// used to log warnings if synchronous librados calls are made +extern thread_local bool is_asio_thread; + +/// perform the rados operation, using the yield context when given +int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid, + librados::ObjectReadOperation *op, bufferlist* pbl, + optional_yield y, int flags = 0); +int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid, + librados::ObjectWriteOperation *op, optional_yield y, + int flags = 0); +int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid, + bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl, + optional_yield y); + +int rgw_tools_init(const DoutPrefixProvider *dpp, CephContext *cct); +void rgw_tools_cleanup(); + +template +class RGWEtag +{ + H hash; + +public: + RGWEtag() { + if constexpr (std::is_same_v) { + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + } + } + + void update(const char *buf, size_t len) { + hash.Update((const unsigned char *)buf, len); + } + + void update(bufferlist& bl) { + if (bl.length() > 0) { + update(bl.c_str(), bl.length()); + } + } + + void update(const std::string& s) { + if (!s.empty()) { + update(s.c_str(), s.size()); + } + } + void finish(std::string *etag) { + char etag_buf[S]; + char etag_buf_str[S * 2 + 16]; + + hash.Final((unsigned char *)etag_buf); + buf_to_hex((const unsigned char *)etag_buf, S, + etag_buf_str); + + *etag = etag_buf_str; + } +}; + +using RGWMD5Etag = RGWEtag; + +class RGWDataAccess +{ + rgw::sal::Driver* driver; + +public: + RGWDataAccess(rgw::sal::Driver* _driver); + + class Object; + class Bucket; + + using BucketRef = std::shared_ptr; + using ObjectRef = std::shared_ptr; + + class Bucket : public std::enable_shared_from_this { + friend class RGWDataAccess; + friend class Object; + + RGWDataAccess *sd{nullptr}; + RGWBucketInfo bucket_info; + std::string tenant; + std::string name; + std::string bucket_id; + ceph::real_time mtime; + std::map attrs; + + RGWAccessControlPolicy policy; + int finish_init(); + + Bucket(RGWDataAccess *_sd, + const std::string& _tenant, + const std::string& _name, + const std::string& _bucket_id) : sd(_sd), + tenant(_tenant), + name(_name), + bucket_id(_bucket_id) {} + Bucket(RGWDataAccess *_sd) : sd(_sd) {} + int init(const DoutPrefixProvider *dpp, optional_yield y); + int init(const RGWBucketInfo& _bucket_info, const std::map& _attrs); + public: + int get_object(const rgw_obj_key& key, + ObjectRef *obj); + + }; + + + class Object { + RGWDataAccess *sd{nullptr}; + BucketRef bucket; + rgw_obj_key key; + + ceph::real_time mtime; + std::string etag; + uint64_t olh_epoch{0}; + ceph::real_time delete_at; + std::optional user_data; + + std::optional aclbl; + + Object(RGWDataAccess *_sd, + BucketRef&& _bucket, + const rgw_obj_key& _key) : sd(_sd), + bucket(_bucket), + key(_key) {} + public: + int put(bufferlist& data, std::map& attrs, const DoutPrefixProvider *dpp, optional_yield y); /* might modify attrs */ + + void set_mtime(const ceph::real_time& _mtime) { + mtime = _mtime; + } + + void set_etag(const std::string& _etag) { + etag = _etag; + } + + void set_olh_epoch(uint64_t epoch) { + olh_epoch = epoch; + } + + void set_delete_at(ceph::real_time _delete_at) { + delete_at = _delete_at; + } + + void set_user_data(const std::string& _user_data) { + user_data = _user_data; + } + + void set_policy(const RGWAccessControlPolicy& policy); + + friend class Bucket; + }; + + int get_bucket(const DoutPrefixProvider *dpp, + const std::string& tenant, + const std::string name, + const std::string bucket_id, + BucketRef *bucket, + optional_yield y) { + bucket->reset(new Bucket(this, tenant, name, bucket_id)); + return (*bucket)->init(dpp, y); + } + + int get_bucket(const RGWBucketInfo& bucket_info, + const std::map& attrs, + BucketRef *bucket) { + bucket->reset(new Bucket(this)); + return (*bucket)->init(bucket_info, attrs); + } + friend class Bucket; + friend class Object; +}; + +using RGWDataAccessRef = std::shared_ptr; + +/// Complete an AioCompletion. To return error values or otherwise +/// satisfy the caller. Useful for making complicated asynchronous +/// calls and error handling. +void rgw_complete_aio_completion(librados::AioCompletion* c, int r); + +/// This returns a static, non-NULL pointer, recognized only by +/// rgw_put_system_obj(). When supplied instead of the attributes, the +/// attributes will be unmodified. +/// +// (Currently providing nullptr will wipe all attributes.) + +std::map* no_change_attrs(); diff --git a/src/rgw/driver/rados/rgw_trim_bilog.cc b/src/rgw/driver/rados/rgw_trim_bilog.cc new file mode 100644 index 000000000..4e34abf51 --- /dev/null +++ b/src/rgw/driver/rados/rgw_trim_bilog.cc @@ -0,0 +1,1445 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * Author: Casey Bodley + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include +#include +#include + +#include "include/scope_guard.h" +#include "common/bounded_key_counter.h" +#include "common/errno.h" +#include "rgw_trim_bilog.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_cr_tools.h" +#include "rgw_data_sync.h" +#include "rgw_metadata.h" +#include "rgw_sal.h" +#include "rgw_zone.h" +#include "rgw_sync.h" +#include "rgw_bucket.h" + +#include "services/svc_zone.h" +#include "services/svc_meta.h" +#include "services/svc_bilog_rados.h" + +#include +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "trim: ") + +using namespace std; + +using rgw::BucketTrimConfig; +using BucketChangeCounter = BoundedKeyCounter; + +const std::string rgw::BucketTrimStatus::oid = "bilog.trim"; +using rgw::BucketTrimStatus; + + +// watch/notify api for gateways to coordinate about which buckets to trim +enum TrimNotifyType { + NotifyTrimCounters = 0, + NotifyTrimComplete, +}; +WRITE_RAW_ENCODER(TrimNotifyType); + +struct TrimNotifyHandler { + virtual ~TrimNotifyHandler() = default; + + virtual void handle(bufferlist::const_iterator& input, bufferlist& output) = 0; +}; + +/// api to share the bucket trim counters between gateways in the same zone. +/// each gateway will process different datalog shards, so the gateway that runs +/// the trim process needs to accumulate their counters +struct TrimCounters { + /// counter for a single bucket + struct BucketCounter { + std::string bucket; //< bucket instance metadata key + int count{0}; + + BucketCounter() = default; + BucketCounter(const std::string& bucket, int count) + : bucket(bucket), count(count) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + using Vector = std::vector; + + /// request bucket trim counters from peer gateways + struct Request { + uint16_t max_buckets; //< maximum number of bucket counters to return + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + + /// return the current bucket trim counters + struct Response { + Vector bucket_counters; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + + /// server interface to query the hottest buckets + struct Server { + virtual ~Server() = default; + + virtual void get_bucket_counters(int count, Vector& counters) = 0; + virtual void reset_bucket_counters() = 0; + }; + + /// notify handler + class Handler : public TrimNotifyHandler { + Server *const server; + public: + explicit Handler(Server *server) : server(server) {} + + void handle(bufferlist::const_iterator& input, bufferlist& output) override; + }; +}; +std::ostream& operator<<(std::ostream& out, const TrimCounters::BucketCounter& rhs) +{ + return out << rhs.bucket << ":" << rhs.count; +} + +void TrimCounters::BucketCounter::encode(bufferlist& bl) const +{ + using ceph::encode; + // no versioning to save space + encode(bucket, bl); + encode(count, bl); +} +void TrimCounters::BucketCounter::decode(bufferlist::const_iterator& p) +{ + using ceph::decode; + decode(bucket, p); + decode(count, p); +} +WRITE_CLASS_ENCODER(TrimCounters::BucketCounter); + +void TrimCounters::Request::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(max_buckets, bl); + ENCODE_FINISH(bl); +} +void TrimCounters::Request::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(max_buckets, p); + DECODE_FINISH(p); +} +WRITE_CLASS_ENCODER(TrimCounters::Request); + +void TrimCounters::Response::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(bucket_counters, bl); + ENCODE_FINISH(bl); +} +void TrimCounters::Response::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(bucket_counters, p); + DECODE_FINISH(p); +} +WRITE_CLASS_ENCODER(TrimCounters::Response); + +void TrimCounters::Handler::handle(bufferlist::const_iterator& input, + bufferlist& output) +{ + Request request; + decode(request, input); + auto count = std::min(request.max_buckets, 128); + + Response response; + server->get_bucket_counters(count, response.bucket_counters); + encode(response, output); +} + +/// api to notify peer gateways that trim has completed and their bucket change +/// counters can be reset +struct TrimComplete { + struct Request { + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + struct Response { + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + + /// server interface to reset bucket counters + using Server = TrimCounters::Server; + + /// notify handler + class Handler : public TrimNotifyHandler { + Server *const server; + public: + explicit Handler(Server *server) : server(server) {} + + void handle(bufferlist::const_iterator& input, bufferlist& output) override; + }; +}; + +void TrimComplete::Request::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ENCODE_FINISH(bl); +} +void TrimComplete::Request::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + DECODE_FINISH(p); +} +WRITE_CLASS_ENCODER(TrimComplete::Request); + +void TrimComplete::Response::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ENCODE_FINISH(bl); +} +void TrimComplete::Response::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + DECODE_FINISH(p); +} +WRITE_CLASS_ENCODER(TrimComplete::Response); + +void TrimComplete::Handler::handle(bufferlist::const_iterator& input, + bufferlist& output) +{ + Request request; + decode(request, input); + + server->reset_bucket_counters(); + + Response response; + encode(response, output); +} + + +/// rados watcher for bucket trim notifications +class BucketTrimWatcher : public librados::WatchCtx2 { + rgw::sal::RadosStore* const store; + const rgw_raw_obj& obj; + rgw_rados_ref ref; + uint64_t handle{0}; + + using HandlerPtr = std::unique_ptr; + boost::container::flat_map handlers; + + public: + BucketTrimWatcher(rgw::sal::RadosStore* store, const rgw_raw_obj& obj, + TrimCounters::Server *counters) + : store(store), obj(obj) { + handlers.emplace(NotifyTrimCounters, + std::make_unique(counters)); + handlers.emplace(NotifyTrimComplete, + std::make_unique(counters)); + } + + ~BucketTrimWatcher() { + stop(); + } + + int start(const DoutPrefixProvider *dpp) { + int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + // register a watch on the realm's control object + r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this); + if (r == -ENOENT) { + constexpr bool exclusive = true; + r = ref.pool.ioctx().create(ref.obj.oid, exclusive); + if (r == -EEXIST || r == 0) { + r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this); + } + } + if (r < 0) { + ldpp_dout(dpp, -1) << "Failed to watch " << ref.obj + << " with " << cpp_strerror(-r) << dendl; + ref.pool.ioctx().close(); + return r; + } + + ldpp_dout(dpp, 10) << "Watching " << ref.obj.oid << dendl; + return 0; + } + + int restart() { + int r = ref.pool.ioctx().unwatch2(handle); + if (r < 0) { + lderr(store->ctx()) << "Failed to unwatch on " << ref.obj + << " with " << cpp_strerror(-r) << dendl; + } + r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this); + if (r < 0) { + lderr(store->ctx()) << "Failed to restart watch on " << ref.obj + << " with " << cpp_strerror(-r) << dendl; + ref.pool.ioctx().close(); + } + return r; + } + + void stop() { + if (handle) { + ref.pool.ioctx().unwatch2(handle); + ref.pool.ioctx().close(); + } + } + + /// respond to bucket trim notifications + void handle_notify(uint64_t notify_id, uint64_t cookie, + uint64_t notifier_id, bufferlist& bl) override { + if (cookie != handle) { + return; + } + bufferlist reply; + try { + auto p = bl.cbegin(); + TrimNotifyType type; + decode(type, p); + + auto handler = handlers.find(type); + if (handler != handlers.end()) { + handler->second->handle(p, reply); + } else { + lderr(store->ctx()) << "no handler for notify type " << type << dendl; + } + } catch (const buffer::error& e) { + lderr(store->ctx()) << "Failed to decode notification: " << e.what() << dendl; + } + ref.pool.ioctx().notify_ack(ref.obj.oid, notify_id, cookie, reply); + } + + /// reestablish the watch if it gets disconnected + void handle_error(uint64_t cookie, int err) override { + if (cookie != handle) { + return; + } + if (err == -ENOTCONN) { + ldout(store->ctx(), 4) << "Disconnected watch on " << ref.obj << dendl; + restart(); + } + } +}; + + +/// Interface to communicate with the trim manager about completed operations +struct BucketTrimObserver { + virtual ~BucketTrimObserver() = default; + + virtual void on_bucket_trimmed(std::string&& bucket_instance) = 0; + virtual bool trimmed_recently(const std::string_view& bucket_instance) = 0; +}; + +/// trim each bilog shard to the given marker, while limiting the number of +/// concurrent requests +class BucketTrimShardCollectCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* const store; + const RGWBucketInfo& bucket_info; + rgw::bucket_index_layout_generation generation; + const std::vector& markers; //< shard markers to trim + size_t i{0}; //< index of current shard marker + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to trim bilog shard: " << cpp_strerror(r) << dendl; + } + return r; + } + public: + BucketTrimShardCollectCR(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& generation, + const std::vector& markers) + : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS), + dpp(dpp), store(store), bucket_info(bucket_info), + generation(generation), markers(markers) + {} + bool spawn_next() override; +}; + +bool BucketTrimShardCollectCR::spawn_next() +{ + while (i < markers.size()) { + const auto& marker = markers[i]; + const auto shard_id = i++; + + // skip empty markers + if (!marker.empty()) { + ldpp_dout(dpp, 10) << "trimming bilog shard " << shard_id + << " of " << bucket_info.bucket << " at marker " << marker << dendl; + spawn(new RGWRadosBILogTrimCR(dpp, store, bucket_info, shard_id, + generation, std::string{}, marker), + false); + return true; + } + } + return false; +} + +/// Delete a BI generation, limiting the number of requests in flight. +class BucketCleanIndexCollectCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* const store; + const RGWBucketInfo& bucket_info; + rgw::bucket_index_layout_generation index; + uint32_t shard = 0; + const uint32_t num_shards = rgw::num_shards(index); + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "clean index: " << cpp_strerror(r) << dendl; + } + return r; + } + public: + BucketCleanIndexCollectCR(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, + const RGWBucketInfo& bucket_info, + rgw::bucket_index_layout_generation index) + : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS), + dpp(dpp), store(store), bucket_info(bucket_info), + index(index) + {} + bool spawn_next() override { + if (shard < num_shards) { + RGWRados::BucketShard bs(store->getRados()); + bs.init(dpp, bucket_info, index, shard); + spawn(new RGWRadosRemoveOidCR(store, std::move(bs.bucket_obj), nullptr), + false); + ++shard; + return true; + } else { + return false; + } + } +}; + + +/// trim the bilog of all of the given bucket instance's shards +class BucketTrimInstanceCR : public RGWCoroutine { + static constexpr auto MAX_RETRIES = 25u; + rgw::sal::RadosStore* const store; + RGWHTTPManager *const http; + BucketTrimObserver *const observer; + std::string bucket_instance; + rgw_bucket_get_sync_policy_params get_policy_params; + std::shared_ptr source_policy; + rgw_bucket bucket; + const std::string& zone_id; //< my zone id + RGWBucketInfo _bucket_info; + const RGWBucketInfo *pbucket_info; //< pointer to bucket instance info to locate bucket indices + int child_ret = 0; + const DoutPrefixProvider *dpp; +public: + struct StatusShards { + uint64_t generation = 0; + std::vector shards; + }; +private: + std::vector peer_status; //< sync status for each peer + std::vector min_markers; //< min marker per shard + + /// The log generation to trim + rgw::bucket_log_layout_generation totrim; + + /// Generation to be cleaned/New bucket info (if any) + std::optional> clean_info; + /// Maximum number of times to attempt to put bucket info + unsigned retries = 0; + + int take_min_generation() { + // Initialize the min_generation to the bucket's current + // generation, used in case we have no peers. + auto min_generation = pbucket_info->layout.logs.back().gen; + + // Determine the minimum generation + if (auto m = std::min_element(peer_status.begin(), + peer_status.end(), + [](const StatusShards& l, + const StatusShards& r) { + return l.generation < r.generation; + }); m != peer_status.end()) { + min_generation = m->generation; + } + + auto& logs = pbucket_info->layout.logs; + auto log = std::find_if(logs.begin(), logs.end(), + rgw::matches_gen(min_generation)); + if (log == logs.end()) { + ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << "ERROR: No log layout for min_generation=" + << min_generation << dendl; + return -ENOENT; + } + + totrim = *log; + return 0; + } + + /// If there is a generation below the minimum, prepare to clean it up. + int maybe_remove_generation() { + if (clean_info) + return 0; + + + if (pbucket_info->layout.logs.front().gen < totrim.gen) { + clean_info = {*pbucket_info, {}}; + auto log = clean_info->first.layout.logs.cbegin(); + clean_info->second = *log; + + if (clean_info->first.layout.logs.size() == 1) { + ldpp_dout(dpp, -1) + << "Critical error! Attempt to remove only log generation! " + << "log.gen=" << log->gen << ", totrim.gen=" << totrim.gen + << dendl; + return -EIO; + } + clean_info->first.layout.logs.erase(log); + } + return 0; + } + + public: + BucketTrimInstanceCR(rgw::sal::RadosStore* store, RGWHTTPManager *http, + BucketTrimObserver *observer, + const std::string& bucket_instance, + const DoutPrefixProvider *dpp) + : RGWCoroutine(store->ctx()), store(store), + http(http), observer(observer), + bucket_instance(bucket_instance), + zone_id(store->svc()->zone->get_zone().id), + dpp(dpp) { + rgw_bucket_parse_bucket_key(cct, bucket_instance, &bucket, nullptr); + source_policy = make_shared(); + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +namespace { +/// populate the status with the minimum stable marker of each shard +int take_min_status( + CephContext *cct, + const uint64_t min_generation, + std::vector::const_iterator first, + std::vector::const_iterator last, + std::vector *status) { + for (auto peer = first; peer != last; ++peer) { + // Peers on later generations don't get a say in the matter + if (peer->generation > min_generation) { + continue; + } + if (peer->shards.size() != status->size()) { + // all peers must agree on the number of shards + return -EINVAL; + } + + auto m = status->begin(); + for (auto& shard : peer->shards) { + auto& marker = *m++; + // always take the first marker, or any later marker that's smaller + if (peer == first || marker > shard.inc_marker.position) { + marker = std::move(shard.inc_marker.position); + } + } + } + return 0; +} +} + +template<> +inline int parse_decode_json( + BucketTrimInstanceCR::StatusShards& s, bufferlist& bl) +{ + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + return -EINVAL; + } + + try { + bilog_status_v2 v; + decode_json_obj(v, &p); + s.generation = v.sync_status.incremental_gen; + s.shards = std::move(v.inc_status); + } catch (JSONDecoder::err& e) { + try { + // Fall back if we're talking to an old node that can't give v2 + // output. + s.generation = 0; + decode_json_obj(s.shards, &p); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + } + return 0; +} + +int BucketTrimInstanceCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + ldpp_dout(dpp, 4) << "starting trim on bucket=" << bucket_instance << dendl; + + get_policy_params.zone = zone_id; + get_policy_params.bucket = bucket; + yield call(new RGWBucketGetSyncPolicyHandlerCR(store->svc()->rados->get_async_processor(), + store, + get_policy_params, + source_policy, + dpp)); + if (retcode < 0) { + if (retcode != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch policy handler for bucket=" << bucket << dendl; + } + + return set_cr_error(retcode); + } + + if (auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info(); + opt_bucket_info) { + pbucket_info = &(*opt_bucket_info); + } else { + /* this shouldn't really happen */ + return set_cr_error(-ENOENT); + } + + if (pbucket_info->layout.logs.empty()) { + return set_cr_done(); // no bilogs to trim + } + + // query peers for sync status + set_status("fetching sync status from relevant peers"); + yield { + const auto& all_dests = source_policy->policy_handler->get_all_dests(); + + vector zids; + rgw_zone_id last_zid; + for (auto& diter : all_dests) { + const auto& zid = diter.first; + if (zid == last_zid) { + continue; + } + last_zid = zid; + zids.push_back(zid); + } + + peer_status.resize(zids.size()); + + auto& zone_conn_map = store->svc()->zone->get_zone_conn_map(); + + auto p = peer_status.begin(); + for (auto& zid : zids) { + // query data sync status from each sync peer + rgw_http_param_pair params[] = { + { "type", "bucket-index" }, + { "status", nullptr }, + { "options", "merge" }, + { "bucket", bucket_instance.c_str() }, /* equal to source-bucket when `options==merge` and source-bucket + param is not provided */ + { "source-zone", zone_id.c_str() }, + { "version", "2" }, + { nullptr, nullptr } + }; + + auto ziter = zone_conn_map.find(zid); + if (ziter == zone_conn_map.end()) { + ldpp_dout(dpp, 0) << "WARNING: no connection to zone " << zid << ", can't trim bucket: " << bucket << dendl; + return set_cr_error(-ECANCELED); + } + + using StatusCR = RGWReadRESTResourceCR; + spawn(new StatusCR(cct, ziter->second, http, "/admin/log/", params, &*p), + false); + ++p; + } + } + // wait for a response from each peer. all must respond to attempt trim + while (num_spawned()) { + yield wait_for_child(); + collect(&child_ret, nullptr); + if (child_ret < 0) { + drain_all(); + return set_cr_error(child_ret); + } + } + + // Determine the minimum generation + retcode = take_min_generation(); + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to find minimum generation" << dendl; + return set_cr_error(retcode); + } + retcode = maybe_remove_generation(); + if (retcode < 0) { + ldpp_dout(dpp, 4) << "error removing old generation from log: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + if (clean_info) { + if (clean_info->second.layout.type != rgw::BucketLogType::InIndex) { + ldpp_dout(dpp, 0) << "Unable to convert log of unknown type " + << clean_info->second.layout.type + << " to rgw::bucket_index_layout_generation " << dendl; + return set_cr_error(-EINVAL); + } + + yield call(new BucketCleanIndexCollectCR(dpp, store, clean_info->first, + clean_info->second.layout.in_index)); + if (retcode < 0) { + ldpp_dout(dpp, 0) << "failed to remove previous generation: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + while (clean_info && retries < MAX_RETRIES) { + yield call(new RGWPutBucketInstanceInfoCR( + store->svc()->rados->get_async_processor(), + store, clean_info->first, false, {}, + no_change_attrs(), dpp)); + + // Raced, try again. + if (retcode == -ECANCELED) { + yield call(new RGWGetBucketInstanceInfoCR( + store->svc()->rados->get_async_processor(), + store, clean_info->first.bucket, + &(clean_info->first), nullptr, dpp)); + if (retcode < 0) { + ldpp_dout(dpp, 0) << "failed to get bucket info: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + if (clean_info->first.layout.logs.front().gen == + clean_info->second.gen) { + clean_info->first.layout.logs.erase( + clean_info->first.layout.logs.begin()); + ++retries; + continue; + } + // Raced, but someone else did what we needed to. + retcode = 0; + } + + if (retcode < 0) { + ldpp_dout(dpp, 0) << "failed to put bucket info: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + clean_info = std::nullopt; + } + } else { + if (totrim.layout.type != rgw::BucketLogType::InIndex) { + ldpp_dout(dpp, 0) << "Unable to convert log of unknown type " + << totrim.layout.type + << " to rgw::bucket_index_layout_generation " << dendl; + return set_cr_error(-EINVAL); + } + // To avoid hammering the OSD too hard, either trim old + // generations OR trim the current one. + + // determine the minimum marker for each shard + + // initialize each shard with the maximum marker, which is only used when + // there are no peers syncing from us + min_markers.assign(std::max(1u, rgw::num_shards(totrim.layout.in_index)), + RGWSyncLogTrimCR::max_marker); + + + retcode = take_min_status(cct, totrim.gen, peer_status.cbegin(), + peer_status.cend(), &min_markers); + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to correlate bucket sync status from peers" << dendl; + return set_cr_error(retcode); + } + + // trim shards with a ShardCollectCR + ldpp_dout(dpp, 10) << "trimming bilogs for bucket=" << pbucket_info->bucket + << " markers=" << min_markers << ", shards=" << min_markers.size() << dendl; + set_status("trimming bilog shards"); + yield call(new BucketTrimShardCollectCR(dpp, store, *pbucket_info, totrim.layout.in_index, + min_markers)); + // ENODATA just means there were no keys to trim + if (retcode == -ENODATA) { + retcode = 0; + } + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to trim bilog shards: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + } + + observer->on_bucket_trimmed(std::move(bucket_instance)); + return set_cr_done(); + } + return 0; +} + +/// trim each bucket instance while limiting the number of concurrent operations + +class BucketTrimInstanceCollectCR : public RGWShardCollectCR { + rgw::sal::RadosStore* const store; + RGWHTTPManager *const http; + BucketTrimObserver *const observer; + std::vector::const_iterator bucket; + std::vector::const_iterator end; + const DoutPrefixProvider *dpp; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to trim bucket instance: " << cpp_strerror(r) << dendl; + } + return r; + } + public: + BucketTrimInstanceCollectCR(rgw::sal::RadosStore* store, RGWHTTPManager *http, + BucketTrimObserver *observer, + const std::vector& buckets, + int max_concurrent, + const DoutPrefixProvider *dpp) + : RGWShardCollectCR(store->ctx(), max_concurrent), + store(store), http(http), observer(observer), + bucket(buckets.begin()), end(buckets.end()), + dpp(dpp) + {} + bool spawn_next() override; +}; + +bool BucketTrimInstanceCollectCR::spawn_next() +{ + if (bucket == end) { + return false; + } + spawn(new BucketTrimInstanceCR(store, http, observer, *bucket, dpp), false); + ++bucket; + return true; +} + +/// correlate the replies from each peer gateway into the given counter +int accumulate_peer_counters(bufferlist& bl, BucketChangeCounter& counter) +{ + counter.clear(); + + try { + // decode notify responses + auto p = bl.cbegin(); + std::map, bufferlist> replies; + std::set> timeouts; + decode(replies, p); + decode(timeouts, p); + + for (auto& peer : replies) { + auto q = peer.second.cbegin(); + TrimCounters::Response response; + decode(response, q); + for (const auto& b : response.bucket_counters) { + counter.insert(b.bucket, b.count); + } + } + } catch (const buffer::error& e) { + return -EIO; + } + return 0; +} + +/// metadata callback has the signature bool(string&& key, string&& marker) +using MetadataListCallback = std::function; + +/// lists metadata keys, passing each to a callback until it returns false. +/// on reaching the end, it will restart at the beginning and list up to the +/// initial marker +class AsyncMetadataList : public RGWAsyncRadosRequest { + CephContext *const cct; + RGWMetadataManager *const mgr; + const std::string section; + const std::string start_marker; + MetadataListCallback callback; + + int _send_request(const DoutPrefixProvider *dpp) override; + public: + AsyncMetadataList(CephContext *cct, RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, RGWMetadataManager *mgr, + const std::string& section, const std::string& start_marker, + const MetadataListCallback& callback) + : RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr), + section(section), start_marker(start_marker), callback(callback) + {} +}; + +int AsyncMetadataList::_send_request(const DoutPrefixProvider *dpp) +{ + void* handle = nullptr; + std::list keys; + bool truncated{false}; + std::string marker; + + // start a listing at the given marker + int r = mgr->list_keys_init(dpp, section, start_marker, &handle); + if (r == -EINVAL) { + // restart with empty marker below + } else if (r < 0) { + ldpp_dout(dpp, 10) << "failed to init metadata listing: " + << cpp_strerror(r) << dendl; + return r; + } else { + ldpp_dout(dpp, 20) << "starting metadata listing at " << start_marker << dendl; + + // release the handle when scope exits + auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); }); + + do { + // get the next key and marker + r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated); + if (r < 0) { + ldpp_dout(dpp, 10) << "failed to list metadata: " + << cpp_strerror(r) << dendl; + return r; + } + marker = mgr->get_marker(handle); + + if (!keys.empty()) { + ceph_assert(keys.size() == 1); + auto& key = keys.front(); + if (!callback(std::move(key), std::move(marker))) { + return 0; + } + } + } while (truncated); + + if (start_marker.empty()) { + // already listed all keys + return 0; + } + } + + // restart the listing from the beginning (empty marker) + handle = nullptr; + + r = mgr->list_keys_init(dpp, section, "", &handle); + if (r < 0) { + ldpp_dout(dpp, 10) << "failed to restart metadata listing: " + << cpp_strerror(r) << dendl; + return r; + } + ldpp_dout(dpp, 20) << "restarting metadata listing" << dendl; + + // release the handle when scope exits + auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); }); + do { + // get the next key and marker + r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated); + if (r < 0) { + ldpp_dout(dpp, 10) << "failed to list metadata: " + << cpp_strerror(r) << dendl; + return r; + } + marker = mgr->get_marker(handle); + + if (!keys.empty()) { + ceph_assert(keys.size() == 1); + auto& key = keys.front(); + // stop at original marker + if (marker > start_marker) { + return 0; + } + if (!callback(std::move(key), std::move(marker))) { + return 0; + } + } + } while (truncated); + + return 0; +} + +/// coroutine wrapper for AsyncMetadataList +class MetadataListCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *const async_rados; + RGWMetadataManager *const mgr; + const std::string& section; + const std::string& start_marker; + MetadataListCallback callback; + RGWAsyncRadosRequest *req{nullptr}; + public: + MetadataListCR(CephContext *cct, RGWAsyncRadosProcessor *async_rados, + RGWMetadataManager *mgr, const std::string& section, + const std::string& start_marker, + const MetadataListCallback& callback) + : RGWSimpleCoroutine(cct), async_rados(async_rados), mgr(mgr), + section(section), start_marker(start_marker), callback(callback) + {} + ~MetadataListCR() override { + request_cleanup(); + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new AsyncMetadataList(cct, this, stack->create_completion_notifier(), + mgr, section, start_marker, callback); + async_rados->queue(req); + return 0; + } + int request_complete() override { + return req->get_ret_status(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = nullptr; + } + } +}; + +class BucketTrimCR : public RGWCoroutine { + rgw::sal::RadosStore* const store; + RGWHTTPManager *const http; + const BucketTrimConfig& config; + BucketTrimObserver *const observer; + const rgw_raw_obj& obj; + ceph::mono_time start_time; + bufferlist notify_replies; + BucketChangeCounter counter; + std::vector buckets; //< buckets selected for trim + BucketTrimStatus status; + RGWObjVersionTracker objv; //< version tracker for trim status object + std::string last_cold_marker; //< position for next trim marker + const DoutPrefixProvider *dpp; + + static const std::string section; //< metadata section for bucket instances + public: + BucketTrimCR(rgw::sal::RadosStore* store, RGWHTTPManager *http, + const BucketTrimConfig& config, BucketTrimObserver *observer, + const rgw_raw_obj& obj, const DoutPrefixProvider *dpp) + : RGWCoroutine(store->ctx()), store(store), http(http), config(config), + observer(observer), obj(obj), counter(config.counter_size), dpp(dpp) + {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +const std::string BucketTrimCR::section{"bucket.instance"}; + +int BucketTrimCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + start_time = ceph::mono_clock::now(); + + if (config.buckets_per_interval) { + // query watch/notify for hot buckets + ldpp_dout(dpp, 10) << "fetching active bucket counters" << dendl; + set_status("fetching active bucket counters"); + yield { + // request the top bucket counters from each peer gateway + const TrimNotifyType type = NotifyTrimCounters; + TrimCounters::Request request{32}; + bufferlist bl; + encode(type, bl); + encode(request, bl); + call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms, + ¬ify_replies)); + } + if (retcode < 0) { + ldpp_dout(dpp, 10) << "failed to fetch peer bucket counters" << dendl; + return set_cr_error(retcode); + } + + // select the hottest buckets for trim + retcode = accumulate_peer_counters(notify_replies, counter); + if (retcode < 0) { + ldout(cct, 4) << "failed to correlate peer bucket counters" << dendl; + return set_cr_error(retcode); + } + buckets.reserve(config.buckets_per_interval); + + const int max_count = config.buckets_per_interval - + config.min_cold_buckets_per_interval; + counter.get_highest(max_count, + [this] (const std::string& bucket, int count) { + buckets.push_back(bucket); + }); + } + + if (buckets.size() < config.buckets_per_interval) { + // read BucketTrimStatus for marker position + set_status("reading trim status"); + using ReadStatus = RGWSimpleRadosReadCR; + yield call(new ReadStatus(dpp, store, obj, &status, true, &objv)); + if (retcode < 0) { + ldpp_dout(dpp, 10) << "failed to read bilog trim status: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + if (status.marker == "MAX") { + status.marker.clear(); // restart at the beginning + } + ldpp_dout(dpp, 10) << "listing cold buckets from marker=" + << status.marker << dendl; + + set_status("listing cold buckets for trim"); + yield { + // capture a reference so 'this' remains valid in the callback + auto ref = boost::intrusive_ptr{this}; + // list cold buckets to consider for trim + auto cb = [this, ref] (std::string&& bucket, std::string&& marker) { + // filter out keys that we trimmed recently + if (observer->trimmed_recently(bucket)) { + return true; + } + // filter out active buckets that we've already selected + auto i = std::find(buckets.begin(), buckets.end(), bucket); + if (i != buckets.end()) { + return true; + } + buckets.emplace_back(std::move(bucket)); + // remember the last cold bucket spawned to update the status marker + last_cold_marker = std::move(marker); + // return true if there's room for more + return buckets.size() < config.buckets_per_interval; + }; + + call(new MetadataListCR(cct, store->svc()->rados->get_async_processor(), + store->ctl()->meta.mgr, + section, status.marker, cb)); + } + if (retcode < 0) { + ldout(cct, 4) << "failed to list bucket instance metadata: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + } + + // trim bucket instances with limited concurrency + set_status("trimming buckets"); + ldpp_dout(dpp, 4) << "collected " << buckets.size() << " buckets for trim" << dendl; + yield call(new BucketTrimInstanceCollectCR(store, http, observer, buckets, + config.concurrent_buckets, dpp)); + // ignore errors from individual buckets + + // write updated trim status + if (!last_cold_marker.empty() && status.marker != last_cold_marker) { + set_status("writing updated trim status"); + status.marker = std::move(last_cold_marker); + ldpp_dout(dpp, 20) << "writing bucket trim marker=" << status.marker << dendl; + using WriteStatus = RGWSimpleRadosWriteCR; + yield call(new WriteStatus(dpp, store, obj, status, &objv)); + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to write updated trim status: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + } + + // notify peers that trim completed + set_status("trim completed"); + yield { + const TrimNotifyType type = NotifyTrimComplete; + TrimComplete::Request request; + bufferlist bl; + encode(type, bl); + encode(request, bl); + call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms, + nullptr)); + } + if (retcode < 0) { + ldout(cct, 10) << "failed to notify peers of trim completion" << dendl; + return set_cr_error(retcode); + } + + ldpp_dout(dpp, 4) << "bucket index log processing completed in " + << ceph::mono_clock::now() - start_time << dendl; + return set_cr_done(); + } + return 0; +} + +class BucketTrimPollCR : public RGWCoroutine { + rgw::sal::RadosStore* const store; + RGWHTTPManager *const http; + const BucketTrimConfig& config; + BucketTrimObserver *const observer; + const rgw_raw_obj& obj; + const std::string name{"trim"}; //< lock name + const std::string cookie; + const DoutPrefixProvider *dpp; + + public: + BucketTrimPollCR(rgw::sal::RadosStore* store, RGWHTTPManager *http, + const BucketTrimConfig& config, + BucketTrimObserver *observer, const rgw_raw_obj& obj, + const DoutPrefixProvider *dpp) + : RGWCoroutine(store->ctx()), store(store), http(http), + config(config), observer(observer), obj(obj), + cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)), + dpp(dpp) {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int BucketTrimPollCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + for (;;) { + set_status("sleeping"); + wait(utime_t{static_cast(config.trim_interval_sec), 0}); + + // prevent others from trimming for our entire wait interval + set_status("acquiring trim lock"); + yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store, + obj, name, cookie, + config.trim_interval_sec)); + if (retcode < 0) { + ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl; + continue; + } + + set_status("trimming"); + yield call(new BucketTrimCR(store, http, config, observer, obj, dpp)); + if (retcode < 0) { + // on errors, unlock so other gateways can try + set_status("unlocking"); + yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store, + obj, name, cookie)); + } + } + } + return 0; +} + +/// tracks a bounded list of events with timestamps. old events can be expired, +/// and recent events can be searched by key. expiration depends on events being +/// inserted in temporal order +template +class RecentEventList { + public: + using clock_type = Clock; + using time_point = typename clock_type::time_point; + + RecentEventList(size_t max_size, const ceph::timespan& max_duration) + : events(max_size), max_duration(max_duration) + {} + + /// insert an event at the given point in time. this time must be at least as + /// recent as the last inserted event + void insert(T&& value, const time_point& now) { + // ceph_assert(events.empty() || now >= events.back().time) + events.push_back(Event{std::move(value), now}); + } + + /// performs a linear search for an event matching the given key, whose type + /// U can be any that provides operator==(U, T) + template + bool lookup(const U& key) const { + for (const auto& event : events) { + if (key == event.value) { + return true; + } + } + return false; + } + + /// remove events that are no longer recent compared to the given point in time + void expire_old(const time_point& now) { + const auto expired_before = now - max_duration; + while (!events.empty() && events.front().time < expired_before) { + events.pop_front(); + } + } + + private: + struct Event { + T value; + time_point time; + }; + boost::circular_buffer events; + const ceph::timespan max_duration; +}; + +namespace rgw { + +// read bucket trim configuration from ceph context +void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config) +{ + const auto& conf = cct->_conf; + + config.trim_interval_sec = + conf.get_val("rgw_sync_log_trim_interval"); + config.counter_size = 512; + config.buckets_per_interval = + conf.get_val("rgw_sync_log_trim_max_buckets"); + config.min_cold_buckets_per_interval = + conf.get_val("rgw_sync_log_trim_min_cold_buckets"); + config.concurrent_buckets = + conf.get_val("rgw_sync_log_trim_concurrent_buckets"); + config.notify_timeout_ms = 10000; + config.recent_size = 128; + config.recent_duration = std::chrono::hours(2); +} + +class BucketTrimManager::Impl : public TrimCounters::Server, + public BucketTrimObserver { + public: + rgw::sal::RadosStore* const store; + const BucketTrimConfig config; + + const rgw_raw_obj status_obj; + + /// count frequency of bucket instance entries in the data changes log + BucketChangeCounter counter; + + using RecentlyTrimmedBucketList = RecentEventList; + using clock_type = RecentlyTrimmedBucketList::clock_type; + /// track recently trimmed buckets to focus trim activity elsewhere + RecentlyTrimmedBucketList trimmed; + + /// serve the bucket trim watch/notify api + BucketTrimWatcher watcher; + + /// protect data shared between data sync, trim, and watch/notify threads + std::mutex mutex; + + Impl(rgw::sal::RadosStore* store, const BucketTrimConfig& config) + : store(store), config(config), + status_obj(store->svc()->zone->get_zone_params().log_pool, BucketTrimStatus::oid), + counter(config.counter_size), + trimmed(config.recent_size, config.recent_duration), + watcher(store, status_obj, this) + {} + + /// TrimCounters::Server interface for watch/notify api + void get_bucket_counters(int count, TrimCounters::Vector& buckets) { + buckets.reserve(count); + std::lock_guard lock(mutex); + counter.get_highest(count, [&buckets] (const std::string& key, int count) { + buckets.emplace_back(key, count); + }); + ldout(store->ctx(), 20) << "get_bucket_counters: " << buckets << dendl; + } + + void reset_bucket_counters() override { + ldout(store->ctx(), 20) << "bucket trim completed" << dendl; + std::lock_guard lock(mutex); + counter.clear(); + trimmed.expire_old(clock_type::now()); + } + + /// BucketTrimObserver interface to remember successfully-trimmed buckets + void on_bucket_trimmed(std::string&& bucket_instance) override { + ldout(store->ctx(), 20) << "trimmed bucket instance " << bucket_instance << dendl; + std::lock_guard lock(mutex); + trimmed.insert(std::move(bucket_instance), clock_type::now()); + } + + bool trimmed_recently(const std::string_view& bucket_instance) override { + std::lock_guard lock(mutex); + return trimmed.lookup(bucket_instance); + } +}; + +BucketTrimManager::BucketTrimManager(rgw::sal::RadosStore* store, + const BucketTrimConfig& config) + : impl(new Impl(store, config)) +{ +} +BucketTrimManager::~BucketTrimManager() = default; + +int BucketTrimManager::init() +{ + return impl->watcher.start(this); +} + +void BucketTrimManager::on_bucket_changed(const std::string_view& bucket) +{ + std::lock_guard lock(impl->mutex); + // filter recently trimmed bucket instances out of bucket change counter + if (impl->trimmed.lookup(bucket)) { + return; + } + impl->counter.insert(std::string(bucket)); +} + +RGWCoroutine* BucketTrimManager::create_bucket_trim_cr(RGWHTTPManager *http) +{ + return new BucketTrimPollCR(impl->store, http, impl->config, + impl.get(), impl->status_obj, this); +} + +RGWCoroutine* BucketTrimManager::create_admin_bucket_trim_cr(RGWHTTPManager *http) +{ + // return the trim coroutine without any polling + return new BucketTrimCR(impl->store, http, impl->config, + impl.get(), impl->status_obj, this); +} + +CephContext* BucketTrimManager::get_cct() const +{ + return impl->store->ctx(); +} + +unsigned BucketTrimManager::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& BucketTrimManager::gen_prefix(std::ostream& out) const +{ + return out << "rgw bucket trim manager: "; +} + +} // namespace rgw + +int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, uint64_t gen, int shard_id, + std::string_view start_marker, std::string_view end_marker) +{ + auto& logs = bucket_info.layout.logs; + auto log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(gen)); + if (log == logs.end()) { + ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << "ERROR: no log layout with gen=" << gen << dendl; + return -ENOENT; + } + + auto log_layout = *log; + + auto r = store->svc()->bilog_rados->log_trim(p, bucket_info, log_layout, shard_id, start_marker, end_marker); + if (r < 0) { + ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << "ERROR: bilog_rados->log_trim returned r=" << r << dendl; + } + return r; +} diff --git a/src/rgw/driver/rados/rgw_trim_bilog.h b/src/rgw/driver/rados/rgw_trim_bilog.h new file mode 100644 index 000000000..6a11d2476 --- /dev/null +++ b/src/rgw/driver/rados/rgw_trim_bilog.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * Author: Casey Bodley + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include +#include + +#include "include/common_fwd.h" +#include "include/encoding.h" +#include "common/ceph_time.h" +#include "common/dout.h" +#include "rgw_common.h" + +class RGWCoroutine; +class RGWHTTPManager; + +namespace rgw { + +namespace sal { + class RadosStore; +} + +/// Interface to inform the trim process about which buckets are most active +struct BucketChangeObserver { + virtual ~BucketChangeObserver() = default; + + virtual void on_bucket_changed(const std::string_view& bucket_instance) = 0; +}; + +/// Configuration for BucketTrimManager +struct BucketTrimConfig { + /// time interval in seconds between bucket trim attempts + uint32_t trim_interval_sec{0}; + /// maximum number of buckets to track with BucketChangeObserver + size_t counter_size{0}; + /// maximum number of buckets to process each trim interval + uint32_t buckets_per_interval{0}; + /// minimum number of buckets to choose from the global bucket instance list + uint32_t min_cold_buckets_per_interval{0}; + /// maximum number of buckets to process in parallel + uint32_t concurrent_buckets{0}; + /// timeout in ms for bucket trim notify replies + uint64_t notify_timeout_ms{0}; + /// maximum number of recently trimmed buckets to remember (should be small + /// enough for a linear search) + size_t recent_size{0}; + /// maximum duration to consider a trim as 'recent' (should be some multiple + /// of the trim interval, at least) + ceph::timespan recent_duration{0}; +}; + +/// fill out the BucketTrimConfig from the ceph context +void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config); + +/// Determines the buckets on which to focus trim activity, using two sources of +/// input: the frequency of entries read from the data changes log, and a global +/// listing of the bucket.instance metadata. This allows us to trim active +/// buckets quickly, while also ensuring that all buckets will eventually trim +class BucketTrimManager : public BucketChangeObserver, public DoutPrefixProvider { + class Impl; + std::unique_ptr impl; + public: + BucketTrimManager(sal::RadosStore *store, const BucketTrimConfig& config); + ~BucketTrimManager(); + + int init(); + + /// increment a counter for the given bucket instance + void on_bucket_changed(const std::string_view& bucket_instance) override; + + /// create a coroutine to run the bucket trim process every trim interval + RGWCoroutine* create_bucket_trim_cr(RGWHTTPManager *http); + + /// create a coroutine to trim buckets directly via radosgw-admin + RGWCoroutine* create_admin_bucket_trim_cr(RGWHTTPManager *http); + + CephContext *get_cct() const override; + unsigned get_subsys() const; + std::ostream& gen_prefix(std::ostream& out) const; +}; + +/// provides persistent storage for the trim manager's current position in the +/// list of bucket instance metadata +struct BucketTrimStatus { + std::string marker; //< metadata key of current bucket instance + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(marker, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(marker, p); + DECODE_FINISH(p); + } + + static const std::string oid; +}; + +} // namespace rgw + +WRITE_CLASS_ENCODER(rgw::BucketTrimStatus); + +int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, uint64_t gen, int shard_id, + std::string_view start_marker, std::string_view end_marker); diff --git a/src/rgw/driver/rados/rgw_trim_datalog.cc b/src/rgw/driver/rados/rgw_trim_datalog.cc new file mode 100644 index 000000000..72a160039 --- /dev/null +++ b/src/rgw/driver/rados/rgw_trim_datalog.cc @@ -0,0 +1,252 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "common/errno.h" + +#include "rgw_trim_datalog.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_datalog.h" +#include "rgw_data_sync.h" +#include "rgw_zone.h" +#include "rgw_bucket.h" + +#include "services/svc_zone.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "data trim: ") + +namespace { + +class DatalogTrimImplCR : public RGWSimpleCoroutine { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + boost::intrusive_ptr cn; + int shard; + std::string marker; + std::string* last_trim_marker; + + public: + DatalogTrimImplCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, int shard, + const std::string& marker, std::string* last_trim_marker) + : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), shard(shard), + marker(marker), last_trim_marker(last_trim_marker) { + set_description() << "Datalog trim shard=" << shard + << " marker=" << marker; + } + + int send_request(const DoutPrefixProvider *dpp) override { + set_status() << "sending request"; + cn = stack->create_completion_notifier(); + return store->svc()->datalog_rados->trim_entries(dpp, shard, marker, + cn->completion()); + } + int request_complete() override { + int r = cn->completion()->get_return_value(); + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << "(): trim of shard=" << shard + << " marker=" << marker << " returned r=" << r << dendl; + + set_status() << "request complete; ret=" << r; + if (r != -ENODATA) { + return r; + } + // nothing left to trim, update last_trim_marker + if (*last_trim_marker < marker && + marker != store->svc()->datalog_rados->max_marker()) { + *last_trim_marker = marker; + } + return 0; + } +}; + +/// return the marker that it's safe to trim up to +const std::string& get_stable_marker(const rgw_data_sync_marker& m) +{ + return m.state == m.FullSync ? m.next_step_marker : m.marker; +} + +/// populate the container starting with 'dest' with the minimum stable marker +/// of each shard for all of the peers in [first, last) +template +void take_min_markers(IterIn first, IterIn last, IterOut dest) +{ + if (first == last) { + return; + } + for (auto p = first; p != last; ++p) { + auto m = dest; + for (auto &shard : p->sync_markers) { + const auto& stable = get_stable_marker(shard.second); + if (*m > stable) { + *m = stable; + } + ++m; + } + } +} + +} // anonymous namespace + +class DataLogTrimCR : public RGWCoroutine { + using TrimCR = DatalogTrimImplCR; + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + RGWHTTPManager *http; + const int num_shards; + const std::string& zone_id; //< my zone id + std::vector peer_status; //< sync status for each peer + std::vector min_shard_markers; //< min marker per shard + std::vector& last_trim; //< last trimmed marker per shard + int ret{0}; + + public: + DataLogTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, + int num_shards, std::vector& last_trim) + : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http), + num_shards(num_shards), + zone_id(store->svc()->zone->get_zone().id), + peer_status(store->svc()->zone->get_zone_data_notify_to_map().size()), + min_shard_markers(num_shards, + std::string(store->svc()->datalog_rados->max_marker())), + last_trim(last_trim) + {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int DataLogTrimCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + ldpp_dout(dpp, 10) << "fetching sync status for zone " << zone_id << dendl; + set_status("fetching sync status"); + yield { + // query data sync status from each sync peer + rgw_http_param_pair params[] = { + { "type", "data" }, + { "status", nullptr }, + { "source-zone", zone_id.c_str() }, + { nullptr, nullptr } + }; + + auto p = peer_status.begin(); + for (auto& c : store->svc()->zone->get_zone_data_notify_to_map()) { + ldpp_dout(dpp, 20) << "query sync status from " << c.first << dendl; + using StatusCR = RGWReadRESTResourceCR; + spawn(new StatusCR(cct, c.second, http, "/admin/log/", params, &*p), + false); + ++p; + } + } + + // must get a successful reply from all peers to consider trimming + ret = 0; + while (ret == 0 && num_spawned() > 0) { + yield wait_for_child(); + collect_next(&ret); + } + drain_all(); + + if (ret < 0) { + ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl; + return set_cr_error(ret); + } + + ldpp_dout(dpp, 10) << "trimming log shards" << dendl; + set_status("trimming log shards"); + yield { + // determine the minimum marker for each shard + take_min_markers(peer_status.begin(), peer_status.end(), + min_shard_markers.begin()); + + for (int i = 0; i < num_shards; i++) { + const auto& m = min_shard_markers[i]; + if (m <= last_trim[i]) { + continue; + } + ldpp_dout(dpp, 10) << "trimming log shard " << i + << " at marker=" << m + << " last_trim=" << last_trim[i] << dendl; + spawn(new TrimCR(dpp, store, i, m, &last_trim[i]), + true); + } + } + return set_cr_done(); + } + return 0; +} + +RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, + RGWHTTPManager *http, + int num_shards, + std::vector& markers) +{ + return new DataLogTrimCR(dpp, store, http, num_shards, markers); +} + +class DataLogTrimPollCR : public RGWCoroutine { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* store; + RGWHTTPManager *http; + const int num_shards; + const utime_t interval; //< polling interval + const std::string lock_oid; //< use first data log shard for lock + const std::string lock_cookie; + std::vector last_trim; //< last trimmed marker per shard + + public: + DataLogTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, + int num_shards, utime_t interval) + : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http), + num_shards(num_shards), interval(interval), + lock_oid(store->svc()->datalog_rados->get_oid(0, 0)), + lock_cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)), + last_trim(num_shards) + {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int DataLogTrimPollCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + for (;;) { + set_status("sleeping"); + wait(interval); + + // request a 'data_trim' lock that covers the entire wait interval to + // prevent other gateways from attempting to trim for the duration + set_status("acquiring trim lock"); + yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store, + rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, lock_oid), + "data_trim", lock_cookie, + interval.sec())); + if (retcode < 0) { + // if the lock is already held, go back to sleep and try again later + ldpp_dout(dpp, 4) << "failed to lock " << lock_oid << ", trying again in " + << interval.sec() << "s" << dendl; + continue; + } + + set_status("trimming"); + yield call(new DataLogTrimCR(dpp, store, http, num_shards, last_trim)); + + // note that the lock is not released. this is intentional, as it avoids + // duplicating this work in other gateways + } + } + return 0; +} + +RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, + RGWHTTPManager *http, + int num_shards, utime_t interval) +{ + return new DataLogTrimPollCR(dpp, store, http, num_shards, interval); +} diff --git a/src/rgw/driver/rados/rgw_trim_datalog.h b/src/rgw/driver/rados/rgw_trim_datalog.h new file mode 100644 index 000000000..9f5bf7252 --- /dev/null +++ b/src/rgw/driver/rados/rgw_trim_datalog.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include + +#include "common/dout.h" + +class RGWCoroutine; +class RGWRados; +class RGWHTTPManager; +class utime_t; +namespace rgw { namespace sal { + class RadosStore; +} } + +// DataLogTrimCR factory function +extern RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, + RGWHTTPManager *http, + int num_shards, utime_t interval); + +// factory function for datalog trim via radosgw-admin +RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, + RGWHTTPManager *http, + int num_shards, + std::vector& markers); diff --git a/src/rgw/driver/rados/rgw_trim_mdlog.cc b/src/rgw/driver/rados/rgw_trim_mdlog.cc new file mode 100644 index 000000000..d8e19594a --- /dev/null +++ b/src/rgw/driver/rados/rgw_trim_mdlog.cc @@ -0,0 +1,795 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" + +#include "rgw_trim_mdlog.h" +#include "rgw_sync.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_zone.h" +#include "services/svc_zone.h" +#include "services/svc_meta.h" +#include "services/svc_mdlog.h" +#include "services/svc_cls.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "meta trim: ") + +/// purge all log shards for the given mdlog +class PurgeLogShardsCR : public RGWShardCollectCR { + rgw::sal::RadosStore* const store; + const RGWMetadataLog* mdlog; + const int num_shards; + rgw_raw_obj obj; + int i{0}; + + static constexpr int max_concurrent = 16; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to remove mdlog shard: " << cpp_strerror(r) << dendl; + } + return r; + } + public: + PurgeLogShardsCR(rgw::sal::RadosStore* store, const RGWMetadataLog* mdlog, + const rgw_pool& pool, int num_shards) + : RGWShardCollectCR(store->ctx(), max_concurrent), + store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "") + {} + + bool spawn_next() override { + if (i == num_shards) { + return false; + } + mdlog->get_shard_oid(i++, obj.oid); + spawn(new RGWRadosRemoveCR(store, obj), false); + return true; + } +}; + +using Cursor = RGWPeriodHistory::Cursor; + +/// purge mdlogs from the oldest up to (but not including) the given realm_epoch +class PurgePeriodLogsCR : public RGWCoroutine { + struct Svc { + RGWSI_Zone *zone; + RGWSI_MDLog *mdlog; + } svc; + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* const store; + RGWMetadataManager *const metadata; + RGWObjVersionTracker objv; + Cursor cursor; + epoch_t realm_epoch; + epoch_t *last_trim_epoch; //< update last trim on success + + public: + PurgePeriodLogsCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, epoch_t realm_epoch, epoch_t *last_trim) + : RGWCoroutine(store->ctx()), dpp(dpp), store(store), metadata(store->ctl()->meta.mgr), + realm_epoch(realm_epoch), last_trim_epoch(last_trim) { + svc.zone = store->svc()->zone; + svc.mdlog = store->svc()->mdlog; + } + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int PurgePeriodLogsCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + // read our current oldest log period + yield call(svc.mdlog->read_oldest_log_period_cr(dpp, &cursor, &objv)); + if (retcode < 0) { + return set_cr_error(retcode); + } + ceph_assert(cursor); + ldpp_dout(dpp, 20) << "oldest log realm_epoch=" << cursor.get_epoch() + << " period=" << cursor.get_period().get_id() << dendl; + + // trim -up to- the given realm_epoch + while (cursor.get_epoch() < realm_epoch) { + ldpp_dout(dpp, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch() + << " period=" << cursor.get_period().get_id() << dendl; + yield { + const auto mdlog = svc.mdlog->get_log(cursor.get_period().get_id()); + const auto& pool = svc.zone->get_zone_params().log_pool; + auto num_shards = cct->_conf->rgw_md_log_max_shards; + call(new PurgeLogShardsCR(store, mdlog, pool, num_shards)); + } + if (retcode < 0) { + ldpp_dout(dpp, 1) << "failed to remove log shards: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + ldpp_dout(dpp, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch() + << " period=" << cursor.get_period().get_id() << dendl; + + // update our mdlog history + yield call(svc.mdlog->trim_log_period_cr(dpp, cursor, &objv)); + if (retcode == -ENOENT) { + // must have raced to update mdlog history. return success and allow the + // winner to continue purging + ldpp_dout(dpp, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch() + << " period=" << cursor.get_period().get_id() << dendl; + return set_cr_done(); + } else if (retcode < 0) { + ldpp_dout(dpp, 1) << "failed to remove log shards for realm_epoch=" + << cursor.get_epoch() << " period=" << cursor.get_period().get_id() + << " with: " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + if (*last_trim_epoch < cursor.get_epoch()) { + *last_trim_epoch = cursor.get_epoch(); + } + + ceph_assert(cursor.has_next()); // get_current() should always come after + cursor.next(); + } + return set_cr_done(); + } + return 0; +} + +namespace { + +using connection_map = std::map>; + +/// construct a RGWRESTConn for each zone in the realm +template +connection_map make_peer_connections(rgw::sal::RadosStore* store, + const Zonegroups& zonegroups) +{ + connection_map connections; + for (auto& g : zonegroups) { + for (auto& z : g.second.zones) { + std::unique_ptr conn{ + new RGWRESTConn(store->ctx(), store, z.first.id, z.second.endpoints, g.second.api_name)}; + connections.emplace(z.first.id, std::move(conn)); + } + } + return connections; +} + +/// return the marker that it's safe to trim up to +const std::string& get_stable_marker(const rgw_meta_sync_marker& m) +{ + return m.state == m.FullSync ? m.next_step_marker : m.marker; +} + +/// comparison operator for take_min_status() +bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs) +{ + // sort by stable marker + return get_stable_marker(lhs) < get_stable_marker(rhs); +} + +/// populate the status with the minimum stable marker of each shard for any +/// peer whose realm_epoch matches the minimum realm_epoch in the input +template +int take_min_status(CephContext *cct, Iter first, Iter last, + rgw_meta_sync_status *status) +{ + if (first == last) { + return -EINVAL; + } + const size_t num_shards = cct->_conf->rgw_md_log_max_shards; + + status->sync_info.realm_epoch = std::numeric_limits::max(); + for (auto p = first; p != last; ++p) { + // validate peer's shard count + if (p->sync_markers.size() != num_shards) { + ldout(cct, 1) << "take_min_status got peer status with " + << p->sync_markers.size() << " shards, expected " + << num_shards << dendl; + return -EINVAL; + } + if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) { + // earlier epoch, take its entire status + *status = std::move(*p); + } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) { + // same epoch, take any earlier markers + auto m = status->sync_markers.begin(); + for (auto& shard : p->sync_markers) { + if (shard.second < m->second) { + m->second = std::move(shard.second); + } + ++m; + } + } + } + return 0; +} + +struct TrimEnv { + const DoutPrefixProvider *dpp; + rgw::sal::RadosStore* const store; + RGWHTTPManager *const http; + int num_shards; + const rgw_zone_id& zone; + Cursor current; //< cursor to current period + epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged + + TrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards) + : dpp(dpp), store(store), http(http), num_shards(num_shards), + zone(store->svc()->zone->zone_id()), + current(store->svc()->mdlog->get_period_history()->get_current()) + {} +}; + +struct MasterTrimEnv : public TrimEnv { + connection_map connections; //< peer connections + std::vector peer_status; //< sync status for each peer + /// last trim marker for each shard, only applies to current period's mdlog + std::vector last_trim_markers; + + MasterTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards) + : TrimEnv(dpp, store, http, num_shards), + last_trim_markers(num_shards) + { + auto& period = current.get_period(); + connections = make_peer_connections(store, period.get_map().zonegroups); + connections.erase(zone.id); + peer_status.resize(connections.size()); + } +}; + +struct PeerTrimEnv : public TrimEnv { + /// last trim timestamp for each shard, only applies to current period's mdlog + std::vector last_trim_timestamps; + + PeerTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards) + : TrimEnv(dpp, store, http, num_shards), + last_trim_timestamps(num_shards) + {} + + void set_num_shards(int num_shards) { + this->num_shards = num_shards; + last_trim_timestamps.resize(num_shards); + } +}; + +} // anonymous namespace + + +/// spawn a trim cr for each shard that needs it, while limiting the number +/// of concurrent shards +class MetaMasterTrimShardCollectCR : public RGWShardCollectCR { + private: + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + MasterTrimEnv& env; + RGWMetadataLog *mdlog; + int shard_id{0}; + std::string oid; + const rgw_meta_sync_status& sync_status; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl; + } + return r; + } + public: + MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog, + const rgw_meta_sync_status& sync_status) + : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS), + env(env), mdlog(mdlog), sync_status(sync_status) + {} + + bool spawn_next() override; +}; + +bool MetaMasterTrimShardCollectCR::spawn_next() +{ + while (shard_id < env.num_shards) { + auto m = sync_status.sync_markers.find(shard_id); + if (m == sync_status.sync_markers.end()) { + shard_id++; + continue; + } + auto& stable = get_stable_marker(m->second); + auto& last_trim = env.last_trim_markers[shard_id]; + + if (stable <= last_trim) { + // already trimmed + ldpp_dout(env.dpp, 20) << "skipping log shard " << shard_id + << " at marker=" << stable + << " last_trim=" << last_trim + << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl; + shard_id++; + continue; + } + + mdlog->get_shard_oid(shard_id, oid); + + ldpp_dout(env.dpp, 10) << "trimming log shard " << shard_id + << " at marker=" << stable + << " last_trim=" << last_trim + << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl; + spawn(new RGWSyncLogTrimCR(env.dpp, env.store, oid, stable, &last_trim), false); + shard_id++; + return true; + } + return false; +} + +/// spawn rest requests to read each peer's sync status +class MetaMasterStatusCollectCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + MasterTrimEnv& env; + connection_map::iterator c; + std::vector::iterator s; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to fetch metadata sync status: " + << cpp_strerror(r) << dendl; + } + return r; + } + public: + explicit MetaMasterStatusCollectCR(MasterTrimEnv& env) + : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS), + env(env), c(env.connections.begin()), s(env.peer_status.begin()) + {} + + bool spawn_next() override { + if (c == env.connections.end()) { + return false; + } + static rgw_http_param_pair params[] = { + { "type", "metadata" }, + { "status", nullptr }, + { nullptr, nullptr } + }; + + ldout(cct, 20) << "query sync status from " << c->first << dendl; + auto conn = c->second.get(); + using StatusCR = RGWReadRESTResourceCR; + spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s), + false); + ++c; + ++s; + return true; + } +}; + +class MetaMasterTrimCR : public RGWCoroutine { + MasterTrimEnv& env; + rgw_meta_sync_status min_status; //< minimum sync status of all peers + int ret{0}; + + public: + explicit MetaMasterTrimCR(MasterTrimEnv& env) + : RGWCoroutine(env.store->ctx()), env(env) + {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int MetaMasterTrimCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + // TODO: detect this and fail before we spawn the trim thread? + if (env.connections.empty()) { + ldpp_dout(dpp, 4) << "no peers, exiting" << dendl; + return set_cr_done(); + } + + ldpp_dout(dpp, 10) << "fetching sync status for zone " << env.zone << dendl; + // query mdlog sync status from peers + yield call(new MetaMasterStatusCollectCR(env)); + + // must get a successful reply from all peers to consider trimming + if (ret < 0) { + ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl; + return set_cr_error(ret); + } + + // determine the minimum epoch and markers + ret = take_min_status(env.store->ctx(), env.peer_status.begin(), + env.peer_status.end(), &min_status); + if (ret < 0) { + ldpp_dout(dpp, 4) << "failed to calculate min sync status from peers" << dendl; + return set_cr_error(ret); + } + yield { + auto store = env.store; + auto epoch = min_status.sync_info.realm_epoch; + ldpp_dout(dpp, 4) << "realm epoch min=" << epoch + << " current=" << env.current.get_epoch()<< dendl; + if (epoch > env.last_trim_epoch + 1) { + // delete any prior mdlog periods + spawn(new PurgePeriodLogsCR(dpp, store, epoch, &env.last_trim_epoch), true); + } else { + ldpp_dout(dpp, 10) << "mdlogs already purged up to realm_epoch " + << env.last_trim_epoch << dendl; + } + + // if realm_epoch == current, trim mdlog based on markers + if (epoch == env.current.get_epoch()) { + auto mdlog = store->svc()->mdlog->get_log(env.current.get_period().get_id()); + spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true); + } + } + // ignore any errors during purge/trim because we want to hold the lock open + return set_cr_done(); + } + return 0; +} + + +/// read the first entry of the master's mdlog shard and trim to that position +class MetaPeerTrimShardCR : public RGWCoroutine { + RGWMetaSyncEnv& env; + RGWMetadataLog *mdlog; + const std::string& period_id; + const int shard_id; + RGWMetadataLogInfo info; + ceph::real_time stable; //< safe timestamp to trim, according to master + ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim + rgw_mdlog_shard_data result; //< result from master's mdlog listing + + public: + MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog, + const std::string& period_id, int shard_id, + ceph::real_time *last_trim) + : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog), + period_id(period_id), shard_id(shard_id), last_trim(last_trim) + {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int MetaPeerTrimShardCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + // query master's first mdlog entry for this shard + yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id, + "", 1, &result)); + if (retcode < 0) { + ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard " + << shard_id << " for period " << period_id + << ": " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + if (result.entries.empty()) { + // if there are no mdlog entries, we don't have a timestamp to compare. we + // can't just trim everything, because there could be racing updates since + // this empty reply. query the mdlog shard info to read its max timestamp, + // then retry the listing to make sure it's still empty before trimming to + // that + ldpp_dout(dpp, 10) << "empty master mdlog shard " << shard_id + << ", reading last timestamp from shard info" << dendl; + // read the mdlog shard info for the last timestamp + yield call(create_read_remote_mdlog_shard_info_cr(&env, period_id, shard_id, &info)); + if (retcode < 0) { + ldpp_dout(dpp, 5) << "failed to read info from master's mdlog shard " + << shard_id << " for period " << period_id + << ": " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + if (ceph::real_clock::is_zero(info.last_update)) { + return set_cr_done(); // nothing to trim + } + ldpp_dout(dpp, 10) << "got mdlog shard info with last update=" + << info.last_update << dendl; + // re-read the master's first mdlog entry to make sure it hasn't changed + yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id, + "", 1, &result)); + if (retcode < 0) { + ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard " + << shard_id << " for period " << period_id + << ": " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + // if the mdlog is still empty, trim to max marker + if (result.entries.empty()) { + stable = info.last_update; + } else { + stable = result.entries.front().timestamp; + + // can only trim -up to- master's first timestamp, so subtract a second. + // (this is why we use timestamps instead of markers for the peers) + stable -= std::chrono::seconds(1); + } + } else { + stable = result.entries.front().timestamp; + stable -= std::chrono::seconds(1); + } + + if (stable <= *last_trim) { + ldpp_dout(dpp, 10) << "skipping log shard " << shard_id + << " at timestamp=" << stable + << " last_trim=" << *last_trim << dendl; + return set_cr_done(); + } + + ldpp_dout(dpp, 10) << "trimming log shard " << shard_id + << " at timestamp=" << stable + << " last_trim=" << *last_trim << dendl; + yield { + std::string oid; + mdlog->get_shard_oid(shard_id, oid); + call(new RGWRadosTimelogTrimCR(dpp, env.store, oid, real_time{}, stable, "", "")); + } + if (retcode < 0 && retcode != -ENODATA) { + ldpp_dout(dpp, 1) << "failed to trim mdlog shard " << shard_id + << ": " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + *last_trim = stable; + return set_cr_done(); + } + return 0; +} + +class MetaPeerTrimShardCollectCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + PeerTrimEnv& env; + RGWMetadataLog *mdlog; + const std::string& period_id; + RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR + int shard_id{0}; + + int handle_result(int r) override { + if (r == -ENOENT) { // ENOENT is not a fatal error + return 0; + } + if (r < 0) { + ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl; + } + return r; + } + public: + MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog) + : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS), + env(env), mdlog(mdlog), period_id(env.current.get_period().get_id()) + { + meta_env.init(env.dpp, cct, env.store, env.store->svc()->zone->get_master_conn(), + env.store->svc()->rados->get_async_processor(), env.http, nullptr, + env.store->getRados()->get_sync_tracer()); + } + + bool spawn_next() override; +}; + +bool MetaPeerTrimShardCollectCR::spawn_next() +{ + if (shard_id >= env.num_shards) { + return false; + } + auto& last_trim = env.last_trim_timestamps[shard_id]; + spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim), + false); + shard_id++; + return true; +} + +class MetaPeerTrimCR : public RGWCoroutine { + PeerTrimEnv& env; + rgw_mdlog_info mdlog_info; //< master's mdlog info + + public: + explicit MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int MetaPeerTrimCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + ldpp_dout(dpp, 10) << "fetching master mdlog info" << dendl; + yield { + // query mdlog_info from master for oldest_log_period + rgw_http_param_pair params[] = { + { "type", "metadata" }, + { nullptr, nullptr } + }; + + using LogInfoCR = RGWReadRESTResourceCR; + call(new LogInfoCR(cct, env.store->svc()->zone->get_master_conn(), env.http, + "/admin/log/", params, &mdlog_info)); + } + if (retcode < 0) { + ldpp_dout(dpp, 4) << "failed to read mdlog info from master" << dendl; + return set_cr_error(retcode); + } + // use master's shard count instead + env.set_num_shards(mdlog_info.num_shards); + + if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) { + // delete any prior mdlog periods + yield call(new PurgePeriodLogsCR(dpp, env.store, mdlog_info.realm_epoch, + &env.last_trim_epoch)); + } else { + ldpp_dout(dpp, 10) << "mdlogs already purged through realm_epoch " + << env.last_trim_epoch << dendl; + } + + // if realm_epoch == current, trim mdlog based on master's markers + if (mdlog_info.realm_epoch == env.current.get_epoch()) { + yield { + auto mdlog = env.store->svc()->mdlog->get_log(env.current.get_period().get_id()); + call(new MetaPeerTrimShardCollectCR(env, mdlog)); + // ignore any errors during purge/trim because we want to hold the lock open + } + } + return set_cr_done(); + } + return 0; +} + +class MetaTrimPollCR : public RGWCoroutine { + rgw::sal::RadosStore* const store; + const utime_t interval; //< polling interval + const rgw_raw_obj obj; + const std::string name{"meta_trim"}; //< lock name + const std::string cookie; + + protected: + /// allocate the coroutine to run within the lease + virtual RGWCoroutine* alloc_cr() = 0; + + public: + MetaTrimPollCR(rgw::sal::RadosStore* store, utime_t interval) + : RGWCoroutine(store->ctx()), store(store), interval(interval), + obj(store->svc()->zone->get_zone_params().log_pool, RGWMetadataLogHistory::oid), + cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)) + {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int MetaTrimPollCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + for (;;) { + set_status("sleeping"); + wait(interval); + + // prevent others from trimming for our entire wait interval + set_status("acquiring trim lock"); + yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store, + obj, name, cookie, interval.sec())); + if (retcode < 0) { + ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl; + continue; + } + + set_status("trimming"); + yield call(alloc_cr()); + + if (retcode < 0) { + // on errors, unlock so other gateways can try + set_status("unlocking"); + yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store, + obj, name, cookie)); + } + } + } + return 0; +} + +class MetaMasterTrimPollCR : public MetaTrimPollCR { + MasterTrimEnv env; //< trim state to share between calls + RGWCoroutine* alloc_cr() override { + return new MetaMasterTrimCR(env); + } + public: + MetaMasterTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, + int num_shards, utime_t interval) + : MetaTrimPollCR(store, interval), + env(dpp, store, http, num_shards) + {} +}; + +class MetaPeerTrimPollCR : public MetaTrimPollCR { + PeerTrimEnv env; //< trim state to share between calls + RGWCoroutine* alloc_cr() override { + return new MetaPeerTrimCR(env); + } + public: + MetaPeerTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, + int num_shards, utime_t interval) + : MetaTrimPollCR(store, interval), + env(dpp, store, http, num_shards) + {} +}; + +namespace { +bool sanity_check_endpoints(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store) { + bool retval = true; + auto current = store->svc()->mdlog->get_period_history()->get_current(); + const auto& period = current.get_period(); + for (const auto& [_, zonegroup] : period.get_map().zonegroups) { + if (zonegroup.endpoints.empty()) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " WARNING: Cluster is is misconfigured! " + << " Zonegroup " << zonegroup.get_name() + << " (" << zonegroup.get_id() << ") in Realm " + << period.get_realm_name() << " ( " << period.get_realm() << ") " + << " has no endpoints!" << dendl; + } + for (const auto& [_, zone] : zonegroup.zones) { + if (zone.endpoints.empty()) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " ERROR: Cluster is is misconfigured! " + << " Zone " << zone.name << " (" << zone.id << ") in Zonegroup " + << zonegroup.get_name() << " ( " << zonegroup.get_id() + << ") in Realm " << period.get_realm_name() + << " ( " << period.get_realm() << ") " + << " has no endpoints! Trimming is impossible." << dendl; + retval = false; + } + } + } + return retval; +} +} + +RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, + int num_shards, utime_t interval) +{ + if (!sanity_check_endpoints(dpp, store)) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl; + return nullptr; + } + if (store->svc()->zone->is_meta_master()) { + return new MetaMasterTrimPollCR(dpp, store, http, num_shards, interval); + } + return new MetaPeerTrimPollCR(dpp, store, http, num_shards, interval); +} + + +struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR { + MetaMasterAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards) + : MasterTrimEnv(dpp, store, http, num_shards), + MetaMasterTrimCR(*static_cast(this)) + {} +}; + +struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR { + MetaPeerAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards) + : PeerTrimEnv(dpp, store, http, num_shards), + MetaPeerTrimCR(*static_cast(this)) + {} +}; + +RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, + RGWHTTPManager *http, + int num_shards) +{ + if (!sanity_check_endpoints(dpp, store)) { + ldpp_dout(dpp, -1) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl; + return nullptr; + } + if (store->svc()->zone->is_meta_master()) { + return new MetaMasterAdminTrimCR(dpp, store, http, num_shards); + } + return new MetaPeerAdminTrimCR(dpp, store, http, num_shards); +} diff --git a/src/rgw/driver/rados/rgw_trim_mdlog.h b/src/rgw/driver/rados/rgw_trim_mdlog.h new file mode 100644 index 000000000..1dba8612b --- /dev/null +++ b/src/rgw/driver/rados/rgw_trim_mdlog.h @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +class RGWCoroutine; +class DoutPrefixProvider; +class RGWRados; +class RGWHTTPManager; +class utime_t; +namespace rgw { namespace sal { + class RadosStore; +} } + +// MetaLogTrimCR factory function +RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, + RGWHTTPManager *http, + int num_shards, utime_t interval); + +// factory function for mdlog trim via radosgw-admin +RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, + RGWHTTPManager *http, + int num_shards); diff --git a/src/rgw/driver/rados/rgw_user.cc b/src/rgw/driver/rados/rgw_user.cc new file mode 100644 index 000000000..51b38c082 --- /dev/null +++ b/src/rgw/driver/rados/rgw_user.cc @@ -0,0 +1,2776 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" + +#include "rgw_user.h" + +#include "rgw_bucket.h" +#include "rgw_quota.h" + +#include "services/svc_user.h" +#include "services/svc_meta.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +extern void op_type_to_str(uint32_t mask, char *buf, int len); + +static string key_type_to_str(int key_type) { + switch (key_type) { + case KEY_TYPE_SWIFT: + return "swift"; + break; + + default: + return "s3"; + break; + } +} + +static bool char_is_unreserved_url(char c) +{ + if (isalnum(c)) + return true; + + switch (c) { + case '-': + case '.': + case '_': + case '~': + return true; + default: + return false; + } +} + +static bool validate_access_key(string& key) +{ + const char *p = key.c_str(); + while (*p) { + if (!char_is_unreserved_url(*p)) + return false; + p++; + } + return true; +} + +static void set_err_msg(std::string *sink, std::string msg) +{ + if (sink && !msg.empty()) + *sink = msg; +} + +/* + * Dump either the full user info or a subset to a formatter. + * + * NOTE: It is the caller's responsibility to ensure that the + * formatter is flushed at the correct time. + */ + +static void dump_subusers_info(Formatter *f, RGWUserInfo &info) +{ + map::iterator uiter; + + f->open_array_section("subusers"); + for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) { + RGWSubUser& u = uiter->second; + f->open_object_section("user"); + string s; + info.user_id.to_str(s); + f->dump_format("id", "%s:%s", s.c_str(), u.name.c_str()); + char buf[256]; + rgw_perm_to_str(u.perm_mask, buf, sizeof(buf)); + f->dump_string("permissions", buf); + f->close_section(); + } + f->close_section(); +} + +static void dump_access_keys_info(Formatter *f, RGWUserInfo &info) +{ + map::iterator kiter; + f->open_array_section("keys"); + for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) { + RGWAccessKey& k = kiter->second; + const char *sep = (k.subuser.empty() ? "" : ":"); + const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str()); + f->open_object_section("key"); + string s; + info.user_id.to_str(s); + f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser); + f->dump_string("access_key", k.id); + f->dump_string("secret_key", k.key); + f->close_section(); + } + f->close_section(); +} + +static void dump_swift_keys_info(Formatter *f, RGWUserInfo &info) +{ + map::iterator kiter; + f->open_array_section("swift_keys"); + for (kiter = info.swift_keys.begin(); kiter != info.swift_keys.end(); ++kiter) { + RGWAccessKey& k = kiter->second; + const char *sep = (k.subuser.empty() ? "" : ":"); + const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str()); + f->open_object_section("key"); + string s; + info.user_id.to_str(s); + f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser); + f->dump_string("secret_key", k.key); + f->close_section(); + } + f->close_section(); +} + +static void dump_user_info(Formatter *f, RGWUserInfo &info, + RGWStorageStats *stats = NULL) +{ + f->open_object_section("user_info"); + encode_json("tenant", info.user_id.tenant, f); + encode_json("user_id", info.user_id.id, f); + encode_json("display_name", info.display_name, f); + encode_json("email", info.user_email, f); + encode_json("suspended", (int)info.suspended, f); + encode_json("max_buckets", (int)info.max_buckets, f); + + dump_subusers_info(f, info); + dump_access_keys_info(f, info); + dump_swift_keys_info(f, info); + + encode_json("caps", info.caps, f); + + char buf[256]; + op_type_to_str(info.op_mask, buf, sizeof(buf)); + encode_json("op_mask", (const char *)buf, f); + encode_json("system", (bool)info.system, f); + encode_json("admin", (bool)info.admin, f); + encode_json("default_placement", info.default_placement.name, f); + encode_json("default_storage_class", info.default_placement.storage_class, f); + encode_json("placement_tags", info.placement_tags, f); + encode_json("bucket_quota", info.quota.bucket_quota, f); + encode_json("user_quota", info.quota.user_quota, f); + encode_json("temp_url_keys", info.temp_url_keys, f); + + string user_source_type; + switch ((RGWIdentityType)info.type) { + case TYPE_RGW: + user_source_type = "rgw"; + break; + case TYPE_KEYSTONE: + user_source_type = "keystone"; + break; + case TYPE_LDAP: + user_source_type = "ldap"; + break; + case TYPE_NONE: + user_source_type = "none"; + break; + default: + user_source_type = "none"; + break; + } + encode_json("type", user_source_type, f); + encode_json("mfa_ids", info.mfa_ids, f); + if (stats) { + encode_json("stats", *stats, f); + } + f->close_section(); +} + +static int user_add_helper(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + int ret = 0; + const rgw_user& uid = op_state.get_user_id(); + std::string user_email = op_state.get_user_email(); + std::string display_name = op_state.get_display_name(); + + // fail if the user exists already + if (op_state.has_existing_user()) { + if (op_state.found_by_email) { + set_err_msg(err_msg, "email: " + user_email + + " is the email address of an existing user"); + ret = -ERR_EMAIL_EXIST; + } else if (op_state.found_by_key) { + set_err_msg(err_msg, "duplicate key provided"); + ret = -ERR_KEY_EXIST; + } else { + set_err_msg(err_msg, "user: " + uid.to_str() + " exists"); + ret = -EEXIST; + } + return ret; + } + + // fail if the user_info has already been populated + if (op_state.is_populated()) { + set_err_msg(err_msg, "cannot overwrite already populated user"); + return -EEXIST; + } + + // fail if the display name was not included + if (display_name.empty()) { + set_err_msg(err_msg, "no display name specified"); + return -EINVAL; + } + + return ret; +} + +RGWAccessKeyPool::RGWAccessKeyPool(RGWUser* usr) +{ + if (!usr) { + return; + } + + user = usr; + + driver = user->get_driver(); +} + +int RGWAccessKeyPool::init(RGWUserAdminOpState& op_state) +{ + if (!op_state.is_initialized()) { + keys_allowed = false; + return -EINVAL; + } + + const rgw_user& uid = op_state.get_user_id(); + if (uid.compare(RGW_USER_ANON_ID) == 0) { + keys_allowed = false; + return -EINVAL; + } + + swift_keys = op_state.get_swift_keys(); + access_keys = op_state.get_access_keys(); + + keys_allowed = true; + + return 0; +} + +RGWUserAdminOpState::RGWUserAdminOpState(rgw::sal::Driver* driver) +{ + user = driver->get_user(rgw_user(RGW_USER_ANON_ID)); +} + +void RGWUserAdminOpState::set_user_id(const rgw_user& id) +{ + if (id.empty()) + return; + + user->get_info().user_id = id; +} + +void RGWUserAdminOpState::set_subuser(std::string& _subuser) +{ + if (_subuser.empty()) + return; + + size_t pos = _subuser.find(":"); + if (pos != string::npos) { + rgw_user tmp_id; + tmp_id.from_str(_subuser.substr(0, pos)); + if (tmp_id.tenant.empty()) { + user->get_info().user_id.id = tmp_id.id; + } else { + user->get_info().user_id = tmp_id; + } + subuser = _subuser.substr(pos+1); + } else { + subuser = _subuser; + } + + subuser_specified = true; +} + +void RGWUserAdminOpState::set_user_info(RGWUserInfo& user_info) +{ + user->get_info() = user_info; +} + +void RGWUserAdminOpState::set_user_version_tracker(RGWObjVersionTracker& objv_tracker) +{ + user->get_version_tracker() = objv_tracker; +} + +const rgw_user& RGWUserAdminOpState::get_user_id() +{ + return user->get_id(); +} + +RGWUserInfo& RGWUserAdminOpState::get_user_info() +{ + return user->get_info(); +} + +map* RGWUserAdminOpState::get_swift_keys() +{ + return &user->get_info().swift_keys; +} + +map* RGWUserAdminOpState::get_access_keys() +{ + return &user->get_info().access_keys; +} + +map* RGWUserAdminOpState::get_subusers() +{ + return &user->get_info().subusers; +} + +RGWUserCaps *RGWUserAdminOpState::get_caps_obj() +{ + return &user->get_info().caps; +} + +std::string RGWUserAdminOpState::build_default_swift_kid() +{ + if (user->get_id().empty() || subuser.empty()) + return ""; + + std::string kid; + user->get_id().to_str(kid); + kid.append(":"); + kid.append(subuser); + + return kid; +} + +std::string RGWUserAdminOpState::generate_subuser() { + if (user->get_id().empty()) + return ""; + + std::string generated_subuser; + user->get_id().to_str(generated_subuser); + std::string rand_suffix; + + int sub_buf_size = RAND_SUBUSER_LEN + 1; + char sub_buf[RAND_SUBUSER_LEN + 1]; + + gen_rand_alphanumeric_upper(g_ceph_context, sub_buf, sub_buf_size); + + rand_suffix = sub_buf; + if (rand_suffix.empty()) + return ""; + + generated_subuser.append(rand_suffix); + subuser = generated_subuser; + + return generated_subuser; +} + +/* + * Do a fairly exhaustive search for an existing key matching the parameters + * given. Also handles the case where no key type was specified and updates + * the operation state if needed. + */ + +bool RGWAccessKeyPool::check_existing_key(RGWUserAdminOpState& op_state) +{ + bool existing_key = false; + + int key_type = op_state.get_key_type(); + std::string kid = op_state.get_access_key(); + std::map::iterator kiter; + std::string swift_kid = op_state.build_default_swift_kid(); + + RGWUserInfo dup_info; + + if (kid.empty() && swift_kid.empty()) + return false; + + switch (key_type) { + case KEY_TYPE_SWIFT: + kiter = swift_keys->find(swift_kid); + + existing_key = (kiter != swift_keys->end()); + if (existing_key) + op_state.set_access_key(swift_kid); + + break; + case KEY_TYPE_S3: + kiter = access_keys->find(kid); + existing_key = (kiter != access_keys->end()); + + break; + default: + kiter = access_keys->find(kid); + + existing_key = (kiter != access_keys->end()); + if (existing_key) { + op_state.set_key_type(KEY_TYPE_S3); + break; + } + + kiter = swift_keys->find(kid); + + existing_key = (kiter != swift_keys->end()); + if (existing_key) { + op_state.set_key_type(KEY_TYPE_SWIFT); + break; + } + + // handle the case where the access key was not provided in user:key format + if (swift_kid.empty()) + return false; + + kiter = swift_keys->find(swift_kid); + + existing_key = (kiter != swift_keys->end()); + if (existing_key) { + op_state.set_access_key(swift_kid); + op_state.set_key_type(KEY_TYPE_SWIFT); + } + } + + op_state.set_existing_key(existing_key); + + return existing_key; +} + +int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state, + std::string *err_msg) +{ + RGWUserInfo dup_info; + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!keys_allowed) { + set_err_msg(err_msg, "keys not allowed for this user"); + return -EACCES; + } + + int32_t key_type = op_state.get_key_type(); + + // if a key type wasn't specified + if (key_type < 0) { + if (op_state.has_subuser()) { + key_type = KEY_TYPE_SWIFT; + } else { + key_type = KEY_TYPE_S3; + } + } + + op_state.set_key_type(key_type); + + /* see if the access key was specified */ + if (key_type == KEY_TYPE_S3 && !op_state.will_gen_access() && + op_state.get_access_key().empty()) { + set_err_msg(err_msg, "empty access key"); + return -ERR_INVALID_ACCESS_KEY; + } + + // don't check for secret key because we may be doing a removal + + if (check_existing_key(op_state)) { + op_state.set_access_key_exist(); + } + return 0; +} + +// Generate a new random key +int RGWAccessKeyPool::generate_key(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, + optional_yield y, std::string *err_msg) +{ + std::string id; + std::string key; + + std::pair key_pair; + RGWAccessKey new_key; + std::unique_ptr duplicate_check; + + int key_type = op_state.get_key_type(); + bool gen_access = op_state.will_gen_access(); + bool gen_secret = op_state.will_gen_secret(); + + if (!keys_allowed) { + set_err_msg(err_msg, "access keys not allowed for this user"); + return -EACCES; + } + + if (op_state.has_existing_key()) { + set_err_msg(err_msg, "cannot create existing key"); + return -ERR_KEY_EXIST; + } + + if (!gen_access) { + id = op_state.get_access_key(); + } + + if (!id.empty()) { + switch (key_type) { + case KEY_TYPE_SWIFT: + if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) { + set_err_msg(err_msg, "existing swift key in RGW system:" + id); + return -ERR_KEY_EXIST; + } + break; + case KEY_TYPE_S3: + if (driver->get_user_by_access_key(dpp, id, y, &duplicate_check) >= 0) { + set_err_msg(err_msg, "existing S3 key in RGW system:" + id); + return -ERR_KEY_EXIST; + } + } + } + + //key's subuser + if (op_state.has_subuser()) { + //create user and subuser at the same time, user's s3 key should not be set this + if (!op_state.key_type_setbycontext || (key_type == KEY_TYPE_SWIFT)) { + new_key.subuser = op_state.get_subuser(); + } + } + + //Secret key + if (!gen_secret) { + if (op_state.get_secret_key().empty()) { + set_err_msg(err_msg, "empty secret key"); + return -ERR_INVALID_SECRET_KEY; + } + + key = op_state.get_secret_key(); + } else { + char secret_key_buf[SECRET_KEY_LEN + 1]; + gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf)); + key = secret_key_buf; + } + + // Generate the access key + if (key_type == KEY_TYPE_S3 && gen_access) { + char public_id_buf[PUBLIC_ID_LEN + 1]; + + do { + int id_buf_size = sizeof(public_id_buf); + gen_rand_alphanumeric_upper(g_ceph_context, public_id_buf, id_buf_size); + id = public_id_buf; + if (!validate_access_key(id)) + continue; + + } while (!driver->get_user_by_access_key(dpp, id, y, &duplicate_check)); + } + + if (key_type == KEY_TYPE_SWIFT) { + id = op_state.build_default_swift_kid(); + if (id.empty()) { + set_err_msg(err_msg, "empty swift access key"); + return -ERR_INVALID_ACCESS_KEY; + } + + // check that the access key doesn't exist + if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) { + set_err_msg(err_msg, "cannot create existing swift key"); + return -ERR_KEY_EXIST; + } + } + + // finally create the new key + new_key.id = id; + new_key.key = key; + + key_pair.first = id; + key_pair.second = new_key; + + if (key_type == KEY_TYPE_S3) { + access_keys->insert(key_pair); + } else if (key_type == KEY_TYPE_SWIFT) { + swift_keys->insert(key_pair); + } + + return 0; +} + +// modify an existing key +int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + std::string id; + std::string key = op_state.get_secret_key(); + int key_type = op_state.get_key_type(); + + RGWAccessKey modify_key; + + pair key_pair; + map::iterator kiter; + + switch (key_type) { + case KEY_TYPE_S3: + id = op_state.get_access_key(); + if (id.empty()) { + set_err_msg(err_msg, "no access key specified"); + return -ERR_INVALID_ACCESS_KEY; + } + break; + case KEY_TYPE_SWIFT: + id = op_state.build_default_swift_kid(); + if (id.empty()) { + set_err_msg(err_msg, "no subuser specified"); + return -EINVAL; + } + break; + default: + set_err_msg(err_msg, "invalid key type"); + return -ERR_INVALID_KEY_TYPE; + } + + if (!op_state.has_existing_key()) { + set_err_msg(err_msg, "key does not exist"); + return -ERR_INVALID_ACCESS_KEY; + } + + key_pair.first = id; + + if (key_type == KEY_TYPE_SWIFT) { + modify_key.id = id; + modify_key.subuser = op_state.get_subuser(); + } else if (key_type == KEY_TYPE_S3) { + kiter = access_keys->find(id); + if (kiter != access_keys->end()) { + modify_key = kiter->second; + } + } + + if (op_state.will_gen_secret()) { + char secret_key_buf[SECRET_KEY_LEN + 1]; + int key_buf_size = sizeof(secret_key_buf); + gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, key_buf_size); + key = secret_key_buf; + } + + if (key.empty()) { + set_err_msg(err_msg, "empty secret key"); + return -ERR_INVALID_SECRET_KEY; + } + + // update the access key with the new secret key + modify_key.key = key; + + key_pair.second = modify_key; + + + if (key_type == KEY_TYPE_S3) { + (*access_keys)[id] = modify_key; + } else if (key_type == KEY_TYPE_SWIFT) { + (*swift_keys)[id] = modify_key; + } + + return 0; +} + +int RGWAccessKeyPool::execute_add(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update, + optional_yield y) +{ + int ret = 0; + + std::string subprocess_msg; + int key_op = GENERATE_KEY; + + // set the op + if (op_state.has_existing_key()) + key_op = MODIFY_KEY; + + switch (key_op) { + case GENERATE_KEY: + ret = generate_key(dpp, op_state, y, &subprocess_msg); + break; + case MODIFY_KEY: + ret = modify_key(op_state, &subprocess_msg); + break; + } + + if (ret < 0) { + set_err_msg(err_msg, subprocess_msg); + return ret; + } + + // store the updated info + if (!defer_user_update) + ret = user->update(dpp, op_state, err_msg, y); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg) +{ + return add(dpp, op_state, err_msg, false, y); +} + +int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, std::string *err_msg, + bool defer_user_update, optional_yield y) +{ + int ret; + std::string subprocess_msg; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to add access key, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWAccessKeyPool::execute_remove(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, + std::string *err_msg, + bool defer_user_update, + optional_yield y) +{ + int ret = 0; + + int key_type = op_state.get_key_type(); + std::string id = op_state.get_access_key(); + map::iterator kiter; + map *keys_map; + + if (!op_state.has_existing_key()) { + set_err_msg(err_msg, "unable to find access key, with key type: " + + key_type_to_str(key_type)); + return -ERR_INVALID_ACCESS_KEY; + } + + if (key_type == KEY_TYPE_S3) { + keys_map = access_keys; + } else if (key_type == KEY_TYPE_SWIFT) { + keys_map = swift_keys; + } else { + keys_map = NULL; + set_err_msg(err_msg, "invalid access key"); + return -ERR_INVALID_ACCESS_KEY; + } + + kiter = keys_map->find(id); + if (kiter == keys_map->end()) { + set_err_msg(err_msg, "key not found"); + return -ERR_INVALID_ACCESS_KEY; + } + + keys_map->erase(kiter); + + if (!defer_user_update) + ret = user->update(dpp, op_state, err_msg, y); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg) +{ + return remove(dpp, op_state, err_msg, false, y); +} + +int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update, + optional_yield y) +{ + int ret; + + std::string subprocess_msg; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove access key, " + subprocess_msg); + return ret; + } + + return 0; +} + +// remove all keys associated with a subuser +int RGWAccessKeyPool::remove_subuser_keys(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, + std::string *err_msg, + bool defer_user_update, + optional_yield y) +{ + int ret = 0; + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!op_state.has_subuser()) { + set_err_msg(err_msg, "no subuser specified"); + return -EINVAL; + } + + std::string swift_kid = op_state.build_default_swift_kid(); + if (swift_kid.empty()) { + set_err_msg(err_msg, "empty swift access key"); + return -EINVAL; + } + + map::iterator kiter; + map *keys_map; + + // a subuser can have at most one swift key + keys_map = swift_keys; + kiter = keys_map->find(swift_kid); + if (kiter != keys_map->end()) { + keys_map->erase(kiter); + } + + // a subuser may have multiple s3 key pairs + std::string subuser_str = op_state.get_subuser(); + keys_map = access_keys; + RGWUserInfo user_info = op_state.get_user_info(); + auto user_kiter = user_info.access_keys.begin(); + for (; user_kiter != user_info.access_keys.end(); ++user_kiter) { + if (user_kiter->second.subuser == subuser_str) { + kiter = keys_map->find(user_kiter->first); + if (kiter != keys_map->end()) { + keys_map->erase(kiter); + } + } + } + + if (!defer_user_update) + ret = user->update(dpp, op_state, err_msg, y); + + if (ret < 0) + return ret; + + return 0; +} + +RGWSubUserPool::RGWSubUserPool(RGWUser *usr) +{ + if (!usr) { + return; + } + + user = usr; + + subusers_allowed = true; + driver = user->get_driver(); +} + +int RGWSubUserPool::init(RGWUserAdminOpState& op_state) +{ + if (!op_state.is_initialized()) { + subusers_allowed = false; + return -EINVAL; + } + + const rgw_user& uid = op_state.get_user_id(); + if (uid.compare(RGW_USER_ANON_ID) == 0) { + subusers_allowed = false; + return -EACCES; + } + + subuser_map = op_state.get_subusers(); + if (subuser_map == NULL) { + subusers_allowed = false; + return -EINVAL; + } + + subusers_allowed = true; + + return 0; +} + +bool RGWSubUserPool::exists(std::string subuser) +{ + if (subuser.empty()) + return false; + + if (!subuser_map) + return false; + + if (subuser_map->count(subuser)) + return true; + + return false; +} + +int RGWSubUserPool::check_op(RGWUserAdminOpState& op_state, + std::string *err_msg) +{ + bool existing = false; + std::string subuser = op_state.get_subuser(); + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!subusers_allowed) { + set_err_msg(err_msg, "subusers not allowed for this user"); + return -EACCES; + } + + if (subuser.empty() && !op_state.will_gen_subuser()) { + set_err_msg(err_msg, "empty subuser name"); + return -EINVAL; + } + + if (op_state.get_subuser_perm() == RGW_PERM_INVALID) { + set_err_msg(err_msg, "invalid subuser access"); + return -EINVAL; + } + + //set key type when it not set or set by context + if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) { + op_state.set_key_type(KEY_TYPE_SWIFT); + op_state.key_type_setbycontext = true; + } + + // check if the subuser exists + if (!subuser.empty()) + existing = exists(subuser); + + op_state.set_existing_subuser(existing); + + return 0; +} + +int RGWSubUserPool::execute_add(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update, + optional_yield y) +{ + int ret = 0; + std::string subprocess_msg; + + RGWSubUser subuser; + std::pair subuser_pair; + std::string subuser_str = op_state.get_subuser(); + + subuser_pair.first = subuser_str; + + // assumes key should be created + if (op_state.has_key_op()) { + ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to create subuser key, " + subprocess_msg); + return ret; + } + } + + // create the subuser + subuser.name = subuser_str; + + if (op_state.has_subuser_perm()) + subuser.perm_mask = op_state.get_subuser_perm(); + + // insert the subuser into user info + subuser_pair.second = subuser; + subuser_map->insert(subuser_pair); + + // attempt to save the subuser + if (!defer_user_update) + ret = user->update(dpp, op_state, err_msg, y); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg) +{ + return add(dpp, op_state, err_msg, false, y); +} + +int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y) +{ + std::string subprocess_msg; + int ret; + int32_t key_type = op_state.get_key_type(); + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + if (op_state.get_access_key_exist()) { + set_err_msg(err_msg, "cannot create existing key"); + return -ERR_KEY_EXIST; + } + + if (key_type == KEY_TYPE_S3 && op_state.get_access_key().empty()) { + op_state.set_gen_access(); + } + + if (op_state.get_secret_key().empty()) { + op_state.set_gen_secret(); + } + + ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to create subuser, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWSubUserPool::execute_remove(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update, + optional_yield y) +{ + int ret = 0; + std::string subprocess_msg; + + std::string subuser_str = op_state.get_subuser(); + + map::iterator siter; + siter = subuser_map->find(subuser_str); + if (siter == subuser_map->end()){ + set_err_msg(err_msg, "subuser not found: " + subuser_str); + return -ERR_NO_SUCH_SUBUSER; + } + if (!op_state.has_existing_subuser()) { + set_err_msg(err_msg, "subuser not found: " + subuser_str); + return -ERR_NO_SUCH_SUBUSER; + } + + // always purge all associate keys + user->keys.remove_subuser_keys(dpp, op_state, &subprocess_msg, true, y); + + // remove the subuser from the user info + subuser_map->erase(siter); + + // attempt to save the subuser + if (!defer_user_update) + ret = user->update(dpp, op_state, err_msg, y); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg) +{ + return remove(dpp, op_state, err_msg, false, y); +} + +int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, + bool defer_user_update, optional_yield y) +{ + std::string subprocess_msg; + int ret; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove subuser, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWSubUserPool::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y) +{ + int ret = 0; + std::string subprocess_msg; + std::map::iterator siter; + std::pair subuser_pair; + + std::string subuser_str = op_state.get_subuser(); + RGWSubUser subuser; + + if (!op_state.has_existing_subuser()) { + set_err_msg(err_msg, "subuser does not exist"); + return -ERR_NO_SUCH_SUBUSER; + } + + subuser_pair.first = subuser_str; + + siter = subuser_map->find(subuser_str); + subuser = siter->second; + + if (op_state.has_key_op()) { + ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to create subuser keys, " + subprocess_msg); + return ret; + } + } + + if (op_state.has_subuser_perm()) + subuser.perm_mask = op_state.get_subuser_perm(); + + subuser_pair.second = subuser; + + subuser_map->erase(siter); + subuser_map->insert(subuser_pair); + + // attempt to save the subuser + if (!defer_user_update) + ret = user->update(dpp, op_state, err_msg, y); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg) +{ + return RGWSubUserPool::modify(dpp, op_state, y, err_msg, false); +} + +int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_user_update) +{ + std::string subprocess_msg; + int ret; + + RGWSubUser subuser; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + ret = execute_modify(dpp, op_state, &subprocess_msg, defer_user_update, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to modify subuser, " + subprocess_msg); + return ret; + } + + return 0; +} + +RGWUserCapPool::RGWUserCapPool(RGWUser *usr) +{ + if (!usr) { + return; + } + user = usr; + caps_allowed = true; +} + +int RGWUserCapPool::init(RGWUserAdminOpState& op_state) +{ + if (!op_state.is_initialized()) { + caps_allowed = false; + return -EINVAL; + } + + const rgw_user& uid = op_state.get_user_id(); + if (uid.compare(RGW_USER_ANON_ID) == 0) { + caps_allowed = false; + return -EACCES; + } + + caps = op_state.get_caps_obj(); + if (!caps) { + caps_allowed = false; + return -ERR_INVALID_CAP; + } + + caps_allowed = true; + + return 0; +} + +int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg) +{ + return add(dpp, op_state, err_msg, false, y); +} + +int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, + bool defer_save, optional_yield y) +{ + int ret = 0; + std::string caps_str = op_state.get_caps(); + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!caps_allowed) { + set_err_msg(err_msg, "caps not allowed for this user"); + return -EACCES; + } + + if (caps_str.empty()) { + set_err_msg(err_msg, "empty user caps"); + return -ERR_INVALID_CAP; + } + + int r = caps->add_from_string(caps_str); + if (r < 0) { + set_err_msg(err_msg, "unable to add caps: " + caps_str); + return r; + } + + if (!defer_save) + ret = user->update(dpp, op_state, err_msg, y); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg) +{ + return remove(dpp, op_state, err_msg, false, y); +} + +int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, + bool defer_save, optional_yield y) +{ + int ret = 0; + + std::string caps_str = op_state.get_caps(); + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!caps_allowed) { + set_err_msg(err_msg, "caps not allowed for this user"); + return -EACCES; + } + + if (caps_str.empty()) { + set_err_msg(err_msg, "empty user caps"); + return -ERR_INVALID_CAP; + } + + int r = caps->remove_from_string(caps_str); + if (r < 0) { + set_err_msg(err_msg, "unable to remove caps: " + caps_str); + return r; + } + + if (!defer_save) + ret = user->update(dpp, op_state, err_msg, y); + + if (ret < 0) + return ret; + + return 0; +} + +RGWUser::RGWUser() : caps(this), keys(this), subusers(this) +{ + init_default(); +} + +int RGWUser::init(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver, + RGWUserAdminOpState& op_state, optional_yield y) +{ + init_default(); + int ret = init_storage(_driver); + if (ret < 0) + return ret; + + ret = init(dpp, op_state, y); + if (ret < 0) + return ret; + + return 0; +} + +void RGWUser::init_default() +{ + // use anonymous user info as a placeholder + rgw_get_anon_user(old_info); + user_id = RGW_USER_ANON_ID; + + clear_populated(); +} + +int RGWUser::init_storage(rgw::sal::Driver* _driver) +{ + if (!_driver) { + return -EINVAL; + } + + driver = _driver; + + clear_populated(); + + /* API wrappers */ + keys = RGWAccessKeyPool(this); + caps = RGWUserCapPool(this); + subusers = RGWSubUserPool(this); + + return 0; +} + +int RGWUser::init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y) +{ + bool found = false; + std::string swift_user; + user_id = op_state.get_user_id(); + std::string user_email = op_state.get_user_email(); + std::string access_key = op_state.get_access_key(); + std::string subuser = op_state.get_subuser(); + + int key_type = op_state.get_key_type(); + if (key_type == KEY_TYPE_SWIFT) { + swift_user = op_state.get_access_key(); + access_key.clear(); + } + + std::unique_ptr user; + + clear_populated(); + + if (user_id.empty() && !subuser.empty()) { + size_t pos = subuser.find(':'); + if (pos != string::npos) { + user_id = subuser.substr(0, pos); + op_state.set_user_id(user_id); + } + } + + if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) { + user = driver->get_user(user_id); + found = (user->load_user(dpp, y) >= 0); + op_state.found_by_uid = found; + } + if (driver->ctx()->_conf.get_val("rgw_user_unique_email")) { + if (!user_email.empty() && !found) { + found = (driver->get_user_by_email(dpp, user_email, y, &user) >= 0); + op_state.found_by_email = found; + } + } + if (!swift_user.empty() && !found) { + found = (driver->get_user_by_swift(dpp, swift_user, y, &user) >= 0); + op_state.found_by_key = found; + } + if (!access_key.empty() && !found) { + found = (driver->get_user_by_access_key(dpp, access_key, y, &user) >= 0); + op_state.found_by_key = found; + } + + op_state.set_existing_user(found); + if (found) { + op_state.set_user_info(user->get_info()); + op_state.set_populated(); + op_state.objv = user->get_version_tracker(); + op_state.set_user_version_tracker(user->get_version_tracker()); + + old_info = user->get_info(); + set_populated(); + } + + if (user_id.empty()) { + user_id = user->get_id(); + } + op_state.set_initialized(); + + // this may have been called by a helper object + int ret = init_members(op_state); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUser::init_members(RGWUserAdminOpState& op_state) +{ + int ret = 0; + + ret = keys.init(op_state); + if (ret < 0) + return ret; + + ret = subusers.init(op_state); + if (ret < 0) + return ret; + + ret = caps.init(op_state); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUser::update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, + optional_yield y) +{ + int ret; + std::string subprocess_msg; + rgw::sal::User* user = op_state.get_user(); + + if (!driver) { + set_err_msg(err_msg, "couldn't initialize storage"); + return -EINVAL; + } + + // if op_state.op_access_keys is not empty most recent keys have been fetched from master zone + if(!op_state.op_access_keys.empty()) { + auto user_access_keys = op_state.get_access_keys(); + *(user_access_keys) = op_state.op_access_keys; + } + + RGWUserInfo *pold_info = (is_populated() ? &old_info : nullptr); + + ret = user->store_user(dpp, y, false, pold_info); + op_state.objv = user->get_version_tracker(); + op_state.set_user_version_tracker(user->get_version_tracker()); + + if (ret < 0) { + set_err_msg(err_msg, "unable to store user info"); + return ret; + } + + old_info = user->get_info(); + set_populated(); + + return 0; +} + +int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + int ret = 0; + const rgw_user& uid = op_state.get_user_id(); + + if (uid.compare(RGW_USER_ANON_ID) == 0) { + set_err_msg(err_msg, "unable to perform operations on the anonymous user"); + return -EINVAL; + } + + if (is_populated() && user_id.compare(uid) != 0) { + set_err_msg(err_msg, "user id mismatch, operation id: " + uid.to_str() + + " does not match: " + user_id.to_str()); + + return -EINVAL; + } + + ret = rgw_validate_tenant_name(uid.tenant); + if (ret) { + set_err_msg(err_msg, + "invalid tenant only alphanumeric and _ characters are allowed"); + return ret; + } + + //set key type when it not set or set by context + if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) { + op_state.set_key_type(KEY_TYPE_S3); + op_state.key_type_setbycontext = true; + } + + return 0; +} + +// update swift_keys with new user id +static void rename_swift_keys(const rgw_user& user, + std::map& keys) +{ + std::string user_id; + user.to_str(user_id); + + auto modify_keys = std::move(keys); + for ([[maybe_unused]] auto& [k, key] : modify_keys) { + std::string id = user_id + ":" + key.subuser; + key.id = id; + keys[id] = std::move(key); + } +} + +int RGWUser::execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y) +{ + int ret; + bool populated = op_state.is_populated(); + + if (!op_state.has_existing_user() && !populated) { + set_err_msg(err_msg, "user not found"); + return -ENOENT; + } + + if (!populated) { + ret = init(dpp, op_state, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to retrieve user info"); + return ret; + } + } + + std::unique_ptr old_user = driver->get_user(op_state.get_user_info().user_id); + std::unique_ptr new_user = driver->get_user(op_state.get_new_uid()); + if (old_user->get_tenant() != new_user->get_tenant()) { + set_err_msg(err_msg, "users have to be under the same tenant namespace " + + old_user->get_tenant() + " != " + new_user->get_tenant()); + return -EINVAL; + } + + // create a stub user and write only the uid index and buckets object + std::unique_ptr user; + user = driver->get_user(new_user->get_id()); + + const bool exclusive = !op_state.get_overwrite_new_user(); // overwrite if requested + + ret = user->store_user(dpp, y, exclusive); + if (ret == -EEXIST) { + set_err_msg(err_msg, "user name given by --new-uid already exists"); + return ret; + } + if (ret < 0) { + set_err_msg(err_msg, "unable to store new user info"); + return ret; + } + + RGWAccessControlPolicy policy_instance; + policy_instance.create_default(new_user->get_id(), old_user->get_display_name()); + + //unlink and link buckets to new user + string marker; + CephContext *cct = driver->ctx(); + size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk; + rgw::sal::BucketList buckets; + + do { + ret = old_user->list_buckets(dpp, marker, "", max_buckets, false, buckets, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to list user buckets"); + return ret; + } + + auto& m = buckets.get_buckets(); + + for (auto it = m.begin(); it != m.end(); ++it) { + auto& bucket = it->second; + marker = it->first; + + ret = bucket->load_bucket(dpp, y); + if (ret < 0) { + set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket->get_name()); + return ret; + } + + ret = bucket->set_acl(dpp, policy_instance, y); + if (ret < 0) { + set_err_msg(err_msg, "failed to set acl on bucket " + bucket->get_name()); + return ret; + } + + ret = rgw_chown_bucket_and_objects(driver, bucket.get(), new_user.get(), + std::string(), nullptr, dpp, y); + if (ret < 0) { + set_err_msg(err_msg, "failed to run bucket chown" + cpp_strerror(-ret)); + return ret; + } + } + + } while (buckets.is_truncated()); + + // update the 'stub user' with all of the other fields and rewrite all of the + // associated index objects + RGWUserInfo& user_info = op_state.get_user_info(); + user_info.user_id = new_user->get_id(); + op_state.objv = user->get_version_tracker(); + op_state.set_user_version_tracker(user->get_version_tracker()); + + rename_swift_keys(new_user->get_id(), user_info.swift_keys); + + return update(dpp, op_state, err_msg, y); +} + +int RGWUser::execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, + optional_yield y) +{ + const rgw_user& uid = op_state.get_user_id(); + std::string user_email = op_state.get_user_email(); + std::string display_name = op_state.get_display_name(); + + // set the user info + RGWUserInfo user_info; + user_id = uid; + user_info.user_id = user_id; + user_info.display_name = display_name; + user_info.type = TYPE_RGW; + + if (!user_email.empty()) + user_info.user_email = user_email; + + CephContext *cct = driver->ctx(); + if (op_state.max_buckets_specified) { + user_info.max_buckets = op_state.get_max_buckets(); + } else { + user_info.max_buckets = + cct->_conf.get_val("rgw_user_max_buckets"); + } + + user_info.suspended = op_state.get_suspension_status(); + user_info.admin = op_state.admin; + user_info.system = op_state.system; + + if (op_state.op_mask_specified) + user_info.op_mask = op_state.get_op_mask(); + + if (op_state.has_bucket_quota()) { + user_info.quota.bucket_quota = op_state.get_bucket_quota(); + } else { + rgw_apply_default_bucket_quota(user_info.quota.bucket_quota, cct->_conf); + } + + if (op_state.temp_url_key_specified) { + map::iterator iter; + for (iter = op_state.temp_url_keys.begin(); + iter != op_state.temp_url_keys.end(); ++iter) { + user_info.temp_url_keys[iter->first] = iter->second; + } + } + + if (op_state.has_user_quota()) { + user_info.quota.user_quota = op_state.get_user_quota(); + } else { + rgw_apply_default_user_quota(user_info.quota.user_quota, cct->_conf); + } + + if (op_state.default_placement_specified) { + user_info.default_placement = op_state.default_placement; + } + + if (op_state.placement_tags_specified) { + user_info.placement_tags = op_state.placement_tags; + } + + // update the request + op_state.set_user_info(user_info); + op_state.set_populated(); + + // update the helper objects + int ret = init_members(op_state); + if (ret < 0) { + set_err_msg(err_msg, "unable to initialize user"); + return ret; + } + + // see if we need to add an access key + std::string subprocess_msg; + bool defer_user_update = true; + if (op_state.has_key_op()) { + ret = keys.add(dpp, op_state, &subprocess_msg, defer_user_update, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to create access key, " + subprocess_msg); + return ret; + } + } + + // see if we need to add some caps + if (op_state.has_caps_op()) { + ret = caps.add(dpp, op_state, &subprocess_msg, defer_user_update, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to add user capabilities, " + subprocess_msg); + return ret; + } + } + + ret = update(dpp, op_state, err_msg, y); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUser::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg) +{ + std::string subprocess_msg; + int ret = user_add_helper(op_state, &subprocess_msg); + if (ret != 0) { + set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg); + return ret; + } + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg); + return ret; + } + + ret = execute_add(dpp, op_state, &subprocess_msg, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to create user, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWUser::rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg) +{ + std::string subprocess_msg; + int ret; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg); + return ret; + } + + ret = execute_rename(dpp, op_state, &subprocess_msg, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to rename user, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWUser::execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y) +{ + int ret; + + bool purge_data = op_state.will_purge_data(); + rgw::sal::User* user = op_state.get_user(); + + if (!op_state.has_existing_user()) { + set_err_msg(err_msg, "user does not exist"); + return -ENOENT; + } + + rgw::sal::BucketList buckets; + string marker; + CephContext *cct = driver->ctx(); + size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk; + do { + ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to read user bucket info"); + return ret; + } + + auto& m = buckets.get_buckets(); + if (!m.empty() && !purge_data) { + set_err_msg(err_msg, "must specify purge data to remove user with buckets"); + return -EEXIST; // change to code that maps to 409: conflict + } + + for (auto it = m.begin(); it != m.end(); ++it) { + ret = it->second->remove_bucket(dpp, true, false, nullptr, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to delete user data"); + return ret; + } + + marker = it->first; + } + + } while (buckets.is_truncated()); + + ret = user->remove_user(dpp, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove user from RADOS"); + return ret; + } + + op_state.clear_populated(); + clear_populated(); + + return 0; +} + +int RGWUser::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg) +{ + std::string subprocess_msg; + int ret; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg); + return ret; + } + + ret = execute_remove(dpp, op_state, &subprocess_msg, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove user, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWUser::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y) +{ + bool populated = op_state.is_populated(); + int ret = 0; + std::string subprocess_msg; + std::string op_email = op_state.get_user_email(); + std::string display_name = op_state.get_display_name(); + + RGWUserInfo user_info; + std::unique_ptr duplicate_check; + + // ensure that the user info has been populated or is populate-able + if (!op_state.has_existing_user() && !populated) { + set_err_msg(err_msg, "user not found"); + return -ENOENT; + } + + // if the user hasn't already been populated...attempt to + if (!populated) { + ret = init(dpp, op_state, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to retrieve user info"); + return ret; + } + } + + // ensure that we can modify the user's attributes + if (user_id.compare(RGW_USER_ANON_ID) == 0) { + set_err_msg(err_msg, "unable to modify anonymous user's info"); + return -EACCES; + } + + user_info = old_info; + + std::string old_email = old_info.user_email; + if (!op_email.empty()) { + // make sure we are not adding a duplicate email + if (old_email != op_email) { + ret = driver->get_user_by_email(dpp, op_email, y, &duplicate_check); + if (ret >= 0 && duplicate_check->get_id().compare(user_id) != 0) { + set_err_msg(err_msg, "cannot add duplicate email"); + return -ERR_EMAIL_EXIST; + } + } + user_info.user_email = op_email; + } else if (op_email.empty() && op_state.user_email_specified) { + ldpp_dout(dpp, 10) << "removing email index: " << user_info.user_email << dendl; + /* will be physically removed later when calling update() */ + user_info.user_email.clear(); + } + + // update the remaining user info + if (!display_name.empty()) + user_info.display_name = display_name; + + if (op_state.max_buckets_specified) + user_info.max_buckets = op_state.get_max_buckets(); + + if (op_state.admin_specified) + user_info.admin = op_state.admin; + + if (op_state.system_specified) + user_info.system = op_state.system; + + if (op_state.temp_url_key_specified) { + map::iterator iter; + for (iter = op_state.temp_url_keys.begin(); + iter != op_state.temp_url_keys.end(); ++iter) { + user_info.temp_url_keys[iter->first] = iter->second; + } + } + + if (op_state.op_mask_specified) + user_info.op_mask = op_state.get_op_mask(); + + if (op_state.has_bucket_quota()) + user_info.quota.bucket_quota = op_state.get_bucket_quota(); + + if (op_state.has_user_quota()) + user_info.quota.user_quota = op_state.get_user_quota(); + + if (op_state.has_suspension_op()) { + __u8 suspended = op_state.get_suspension_status(); + user_info.suspended = suspended; + + rgw::sal::BucketList buckets; + + if (user_id.empty()) { + set_err_msg(err_msg, "empty user id passed...aborting"); + return -EINVAL; + } + + string marker; + CephContext *cct = driver->ctx(); + size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk; + std::unique_ptr user = driver->get_user(user_id); + do { + ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y); + if (ret < 0) { + set_err_msg(err_msg, "could not get buckets for uid: " + user_id.to_str()); + return ret; + } + + auto& m = buckets.get_buckets(); + + vector bucket_names; + for (auto iter = m.begin(); iter != m.end(); ++iter) { + auto& bucket = iter->second; + bucket_names.push_back(bucket->get_key()); + + marker = iter->first; + } + + ret = driver->set_buckets_enabled(dpp, bucket_names, !suspended); + if (ret < 0) { + set_err_msg(err_msg, "failed to modify bucket"); + return ret; + } + + } while (buckets.is_truncated()); + } + + if (op_state.mfa_ids_specified) { + user_info.mfa_ids = op_state.mfa_ids; + } + + if (op_state.default_placement_specified) { + user_info.default_placement = op_state.default_placement; + } + + if (op_state.placement_tags_specified) { + user_info.placement_tags = op_state.placement_tags; + } + + op_state.set_user_info(user_info); + + // if we're supposed to modify keys, do so + if (op_state.has_key_op()) { + ret = keys.add(dpp, op_state, &subprocess_msg, true, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to create or modify keys, " + subprocess_msg); + return ret; + } + } + + ret = update(dpp, op_state, err_msg, y); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUser::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg) +{ + std::string subprocess_msg; + int ret; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg); + return ret; + } + + ret = execute_modify(dpp, op_state, &subprocess_msg, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to modify user, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWUser::info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info, + optional_yield y, std::string *err_msg) +{ + int ret = init(dpp, op_state, y); + if (ret < 0) { + set_err_msg(err_msg, "unable to fetch user info"); + return ret; + } + + fetched_info = op_state.get_user_info(); + + return 0; +} + +int RGWUser::info(RGWUserInfo& fetched_info, std::string *err_msg) +{ + if (!is_populated()) { + set_err_msg(err_msg, "no user info saved"); + return -EINVAL; + } + + fetched_info = old_info; + + return 0; +} + +int RGWUser::list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher) +{ + Formatter *formatter = flusher.get_formatter(); + void *handle = nullptr; + std::string metadata_key = "user"; + if (op_state.max_entries > 1000) { + op_state.max_entries = 1000; + } + + int ret = driver->meta_list_keys_init(dpp, metadata_key, op_state.marker, &handle); + if (ret < 0) { + return ret; + } + + bool truncated = false; + uint64_t count = 0; + uint64_t left = 0; + flusher.start(0); + + // open the result object section + formatter->open_object_section("result"); + + // open the user id list array section + formatter->open_array_section("keys"); + do { + std::list keys; + left = op_state.max_entries - count; + ret = driver->meta_list_keys_next(dpp, handle, left, keys, &truncated); + if (ret < 0 && ret != -ENOENT) { + return ret; + } if (ret != -ENOENT) { + for (std::list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + formatter->dump_string("key", *iter); + ++count; + } + } + } while (truncated && left > 0); + // close user id list section + formatter->close_section(); + + formatter->dump_bool("truncated", truncated); + formatter->dump_int("count", count); + if (truncated) { + formatter->dump_string("marker", driver->meta_get_marker(handle)); + } + + // close result object section + formatter->close_section(); + + driver->meta_list_keys_complete(handle); + + flusher.flush(); + return 0; +} + +int RGWUserAdminOp_User::list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUser user; + + int ret = user.init_storage(driver); + if (ret < 0) + return ret; + + ret = user.list(dpp, op_state, flusher); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUserAdminOp_User::info(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, + optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + std::unique_ptr ruser; + + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + ruser = driver->get_user(info.user_id); + + if (op_state.sync_stats) { + ret = rgw_user_sync_all_stats(dpp, driver, ruser.get(), y); + if (ret < 0) { + return ret; + } + } + + RGWStorageStats stats; + RGWStorageStats *arg_stats = NULL; + if (op_state.fetch_stats) { + int ret = ruser->read_stats(dpp, y, &stats); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + + arg_stats = &stats; + } + + if (formatter) { + flusher.start(0); + + dump_user_info(formatter, info, arg_stats); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_User::create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.add(dpp, op_state, y, NULL); + if (ret < 0) { + if (ret == -EEXIST) + ret = -ERR_USER_EXIST; + return ret; + } + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + dump_user_info(formatter, info); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_User::modify(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + Formatter *formatter = flusher.get_formatter(); + + ret = user.modify(dpp, op_state, y, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = -ERR_NO_SUCH_USER; + return ret; + } + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + dump_user_info(formatter, info); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_User::remove(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + + ret = user.remove(dpp, op_state, y, NULL); + + if (ret == -ENOENT) + ret = -ERR_NO_SUCH_USER; + return ret; +} + +int RGWUserAdminOp_Subuser::create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, + optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.subusers.add(dpp, op_state, y, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + dump_subusers_info(formatter, info); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_Subuser::modify(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.subusers.modify(dpp, op_state, y, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + dump_subusers_info(formatter, info); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_Subuser::remove(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, + optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + ret = user.subusers.remove(dpp, op_state, y, NULL); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUserAdminOp_Key::create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, + optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.keys.add(dpp, op_state, y, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + int key_type = op_state.get_key_type(); + + if (key_type == KEY_TYPE_SWIFT) + dump_swift_keys_info(formatter, info); + + else if (key_type == KEY_TYPE_S3) + dump_access_keys_info(formatter, info); + + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_Key::remove(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, + optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + + ret = user.keys.remove(dpp, op_state, y, NULL); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUserAdminOp_Caps::add(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.caps.add(dpp, op_state, y, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + info.caps.dump(formatter); + flusher.flush(); + } + + return 0; +} + + +int RGWUserAdminOp_Caps::remove(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher, optional_yield y) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(dpp, driver, op_state, y); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.caps.remove(dpp, op_state, y, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + info.caps.dump(formatter); + flusher.flush(); + } + + return 0; +} + +class RGWUserMetadataHandler : public RGWMetadataHandler_GenericMetaBE { +public: + struct Svc { + RGWSI_User *user{nullptr}; + } svc; + + RGWUserMetadataHandler(RGWSI_User *user_svc) { + base_init(user_svc->ctx(), user_svc->get_be_handler()); + svc.user = user_svc; + } + + ~RGWUserMetadataHandler() {} + + string get_type() override { return "user"; } + + int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override { + RGWUserCompleteInfo uci; + RGWObjVersionTracker objv_tracker; + real_time mtime; + + rgw_user user = RGWSI_User::user_from_meta_key(entry); + + int ret = svc.user->read_user_info(op->ctx(), user, &uci.info, &objv_tracker, + &mtime, nullptr, &uci.attrs, + y, dpp); + if (ret < 0) { + return ret; + } + + RGWUserMetadataObject *mdo = new RGWUserMetadataObject(uci, objv_tracker.read_version, mtime); + *obj = mdo; + + return 0; + } + + RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override { + RGWUserCompleteInfo uci; + + try { + decode_json_obj(uci, jo); + } catch (JSONDecoder::err& e) { + return nullptr; + } + + return new RGWUserMetadataObject(uci, objv, mtime); + } + + int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, bool from_remote_zone) override; + + int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + RGWUserInfo info; + + rgw_user user = RGWSI_User::user_from_meta_key(entry); + + int ret = svc.user->read_user_info(op->ctx(), user, &info, nullptr, + nullptr, nullptr, nullptr, + y, dpp); + if (ret < 0) { + return ret; + } + + return svc.user->remove_user_info(op->ctx(), info, &objv_tracker, + y, dpp); + } +}; + +class RGWMetadataHandlerPut_User : public RGWMetadataHandlerPut_SObj +{ + RGWUserMetadataHandler *uhandler; + RGWUserMetadataObject *uobj; +public: + RGWMetadataHandlerPut_User(RGWUserMetadataHandler *_handler, + RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, + RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone), + uhandler(_handler) { + uobj = static_cast(obj); + } + + int put_checked(const DoutPrefixProvider *dpp) override; +}; + +int RGWUserMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, bool from_remote_zone) +{ + RGWMetadataHandlerPut_User put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone); + return do_put_operate(&put_op, dpp); +} + +int RGWMetadataHandlerPut_User::put_checked(const DoutPrefixProvider *dpp) +{ + RGWUserMetadataObject *orig_obj = static_cast(old_obj); + RGWUserCompleteInfo& uci = uobj->get_uci(); + + map *pattrs{nullptr}; + if (uci.has_attrs) { + pattrs = &uci.attrs; + } + + RGWUserInfo *pold_info = (orig_obj ? &orig_obj->get_uci().info : nullptr); + + auto mtime = obj->get_mtime(); + + int ret = uhandler->svc.user->store_user_info(op->ctx(), uci.info, pold_info, + &objv_tracker, mtime, + false, pattrs, y, dpp); + if (ret < 0) { + return ret; + } + + return STATUS_APPLIED; +} + + +RGWUserCtl::RGWUserCtl(RGWSI_Zone *zone_svc, + RGWSI_User *user_svc, + RGWUserMetadataHandler *_umhandler) : umhandler(_umhandler) { + svc.zone = zone_svc; + svc.user = user_svc; + be_handler = umhandler->get_be_handler(); +} + +template +class optional_default +{ + const std::optional& opt; + std::optional def; + const T *p; +public: + optional_default(const std::optional& _o) : opt(_o) { + if (opt) { + p = &(*opt); + } else { + def = T(); + p = &(*def); + } + } + + const T *operator->() { + return p; + } + + const T& operator*() { + return *p; + } +}; + +int RGWUserCtl::get_info_by_uid(const DoutPrefixProvider *dpp, + const rgw_user& uid, + RGWUserInfo *info, + optional_yield y, + const GetParams& params) + +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return svc.user->read_user_info(op->ctx(), + uid, + info, + params.objv_tracker, + params.mtime, + params.cache_info, + params.attrs, + y, + dpp); + }); +} + +int RGWUserCtl::get_info_by_email(const DoutPrefixProvider *dpp, + const string& email, + RGWUserInfo *info, + optional_yield y, + const GetParams& params) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return svc.user->get_user_info_by_email(op->ctx(), email, + info, + params.objv_tracker, + params.mtime, + y, + dpp); + }); +} + +int RGWUserCtl::get_info_by_swift(const DoutPrefixProvider *dpp, + const string& swift_name, + RGWUserInfo *info, + optional_yield y, + const GetParams& params) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return svc.user->get_user_info_by_swift(op->ctx(), swift_name, + info, + params.objv_tracker, + params.mtime, + y, + dpp); + }); +} + +int RGWUserCtl::get_info_by_access_key(const DoutPrefixProvider *dpp, + const string& access_key, + RGWUserInfo *info, + optional_yield y, + const GetParams& params) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return svc.user->get_user_info_by_access_key(op->ctx(), access_key, + info, + params.objv_tracker, + params.mtime, + y, + dpp); + }); +} + +int RGWUserCtl::get_attrs_by_uid(const DoutPrefixProvider *dpp, + const rgw_user& user_id, + map *pattrs, + optional_yield y, + RGWObjVersionTracker *objv_tracker) +{ + RGWUserInfo user_info; + + return get_info_by_uid(dpp, user_id, &user_info, y, RGWUserCtl::GetParams() + .set_attrs(pattrs) + .set_objv_tracker(objv_tracker)); +} + +int RGWUserCtl::store_info(const DoutPrefixProvider *dpp, + const RGWUserInfo& info, optional_yield y, + const PutParams& params) +{ + string key = RGWSI_User::get_meta_key(info.user_id); + + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return svc.user->store_user_info(op->ctx(), info, + params.old_info, + params.objv_tracker, + params.mtime, + params.exclusive, + params.attrs, + y, + dpp); + }); +} + +int RGWUserCtl::remove_info(const DoutPrefixProvider *dpp, + const RGWUserInfo& info, optional_yield y, + const RemoveParams& params) + +{ + string key = RGWSI_User::get_meta_key(info.user_id); + + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return svc.user->remove_user_info(op->ctx(), info, + params.objv_tracker, + y, dpp); + }); +} + +int RGWUserCtl::list_buckets(const DoutPrefixProvider *dpp, + const rgw_user& user, + const string& marker, + const string& end_marker, + uint64_t max, + bool need_stats, + RGWUserBuckets *buckets, + bool *is_truncated, + optional_yield y, + uint64_t default_max) +{ + if (!max) { + max = default_max; + } + + int ret = svc.user->list_buckets(dpp, user, marker, end_marker, + max, buckets, is_truncated, y); + if (ret < 0) { + return ret; + } + if (need_stats) { + map& m = buckets->get_buckets(); + ret = ctl.bucket->read_buckets_stats(m, y, dpp); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: could not get stats for buckets" << dendl; + return ret; + } + } + return 0; +} + +int RGWUserCtl::read_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, RGWStorageStats *stats, + optional_yield y, + ceph::real_time *last_stats_sync, + ceph::real_time *last_stats_update) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return svc.user->read_stats(dpp, op->ctx(), user, stats, + last_stats_sync, last_stats_update, y); + }); +} + +RGWMetadataHandler *RGWUserMetaHandlerAllocator::alloc(RGWSI_User *user_svc) { + return new RGWUserMetadataHandler(user_svc); +} + +void rgw_user::dump(Formatter *f) const +{ + ::encode_json("user", *this, f); +} + diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h new file mode 100644 index 000000000..ea05de806 --- /dev/null +++ b/src/rgw/driver/rados/rgw_user.h @@ -0,0 +1,885 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include "include/ceph_assert.h" + +#include "include/types.h" +#include "rgw_common.h" +#include "rgw_tools.h" + +#include "rgw_string.h" + +#include "common/Formatter.h" +#include "rgw_formats.h" +#include "rgw_metadata.h" +#include "rgw_sal_fwd.h" + +#define RGW_USER_ANON_ID "anonymous" + +#define SECRET_KEY_LEN 40 +#define PUBLIC_ID_LEN 20 +#define RAND_SUBUSER_LEN 5 + +#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/" + +class RGWUserCtl; +class RGWBucketCtl; +class RGWUserBuckets; + +class RGWGetUserStats_CB; + +/** + * A string wrapper that includes encode/decode functions + * for easily accessing a UID in all forms + */ +struct RGWUID +{ + rgw_user user_id; + void encode(bufferlist& bl) const { + std::string s; + user_id.to_str(s); + using ceph::encode; + encode(s, bl); + } + void decode(bufferlist::const_iterator& bl) { + std::string s; + using ceph::decode; + decode(s, bl); + user_id.from_str(s); + } +}; +WRITE_CLASS_ENCODER(RGWUID) + +/** Entry for bucket metadata collection */ +struct bucket_meta_entry { + size_t size; + size_t size_rounded; + ceph::real_time creation_time; + uint64_t count; +}; + +extern int rgw_user_sync_all_stats(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::User* user, optional_yield y); +extern int rgw_user_get_all_buckets_stats(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, rgw::sal::User* user, + std::map& buckets_usage_map, optional_yield y); + +/** + * Get the anonymous (ie, unauthenticated) user info. + */ +extern void rgw_get_anon_user(RGWUserInfo& info); + +extern void rgw_perm_to_str(uint32_t mask, char *buf, int len); +extern uint32_t rgw_str_to_perm(const char *str); + +extern int rgw_validate_tenant_name(const std::string& t); + +enum ObjectKeyType { + KEY_TYPE_SWIFT, + KEY_TYPE_S3, + KEY_TYPE_UNDEFINED +}; + +enum RGWKeyPoolOp { + GENERATE_KEY, + MODIFY_KEY +}; + +enum RGWUserId { + RGW_USER_ID, + RGW_SWIFT_USERNAME, + RGW_USER_EMAIL, + RGW_ACCESS_KEY, +}; + +/* + * An RGWUser class along with supporting classes created + * to support the creation of an RESTful administrative API + */ +struct RGWUserAdminOpState { + // user attributes + std::unique_ptr user; + std::string user_email; + std::string display_name; + rgw_user new_user_id; + bool overwrite_new_user = false; + int32_t max_buckets{RGW_DEFAULT_MAX_BUCKETS}; + __u8 suspended{0}; + __u8 admin{0}; + __u8 system{0}; + __u8 exclusive{0}; + __u8 fetch_stats{0}; + __u8 sync_stats{0}; + std::string caps; + RGWObjVersionTracker objv; + uint32_t op_mask{0}; + std::map temp_url_keys; + + // subuser attributes + std::string subuser; + uint32_t perm_mask{RGW_PERM_NONE}; + + // key_attributes + std::string id; // access key + std::string key; // secret key + // access keys fetched for a user in the middle of an op + std::map op_access_keys; + int32_t key_type{-1}; + bool access_key_exist = false; + + std::set mfa_ids; + + // operation attributes + bool existing_user{false}; + bool existing_key{false}; + bool existing_subuser{false}; + bool existing_email{false}; + bool subuser_specified{false}; + bool gen_secret{false}; + bool gen_access{false}; + bool gen_subuser{false}; + bool id_specified{false}; + bool key_specified{false}; + bool type_specified{false}; + bool key_type_setbycontext{false}; // key type set by user or subuser context + bool purge_data{false}; + bool purge_keys{false}; + bool display_name_specified{false}; + bool user_email_specified{false}; + bool max_buckets_specified{false}; + bool perm_specified{false}; + bool op_mask_specified{false}; + bool caps_specified{false}; + bool suspension_op{false}; + bool admin_specified{false}; + bool system_specified{false}; + bool key_op{false}; + bool temp_url_key_specified{false}; + bool found_by_uid{false}; + bool found_by_email{false}; + bool found_by_key{false}; + bool mfa_ids_specified{false}; + + // req parameters + bool populated{false}; + bool initialized{false}; + bool key_params_checked{false}; + bool subuser_params_checked{false}; + bool user_params_checked{false}; + + bool bucket_quota_specified{false}; + bool user_quota_specified{false}; + bool bucket_ratelimit_specified{false}; + bool user_ratelimit_specified{false}; + + RGWQuota quota; + RGWRateLimitInfo user_ratelimit; + RGWRateLimitInfo bucket_ratelimit; + + // req parameters for listing user + std::string marker{""}; + uint32_t max_entries{1000}; + rgw_placement_rule default_placement; // user default placement + bool default_placement_specified{false}; + + std::list placement_tags; // user default placement_tags + bool placement_tags_specified{false}; + + void set_access_key(const std::string& access_key) { + if (access_key.empty()) + return; + + id = access_key; + id_specified = true; + gen_access = false; + key_op = true; + } + + void set_secret_key(const std::string& secret_key) { + if (secret_key.empty()) + return; + + key = secret_key; + key_specified = true; + gen_secret = false; + key_op = true; + } + + void set_user_id(const rgw_user& id); + + void set_new_user_id(const rgw_user& id) { + if (id.empty()) + return; + + new_user_id = id; + } + void set_overwrite_new_user(bool b) { + overwrite_new_user = b; + } + + void set_user_email(std::string& email) { + /* always lowercase email address */ + boost::algorithm::to_lower(email); + user_email = email; + user_email_specified = true; + } + + void set_display_name(const std::string& name) { + if (name.empty()) + return; + + display_name = name; + display_name_specified = true; + } + + void set_subuser(std::string& _subuser); + + void set_caps(const std::string& _caps) { + if (_caps.empty()) + return; + + caps = _caps; + caps_specified = true; + } + + void set_perm(uint32_t perm) { + perm_mask = perm; + perm_specified = true; + } + + void set_op_mask(uint32_t mask) { + op_mask = mask; + op_mask_specified = true; + } + + void set_temp_url_key(const std::string& key, int index) { + temp_url_keys[index] = key; + temp_url_key_specified = true; + } + + void set_key_type(int32_t type) { + key_type = type; + type_specified = true; + } + + void set_access_key_exist() { + access_key_exist = true; + } + + void set_suspension(__u8 is_suspended) { + suspended = is_suspended; + suspension_op = true; + } + + void set_admin(__u8 is_admin) { + admin = is_admin; + admin_specified = true; + } + + void set_system(__u8 is_system) { + system = is_system; + system_specified = true; + } + + void set_exclusive(__u8 is_exclusive) { + exclusive = is_exclusive; + } + + void set_fetch_stats(__u8 is_fetch_stats) { + fetch_stats = is_fetch_stats; + } + + void set_sync_stats(__u8 is_sync_stats) { + sync_stats = is_sync_stats; + } + + void set_user_info(RGWUserInfo& user_info); + + void set_user_version_tracker(RGWObjVersionTracker& objv_tracker); + + void set_max_buckets(int32_t mb) { + max_buckets = mb; + max_buckets_specified = true; + } + + void set_gen_access() { + gen_access = true; + key_op = true; + } + + void set_gen_secret() { + gen_secret = true; + key_op = true; + } + + void set_generate_key() { + if (id.empty()) + gen_access = true; + if (key.empty()) + gen_secret = true; + key_op = true; + } + + void clear_generate_key() { + gen_access = false; + gen_secret = false; + } + + void set_purge_keys() { + purge_keys = true; + key_op = true; + } + + void set_bucket_quota(RGWQuotaInfo& quotas) { + quota.bucket_quota = quotas; + bucket_quota_specified = true; + } + + void set_user_quota(RGWQuotaInfo& quotas) { + quota.user_quota = quotas; + user_quota_specified = true; + } + + void set_bucket_ratelimit(RGWRateLimitInfo& ratelimit) { + bucket_ratelimit = ratelimit; + bucket_ratelimit_specified = true; + } + + void set_user_ratelimit(RGWRateLimitInfo& ratelimit) { + user_ratelimit = ratelimit; + user_ratelimit_specified = true; + } + + void set_mfa_ids(const std::set& ids) { + mfa_ids = ids; + mfa_ids_specified = true; + } + + void set_default_placement(const rgw_placement_rule& _placement) { + default_placement = _placement; + default_placement_specified = true; + } + + void set_placement_tags(const std::list& _tags) { + placement_tags = _tags; + placement_tags_specified = true; + } + + bool is_populated() { return populated; } + bool is_initialized() { return initialized; } + bool has_existing_user() { return existing_user; } + bool has_existing_key() { return existing_key; } + bool has_existing_subuser() { return existing_subuser; } + bool has_existing_email() { return existing_email; } + bool has_subuser() { return subuser_specified; } + bool has_key_op() { return key_op; } + bool has_caps_op() { return caps_specified; } + bool has_suspension_op() { return suspension_op; } + bool has_subuser_perm() { return perm_specified; } + bool has_op_mask() { return op_mask_specified; } + bool will_gen_access() { return gen_access; } + bool will_gen_secret() { return gen_secret; } + bool will_gen_subuser() { return gen_subuser; } + bool will_purge_keys() { return purge_keys; } + bool will_purge_data() { return purge_data; } + bool will_generate_subuser() { return gen_subuser; } + bool has_bucket_quota() { return bucket_quota_specified; } + bool has_user_quota() { return user_quota_specified; } + void set_populated() { populated = true; } + void clear_populated() { populated = false; } + void set_initialized() { initialized = true; } + void set_existing_user(bool flag) { existing_user = flag; } + void set_existing_key(bool flag) { existing_key = flag; } + void set_existing_subuser(bool flag) { existing_subuser = flag; } + void set_existing_email(bool flag) { existing_email = flag; } + void set_purge_data(bool flag) { purge_data = flag; } + void set_generate_subuser(bool flag) { gen_subuser = flag; } + __u8 get_suspension_status() { return suspended; } + int32_t get_key_type() {return key_type; } + bool get_access_key_exist() {return access_key_exist; } + uint32_t get_subuser_perm() { return perm_mask; } + int32_t get_max_buckets() { return max_buckets; } + uint32_t get_op_mask() { return op_mask; } + RGWQuotaInfo& get_bucket_quota() { return quota.bucket_quota; } + RGWQuotaInfo& get_user_quota() { return quota.user_quota; } + std::set& get_mfa_ids() { return mfa_ids; } + + rgw::sal::User* get_user() { return user.get(); } + const rgw_user& get_user_id(); + std::string get_subuser() { return subuser; } + std::string get_access_key() { return id; } + std::string get_secret_key() { return key; } + std::string get_caps() { return caps; } + std::string get_user_email() { return user_email; } + std::string get_display_name() { return display_name; } + rgw_user& get_new_uid() { return new_user_id; } + bool get_overwrite_new_user() const { return overwrite_new_user; } + std::map& get_temp_url_keys() { return temp_url_keys; } + + RGWUserInfo& get_user_info(); + + std::map* get_swift_keys(); + std::map* get_access_keys(); + std::map* get_subusers(); + + RGWUserCaps* get_caps_obj(); + + std::string build_default_swift_kid(); + + std::string generate_subuser(); + + RGWUserAdminOpState(rgw::sal::Driver* driver); +}; + +class RGWUser; + +class RGWAccessKeyPool +{ + RGWUser *user{nullptr}; + + std::map key_type_map; + rgw_user user_id; + rgw::sal::Driver* driver{nullptr}; + + std::map *swift_keys{nullptr}; + std::map *access_keys{nullptr}; + + // we don't want to allow keys for the anonymous user or a null user + bool keys_allowed{false}; + +private: + int create_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + int generate_key(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg = NULL); + int modify_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + int check_key_owner(RGWUserAdminOpState& op_state); + bool check_existing_key(RGWUserAdminOpState& op_state); + int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + /* API Contract Fulfilment */ + int execute_add(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, std::string *err_msg, + bool defer_save, optional_yield y); + int execute_remove(const DoutPrefixProvider *dpp, + RGWUserAdminOpState& op_state, std::string *err_msg, + bool defer_save, optional_yield y); + int remove_subuser_keys(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, + bool defer_save, optional_yield y); + + int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, + optional_yield y); + int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, + bool defer_save, optional_yield y); +public: + explicit RGWAccessKeyPool(RGWUser* usr); + + int init(RGWUserAdminOpState& op_state); + + /* API Contracted Methods */ + int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg = NULL); + int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg = NULL); + + friend class RGWUser; + friend class RGWSubUserPool; +}; + +class RGWSubUserPool +{ + RGWUser *user{nullptr}; + + rgw_user user_id; + rgw::sal::Driver* driver{nullptr}; + bool subusers_allowed{false}; + + std::map *subuser_map{nullptr}; + +private: + int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + /* API Contract Fulfillment */ + int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y); + int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y); + int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y); + + int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, + optional_yield y); + int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y); + int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_save); +public: + explicit RGWSubUserPool(RGWUser *user); + + bool exists(std::string subuser); + int init(RGWUserAdminOpState& op_state); + + /* API contracted methods */ + int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg = NULL); + int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL); + int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL); + + friend class RGWUser; +}; + +class RGWUserCapPool +{ + RGWUserCaps *caps{nullptr}; + bool caps_allowed{false}; + RGWUser *user{nullptr}; + +private: + int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, + optional_yield y); + int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, + optional_yield y); + +public: + explicit RGWUserCapPool(RGWUser *user); + + int init(RGWUserAdminOpState& op_state); + + /* API contracted methods */ + int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, + std::string *err_msg = NULL); + int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL); + + friend class RGWUser; +}; + +class RGWUser +{ + +private: + RGWUserInfo old_info; + rgw::sal::Driver* driver{nullptr}; + + rgw_user user_id; + bool info_stored{false}; + + void set_populated() { info_stored = true; } + void clear_populated() { info_stored = false; } + bool is_populated() { return info_stored; } + + int check_op(RGWUserAdminOpState& req, std::string *err_msg); + int update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y); + + void clear_members(); + void init_default(); + + /* API Contract Fulfillment */ + int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, + optional_yield y); + int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, + std::string *err_msg, optional_yield y); + int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y); + int execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y); + +public: + RGWUser(); + + int init(const DoutPrefixProvider *dpp, rgw::sal::Driver* storage, RGWUserAdminOpState& op_state, + optional_yield y); + + int init_storage(rgw::sal::Driver* storage); + int init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y); + int init_members(RGWUserAdminOpState& op_state); + + rgw::sal::Driver* get_driver() { return driver; } + + /* API Contracted Members */ + RGWUserCapPool caps; + RGWAccessKeyPool keys; + RGWSubUserPool subusers; + + /* API Contracted Methods */ + int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL); + + int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL); + + int rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL); + + /* remove an already populated RGWUser */ + int remove(std::string *err_msg = NULL); + + int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL); + + /* retrieve info from an existing user in the RGW system */ + int info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info, optional_yield y, + std::string *err_msg = NULL); + + /* info from an already populated RGWUser */ + int info (RGWUserInfo& fetched_info, std::string *err_msg = NULL); + + /* list the existing users */ + int list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + friend class RGWAccessKeyPool; + friend class RGWSubUserPool; + friend class RGWUserCapPool; +}; + +/* Wrappers for admin API functionality */ + +class RGWUserAdminOp_User +{ +public: + static int list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int info(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); + + static int create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); + + static int modify(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y); + + static int remove(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y); +}; + +class RGWUserAdminOp_Subuser +{ +public: + static int create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); + + static int modify(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); + + static int remove(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); +}; + +class RGWUserAdminOp_Key +{ +public: + static int create(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); + + static int remove(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); +}; + +class RGWUserAdminOp_Caps +{ +public: + static int add(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); + + static int remove(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, + optional_yield y); +}; + +struct RGWUserCompleteInfo { + RGWUserInfo info; + std::map attrs; + bool has_attrs{false}; + + void dump(Formatter * const f) const { + info.dump(f); + encode_json("attrs", attrs, f); + } + + void decode_json(JSONObj *obj) { + decode_json_obj(info, obj); + has_attrs = JSONDecoder::decode_json("attrs", attrs, obj); + } +}; + +class RGWUserMetadataObject : public RGWMetadataObject { + RGWUserCompleteInfo uci; +public: + RGWUserMetadataObject() {} + RGWUserMetadataObject(const RGWUserCompleteInfo& _uci, const obj_version& v, real_time m) + : uci(_uci) { + objv = v; + mtime = m; + } + + void dump(Formatter *f) const override { + uci.dump(f); + } + + RGWUserCompleteInfo& get_uci() { + return uci; + } +}; + +class RGWUserMetadataHandler; + +class RGWUserCtl +{ + struct Svc { + RGWSI_Zone *zone{nullptr}; + RGWSI_User *user{nullptr}; + } svc; + + struct Ctl { + RGWBucketCtl *bucket{nullptr}; + } ctl; + + RGWUserMetadataHandler *umhandler; + RGWSI_MetaBackend_Handler *be_handler{nullptr}; + +public: + RGWUserCtl(RGWSI_Zone *zone_svc, + RGWSI_User *user_svc, + RGWUserMetadataHandler *_umhandler); + + void init(RGWBucketCtl *bucket_ctl) { + ctl.bucket = bucket_ctl; + } + + RGWBucketCtl *get_bucket_ctl() { + return ctl.bucket; + } + + struct GetParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + ceph::real_time *mtime{nullptr}; + rgw_cache_entry_info *cache_info{nullptr}; + std::map *attrs{nullptr}; + + GetParams() {} + + GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + GetParams& set_mtime(ceph::real_time *_mtime) { + mtime = _mtime; + return *this; + } + + GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) { + cache_info = _cache_info; + return *this; + } + + GetParams& set_attrs(std::map *_attrs) { + attrs = _attrs; + return *this; + } + }; + + struct PutParams { + RGWUserInfo *old_info{nullptr}; + RGWObjVersionTracker *objv_tracker{nullptr}; + ceph::real_time mtime; + bool exclusive{false}; + std::map *attrs{nullptr}; + + PutParams() {} + + PutParams& set_old_info(RGWUserInfo *_info) { + old_info = _info; + return *this; + } + + PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + PutParams& set_mtime(const ceph::real_time& _mtime) { + mtime = _mtime; + return *this; + } + + PutParams& set_exclusive(bool _exclusive) { + exclusive = _exclusive; + return *this; + } + + PutParams& set_attrs(std::map *_attrs) { + attrs = _attrs; + return *this; + } + }; + + struct RemoveParams { + RGWObjVersionTracker *objv_tracker{nullptr}; + + RemoveParams() {} + + RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + }; + + int get_info_by_uid(const DoutPrefixProvider *dpp, + const rgw_user& uid, RGWUserInfo *info, + optional_yield y, const GetParams& params = {}); + int get_info_by_email(const DoutPrefixProvider *dpp, + const std::string& email, RGWUserInfo *info, + optional_yield y, const GetParams& params = {}); + int get_info_by_swift(const DoutPrefixProvider *dpp, + const std::string& swift_name, RGWUserInfo *info, + optional_yield y, const GetParams& params = {}); + int get_info_by_access_key(const DoutPrefixProvider *dpp, + const std::string& access_key, RGWUserInfo *info, + optional_yield y, const GetParams& params = {}); + + int get_attrs_by_uid(const DoutPrefixProvider *dpp, + const rgw_user& user_id, + std::map *attrs, + optional_yield y, + RGWObjVersionTracker *objv_tracker = nullptr); + + int store_info(const DoutPrefixProvider *dpp, + const RGWUserInfo& info, optional_yield y, + const PutParams& params = {}); + int remove_info(const DoutPrefixProvider *dpp, + const RGWUserInfo& info, optional_yield y, + const RemoveParams& params = {}); + + int list_buckets(const DoutPrefixProvider *dpp, + const rgw_user& user, + const std::string& marker, + const std::string& end_marker, + uint64_t max, + bool need_stats, + RGWUserBuckets *buckets, + bool *is_truncated, + optional_yield y, + uint64_t default_max = 1000); + + int read_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, RGWStorageStats *stats, + optional_yield y, + ceph::real_time *last_stats_sync = nullptr, /* last time a full stats sync completed */ + ceph::real_time *last_stats_update = nullptr); /* last time a stats update was done */ +}; + +class RGWUserMetaHandlerAllocator { +public: + static RGWMetadataHandler *alloc(RGWSI_User *user_svc); +}; diff --git a/src/rgw/driver/rados/rgw_zone.cc b/src/rgw/driver/rados/rgw_zone.cc new file mode 100644 index 000000000..ed09f24f6 --- /dev/null +++ b/src/rgw/driver/rados/rgw_zone.cc @@ -0,0 +1,1288 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_zone.h" +#include "rgw_realm_watcher.h" +#include "rgw_sal_config.h" +#include "rgw_sync.h" + +#include "services/svc_zone.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace rgw_zone_defaults; + +RGWMetaSyncStatusManager::~RGWMetaSyncStatusManager(){} + +#define FIRST_EPOCH 1 + +struct RGWAccessKey; + +/// Generate a random uuid for realm/period/zonegroup/zone ids +static std::string gen_random_uuid() +{ + uuid_d uuid; + uuid.generate_random(); + return uuid.to_string(); +} + +void RGWDefaultZoneGroupInfo::dump(Formatter *f) const { + encode_json("default_zonegroup", default_zonegroup, f); +} + +void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) { + + JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj); + /* backward compatability with region */ + if (default_zonegroup.empty()) { + JSONDecoder::decode_json("default_region", default_zonegroup, obj); + } +} + +int RGWZoneGroup::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format) +{ + name = default_zonegroup_name; + api_name = default_zonegroup_name; + is_master = true; + + RGWZoneGroupPlacementTarget placement_target; + placement_target.name = "default-placement"; + placement_targets[placement_target.name] = placement_target; + default_placement.name = "default-placement"; + + RGWZoneParams zone_params(default_zone_name); + + int r = zone_params.init(dpp, cct, sysobj_svc, y, false); + if (r < 0) { + ldpp_dout(dpp, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl; + return r; + } + + r = zone_params.create_default(dpp, y); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl; + return r; + } else if (r == -EEXIST) { + ldpp_dout(dpp, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl; + zone_params.clear_id(); + r = zone_params.init(dpp, cct, sysobj_svc, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl; + return r; + } + ldpp_dout(dpp, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id() + << dendl; + } + + RGWZone& default_zone = zones[zone_params.get_id()]; + default_zone.name = zone_params.get_name(); + default_zone.id = zone_params.get_id(); + master_zone = default_zone.id; + + // initialize supported zone features + default_zone.supported_features.insert(rgw::zone_features::supported.begin(), + rgw::zone_features::supported.end()); + // enable default zonegroup features + enabled_features.insert(rgw::zone_features::enabled.begin(), + rgw::zone_features::enabled.end()); + + r = create(dpp, y); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl; + return r; + } + + if (r == -EEXIST) { + ldpp_dout(dpp, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl; + id.clear(); + r = init(dpp, cct, sysobj_svc, y); + if (r < 0) { + return r; + } + } + + if (old_format) { + name = id; + } + + post_process_params(dpp, y); + + return 0; +} + +int RGWZoneGroup::equals(const string& other_zonegroup) const +{ + if (is_master && other_zonegroup.empty()) + return true; + + return (id == other_zonegroup); +} + +int RGWZoneGroup::add_zone(const DoutPrefixProvider *dpp, + const RGWZoneParams& zone_params, bool *is_master, bool *read_only, + const list& endpoints, const string *ptier_type, + bool *psync_from_all, list& sync_from, list& sync_from_rm, + string *predirect_zone, std::optional bucket_index_max_shards, + RGWSyncModulesManager *sync_mgr, + const rgw::zone_features::set& enable_features, + const rgw::zone_features::set& disable_features, + optional_yield y) +{ + auto& zone_id = zone_params.get_id(); + auto& zone_name = zone_params.get_name(); + + // check for duplicate zone name on insert + if (!zones.count(zone_id)) { + for (const auto& zone : zones) { + if (zone.second.name == zone_name) { + ldpp_dout(dpp, 0) << "ERROR: found existing zone name " << zone_name + << " (" << zone.first << ") in zonegroup " << get_name() << dendl; + return -EEXIST; + } + } + } + + if (is_master) { + if (*is_master) { + if (!master_zone.empty() && master_zone != zone_id) { + ldpp_dout(dpp, 0) << "NOTICE: overriding master zone: " << master_zone << dendl; + } + master_zone = zone_id; + } else if (master_zone == zone_id) { + master_zone.clear(); + } + } + + RGWZone& zone = zones[zone_id]; + zone.name = zone_name; + zone.id = zone_id; + if (!endpoints.empty()) { + zone.endpoints = endpoints; + } + if (read_only) { + zone.read_only = *read_only; + } + if (ptier_type) { + zone.tier_type = *ptier_type; + if (!sync_mgr->get_module(*ptier_type, nullptr)) { + ldpp_dout(dpp, 0) << "ERROR: could not found sync module: " << *ptier_type + << ", valid sync modules: " + << sync_mgr->get_registered_module_names() + << dendl; + return -ENOENT; + } + } + + if (psync_from_all) { + zone.sync_from_all = *psync_from_all; + } + + if (predirect_zone) { + zone.redirect_zone = *predirect_zone; + } + + if (bucket_index_max_shards) { + zone.bucket_index_max_shards = *bucket_index_max_shards; + } + + for (auto add : sync_from) { + zone.sync_from.insert(add); + } + + for (auto rm : sync_from_rm) { + zone.sync_from.erase(rm); + } + + zone.supported_features.insert(enable_features.begin(), + enable_features.end()); + + for (const auto& feature : disable_features) { + if (enabled_features.contains(feature)) { + lderr(cct) << "ERROR: Cannot disable zone feature \"" << feature + << "\" until it's been disabled in zonegroup " << name << dendl; + return -EINVAL; + } + auto i = zone.supported_features.find(feature); + if (i == zone.supported_features.end()) { + ldout(cct, 1) << "WARNING: zone feature \"" << feature + << "\" was not enabled in zone " << zone.name << dendl; + continue; + } + zone.supported_features.erase(i); + } + + post_process_params(dpp, y); + + return update(dpp,y); +} + + +int RGWZoneGroup::rename_zone(const DoutPrefixProvider *dpp, + const RGWZoneParams& zone_params, + optional_yield y) +{ + RGWZone& zone = zones[zone_params.get_id()]; + zone.name = zone_params.get_name(); + + return update(dpp, y); +} + +void RGWZoneGroup::post_process_params(const DoutPrefixProvider *dpp, optional_yield y) +{ + bool log_data = zones.size() > 1; + + if (master_zone.empty()) { + auto iter = zones.begin(); + if (iter != zones.end()) { + master_zone = iter->first; + } + } + + for (auto& item : zones) { + RGWZone& zone = item.second; + zone.log_data = log_data; + + RGWZoneParams zone_params(zone.id, zone.name); + int ret = zone_params.init(dpp, cct, sysobj_svc, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl; + continue; + } + + for (auto& pitem : zone_params.placement_pools) { + const string& placement_name = pitem.first; + if (placement_targets.find(placement_name) == placement_targets.end()) { + RGWZoneGroupPlacementTarget placement_target; + placement_target.name = placement_name; + placement_targets[placement_name] = placement_target; + } + } + } + + if (default_placement.empty() && !placement_targets.empty()) { + default_placement.init(placement_targets.begin()->first, RGW_STORAGE_CLASS_STANDARD); + } +} + +int RGWZoneGroup::remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y) +{ + auto iter = zones.find(zone_id); + if (iter == zones.end()) { + ldpp_dout(dpp, 0) << "zone id " << zone_id << " is not a part of zonegroup " + << name << dendl; + return -ENOENT; + } + + zones.erase(iter); + + post_process_params(dpp, y); + + return update(dpp, y); +} + +void RGWDefaultSystemMetaObjInfo::dump(Formatter *f) const { + encode_json("default_id", default_id, f); +} + +void RGWDefaultSystemMetaObjInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("default_id", default_id, obj); +} + +int RGWSystemMetaObj::rename(const DoutPrefixProvider *dpp, const string& new_name, optional_yield y) +{ + string new_id; + int ret = read_id(dpp, new_name, new_id, y); + if (!ret) { + return -EEXIST; + } + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + string old_name = name; + name = new_name; + ret = update(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = store_name(dpp, true, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + /* delete old name */ + rgw_pool pool(get_pool(cct)); + string oid = get_names_oid_prefix() + old_name; + rgw_raw_obj old_name_obj(pool, oid); + auto sysobj = sysobj_svc->get_obj(old_name_obj); + ret = sysobj.wop().remove(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return ret; +} + +int RGWSystemMetaObj::read(const DoutPrefixProvider *dpp, optional_yield y) +{ + int ret = read_id(dpp, name, id, y); + if (ret < 0) { + return ret; + } + + return read_info(dpp, id, y); +} + +int RGWZoneParams::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format) +{ + name = default_zone_name; + + int r = create(dpp, y); + if (r < 0) { + return r; + } + + if (old_format) { + name = id; + } + + return r; +} + +const string& RGWZoneParams::get_compression_type(const rgw_placement_rule& placement_rule) const +{ + static const std::string NONE{"none"}; + auto p = placement_pools.find(placement_rule.name); + if (p == placement_pools.end()) { + return NONE; + } + const auto& type = p->second.get_compression_type(placement_rule.get_storage_class()); + return !type.empty() ? type : NONE; +} + +// run an MD5 hash on the zone_id and return the first 32 bits +static uint32_t gen_short_zone_id(const std::string zone_id) +{ + unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + hash.Update((const unsigned char *)zone_id.c_str(), zone_id.size()); + hash.Final(md5); + + uint32_t short_id; + memcpy((char *)&short_id, md5, sizeof(short_id)); + return std::max(short_id, 1u); +} + +int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct) +{ + if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) { + ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl; + ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <::iterator iter = zonegroups.find(zonegroup.get_id()); + if (iter != zonegroups.end()) { + RGWZoneGroup& old_zonegroup = iter->second; + if (!old_zonegroup.api_name.empty()) { + zonegroups_by_api.erase(old_zonegroup.api_name); + } + } + zonegroups[zonegroup.get_id()] = zonegroup; + + if (!zonegroup.api_name.empty()) { + zonegroups_by_api[zonegroup.api_name] = zonegroup; + } + + if (zonegroup.is_master_zonegroup()) { + master_zonegroup = zonegroup.get_id(); + } else if (master_zonegroup == zonegroup.get_id()) { + master_zonegroup = ""; + } + + for (auto& i : zonegroup.zones) { + auto& zone = i.second; + if (short_zone_ids.find(zone.id) != short_zone_ids.end()) { + continue; + } + // calculate the zone's short id + uint32_t short_id = gen_short_zone_id(zone.id); + + // search for an existing zone with the same short id + for (auto& s : short_zone_ids) { + if (s.second == short_id) { + ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id + << ") generates the same short_zone_id " << short_id + << " as existing zone id " << s.first << dendl; + return -EEXIST; + } + } + + short_zone_ids[zone.id] = short_id; + } + + return 0; +} + +uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const +{ + auto i = short_zone_ids.find(zone_id); + if (i == short_zone_ids.end()) { + return 0; + } + return i->second; +} + +bool RGWPeriodMap::find_zone_by_name(const string& zone_name, + RGWZoneGroup *zonegroup, + RGWZone *zone) const +{ + for (auto& iter : zonegroups) { + auto& zg = iter.second; + for (auto& ziter : zg.zones) { + auto& z = ziter.second; + + if (z.name == zone_name) { + *zonegroup = zg; + *zone = z; + return true; + } + } + } + + return false; +} + +namespace rgw { + +int read_realm(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, + std::string_view realm_id, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer) +{ + if (!realm_id.empty()) { + return cfgstore->read_realm_by_id(dpp, y, realm_id, info, writer); + } + if (!realm_name.empty()) { + return cfgstore->read_realm_by_name(dpp, y, realm_name, info, writer); + } + return cfgstore->read_default_realm(dpp, y, info, writer); +} + +int create_realm(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, bool exclusive, + RGWRealm& info, + std::unique_ptr* writer_out) +{ + if (info.name.empty()) { + ldpp_dout(dpp, -1) << __func__ << " requires a realm name" << dendl; + return -EINVAL; + } + if (info.id.empty()) { + info.id = gen_random_uuid(); + } + + // if the realm already has a current_period, just make sure it exists + std::optional period; + if (!info.current_period.empty()) { + period.emplace(); + int r = cfgstore->read_period(dpp, y, info.current_period, + std::nullopt, *period); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << " failed to read realm's current_period=" + << info.current_period << " with " << cpp_strerror(r) << dendl; + return r; + } + } + + // create the realm + std::unique_ptr writer; + int r = cfgstore->create_realm(dpp, y, exclusive, info, &writer); + if (r < 0) { + return r; + } + + if (!period) { + // initialize and exclusive-create the initial period + period.emplace(); + period->id = gen_random_uuid(); + period->period_map.id = period->id; + period->epoch = FIRST_EPOCH; + period->realm_id = info.id; + period->realm_name = info.name; + + r = cfgstore->create_period(dpp, y, true, *period); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << " failed to create the initial period id=" + << period->id << " for realm " << info.name + << " with " << cpp_strerror(r) << dendl; + return r; + } + } + + // update the realm's current_period + r = realm_set_current_period(dpp, y, cfgstore, *writer, info, *period); + if (r < 0) { + return r; + } + + // try to set as default. may race with another create, so pass exclusive=true + // so we don't override an existing default + r = set_default_realm(dpp, y, cfgstore, info, true); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, 0) << "WARNING: failed to set realm as default: " + << cpp_strerror(r) << dendl; + } + + if (writer_out) { + *writer_out = std::move(writer); + } + return 0; +} + +int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWRealm& info, + bool exclusive) +{ + return cfgstore->write_default_realm_id(dpp, y, exclusive, info.id); +} + +int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, + sal::RealmWriter& writer, RGWRealm& realm, + const RGWPeriod& period) +{ + // update realm epoch to match the period's + if (realm.epoch > period.realm_epoch) { + ldpp_dout(dpp, -1) << __func__ << " with old realm epoch " + << period.realm_epoch << ", current epoch=" << realm.epoch << dendl; + return -EINVAL; + } + if (realm.epoch == period.realm_epoch && realm.current_period != period.id) { + ldpp_dout(dpp, -1) << __func__ << " with same realm epoch " + << period.realm_epoch << ", but different period id " + << period.id << " != " << realm.current_period << dendl; + return -EINVAL; + } + + realm.epoch = period.realm_epoch; + realm.current_period = period.id; + + // update the realm object + int r = writer.write(dpp, y, realm); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << " failed to overwrite realm " + << realm.name << " with " << cpp_strerror(r) << dendl; + return r; + } + + // reflect the zonegroup and period config + (void) reflect_period(dpp, y, cfgstore, period); + return 0; +} + +int reflect_period(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWPeriod& info) +{ + // overwrite the local period config and zonegroup objects + constexpr bool exclusive = false; + + int r = cfgstore->write_period_config(dpp, y, exclusive, info.realm_id, + info.period_config); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << " failed to store period config for realm id=" + << info.realm_id << " with " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& [zonegroup_id, zonegroup] : info.period_map.zonegroups) { + r = cfgstore->create_zonegroup(dpp, y, exclusive, zonegroup, nullptr); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << " failed to store zonegroup id=" + << zonegroup_id << " with " << cpp_strerror(r) << dendl; + return r; + } + if (zonegroup.is_master) { + // set master as default if no default exists + constexpr bool exclusive = true; + r = set_default_zonegroup(dpp, y, cfgstore, zonegroup, exclusive); + if (r == 0) { + ldpp_dout(dpp, 1) << "Set the period's master zonegroup " + << zonegroup.name << " as the default" << dendl; + } + } + } + return 0; +} + +std::string get_staging_period_id(std::string_view realm_id) +{ + return string_cat_reserve(realm_id, ":staging"); +} + +void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info) +{ + ldpp_dout(dpp, 20) << __func__ << " realm id=" << info.realm_id + << " period id=" << info.id << dendl; + + info.predecessor_uuid = std::move(info.id); + info.id = get_staging_period_id(info.realm_id); + info.period_map.reset(); + info.realm_epoch++; +} + +int update_period(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, RGWPeriod& info) +{ + // clear zone short ids of removed zones. period_map.update() will add the + // remaining zones back + info.period_map.short_zone_ids.clear(); + + // list all zonegroups in the realm + rgw::sal::ListResult listing; + std::array zonegroup_names; // list in pages of 1000 + do { + int ret = cfgstore->list_zonegroup_names(dpp, y, listing.next, + zonegroup_names, listing); + if (ret < 0) { + std::cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + for (const auto& name : listing.entries) { + RGWZoneGroup zg; + ret = cfgstore->read_zonegroup_by_name(dpp, y, name, zg, nullptr); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: failed to read zonegroup " + << name << ": " << cpp_strerror(-ret) << dendl; + continue; + } + + if (zg.realm_id != info.realm_id) { + ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name() + << " with realm id " << zg.realm_id + << ", not on our realm " << info.realm_id << dendl; + continue; + } + + if (zg.master_zone.empty()) { + ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl; + return -EINVAL; + } + + if (zg.zones.find(zg.master_zone) == zg.zones.end()) { + ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() + << " has a non existent master zone "<< dendl; + return -EINVAL; + } + + if (zg.is_master_zonegroup()) { + info.master_zonegroup = zg.get_id(); + info.master_zone = zg.master_zone; + } + + ret = info.period_map.update(zg, dpp->get_cct()); + if (ret < 0) { + return ret; + } + } // foreach name in listing.entries + } while (!listing.next.empty()); + + // read the realm's current period config + int ret = cfgstore->read_period_config(dpp, y, info.realm_id, + info.period_config); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: failed to read period config: " + << cpp_strerror(ret) << dendl; + return ret; + } + + return 0; +} + +int commit_period(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, sal::Driver* driver, + RGWRealm& realm, sal::RealmWriter& realm_writer, + const RGWPeriod& current_period, + RGWPeriod& info, std::ostream& error_stream, + bool force_if_stale) +{ + auto zone_svc = static_cast(driver)->svc()->zone; // XXX + + ldpp_dout(dpp, 20) << __func__ << " realm " << realm.id + << " period " << current_period.id << dendl; + // gateway must be in the master zone to commit + if (info.master_zone != zone_svc->get_zone_params().id) { + error_stream << "Cannot commit period on zone " + << zone_svc->get_zone_params().id << ", it must be sent to " + "the period's master zone " << info.master_zone << '.' << std::endl; + return -EINVAL; + } + // period predecessor must match current period + if (info.predecessor_uuid != current_period.id) { + error_stream << "Period predecessor " << info.predecessor_uuid + << " does not match current period " << current_period.id + << ". Use 'period pull' to get the latest period from the master, " + "reapply your changes, and try again." << std::endl; + return -EINVAL; + } + // realm epoch must be 1 greater than current period + if (info.realm_epoch != current_period.realm_epoch + 1) { + error_stream << "Period's realm epoch " << info.realm_epoch + << " does not come directly after current realm epoch " + << current_period.realm_epoch << ". Use 'realm pull' to get the " + "latest realm and period from the master zone, reapply your changes, " + "and try again." << std::endl; + return -EINVAL; + } + // did the master zone change? + if (info.master_zone != current_period.master_zone) { + // store the current metadata sync status in the period + int r = info.update_sync_status(dpp, driver, current_period, + error_stream, force_if_stale); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to update metadata sync status: " + << cpp_strerror(-r) << dendl; + return r; + } + // create an object with a new period id + info.period_map.id = info.id = gen_random_uuid(); + info.epoch = FIRST_EPOCH; + + constexpr bool exclusive = true; + r = cfgstore->create_period(dpp, y, exclusive, info); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl; + return r; + } + // set as current period + r = realm_set_current_period(dpp, y, cfgstore, realm_writer, realm, info); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to update realm's current period: " + << cpp_strerror(-r) << dendl; + return r; + } + ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period " + << info.id << dendl; + (void) cfgstore->realm_notify_new_period(dpp, y, info); + return 0; + } + // period must be based on current epoch + if (info.epoch != current_period.epoch) { + error_stream << "Period epoch " << info.epoch << " does not match " + "predecessor epoch " << current_period.epoch << ". Use " + "'period pull' to get the latest epoch from the master zone, " + "reapply your changes, and try again." << std::endl; + return -EINVAL; + } + // set period as next epoch + info.id = current_period.id; + info.epoch = current_period.epoch + 1; + info.predecessor_uuid = current_period.predecessor_uuid; + info.realm_epoch = current_period.realm_epoch; + // write the period + constexpr bool exclusive = true; + int r = cfgstore->create_period(dpp, y, exclusive, info); + if (r == -EEXIST) { + // already have this epoch (or a more recent one) + return 0; + } + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(r) << dendl; + return r; + } + r = reflect_period(dpp, y, cfgstore, info); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(r) << dendl; + return r; + } + ldpp_dout(dpp, 4) << "Committed new epoch " << info.epoch + << " for period " << info.id << dendl; + (void) cfgstore->realm_notify_new_period(dpp, y, info); + return 0; +} + + +int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, + std::string_view zonegroup_id, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer) +{ + if (!zonegroup_id.empty()) { + return cfgstore->read_zonegroup_by_id(dpp, y, zonegroup_id, info, writer); + } + if (!zonegroup_name.empty()) { + return cfgstore->read_zonegroup_by_name(dpp, y, zonegroup_name, info, writer); + } + + std::string realm_id; + int r = cfgstore->read_default_realm_id(dpp, y, realm_id); + if (r == -ENOENT) { + return cfgstore->read_zonegroup_by_name(dpp, y, default_zonegroup_name, + info, writer); + } + if (r < 0) { + return r; + } + return cfgstore->read_default_zonegroup(dpp, y, realm_id, info, writer); +} + +int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, bool exclusive, + RGWZoneGroup& info) +{ + if (info.name.empty()) { + ldpp_dout(dpp, -1) << __func__ << " requires a zonegroup name" << dendl; + return -EINVAL; + } + if (info.id.empty()) { + info.id = gen_random_uuid(); + } + + // insert the default placement target if it doesn't exist + constexpr std::string_view default_placement_name = "default-placement"; + + RGWZoneGroupPlacementTarget placement_target; + placement_target.name = default_placement_name; + + info.placement_targets.emplace(default_placement_name, placement_target); + if (info.default_placement.name.empty()) { + info.default_placement.name = default_placement_name; + } + + int r = cfgstore->create_zonegroup(dpp, y, exclusive, info, nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to create zonegroup with " + << cpp_strerror(r) << dendl; + return r; + } + + // try to set as default. may race with another create, so pass exclusive=true + // so we don't override an existing default + r = set_default_zonegroup(dpp, y, cfgstore, info, true); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, 0) << "WARNING: failed to set zonegroup as default: " + << cpp_strerror(r) << dendl; + } + + return 0; +} + +int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWZoneGroup& info, + bool exclusive) +{ + return cfgstore->write_default_zonegroup_id( + dpp, y, exclusive, info.realm_id, info.id); +} + +int remove_zone_from_group(const DoutPrefixProvider* dpp, + RGWZoneGroup& zonegroup, + const rgw_zone_id& zone_id) +{ + auto z = zonegroup.zones.find(zone_id); + if (z == zonegroup.zones.end()) { + return -ENOENT; + } + zonegroup.zones.erase(z); + + if (zonegroup.master_zone == zone_id) { + // choose a new master zone + auto m = zonegroup.zones.begin(); + if (m != zonegroup.zones.end()) { + zonegroup.master_zone = m->first; + ldpp_dout(dpp, 0) << "NOTICE: promoted " << m->second.name + << " as new master_zone of zonegroup " << zonegroup.name << dendl; + } else { + ldpp_dout(dpp, 0) << "NOTICE: removed master_zone of zonegroup " + << zonegroup.name << dendl; + } + } + + const bool log_data = zonegroup.zones.size() > 1; + for (auto& [id, zone] : zonegroup.zones) { + zone.log_data = log_data; + } + + return 0; +} + +// try to remove the given zone id from every zonegroup in the cluster +static int remove_zone_from_groups(const DoutPrefixProvider* dpp, + optional_yield y, + sal::ConfigStore* cfgstore, + const rgw_zone_id& zone_id) +{ + std::array zonegroup_names; + sal::ListResult listing; + do { + int r = cfgstore->list_zonegroup_names(dpp, y, listing.next, + zonegroup_names, listing); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to list zonegroups with " + << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& name : listing.entries) { + RGWZoneGroup zonegroup; + std::unique_ptr writer; + r = cfgstore->read_zonegroup_by_name(dpp, y, name, zonegroup, &writer); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: failed to load zonegroup " << name + << " with " << cpp_strerror(r) << dendl; + continue; + } + + r = remove_zone_from_group(dpp, zonegroup, zone_id); + if (r < 0) { + continue; + } + + // write the updated zonegroup + r = writer->write(dpp, y, zonegroup); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: failed to write zonegroup " << name + << " with " << cpp_strerror(r) << dendl; + continue; + } + ldpp_dout(dpp, 0) << "Removed zone from zonegroup " << name << dendl; + } + } while (!listing.next.empty()); + + return 0; +} + + +int read_zone(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, + std::string_view zone_id, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer) +{ + if (!zone_id.empty()) { + return cfgstore->read_zone_by_id(dpp, y, zone_id, info, writer); + } + if (!zone_name.empty()) { + return cfgstore->read_zone_by_name(dpp, y, zone_name, info, writer); + } + + std::string realm_id; + int r = cfgstore->read_default_realm_id(dpp, y, realm_id); + if (r == -ENOENT) { + return cfgstore->read_zone_by_name(dpp, y, default_zone_name, info, writer); + } + if (r < 0) { + return r; + } + return cfgstore->read_default_zone(dpp, y, realm_id, info, writer); +} + +extern int get_zones_pool_set(const DoutPrefixProvider *dpp, optional_yield y, + rgw::sal::ConfigStore* cfgstore, + std::string_view my_zone_id, + std::set& pools); + +int create_zone(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, bool exclusive, + RGWZoneParams& info, std::unique_ptr* writer) +{ + if (info.name.empty()) { + ldpp_dout(dpp, -1) << __func__ << " requires a zone name" << dendl; + return -EINVAL; + } + if (info.id.empty()) { + info.id = gen_random_uuid(); + } + + // add default placement with empty pool name + rgw_pool pool; + auto& placement = info.placement_pools["default-placement"]; + placement.storage_classes.set_storage_class( + RGW_STORAGE_CLASS_STANDARD, &pool, nullptr); + + // build a set of all pool names used by other zones + std::set pools; + int r = get_zones_pool_set(dpp, y, cfgstore, info.id, pools); + if (r < 0) { + return r; + } + + // initialize pool names with the zone name prefix + r = init_zone_pool_names(dpp, y, pools, info); + if (r < 0) { + return r; + } + + r = cfgstore->create_zone(dpp, y, exclusive, info, nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to create zone with " + << cpp_strerror(r) << dendl; + return r; + } + + // try to set as default. may race with another create, so pass exclusive=true + // so we don't override an existing default + r = set_default_zone(dpp, y, cfgstore, info, true); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, 0) << "WARNING: failed to set zone as default: " + << cpp_strerror(r) << dendl; + } + + return 0; + +} + +int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWZoneParams& info, + bool exclusive) +{ + return cfgstore->write_default_zone_id( + dpp, y, exclusive, info.realm_id, info.id); +} + +int delete_zone(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWZoneParams& info, + sal::ZoneWriter& writer) +{ + // remove this zone from any zonegroups that contain it + int r = remove_zone_from_groups(dpp, y, cfgstore, info.id); + if (r < 0) { + return r; + } + + return writer.remove(dpp, y); +} + +} // namespace rgw + +static inline int conf_to_uint64(const JSONFormattable& config, const string& key, uint64_t *pval) +{ + string sval; + if (config.find(key, &sval)) { + string err; + uint64_t val = strict_strtoll(sval.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + *pval = val; + } + return 0; +} + +int RGWZoneGroupPlacementTier::update_params(const JSONFormattable& config) +{ + int r = -1; + + if (config.exists("retain_head_object")) { + string s = config["retain_head_object"]; + if (s == "true") { + retain_head_object = true; + } else { + retain_head_object = false; + } + } + + if (tier_type == "cloud-s3") { + r = t.s3.update_params(config); + } + + return r; +} + +int RGWZoneGroupPlacementTier::clear_params(const JSONFormattable& config) +{ + if (config.exists("retain_head_object")) { + retain_head_object = false; + } + + if (tier_type == "cloud-s3") { + t.s3.clear_params(config); + } + + return 0; +} + +int RGWZoneGroupPlacementTierS3::update_params(const JSONFormattable& config) +{ + int r = -1; + + if (config.exists("endpoint")) { + endpoint = config["endpoint"]; + } + if (config.exists("target_path")) { + target_path = config["target_path"]; + } + if (config.exists("region")) { + region = config["region"]; + } + if (config.exists("host_style")) { + string s; + s = config["host_style"]; + if (s != "virtual") { + host_style = PathStyle; + } else { + host_style = VirtualStyle; + } + } + if (config.exists("target_storage_class")) { + target_storage_class = config["target_storage_class"]; + } + if (config.exists("access_key")) { + key.id = config["access_key"]; + } + if (config.exists("secret")) { + key.key = config["secret"]; + } + if (config.exists("multipart_sync_threshold")) { + r = conf_to_uint64(config, "multipart_sync_threshold", &multipart_sync_threshold); + if (r < 0) { + multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE; + } + } + + if (config.exists("multipart_min_part_size")) { + r = conf_to_uint64(config, "multipart_min_part_size", &multipart_min_part_size); + if (r < 0) { + multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE; + } + } + + if (config.exists("acls")) { + const JSONFormattable& cc = config["acls"]; + if (cc.is_array()) { + for (auto& c : cc.array()) { + RGWTierACLMapping m; + m.init(c); + if (!m.source_id.empty()) { + acl_mappings[m.source_id] = m; + } + } + } else { + RGWTierACLMapping m; + m.init(cc); + if (!m.source_id.empty()) { + acl_mappings[m.source_id] = m; + } + } + } + return 0; +} + +int RGWZoneGroupPlacementTierS3::clear_params(const JSONFormattable& config) +{ + if (config.exists("endpoint")) { + endpoint.clear(); + } + if (config.exists("target_path")) { + target_path.clear(); + } + if (config.exists("region")) { + region.clear(); + } + if (config.exists("host_style")) { + /* default */ + host_style = PathStyle; + } + if (config.exists("target_storage_class")) { + target_storage_class.clear(); + } + if (config.exists("access_key")) { + key.id.clear(); + } + if (config.exists("secret")) { + key.key.clear(); + } + if (config.exists("multipart_sync_threshold")) { + multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE; + } + if (config.exists("multipart_min_part_size")) { + multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE; + } + if (config.exists("acls")) { + const JSONFormattable& cc = config["acls"]; + if (cc.is_array()) { + for (auto& c : cc.array()) { + RGWTierACLMapping m; + m.init(c); + acl_mappings.erase(m.source_id); + } + } else { + RGWTierACLMapping m; + m.init(cc); + acl_mappings.erase(m.source_id); + } + } + return 0; +} + +void rgw_meta_sync_info::generate_test_instances(list& o) +{ + auto info = new rgw_meta_sync_info; + info->state = rgw_meta_sync_info::StateBuildingFullSyncMaps; + info->period = "periodid"; + info->realm_epoch = 5; + o.push_back(info); + o.push_back(new rgw_meta_sync_info); +} + +void rgw_meta_sync_marker::generate_test_instances(list& o) +{ + auto marker = new rgw_meta_sync_marker; + marker->state = rgw_meta_sync_marker::IncrementalSync; + marker->marker = "01234"; + marker->realm_epoch = 5; + o.push_back(marker); + o.push_back(new rgw_meta_sync_marker); +} + +void rgw_meta_sync_status::generate_test_instances(list& o) +{ + o.push_back(new rgw_meta_sync_status); +} + +void RGWZoneParams::generate_test_instances(list &o) +{ + o.push_back(new RGWZoneParams); + o.push_back(new RGWZoneParams); +} + +void RGWPeriodLatestEpochInfo::generate_test_instances(list &o) +{ + RGWPeriodLatestEpochInfo *z = new RGWPeriodLatestEpochInfo; + o.push_back(z); + o.push_back(new RGWPeriodLatestEpochInfo); +} + +void RGWZoneGroup::generate_test_instances(list& o) +{ + RGWZoneGroup *r = new RGWZoneGroup; + o.push_back(r); + o.push_back(new RGWZoneGroup); +} + +void RGWPeriodLatestEpochInfo::dump(Formatter *f) const { + encode_json("latest_epoch", epoch, f); +} + +void RGWPeriodLatestEpochInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("latest_epoch", epoch, obj); +} + +void RGWNameToId::dump(Formatter *f) const { + encode_json("obj_id", obj_id, f); +} + +void RGWNameToId::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("obj_id", obj_id, obj); +} + diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h new file mode 100644 index 000000000..2d69d5f1c --- /dev/null +++ b/src/rgw/driver/rados/rgw_zone.h @@ -0,0 +1,943 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "rgw_zone_types.h" +#include "rgw_common.h" +#include "rgw_sal_fwd.h" +#include "rgw_sync_policy.h" + + +class RGWSyncModulesManager; + +class RGWSI_SysObj; +class RGWSI_Zone; + +class RGWSystemMetaObj { +public: + std::string id; + std::string name; + + CephContext *cct{nullptr}; + RGWSI_SysObj *sysobj_svc{nullptr}; + RGWSI_Zone *zone_svc{nullptr}; + + int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y); + int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y); + int read_info(const DoutPrefixProvider *dpp, const std::string& obj_id, optional_yield y, bool old_format = false); + int read_id(const DoutPrefixProvider *dpp, const std::string& obj_name, std::string& obj_id, optional_yield y); + int read_default(const DoutPrefixProvider *dpp, + RGWDefaultSystemMetaObjInfo& default_info, + const std::string& oid, + optional_yield y); + /* read and use default id */ + int use_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false); + +public: + RGWSystemMetaObj() {} + RGWSystemMetaObj(const std::string& _name): name(_name) {} + RGWSystemMetaObj(const std::string& _id, const std::string& _name) : id(_id), name(_name) {} + RGWSystemMetaObj(CephContext *_cct, RGWSI_SysObj *_sysobj_svc) { + reinit_instance(_cct, _sysobj_svc); + } + RGWSystemMetaObj(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): name(_name) { + reinit_instance(_cct, _sysobj_svc); + } + + const std::string& get_name() const { return name; } + const std::string& get_id() const { return id; } + + void set_name(const std::string& _name) { name = _name;} + void set_id(const std::string& _id) { id = _id;} + void clear_id() { id.clear(); } + + virtual ~RGWSystemMetaObj() {} + + virtual void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(name, bl); + ENCODE_FINISH(bl); + } + + virtual void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(name, bl); + DECODE_FINISH(bl); + } + + void reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc); + int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, + optional_yield y, + bool setup_obj = true, bool old_format = false); + virtual int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, + bool old_format = false); + virtual int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false); + int delete_default(); + virtual int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true); + int delete_obj(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false); + int rename(const DoutPrefixProvider *dpp, const std::string& new_name, optional_yield y); + int update(const DoutPrefixProvider *dpp, optional_yield y) { return store_info(dpp, false, y);} + int update_name(const DoutPrefixProvider *dpp, optional_yield y) { return store_name(dpp, false, y);} + int read(const DoutPrefixProvider *dpp, optional_yield y); + int write(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y); + + virtual rgw_pool get_pool(CephContext *cct) const = 0; + virtual const std::string get_default_oid(bool old_format = false) const = 0; + virtual const std::string& get_names_oid_prefix() const = 0; + virtual const std::string& get_info_oid_prefix(bool old_format = false) const = 0; + virtual std::string get_predefined_id(CephContext *cct) const = 0; + virtual const std::string& get_predefined_name(CephContext *cct) const = 0; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWSystemMetaObj) + +struct RGWZoneParams : RGWSystemMetaObj { + rgw_pool domain_root; + rgw_pool control_pool; + rgw_pool gc_pool; + rgw_pool lc_pool; + rgw_pool log_pool; + rgw_pool intent_log_pool; + rgw_pool usage_log_pool; + rgw_pool user_keys_pool; + rgw_pool user_email_pool; + rgw_pool user_swift_pool; + rgw_pool user_uid_pool; + rgw_pool roles_pool; + rgw_pool reshard_pool; + rgw_pool otp_pool; + rgw_pool oidc_pool; + rgw_pool notif_pool; + + RGWAccessKey system_key; + + std::map placement_pools; + + std::string realm_id; + + JSONFormattable tier_config; + + RGWZoneParams() : RGWSystemMetaObj() {} + explicit RGWZoneParams(const std::string& name) : RGWSystemMetaObj(name){} + RGWZoneParams(const rgw_zone_id& id, const std::string& name) : RGWSystemMetaObj(id.id, name) {} + RGWZoneParams(const rgw_zone_id& id, const std::string& name, const std::string& _realm_id) + : RGWSystemMetaObj(id.id, name), realm_id(_realm_id) {} + virtual ~RGWZoneParams(); + + rgw_pool get_pool(CephContext *cct) const override; + const std::string get_default_oid(bool old_format = false) const override; + const std::string& get_names_oid_prefix() const override; + const std::string& get_info_oid_prefix(bool old_format = false) const override; + std::string get_predefined_id(CephContext *cct) const override; + const std::string& get_predefined_name(CephContext *cct) const override; + + int init(const DoutPrefixProvider *dpp, + CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y, + bool setup_obj = true, bool old_format = false); + using RGWSystemMetaObj::init; + int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override; + int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override; + int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false); + int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override; + int fix_pool_names(const DoutPrefixProvider *dpp, optional_yield y); + + const std::string& get_compression_type(const rgw_placement_rule& placement_rule) const; + + void encode(bufferlist& bl) const override { + ENCODE_START(14, 1, bl); + encode(domain_root, bl); + encode(control_pool, bl); + encode(gc_pool, bl); + encode(log_pool, bl); + encode(intent_log_pool, bl); + encode(usage_log_pool, bl); + encode(user_keys_pool, bl); + encode(user_email_pool, bl); + encode(user_swift_pool, bl); + encode(user_uid_pool, bl); + RGWSystemMetaObj::encode(bl); + encode(system_key, bl); + encode(placement_pools, bl); + rgw_pool unused_metadata_heap; + encode(unused_metadata_heap, bl); + encode(realm_id, bl); + encode(lc_pool, bl); + std::map old_tier_config; + encode(old_tier_config, bl); + encode(roles_pool, bl); + encode(reshard_pool, bl); + encode(otp_pool, bl); + encode(tier_config, bl); + encode(oidc_pool, bl); + encode(notif_pool, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(14, bl); + decode(domain_root, bl); + decode(control_pool, bl); + decode(gc_pool, bl); + decode(log_pool, bl); + decode(intent_log_pool, bl); + decode(usage_log_pool, bl); + decode(user_keys_pool, bl); + decode(user_email_pool, bl); + decode(user_swift_pool, bl); + decode(user_uid_pool, bl); + if (struct_v >= 6) { + RGWSystemMetaObj::decode(bl); + } else if (struct_v >= 2) { + decode(name, bl); + id = name; + } + if (struct_v >= 3) + decode(system_key, bl); + if (struct_v >= 4) + decode(placement_pools, bl); + if (struct_v >= 5) { + rgw_pool unused_metadata_heap; + decode(unused_metadata_heap, bl); + } + if (struct_v >= 6) { + decode(realm_id, bl); + } + if (struct_v >= 7) { + decode(lc_pool, bl); + } else { + lc_pool = log_pool.name + ":lc"; + } + std::map old_tier_config; + if (struct_v >= 8) { + decode(old_tier_config, bl); + } + if (struct_v >= 9) { + decode(roles_pool, bl); + } else { + roles_pool = name + ".rgw.meta:roles"; + } + if (struct_v >= 10) { + decode(reshard_pool, bl); + } else { + reshard_pool = log_pool.name + ":reshard"; + } + if (struct_v >= 11) { + ::decode(otp_pool, bl); + } else { + otp_pool = name + ".rgw.otp"; + } + if (struct_v >= 12) { + ::decode(tier_config, bl); + } else { + for (auto& kv : old_tier_config) { + tier_config.set(kv.first, kv.second); + } + } + if (struct_v >= 13) { + ::decode(oidc_pool, bl); + } else { + oidc_pool = name + ".rgw.meta:oidc"; + } + if (struct_v >= 14) { + decode(notif_pool, bl); + } else { + notif_pool = log_pool.name + ":notif"; + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); + + bool get_placement(const std::string& placement_id, RGWZonePlacementInfo *placement) const { + auto iter = placement_pools.find(placement_id); + if (iter == placement_pools.end()) { + return false; + } + *placement = iter->second; + return true; + } + + /* + * return data pool of the head object + */ + bool get_head_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool* pool) const { + const rgw_data_placement_target& explicit_placement = obj.bucket.explicit_placement; + if (!explicit_placement.data_pool.empty()) { + if (!obj.in_extra_data) { + *pool = explicit_placement.data_pool; + } else { + *pool = explicit_placement.get_data_extra_pool(); + } + return true; + } + if (placement_rule.empty()) { + return false; + } + auto iter = placement_pools.find(placement_rule.name); + if (iter == placement_pools.end()) { + return false; + } + if (!obj.in_extra_data) { + *pool = iter->second.get_data_pool(placement_rule.storage_class); + } else { + *pool = iter->second.get_data_extra_pool(); + } + return true; + } + + bool valid_placement(const rgw_placement_rule& rule) const { + auto iter = placement_pools.find(rule.name); + if (iter == placement_pools.end()) { + return false; + } + return iter->second.storage_class_exists(rule.storage_class); + } +}; +WRITE_CLASS_ENCODER(RGWZoneParams) + +struct RGWZoneGroup : public RGWSystemMetaObj { + std::string api_name; + std::list endpoints; + bool is_master = false; + + rgw_zone_id master_zone; + std::map zones; + + std::map placement_targets; + rgw_placement_rule default_placement; + + std::list hostnames; + std::list hostnames_s3website; + // TODO: Maybe convert hostnames to a map> for + // endpoint_type->hostnames +/* +20:05 < _robbat21irssi> maybe I do someting like: if (hostname_map.empty()) { populate all map keys from hostnames; }; +20:05 < _robbat21irssi> but that's a later compatability migration planning bit +20:06 < yehudasa> more like if (!hostnames.empty()) { +20:06 < yehudasa> for (std::list::iterator iter = hostnames.begin(); iter != hostnames.end(); ++iter) { +20:06 < yehudasa> hostname_map["s3"].append(iter->second); +20:07 < yehudasa> hostname_map["s3website"].append(iter->second); +20:07 < yehudasa> s/append/push_back/g +20:08 < _robbat21irssi> inner loop over APIs +20:08 < yehudasa> yeah, probably +20:08 < _robbat21irssi> s3, s3website, swift, swith_auth, swift_website +*/ + std::map > api_hostname_map; + std::map > api_endpoints_map; + + std::string realm_id; + + rgw_sync_policy_info sync_policy; + rgw::zone_features::set enabled_features; + + RGWZoneGroup(): is_master(false){} + RGWZoneGroup(const std::string &id, const std::string &name):RGWSystemMetaObj(id, name) {} + explicit RGWZoneGroup(const std::string &_name):RGWSystemMetaObj(_name) {} + RGWZoneGroup(const std::string &_name, bool _is_master, CephContext *cct, RGWSI_SysObj* sysobj_svc, + const std::string& _realm_id, const std::list& _endpoints) + : RGWSystemMetaObj(_name, cct , sysobj_svc), endpoints(_endpoints), is_master(_is_master), + realm_id(_realm_id) {} + virtual ~RGWZoneGroup(); + + bool is_master_zonegroup() const { return is_master;} + void update_master(const DoutPrefixProvider *dpp, bool _is_master, optional_yield y) { + is_master = _is_master; + post_process_params(dpp, y); + } + void post_process_params(const DoutPrefixProvider *dpp, optional_yield y); + + void encode(bufferlist& bl) const override { + ENCODE_START(6, 1, bl); + encode(name, bl); + encode(api_name, bl); + encode(is_master, bl); + encode(endpoints, bl); + encode(master_zone, bl); + encode(zones, bl); + encode(placement_targets, bl); + encode(default_placement, bl); + encode(hostnames, bl); + encode(hostnames_s3website, bl); + RGWSystemMetaObj::encode(bl); + encode(realm_id, bl); + encode(sync_policy, bl); + encode(enabled_features, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(6, bl); + decode(name, bl); + decode(api_name, bl); + decode(is_master, bl); + decode(endpoints, bl); + decode(master_zone, bl); + decode(zones, bl); + decode(placement_targets, bl); + decode(default_placement, bl); + if (struct_v >= 2) { + decode(hostnames, bl); + } + if (struct_v >= 3) { + decode(hostnames_s3website, bl); + } + if (struct_v >= 4) { + RGWSystemMetaObj::decode(bl); + decode(realm_id, bl); + } else { + id = name; + } + if (struct_v >= 5) { + decode(sync_policy, bl); + } + if (struct_v >= 6) { + decode(enabled_features, bl); + } + DECODE_FINISH(bl); + } + + int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override; + int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override; + int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false); + int equals(const std::string& other_zonegroup) const; + int add_zone(const DoutPrefixProvider *dpp, + const RGWZoneParams& zone_params, bool *is_master, bool *read_only, + const std::list& endpoints, const std::string *ptier_type, + bool *psync_from_all, std::list& sync_from, + std::list& sync_from_rm, std::string *predirect_zone, + std::optional bucket_index_max_shards, RGWSyncModulesManager *sync_mgr, + const rgw::zone_features::set& enable_features, + const rgw::zone_features::set& disable_features, + optional_yield y); + int remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y); + int rename_zone(const DoutPrefixProvider *dpp, const RGWZoneParams& zone_params, optional_yield y); + rgw_pool get_pool(CephContext *cct) const override; + const std::string get_default_oid(bool old_region_format = false) const override; + const std::string& get_info_oid_prefix(bool old_region_format = false) const override; + const std::string& get_names_oid_prefix() const override; + std::string get_predefined_id(CephContext *cct) const override; + const std::string& get_predefined_name(CephContext *cct) const override; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); + + bool supports(std::string_view feature) const { + return enabled_features.contains(feature); + } +}; +WRITE_CLASS_ENCODER(RGWZoneGroup) + +struct RGWPeriodMap +{ + std::string id; + std::map zonegroups; + std::map zonegroups_by_api; + std::map short_zone_ids; + + std::string master_zonegroup; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + + int update(const RGWZoneGroup& zonegroup, CephContext *cct); + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + void reset() { + zonegroups.clear(); + zonegroups_by_api.clear(); + master_zonegroup.clear(); + } + + uint32_t get_zone_short_id(const std::string& zone_id) const; + + bool find_zone_by_id(const rgw_zone_id& zone_id, + RGWZoneGroup *zonegroup, + RGWZone *zone) const; + bool find_zone_by_name(const std::string& zone_id, + RGWZoneGroup *zonegroup, + RGWZone *zone) const; +}; +WRITE_CLASS_ENCODER(RGWPeriodMap) + +struct RGWPeriodConfig +{ + RGWQuota quota; + RGWRateLimitInfo user_ratelimit; + RGWRateLimitInfo bucket_ratelimit; + // rate limit unauthenticated user + RGWRateLimitInfo anon_ratelimit; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(quota.bucket_quota, bl); + encode(quota.user_quota, bl); + encode(bucket_ratelimit, bl); + encode(user_ratelimit, bl); + encode(anon_ratelimit, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(quota.bucket_quota, bl); + decode(quota.user_quota, bl); + if (struct_v >= 2) { + decode(bucket_ratelimit, bl); + decode(user_ratelimit, bl); + decode(anon_ratelimit, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + // the period config must be stored in a local object outside of the period, + // so that it can be used in a default configuration where no realm/period + // exists + int read(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y); + int write(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y); + + static std::string get_oid(const std::string& realm_id); + static rgw_pool get_pool(CephContext *cct); +}; +WRITE_CLASS_ENCODER(RGWPeriodConfig) + +class RGWRealm; +class RGWPeriod; + +class RGWRealm : public RGWSystemMetaObj +{ +public: + std::string current_period; + epoch_t epoch{0}; //< realm epoch, incremented for each new period + + int create_control(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y); + int delete_control(const DoutPrefixProvider *dpp, optional_yield y); +public: + RGWRealm() {} + RGWRealm(const std::string& _id, const std::string& _name = "") : RGWSystemMetaObj(_id, _name) {} + RGWRealm(CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_cct, _sysobj_svc) {} + RGWRealm(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_name, _cct, _sysobj_svc){} + virtual ~RGWRealm() override; + + void encode(bufferlist& bl) const override { + ENCODE_START(1, 1, bl); + RGWSystemMetaObj::encode(bl); + encode(current_period, bl); + encode(epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(1, bl); + RGWSystemMetaObj::decode(bl); + decode(current_period, bl); + decode(epoch, bl); + DECODE_FINISH(bl); + } + + int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override; + int delete_obj(const DoutPrefixProvider *dpp, optional_yield y); + rgw_pool get_pool(CephContext *cct) const override; + const std::string get_default_oid(bool old_format = false) const override; + const std::string& get_names_oid_prefix() const override; + const std::string& get_info_oid_prefix(bool old_format = false) const override; + std::string get_predefined_id(CephContext *cct) const override; + const std::string& get_predefined_name(CephContext *cct) const override; + + using RGWSystemMetaObj::read_id; // expose as public for radosgw-admin + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); + + const std::string& get_current_period() const { + return current_period; + } + int set_current_period(const DoutPrefixProvider *dpp, RGWPeriod& period, optional_yield y); + void clear_current_period_and_epoch() { + current_period.clear(); + epoch = 0; + } + epoch_t get_epoch() const { return epoch; } + + std::string get_control_oid() const; + /// send a notify on the realm control object + int notify_zone(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y); + /// notify the zone of a new period + int notify_new_period(const DoutPrefixProvider *dpp, const RGWPeriod& period, optional_yield y); + + int find_zone(const DoutPrefixProvider *dpp, + const rgw_zone_id& zid, + RGWPeriod *pperiod, + RGWZoneGroup *pzonegroup, + bool *pfound, + optional_yield y) const; +}; +WRITE_CLASS_ENCODER(RGWRealm) + +struct RGWPeriodLatestEpochInfo { + epoch_t epoch = 0; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(epoch, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo) + + +/* + * The RGWPeriod object contains the entire configuration of a + * RGWRealm, including its RGWZoneGroups and RGWZones. Consistency of + * this configuration is maintained across all zones by passing around + * the RGWPeriod object in its JSON representation. + * + * If a new configuration changes which zone is the metadata master + * zone (i.e., master zone of the master zonegroup), then a new + * RGWPeriod::id (a uuid) is generated, its RGWPeriod::realm_epoch is + * incremented, and the RGWRealm object is updated to reflect that new + * current_period id and epoch. If the configuration changes BUT which + * zone is the metadata master does NOT change, then only the + * RGWPeriod::epoch is incremented (and the RGWPeriod::id remains the + * same). + * + * When a new RGWPeriod is created with a new RGWPeriod::id (uuid), it + * is linked back to its predecessor RGWPeriod through the + * RGWPeriod::predecessor_uuid field, thus creating a "linked + * list"-like structure of RGWPeriods back to the cluster's creation. + */ +class RGWPeriod +{ +public: + std::string id; //< a uuid + epoch_t epoch{0}; + std::string predecessor_uuid; + std::vector sync_status; + RGWPeriodMap period_map; + RGWPeriodConfig period_config; + std::string master_zonegroup; + rgw_zone_id master_zone; + + std::string realm_id; + std::string realm_name; + epoch_t realm_epoch{1}; //< realm epoch when period was made current + + CephContext *cct{nullptr}; + RGWSI_SysObj *sysobj_svc{nullptr}; + + int read_info(const DoutPrefixProvider *dpp, optional_yield y); + int read_latest_epoch(const DoutPrefixProvider *dpp, + RGWPeriodLatestEpochInfo& epoch_info, + optional_yield y, + RGWObjVersionTracker *objv = nullptr); + int use_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y); + int use_current_period(); + + const std::string get_period_oid() const; + const std::string get_period_oid_prefix() const; + + // gather the metadata sync status for each shard; only for use on master zone + int update_sync_status(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const RGWPeriod ¤t_period, + std::ostream& error_stream, bool force_if_stale); + +public: + RGWPeriod() {} + + explicit RGWPeriod(const std::string& period_id, epoch_t _epoch = 0) + : id(period_id), epoch(_epoch) {} + + const std::string& get_id() const { return id; } + epoch_t get_epoch() const { return epoch; } + epoch_t get_realm_epoch() const { return realm_epoch; } + const std::string& get_predecessor() const { return predecessor_uuid; } + const rgw_zone_id& get_master_zone() const { return master_zone; } + const std::string& get_master_zonegroup() const { return master_zonegroup; } + const std::string& get_realm() const { return realm_id; } + const std::string& get_realm_name() const { return realm_name; } + const RGWPeriodMap& get_map() const { return period_map; } + RGWPeriodConfig& get_config() { return period_config; } + const RGWPeriodConfig& get_config() const { return period_config; } + const std::vector& get_sync_status() const { return sync_status; } + rgw_pool get_pool(CephContext *cct) const; + const std::string& get_latest_epoch_oid() const; + const std::string& get_info_oid_prefix() const; + + void set_user_quota(RGWQuotaInfo& user_quota) { + period_config.quota.user_quota = user_quota; + } + + void set_bucket_quota(RGWQuotaInfo& bucket_quota) { + period_config.quota.bucket_quota = bucket_quota; + } + + void set_id(const std::string& _id) { + this->id = _id; + period_map.id = _id; + } + void set_epoch(epoch_t epoch) { this->epoch = epoch; } + void set_realm_epoch(epoch_t epoch) { realm_epoch = epoch; } + + void set_predecessor(const std::string& predecessor) + { + predecessor_uuid = predecessor; + } + + void set_realm_id(const std::string& _realm_id) { + realm_id = _realm_id; + } + + int reflect(const DoutPrefixProvider *dpp, optional_yield y); + + int get_zonegroup(RGWZoneGroup& zonegroup, + const std::string& zonegroup_id) const; + + bool is_single_zonegroup() const + { + return (period_map.zonegroups.size() <= 1); + } + + /* + returns true if there are several zone groups with a least one zone + */ + bool is_multi_zonegroups_with_zones() const + { + int count = 0; + for (const auto& zg: period_map.zonegroups) { + if (zg.second.zones.size() > 0) { + if (count++ > 0) { + return true; + } + } + } + return false; + } + + bool find_zone(const DoutPrefixProvider *dpp, + const rgw_zone_id& zid, + RGWZoneGroup *pzonegroup, + optional_yield y) const; + + int get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& epoch, optional_yield y); + int set_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y, + epoch_t epoch, bool exclusive = false, + RGWObjVersionTracker *objv = nullptr); + // update latest_epoch if the given epoch is higher, else return -EEXIST + int update_latest_epoch(const DoutPrefixProvider *dpp, epoch_t epoch, optional_yield y); + + int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, const std::string &period_realm_id, optional_yield y, + const std::string &period_realm_name = "", bool setup_obj = true); + int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y, bool setup_obj = true); + + int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true); + int delete_obj(const DoutPrefixProvider *dpp, optional_yield y); + int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y); + int add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y); + + void fork(); + int update(const DoutPrefixProvider *dpp, optional_yield y); + + // commit a staging period; only for use on master zone + int commit(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + RGWRealm& realm, const RGWPeriod ¤t_period, + std::ostream& error_stream, optional_yield y, + bool force_if_stale = false); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(epoch, bl); + encode(realm_epoch, bl); + encode(predecessor_uuid, bl); + encode(sync_status, bl); + encode(period_map, bl); + encode(master_zone, bl); + encode(master_zonegroup, bl); + encode(period_config, bl); + encode(realm_id, bl); + encode(realm_name, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(epoch, bl); + decode(realm_epoch, bl); + decode(predecessor_uuid, bl); + decode(sync_status, bl); + decode(period_map, bl); + decode(master_zone, bl); + decode(master_zonegroup, bl); + decode(period_config, bl); + decode(realm_id, bl); + decode(realm_name, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); + + static std::string get_staging_id(const std::string& realm_id) { + return realm_id + ":staging"; + } +}; +WRITE_CLASS_ENCODER(RGWPeriod) + +namespace rgw { + +/// Look up a realm by its id. If no id is given, look it up by name. +/// If no name is given, fall back to the cluster's default realm. +int read_realm(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, + std::string_view realm_id, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer = nullptr); + +/// Create a realm and its initial period. If the info.id is empty, a +/// random uuid will be generated. +int create_realm(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, bool exclusive, + RGWRealm& info, + std::unique_ptr* writer = nullptr); + +/// Set the given realm as the cluster's default realm. +int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWRealm& info, + bool exclusive = false); + +/// Update the current_period of an existing realm. +int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, + sal::RealmWriter& writer, RGWRealm& realm, + const RGWPeriod& period); + +/// Overwrite the local zonegroup and period config objects with the new +/// configuration contained in the given period. +int reflect_period(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWPeriod& info); + +/// Return the staging period id for the given realm. +std::string get_staging_period_id(std::string_view realm_id); + +/// Convert the given period into a separate staging period, where +/// radosgw-admin can make changes to it without effecting the running +/// configuration. +void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info); + +/// Read all zonegroups in the period's realm and add them to the period. +int update_period(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, RGWPeriod& info); + +/// Validates the given 'staging' period and tries to commit it as the +/// realm's new current period. +int commit_period(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, sal::Driver* driver, + RGWRealm& realm, sal::RealmWriter& realm_writer, + const RGWPeriod& current_period, + RGWPeriod& info, std::ostream& error_stream, + bool force_if_stale); + + +/// Look up a zonegroup by its id. If no id is given, look it up by name. +/// If no name is given, fall back to the cluster's default zonegroup. +int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, + std::string_view zonegroup_id, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer = nullptr); + +/// Initialize and create the given zonegroup. If the given info.id is empty, +/// a random uuid will be generated. May fail with -EEXIST. +int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, bool exclusive, + RGWZoneGroup& info); + +/// Set the given zonegroup as its realm's default zonegroup. +int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWZoneGroup& info, + bool exclusive = false); + +/// Add a zone to the zonegroup, or update an existing zone entry. +int add_zone_to_group(const DoutPrefixProvider* dpp, + RGWZoneGroup& zonegroup, + const RGWZoneParams& zone_params, + const bool *pis_master, const bool *pread_only, + const std::list& endpoints, + const std::string *ptier_type, + const bool *psync_from_all, + const std::list& sync_from, + const std::list& sync_from_rm, + const std::string *predirect_zone, + std::optional bucket_index_max_shards, + const rgw::zone_features::set& enable_features, + const rgw::zone_features::set& disable_features); + +/// Remove a zone by id from its zonegroup, promoting a new master zone if +/// necessary. +int remove_zone_from_group(const DoutPrefixProvider* dpp, + RGWZoneGroup& info, + const rgw_zone_id& zone_id); + + +/// Look up a zone by its id. If no id is given, look it up by name. If no name +/// is given, fall back to the realm's default zone. +int read_zone(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, + std::string_view zone_id, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer = nullptr); + +/// Initialize and create a new zone. If the given info.id is empty, a random +/// uuid will be generated. Pool names are initialized with the zone name as a +/// prefix. If any pool names conflict with existing zones, a random suffix is +/// added. +int create_zone(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, bool exclusive, + RGWZoneParams& info, + std::unique_ptr* writer = nullptr); + +/// Initialize the zone's pool names using the zone name as a prefix. If a pool +/// name conflicts with an existing zone's pool, add a unique suffix. +int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y, + const std::set& pools, RGWZoneParams& info); + +/// Set the given zone as its realm's default zone. +int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWZoneParams& info, + bool exclusive = false); + +/// Delete an existing zone and remove it from any zonegroups that contain it. +int delete_zone(const DoutPrefixProvider* dpp, optional_yield y, + sal::ConfigStore* cfgstore, const RGWZoneParams& info, + sal::ZoneWriter& writer); + +} // namespace rgw diff --git a/src/rgw/jwt-cpp/base.h b/src/rgw/jwt-cpp/base.h new file mode 100644 index 000000000..dfca7fc08 --- /dev/null +++ b/src/rgw/jwt-cpp/base.h @@ -0,0 +1,168 @@ +#pragma once +#include +#include + +namespace jwt { + namespace alphabet { + struct base64 { + static const std::array& data() { + static std::array data = { + {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', + 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}}; + return data; + }; + static const std::string& fill() { + static std::string fill = "="; + return fill; + } + }; + struct base64url { + static const std::array& data() { + static std::array data = { + {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', + 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'}}; + return data; + }; + static const std::string& fill() { + static std::string fill = "%3d"; + return fill; + } + }; + } + + class base { + public: + template + static std::string encode(const std::string& bin) { + return encode(bin, T::data(), T::fill()); + } + template + static std::string decode(const std::string& base) { + return decode(base, T::data(), T::fill()); + } + + private: + static std::string encode(const std::string& bin, const std::array& alphabet, const std::string& fill) { + size_t size = bin.size(); + std::string res; + + // clear incomplete bytes + size_t fast_size = size - size % 3; + for (size_t i = 0; i < fast_size;) { + uint32_t octet_a = (unsigned char)bin[i++]; + uint32_t octet_b = (unsigned char)bin[i++]; + uint32_t octet_c = (unsigned char)bin[i++]; + + uint32_t triple = (octet_a << 0x10) + (octet_b << 0x08) + octet_c; + + res += alphabet[(triple >> 3 * 6) & 0x3F]; + res += alphabet[(triple >> 2 * 6) & 0x3F]; + res += alphabet[(triple >> 1 * 6) & 0x3F]; + res += alphabet[(triple >> 0 * 6) & 0x3F]; + } + + if (fast_size == size) + return res; + + size_t mod = size % 3; + + uint32_t octet_a = fast_size < size ? (unsigned char)bin[fast_size++] : 0; + uint32_t octet_b = fast_size < size ? (unsigned char)bin[fast_size++] : 0; + uint32_t octet_c = fast_size < size ? (unsigned char)bin[fast_size++] : 0; + + uint32_t triple = (octet_a << 0x10) + (octet_b << 0x08) + octet_c; + + switch (mod) { + case 1: + res += alphabet[(triple >> 3 * 6) & 0x3F]; + res += alphabet[(triple >> 2 * 6) & 0x3F]; + res += fill; + res += fill; + break; + case 2: + res += alphabet[(triple >> 3 * 6) & 0x3F]; + res += alphabet[(triple >> 2 * 6) & 0x3F]; + res += alphabet[(triple >> 1 * 6) & 0x3F]; + res += fill; + break; + default: + break; + } + + return res; + } + + static std::string decode(const std::string& base, const std::array& alphabet, const std::string& fill) { + size_t size = base.size(); + + size_t fill_cnt = 0; + while (size > fill.size()) { + if (base.substr(size - fill.size(), fill.size()) == fill) { + fill_cnt++; + size -= fill.size(); + if(fill_cnt > 2) + throw std::runtime_error("Invalid input"); + } + else break; + } + + if ((size + fill_cnt) % 4 != 0) + throw std::runtime_error("Invalid input"); + + size_t out_size = size / 4 * 3; + std::string res; + res.reserve(out_size); + + auto get_sextet = [&](size_t offset) { + for (size_t i = 0; i < alphabet.size(); i++) { + if (alphabet[i] == base[offset]) + return i; + } + throw std::runtime_error("Invalid input"); + }; + + + size_t fast_size = size - size % 4; + for (size_t i = 0; i < fast_size;) { + uint32_t sextet_a = get_sextet(i++); + uint32_t sextet_b = get_sextet(i++); + uint32_t sextet_c = get_sextet(i++); + uint32_t sextet_d = get_sextet(i++); + + uint32_t triple = (sextet_a << 3 * 6) + + (sextet_b << 2 * 6) + + (sextet_c << 1 * 6) + + (sextet_d << 0 * 6); + + res += (triple >> 2 * 8) & 0xFF; + res += (triple >> 1 * 8) & 0xFF; + res += (triple >> 0 * 8) & 0xFF; + } + + if (fill_cnt == 0) + return res; + + uint32_t triple = (get_sextet(fast_size) << 3 * 6) + + (get_sextet(fast_size + 1) << 2 * 6); + + switch (fill_cnt) { + case 1: + triple |= (get_sextet(fast_size + 2) << 1 * 6); + res += (triple >> 2 * 8) & 0xFF; + res += (triple >> 1 * 8) & 0xFF; + break; + case 2: + res += (triple >> 2 * 8) & 0xFF; + break; + default: + break; + } + + return res; + } + }; +} diff --git a/src/rgw/jwt-cpp/jwt.h b/src/rgw/jwt-cpp/jwt.h new file mode 100644 index 000000000..b86fb57b0 --- /dev/null +++ b/src/rgw/jwt-cpp/jwt.h @@ -0,0 +1,1615 @@ +#pragma once +#define PICOJSON_USE_INT64 +#include "picojson/picojson.h" +#include "base.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//If openssl version less than 1.1 +#if OPENSSL_VERSION_NUMBER < 269484032 +#define OPENSSL10 +#endif + +#ifndef JWT_CLAIM_EXPLICIT +#define JWT_CLAIM_EXPLICIT 1 +#endif + +namespace jwt { + using date = std::chrono::system_clock::time_point; + + struct signature_verification_exception : public std::runtime_error { + signature_verification_exception() + : std::runtime_error("signature verification failed") + {} + explicit signature_verification_exception(const std::string& msg) + : std::runtime_error(msg) + {} + explicit signature_verification_exception(const char* msg) + : std::runtime_error(msg) + {} + }; + struct signature_generation_exception : public std::runtime_error { + signature_generation_exception() + : std::runtime_error("signature generation failed") + {} + explicit signature_generation_exception(const std::string& msg) + : std::runtime_error(msg) + {} + explicit signature_generation_exception(const char* msg) + : std::runtime_error(msg) + {} + }; + struct rsa_exception : public std::runtime_error { + explicit rsa_exception(const std::string& msg) + : std::runtime_error(msg) + {} + explicit rsa_exception(const char* msg) + : std::runtime_error(msg) + {} + }; + struct ecdsa_exception : public std::runtime_error { + explicit ecdsa_exception(const std::string& msg) + : std::runtime_error(msg) + {} + explicit ecdsa_exception(const char* msg) + : std::runtime_error(msg) + {} + }; + struct token_verification_exception : public std::runtime_error { + token_verification_exception() + : std::runtime_error("token verification failed") + {} + explicit token_verification_exception(const std::string& msg) + : std::runtime_error("token verification failed: " + msg) + {} + }; + + namespace helper { + inline + std::string extract_pubkey_from_cert(const std::string& certstr, const std::string& pw = "") { + // TODO: Cannot find the exact version this change happended +#if OPENSSL_VERSION_NUMBER <= 0x1000114fL + std::unique_ptr certbio(BIO_new_mem_buf(const_cast(certstr.data()), certstr.size()), BIO_free_all); +#else + std::unique_ptr certbio(BIO_new_mem_buf(certstr.data(), certstr.size()), BIO_free_all); +#endif + std::unique_ptr keybio(BIO_new(BIO_s_mem()), BIO_free_all); + + std::unique_ptr cert(PEM_read_bio_X509(certbio.get(), nullptr, nullptr, const_cast(pw.c_str())), X509_free); + if (!cert) throw rsa_exception("Error loading cert into memory"); + std::unique_ptr key(X509_get_pubkey(cert.get()), EVP_PKEY_free); + if(!key) throw rsa_exception("Error getting public key from certificate"); + if(!PEM_write_bio_PUBKEY(keybio.get(), key.get())) throw rsa_exception("Error writing public key data in PEM format"); + char* ptr = nullptr; + auto len = BIO_get_mem_data(keybio.get(), &ptr); + if(len <= 0 || ptr == nullptr) throw rsa_exception("Failed to convert pubkey to pem"); + std::string res(ptr, len); + return res; + } + + inline + std::shared_ptr load_public_key_from_string(const std::string& key, const std::string& password = "") { + std::unique_ptr pubkey_bio(BIO_new(BIO_s_mem()), BIO_free_all); + if(key.substr(0, 27) == "-----BEGIN CERTIFICATE-----") { + auto epkey = helper::extract_pubkey_from_cert(key, password); + if ((size_t)BIO_write(pubkey_bio.get(), epkey.data(), epkey.size()) != epkey.size()) + throw rsa_exception("failed to load public key: bio_write failed"); + } else { + if ((size_t)BIO_write(pubkey_bio.get(), key.data(), key.size()) != key.size()) + throw rsa_exception("failed to load public key: bio_write failed"); + } + + std::shared_ptr pkey(PEM_read_bio_PUBKEY(pubkey_bio.get(), nullptr, nullptr, (void*)password.c_str()), EVP_PKEY_free); + if (!pkey) + throw rsa_exception("failed to load public key: PEM_read_bio_PUBKEY failed:" + std::string(ERR_error_string(ERR_get_error(), NULL))); + return pkey; + } + + inline + std::shared_ptr load_private_key_from_string(const std::string& key, const std::string& password = "") { + std::unique_ptr privkey_bio(BIO_new(BIO_s_mem()), BIO_free_all); + if ((size_t)BIO_write(privkey_bio.get(), key.data(), key.size()) != key.size()) + throw rsa_exception("failed to load private key: bio_write failed"); + std::shared_ptr pkey(PEM_read_bio_PrivateKey(privkey_bio.get(), nullptr, nullptr, const_cast(password.c_str())), EVP_PKEY_free); + if (!pkey) + throw rsa_exception("failed to load private key: PEM_read_bio_PrivateKey failed"); + return pkey; + } + } + + namespace algorithm { + /** + * "none" algorithm. + * + * Returns and empty signature and checks if the given signature is empty. + */ + struct none { + /// Return an empty string + std::string sign(const std::string&) const { + return ""; + } + /// Check if the given signature is empty. JWT's with "none" algorithm should not contain a signature. + void verify(const std::string&, const std::string& signature) const { + if (!signature.empty()) + throw signature_verification_exception(); + } + /// Get algorithm name + std::string name() const { + return "none"; + } + }; + /** + * Base class for HMAC family of algorithms + */ + struct hmacsha { + /** + * Construct new hmac algorithm + * \param key Key to use for HMAC + * \param md Pointer to hash function + * \param name Name of the algorithm + */ + hmacsha(std::string key, const EVP_MD*(*md)(), const std::string& name) + : secret(std::move(key)), md(md), alg_name(name) + {} + /** + * Sign jwt data + * \param data The data to sign + * \return HMAC signature for the given data + * \throws signature_generation_exception + */ + std::string sign(const std::string& data) const { + std::string res; + res.resize(EVP_MAX_MD_SIZE); + unsigned int len = res.size(); + if (HMAC(md(), secret.data(), secret.size(), (const unsigned char*)data.data(), data.size(), (unsigned char*)res.data(), &len) == nullptr) + throw signature_generation_exception(); + res.resize(len); + return res; + } + /** + * Check if signature is valid + * \param data The data to check signature against + * \param signature Signature provided by the jwt + * \throws signature_verification_exception If the provided signature does not match + */ + void verify(const std::string& data, const std::string& signature) const { + try { + auto res = sign(data); + bool matched = true; + for (size_t i = 0; i < std::min(res.size(), signature.size()); i++) + if (res[i] != signature[i]) + matched = false; + if (res.size() != signature.size()) + matched = false; + if (!matched) + throw signature_verification_exception(); + } + catch (const signature_generation_exception&) { + throw signature_verification_exception(); + } + } + /** + * Returns the algorithm name provided to the constructor + * \return Algorithmname + */ + std::string name() const { + return alg_name; + } + private: + /// HMAC secrect + const std::string secret; + /// HMAC hash generator + const EVP_MD*(*md)(); + /// Algorithmname + const std::string alg_name; + }; + /** + * Base class for RSA family of algorithms + */ + struct rsa { + /** + * Construct new rsa algorithm + * \param public_key RSA public key in PEM format + * \param private_key RSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + * \param md Pointer to hash function + * \param name Name of the algorithm + */ + rsa(const std::string& public_key, const std::string& private_key, const std::string& public_key_password, const std::string& private_key_password, const EVP_MD*(*md)(), const std::string& name) + : md(md), alg_name(name) + { + if (!private_key.empty()) { + pkey = helper::load_private_key_from_string(private_key, private_key_password); + } else if(!public_key.empty()) { + pkey = helper::load_public_key_from_string(public_key, public_key_password); + } else + throw rsa_exception("at least one of public or private key need to be present"); + } + /** + * Sign jwt data + * \param data The data to sign + * \return RSA signature for the given data + * \throws signature_generation_exception + */ + std::string sign(const std::string& data) const { +#ifdef OPENSSL10 + std::unique_ptr ctx(EVP_MD_CTX_create(), EVP_MD_CTX_destroy); +#else + std::unique_ptr ctx(EVP_MD_CTX_create(), EVP_MD_CTX_free); +#endif + if (!ctx) + throw signature_generation_exception("failed to create signature: could not create context"); + if (!EVP_SignInit(ctx.get(), md())) + throw signature_generation_exception("failed to create signature: SignInit failed"); + + std::string res; + res.resize(EVP_PKEY_size(pkey.get())); + unsigned int len = 0; + + if (!EVP_SignUpdate(ctx.get(), data.data(), data.size())) + throw signature_generation_exception(); + if (!EVP_SignFinal(ctx.get(), (unsigned char*)res.data(), &len, pkey.get())) + throw signature_generation_exception(); + + res.resize(len); + return res; + } + /** + * Check if signature is valid + * \param data The data to check signature against + * \param signature Signature provided by the jwt + * \throws signature_verification_exception If the provided signature does not match + */ + void verify(const std::string& data, const std::string& signature) const { +#ifdef OPENSSL10 + std::unique_ptr ctx(EVP_MD_CTX_create(), EVP_MD_CTX_destroy); +#else + std::unique_ptr ctx(EVP_MD_CTX_create(), EVP_MD_CTX_free); +#endif + if (!ctx) + throw signature_verification_exception("failed to verify signature: could not create context"); + if (!EVP_VerifyInit(ctx.get(), md())) + throw signature_verification_exception("failed to verify signature: VerifyInit failed"); + if (!EVP_VerifyUpdate(ctx.get(), data.data(), data.size())) + throw signature_verification_exception("failed to verify signature: VerifyUpdate failed"); + auto res = EVP_VerifyFinal(ctx.get(), (const unsigned char*)signature.data(), signature.size(), pkey.get()); + if (res != 1) + throw signature_verification_exception("evp verify final failed: " + std::to_string(res) + " " + ERR_error_string(ERR_get_error(), NULL)); + } + /** + * Returns the algorithm name provided to the constructor + * \return Algorithmname + */ + std::string name() const { + return alg_name; + } + private: + /// OpenSSL structure containing converted keys + std::shared_ptr pkey; + /// Hash generator + const EVP_MD*(*md)(); + /// Algorithmname + const std::string alg_name; + }; + /** + * Base class for ECDSA family of algorithms + */ + struct ecdsa { + /** + * Construct new ecdsa algorithm + * \param public_key ECDSA public key in PEM format + * \param private_key ECDSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + * \param md Pointer to hash function + * \param name Name of the algorithm + */ + ecdsa(const std::string& public_key, const std::string& private_key, const std::string& public_key_password, const std::string& private_key_password, const EVP_MD*(*md)(), const std::string& name, size_t siglen) + : md(md), alg_name(name), signature_length(siglen) + { + if (!public_key.empty()) { + std::unique_ptr pubkey_bio(BIO_new(BIO_s_mem()), BIO_free_all); + if(public_key.substr(0, 27) == "-----BEGIN CERTIFICATE-----") { + auto epkey = helper::extract_pubkey_from_cert(public_key, public_key_password); + if ((size_t)BIO_write(pubkey_bio.get(), epkey.data(), epkey.size()) != epkey.size()) + throw ecdsa_exception("failed to load public key: bio_write failed"); + } else { + if ((size_t)BIO_write(pubkey_bio.get(), public_key.data(), public_key.size()) != public_key.size()) + throw ecdsa_exception("failed to load public key: bio_write failed"); + } + + pkey.reset(PEM_read_bio_EC_PUBKEY(pubkey_bio.get(), nullptr, nullptr, (void*)public_key_password.c_str()), EC_KEY_free); + if (!pkey) + throw ecdsa_exception("failed to load public key: PEM_read_bio_EC_PUBKEY failed:" + std::string(ERR_error_string(ERR_get_error(), NULL))); + size_t keysize = EC_GROUP_get_degree(EC_KEY_get0_group(pkey.get())); + if(keysize != signature_length*4 && (signature_length != 132 || keysize != 521)) + throw ecdsa_exception("invalid key size"); + } + + if (!private_key.empty()) { + std::unique_ptr privkey_bio(BIO_new(BIO_s_mem()), BIO_free_all); + if ((size_t)BIO_write(privkey_bio.get(), private_key.data(), private_key.size()) != private_key.size()) + throw rsa_exception("failed to load private key: bio_write failed"); + pkey.reset(PEM_read_bio_ECPrivateKey(privkey_bio.get(), nullptr, nullptr, const_cast(private_key_password.c_str())), EC_KEY_free); + if (!pkey) + throw rsa_exception("failed to load private key: PEM_read_bio_ECPrivateKey failed"); + size_t keysize = EC_GROUP_get_degree(EC_KEY_get0_group(pkey.get())); + if(keysize != signature_length*4 && (signature_length != 132 || keysize != 521)) + throw ecdsa_exception("invalid key size"); + } + if(!pkey) + throw rsa_exception("at least one of public or private key need to be present"); + + if(EC_KEY_check_key(pkey.get()) == 0) + throw ecdsa_exception("failed to load key: key is invalid"); + } + /** + * Sign jwt data + * \param data The data to sign + * \return ECDSA signature for the given data + * \throws signature_generation_exception + */ + std::string sign(const std::string& data) const { + const std::string hash = generate_hash(data); + + std::unique_ptr + sig(ECDSA_do_sign((const unsigned char*)hash.data(), hash.size(), pkey.get()), ECDSA_SIG_free); + if(!sig) + throw signature_generation_exception(); +#ifdef OPENSSL10 + + auto rr = bn2raw(sig->r); + auto rs = bn2raw(sig->s); +#else + const BIGNUM *r; + const BIGNUM *s; + ECDSA_SIG_get0(sig.get(), &r, &s); + auto rr = bn2raw(r); + auto rs = bn2raw(s); +#endif + if(rr.size() > signature_length/2 || rs.size() > signature_length/2) + throw std::logic_error("bignum size exceeded expected length"); + while(rr.size() != signature_length/2) rr = '\0' + rr; + while(rs.size() != signature_length/2) rs = '\0' + rs; + return rr + rs; + } + + /** + * Check if signature is valid + * \param data The data to check signature against + * \param signature Signature provided by the jwt + * \throws signature_verification_exception If the provided signature does not match + */ + void verify(const std::string& data, const std::string& signature) const { + const std::string hash = generate_hash(data); + auto r = raw2bn(signature.substr(0, signature.size() / 2)); + auto s = raw2bn(signature.substr(signature.size() / 2)); + +#ifdef OPENSSL10 + ECDSA_SIG sig; + sig.r = r.get(); + sig.s = s.get(); + + if(ECDSA_do_verify((const unsigned char*)hash.data(), hash.size(), &sig, pkey.get()) != 1) + throw signature_verification_exception("Invalid signature"); +#else + std::unique_ptr sig(ECDSA_SIG_new(), ECDSA_SIG_free); + + ECDSA_SIG_set0(sig.get(), r.release(), s.release()); + + if(ECDSA_do_verify((const unsigned char*)hash.data(), hash.size(), sig.get(), pkey.get()) != 1) + throw signature_verification_exception("Invalid signature"); +#endif + } + /** + * Returns the algorithm name provided to the constructor + * \return Algorithmname + */ + std::string name() const { + return alg_name; + } + private: + /** + * Convert a OpenSSL BIGNUM to a std::string + * \param bn BIGNUM to convert + * \return bignum as string + */ +#ifdef OPENSSL10 + static std::string bn2raw(BIGNUM* bn) +#else + static std::string bn2raw(const BIGNUM* bn) +#endif + { + std::string res; + res.resize(BN_num_bytes(bn)); + BN_bn2bin(bn, (unsigned char*)res.data()); + return res; + } + /** + * Convert an std::string to a OpenSSL BIGNUM + * \param raw String to convert + * \return BIGNUM representation + */ + static std::unique_ptr raw2bn(const std::string& raw) { + return std::unique_ptr(BN_bin2bn((const unsigned char*)raw.data(), raw.size(), nullptr), BN_free); + } + + /** + * Hash the provided data using the hash function specified in constructor + * \param data Data to hash + * \return Hash of data + */ + std::string generate_hash(const std::string& data) const { +#ifdef OPENSSL10 + std::unique_ptr ctx(EVP_MD_CTX_create(), &EVP_MD_CTX_destroy); +#else + std::unique_ptr ctx(EVP_MD_CTX_new(), EVP_MD_CTX_free); +#endif + if(EVP_DigestInit(ctx.get(), md()) == 0) + throw signature_generation_exception("EVP_DigestInit failed"); + if(EVP_DigestUpdate(ctx.get(), data.data(), data.size()) == 0) + throw signature_generation_exception("EVP_DigestUpdate failed"); + unsigned int len = 0; + std::string res; + res.resize(EVP_MD_CTX_size(ctx.get())); + if(EVP_DigestFinal(ctx.get(), (unsigned char*)res.data(), &len) == 0) + throw signature_generation_exception("EVP_DigestFinal failed"); + res.resize(len); + return res; + } + + /// OpenSSL struct containing keys + std::shared_ptr pkey; + /// Hash generator function + const EVP_MD*(*md)(); + /// Algorithmname + const std::string alg_name; + /// Length of the resulting signature + const size_t signature_length; + }; + + /** + * Base class for PSS-RSA family of algorithms + */ + struct pss { + /** + * Construct new pss algorithm + * \param public_key RSA public key in PEM format + * \param private_key RSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + * \param md Pointer to hash function + * \param name Name of the algorithm + */ + pss(const std::string& public_key, const std::string& private_key, const std::string& public_key_password, const std::string& private_key_password, const EVP_MD*(*md)(), const std::string& name) + : md(md), alg_name(name) + { + if (!private_key.empty()) { + pkey = helper::load_private_key_from_string(private_key, private_key_password); + } else if(!public_key.empty()) { + pkey = helper::load_public_key_from_string(public_key, public_key_password); + } else + throw rsa_exception("at least one of public or private key need to be present"); + } + /** + * Sign jwt data + * \param data The data to sign + * \return ECDSA signature for the given data + * \throws signature_generation_exception + */ + std::string sign(const std::string& data) const { + auto hash = this->generate_hash(data); + + std::unique_ptr key(EVP_PKEY_get1_RSA(pkey.get()), RSA_free); + const int size = RSA_size(key.get()); + + std::string padded(size, 0x00); + if (!RSA_padding_add_PKCS1_PSS_mgf1(key.get(), (unsigned char*)padded.data(), (const unsigned char*)hash.data(), md(), md(), -1)) + throw signature_generation_exception("failed to create signature: RSA_padding_add_PKCS1_PSS_mgf1 failed"); + + std::string res(size, 0x00); + if (RSA_private_encrypt(size, (const unsigned char*)padded.data(), (unsigned char*)res.data(), key.get(), RSA_NO_PADDING) < 0) + throw signature_generation_exception("failed to create signature: RSA_private_encrypt failed"); + return res; + } + /** + * Check if signature is valid + * \param data The data to check signature against + * \param signature Signature provided by the jwt + * \throws signature_verification_exception If the provided signature does not match + */ + void verify(const std::string& data, const std::string& signature) const { + auto hash = this->generate_hash(data); + + std::unique_ptr key(EVP_PKEY_get1_RSA(pkey.get()), RSA_free); + const int size = RSA_size(key.get()); + + std::string sig(size, 0x00); + if(!RSA_public_decrypt(signature.size(), (const unsigned char*)signature.data(), (unsigned char*)sig.data(), key.get(), RSA_NO_PADDING)) + throw signature_verification_exception("Invalid signature"); + + if(!RSA_verify_PKCS1_PSS_mgf1(key.get(), (const unsigned char*)hash.data(), md(), md(), (const unsigned char*)sig.data(), -1)) + throw signature_verification_exception("Invalid signature"); + } + /** + * Returns the algorithm name provided to the constructor + * \return Algorithmname + */ + std::string name() const { + return alg_name; + } + private: + /** + * Hash the provided data using the hash function specified in constructor + * \param data Data to hash + * \return Hash of data + */ + std::string generate_hash(const std::string& data) const { +#ifdef OPENSSL10 + std::unique_ptr ctx(EVP_MD_CTX_create(), &EVP_MD_CTX_destroy); +#else + std::unique_ptr ctx(EVP_MD_CTX_new(), EVP_MD_CTX_free); +#endif + if(EVP_DigestInit(ctx.get(), md()) == 0) + throw signature_generation_exception("EVP_DigestInit failed"); + if(EVP_DigestUpdate(ctx.get(), data.data(), data.size()) == 0) + throw signature_generation_exception("EVP_DigestUpdate failed"); + unsigned int len = 0; + std::string res; + res.resize(EVP_MD_CTX_size(ctx.get())); + if(EVP_DigestFinal(ctx.get(), (unsigned char*)res.data(), &len) == 0) + throw signature_generation_exception("EVP_DigestFinal failed"); + res.resize(len); + return res; + } + + /// OpenSSL structure containing keys + std::shared_ptr pkey; + /// Hash generator function + const EVP_MD*(*md)(); + /// Algorithmname + const std::string alg_name; + }; + + /** + * HS256 algorithm + */ + struct hs256 : public hmacsha { + /** + * Construct new instance of algorithm + * \param key HMAC signing key + */ + explicit hs256(std::string key) + : hmacsha(std::move(key), EVP_sha256, "HS256") + {} + }; + /** + * HS384 algorithm + */ + struct hs384 : public hmacsha { + /** + * Construct new instance of algorithm + * \param key HMAC signing key + */ + explicit hs384(std::string key) + : hmacsha(std::move(key), EVP_sha384, "HS384") + {} + }; + /** + * HS512 algorithm + */ + struct hs512 : public hmacsha { + /** + * Construct new instance of algorithm + * \param key HMAC signing key + */ + explicit hs512(std::string key) + : hmacsha(std::move(key), EVP_sha512, "HS512") + {} + }; + /** + * RS256 algorithm + */ + struct rs256 : public rsa { + /** + * Construct new instance of algorithm + * \param public_key RSA public key in PEM format + * \param private_key RSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit rs256(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : rsa(public_key, private_key, public_key_password, private_key_password, EVP_sha256, "RS256") + {} + }; + /** + * RS384 algorithm + */ + struct rs384 : public rsa { + /** + * Construct new instance of algorithm + * \param public_key RSA public key in PEM format + * \param private_key RSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit rs384(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : rsa(public_key, private_key, public_key_password, private_key_password, EVP_sha384, "RS384") + {} + }; + /** + * RS512 algorithm + */ + struct rs512 : public rsa { + /** + * Construct new instance of algorithm + * \param public_key RSA public key in PEM format + * \param private_key RSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit rs512(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : rsa(public_key, private_key, public_key_password, private_key_password, EVP_sha512, "RS512") + {} + }; + /** + * ES256 algorithm + */ + struct es256 : public ecdsa { + /** + * Construct new instance of algorithm + * \param public_key ECDSA public key in PEM format + * \param private_key ECDSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit es256(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : ecdsa(public_key, private_key, public_key_password, private_key_password, EVP_sha256, "ES256", 64) + {} + }; + /** + * ES384 algorithm + */ + struct es384 : public ecdsa { + /** + * Construct new instance of algorithm + * \param public_key ECDSA public key in PEM format + * \param private_key ECDSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit es384(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : ecdsa(public_key, private_key, public_key_password, private_key_password, EVP_sha384, "ES384", 96) + {} + }; + /** + * ES512 algorithm + */ + struct es512 : public ecdsa { + /** + * Construct new instance of algorithm + * \param public_key ECDSA public key in PEM format + * \param private_key ECDSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit es512(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : ecdsa(public_key, private_key, public_key_password, private_key_password, EVP_sha512, "ES512", 132) + {} + }; + + /** + * PS256 algorithm + */ + struct ps256 : public pss { + /** + * Construct new instance of algorithm + * \param public_key RSA public key in PEM format + * \param private_key RSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit ps256(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : pss(public_key, private_key, public_key_password, private_key_password, EVP_sha256, "PS256") + {} + }; + /** + * PS384 algorithm + */ + struct ps384 : public pss { + /** + * Construct new instance of algorithm + * \param public_key RSA public key in PEM format + * \param private_key RSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit ps384(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : pss(public_key, private_key, public_key_password, private_key_password, EVP_sha384, "PS384") + {} + }; + /** + * PS512 algorithm + */ + struct ps512 : public pss { + /** + * Construct new instance of algorithm + * \param public_key RSA public key in PEM format + * \param private_key RSA private key or empty string if not available. If empty, signing will always fail. + * \param public_key_password Password to decrypt public key pem. + * \param privat_key_password Password to decrypt private key pem. + */ + explicit ps512(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "") + : pss(public_key, private_key, public_key_password, private_key_password, EVP_sha512, "PS512") + {} + }; + } + + /** + * Convenience wrapper for JSON value + */ + class claim { + picojson::value val; + public: + enum class type { + null, + boolean, + number, + string, + array, + object, + int64 + }; + + claim() + : val() + {} +#if JWT_CLAIM_EXPLICIT + explicit claim(std::string s) + : val(std::move(s)) + {} + explicit claim(const date& s) + : val(int64_t(std::chrono::system_clock::to_time_t(s))) + {} + explicit claim(const std::set& s) + : val(picojson::array(s.cbegin(), s.cend())) + {} + explicit claim(const picojson::value& val) + : val(val) + {} +#else + claim(std::string s) + : val(std::move(s)) + {} + claim(const date& s) + : val(int64_t(std::chrono::system_clock::to_time_t(s))) + {} + claim(const std::set& s) + : val(picojson::array(s.cbegin(), s.cend())) + {} + claim(const picojson::value& val) + : val(val) + {} +#endif + + template + claim(Iterator start, Iterator end) + : val(picojson::array()) + { + auto& arr = val.get(); + for(; start != end; start++) { + arr.push_back(picojson::value(*start)); + } + } + + /** + * Get wrapped json object + * \return Wrapped json object + */ + picojson::value to_json() const { + return val; + } + + /** + * Get type of contained object + * \return Type + * \throws std::logic_error An internal error occured + */ + type get_type() const { + if (val.is()) return type::null; + else if (val.is()) return type::boolean; + else if (val.is()) return type::int64; + else if (val.is()) return type::number; + else if (val.is()) return type::string; + else if (val.is()) return type::array; + else if (val.is()) return type::object; + else throw std::logic_error("internal error"); + } + + /** + * Get the contained object as a string + * \return content as string + * \throws std::bad_cast Content was not a string + */ + const std::string& as_string() const { + if (!val.is()) + throw std::bad_cast(); + return val.get(); + } + /** + * Get the contained object as a date + * \return content as date + * \throws std::bad_cast Content was not a date + */ + date as_date() const { + return std::chrono::system_clock::from_time_t(as_int()); + } + /** + * Get the contained object as an array + * \return content as array + * \throws std::bad_cast Content was not an array + */ + const picojson::array& as_array() const { + if (!val.is()) + throw std::bad_cast(); + return val.get(); + } + /** + * Get the contained object as a set of strings + * \return content as set of strings + * \throws std::bad_cast Content was not a set + */ + const std::set as_set() const { + std::set res; + for(auto& e : as_array()) { + if(!e.is()) + throw std::bad_cast(); + res.insert(e.get()); + } + return res; + } + /** + * Get the contained object as an integer + * \return content as int + * \throws std::bad_cast Content was not an int + */ + int64_t as_int() const { + if (!val.is()) + throw std::bad_cast(); + return val.get(); + } + /** + * Get the contained object as a bool + * \return content as bool + * \throws std::bad_cast Content was not a bool + */ + bool as_bool() const { + if (!val.is()) + throw std::bad_cast(); + return val.get(); + } + /** + * Get the contained object as a number + * \return content as double + * \throws std::bad_cast Content was not a number + */ + double as_number() const { + if (!val.is()) + throw std::bad_cast(); + return val.get(); + } + /** + * Get the contained object as an object + * \return content as object + * \throws std::bad_cast Content was not an object + */ + const picojson::object& as_object() const { + if (!val.is()) + throw std::bad_cast(); + return val.get(); + } + }; + + /** + * Base class that represents a token payload. + * Contains Convenience accessors for common claims. + */ + class payload { + protected: + std::unordered_map payload_claims; + public: + /** + * Check if issuer is present ("iss") + * \return true if present, false otherwise + */ + bool has_issuer() const noexcept { return has_payload_claim("iss"); } + /** + * Check if subject is present ("sub") + * \return true if present, false otherwise + */ + bool has_subject() const noexcept { return has_payload_claim("sub"); } + /** + * Check if audience is present ("aud") + * \return true if present, false otherwise + */ + bool has_audience() const noexcept { return has_payload_claim("aud"); } + /** + * Check if expires is present ("exp") + * \return true if present, false otherwise + */ + bool has_expires_at() const noexcept { return has_payload_claim("exp"); } + /** + * Check if not before is present ("nbf") + * \return true if present, false otherwise + */ + bool has_not_before() const noexcept { return has_payload_claim("nbf"); } + /** + * Check if issued at is present ("iat") + * \return true if present, false otherwise + */ + bool has_issued_at() const noexcept { return has_payload_claim("iat"); } + /** + * Check if token id is present ("jti") + * \return true if present, false otherwise + */ + bool has_id() const noexcept { return has_payload_claim("jti"); } + /** + * Get issuer claim + * \return issuer as string + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token) + */ + const std::string& get_issuer() const { return get_payload_claim("iss").as_string(); } + /** + * Get subject claim + * \return subject as string + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token) + */ + const std::string& get_subject() const { return get_payload_claim("sub").as_string(); } + /** + * Get audience claim + * \return audience as a set of strings + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a set (Should not happen in a valid token) + */ + std::set get_audience() const { + auto aud = get_payload_claim("aud"); + if(aud.get_type() == jwt::claim::type::string) return { aud.as_string()}; + else return aud.as_set(); + } + /** + * Get expires claim + * \return expires as a date in utc + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a date (Should not happen in a valid token) + */ + const date get_expires_at() const { return get_payload_claim("exp").as_date(); } + /** + * Get not valid before claim + * \return nbf date in utc + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a date (Should not happen in a valid token) + */ + const date get_not_before() const { return get_payload_claim("nbf").as_date(); } + /** + * Get issued at claim + * \return issued at as date in utc + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a date (Should not happen in a valid token) + */ + const date get_issued_at() const { return get_payload_claim("iat").as_date(); } + /** + * Get id claim + * \return id as string + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token) + */ + const std::string& get_id() const { return get_payload_claim("jti").as_string(); } + /** + * Check if a payload claim is present + * \return true if claim was present, false otherwise + */ + bool has_payload_claim(const std::string& name) const noexcept { return payload_claims.count(name) != 0; } + /** + * Get payload claim + * \return Requested claim + * \throws std::runtime_error If claim was not present + */ + const claim& get_payload_claim(const std::string& name) const { + if (!has_payload_claim(name)) + throw std::runtime_error("claim not found"); + return payload_claims.at(name); + } + /** + * Get all payload claims + * \return map of claims + */ + std::unordered_map get_payload_claims() const { return payload_claims; } + }; + + /** + * Base class that represents a token header. + * Contains Convenience accessors for common claims. + */ + class header { + protected: + std::unordered_map header_claims; + public: + /** + * Check if algortihm is present ("alg") + * \return true if present, false otherwise + */ + bool has_algorithm() const noexcept { return has_header_claim("alg"); } + /** + * Check if type is present ("typ") + * \return true if present, false otherwise + */ + bool has_type() const noexcept { return has_header_claim("typ"); } + /** + * Check if content type is present ("cty") + * \return true if present, false otherwise + */ + bool has_content_type() const noexcept { return has_header_claim("cty"); } + /** + * Check if key id is present ("kid") + * \return true if present, false otherwise + */ + bool has_key_id() const noexcept { return has_header_claim("kid"); } + /** + * Get algorithm claim + * \return algorithm as string + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token) + */ + const std::string& get_algorithm() const { return get_header_claim("alg").as_string(); } + /** + * Get type claim + * \return type as a string + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token) + */ + const std::string& get_type() const { return get_header_claim("typ").as_string(); } + /** + * Get content type claim + * \return content type as string + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token) + */ + const std::string& get_content_type() const { return get_header_claim("cty").as_string(); } + /** + * Get key id claim + * \return key id as string + * \throws std::runtime_error If claim was not present + * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token) + */ + const std::string& get_key_id() const { return get_header_claim("kid").as_string(); } + /** + * Check if a header claim is present + * \return true if claim was present, false otherwise + */ + bool has_header_claim(const std::string& name) const noexcept { return header_claims.count(name) != 0; } + /** + * Get header claim + * \return Requested claim + * \throws std::runtime_error If claim was not present + */ + const claim& get_header_claim(const std::string& name) const { + if (!has_header_claim(name)) + throw std::runtime_error("claim not found"); + return header_claims.at(name); + } + /** + * Get all header claims + * \return map of claims + */ + std::unordered_map get_header_claims() const { return header_claims; } + }; + + /** + * Class containing all information about a decoded token + */ + class decoded_jwt : public header, public payload { + protected: + /// Unmodifed token, as passed to constructor + const std::string token; + /// Header part decoded from base64 + std::string header; + /// Unmodified header part in base64 + std::string header_base64; + /// Payload part decoded from base64 + std::string payload; + /// Unmodified payload part in base64 + std::string payload_base64; + /// Signature part decoded from base64 + std::string signature; + /// Unmodified signature part in base64 + std::string signature_base64; + public: + /** + * Constructor + * Parses a given token + * \param token The token to parse + * \throws std::invalid_argument Token is not in correct format + * \throws std::runtime_error Base64 decoding failed or invalid json + */ + explicit decoded_jwt(const std::string& token) + : token(token) + { + auto hdr_end = token.find('.'); + if (hdr_end == std::string::npos) + throw std::invalid_argument("invalid token supplied"); + auto payload_end = token.find('.', hdr_end + 1); + if (payload_end == std::string::npos) + throw std::invalid_argument("invalid token supplied"); + header = header_base64 = token.substr(0, hdr_end); + payload = payload_base64 = token.substr(hdr_end + 1, payload_end - hdr_end - 1); + signature = signature_base64 = token.substr(payload_end + 1); + + // Fix padding: JWT requires padding to get removed + auto fix_padding = [](std::string& str) { + switch (str.size() % 4) { + case 1: + str += alphabet::base64url::fill(); +#ifdef __has_cpp_attribute +#if __has_cpp_attribute(fallthrough) + [[fallthrough]]; +#endif +#endif + case 2: + str += alphabet::base64url::fill(); +#ifdef __has_cpp_attribute +#if __has_cpp_attribute(fallthrough) + [[fallthrough]]; +#endif +#endif + case 3: + str += alphabet::base64url::fill(); +#ifdef __has_cpp_attribute +#if __has_cpp_attribute(fallthrough) + [[fallthrough]]; +#endif +#endif + default: + break; + } + }; + fix_padding(header); + fix_padding(payload); + fix_padding(signature); + + header = base::decode(header); + payload = base::decode(payload); + signature = base::decode(signature); + + auto parse_claims = [](const std::string& str) { + std::unordered_map res; + picojson::value val; + if (!picojson::parse(val, str).empty()) + throw std::runtime_error("Invalid json"); + + for (auto& e : val.get()) { res.insert({ e.first, claim(e.second) }); } + + return res; + }; + + header_claims = parse_claims(header); + payload_claims = parse_claims(payload); + } + + /** + * Get token string, as passed to constructor + * \return token as passed to constructor + */ + const std::string& get_token() const noexcept { return token; } + /** + * Get header part as json string + * \return header part after base64 decoding + */ + const std::string& get_header() const noexcept { return header; } + /** + * Get payload part as json string + * \return payload part after base64 decoding + */ + const std::string& get_payload() const noexcept { return payload; } + /** + * Get signature part as json string + * \return signature part after base64 decoding + */ + const std::string& get_signature() const noexcept { return signature; } + /** + * Get header part as base64 string + * \return header part before base64 decoding + */ + const std::string& get_header_base64() const noexcept { return header_base64; } + /** + * Get payload part as base64 string + * \return payload part before base64 decoding + */ + const std::string& get_payload_base64() const noexcept { return payload_base64; } + /** + * Get signature part as base64 string + * \return signature part before base64 decoding + */ + const std::string& get_signature_base64() const noexcept { return signature_base64; } + + }; + + /** + * Builder class to build and sign a new token + * Use jwt::create() to get an instance of this class. + */ + class builder { + std::unordered_map header_claims; + std::unordered_map payload_claims; + + builder() {} + friend builder create(); + public: + /** + * Set a header claim. + * \param id Name of the claim + * \param c Claim to add + * \return *this to allow for method chaining + */ + builder& set_header_claim(const std::string& id, claim c) { header_claims[id] = std::move(c); return *this; } + /** + * Set a payload claim. + * \param id Name of the claim + * \param c Claim to add + * \return *this to allow for method chaining + */ + builder& set_payload_claim(const std::string& id, claim c) { payload_claims[id] = std::move(c); return *this; } + /** + * Set algorithm claim + * You normally don't need to do this, as the algorithm is automatically set if you don't change it. + * \param str Name of algorithm + * \return *this to allow for method chaining + */ + builder& set_algorithm(const std::string& str) { return set_header_claim("alg", claim(str)); } + /** + * Set type claim + * \param str Type to set + * \return *this to allow for method chaining + */ + builder& set_type(const std::string& str) { return set_header_claim("typ", claim(str)); } + /** + * Set content type claim + * \param str Type to set + * \return *this to allow for method chaining + */ + builder& set_content_type(const std::string& str) { return set_header_claim("cty", claim(str)); } + /** + * Set key id claim + * \param str Key id to set + * \return *this to allow for method chaining + */ + builder& set_key_id(const std::string& str) { return set_header_claim("kid", claim(str)); } + /** + * Set issuer claim + * \param str Issuer to set + * \return *this to allow for method chaining + */ + builder& set_issuer(const std::string& str) { return set_payload_claim("iss", claim(str)); } + /** + * Set subject claim + * \param str Subject to set + * \return *this to allow for method chaining + */ + builder& set_subject(const std::string& str) { return set_payload_claim("sub", claim(str)); } + /** + * Set audience claim + * \param l Audience set + * \return *this to allow for method chaining + */ + builder& set_audience(const std::set& l) { return set_payload_claim("aud", claim(l)); } + /** + * Set audience claim + * \param aud Single audience + * \return *this to allow for method chaining + */ + builder& set_audience(const std::string& aud) { return set_payload_claim("aud", claim(aud)); } + /** + * Set expires at claim + * \param d Expires time + * \return *this to allow for method chaining + */ + builder& set_expires_at(const date& d) { return set_payload_claim("exp", claim(d)); } + /** + * Set not before claim + * \param d First valid time + * \return *this to allow for method chaining + */ + builder& set_not_before(const date& d) { return set_payload_claim("nbf", claim(d)); } + /** + * Set issued at claim + * \param d Issued at time, should be current time + * \return *this to allow for method chaining + */ + builder& set_issued_at(const date& d) { return set_payload_claim("iat", claim(d)); } + /** + * Set id claim + * \param str ID to set + * \return *this to allow for method chaining + */ + builder& set_id(const std::string& str) { return set_payload_claim("jti", claim(str)); } + + /** + * Sign token and return result + * \param algo Instance of an algorithm to sign the token with + * \return Final token as a string + */ + template + std::string sign(const T& algo) const { + picojson::object obj_header; + obj_header["alg"] = picojson::value(algo.name()); + for (auto& e : header_claims) { + obj_header[e.first] = e.second.to_json(); + } + picojson::object obj_payload; + for (auto& e : payload_claims) { + obj_payload.insert({ e.first, e.second.to_json() }); + } + + auto encode = [](const std::string& data) { + auto base = base::encode(data); + auto pos = base.find(alphabet::base64url::fill()); + base = base.substr(0, pos); + return base; + }; + + std::string header = encode(picojson::value(obj_header).serialize()); + std::string payload = encode(picojson::value(obj_payload).serialize()); + + std::string token = header + "." + payload; + + return token + "." + encode(algo.sign(token)); + } + }; + + /** + * Verifier class used to check if a decoded token contains all claims required by your application and has a valid signature. + */ + template + class verifier { + struct algo_base { + virtual ~algo_base() {} + virtual void verify(const std::string& data, const std::string& sig) = 0; + }; + template + struct algo : public algo_base { + T alg; + explicit algo(T a) : alg(a) {} + virtual void verify(const std::string& data, const std::string& sig) override { + alg.verify(data, sig); + } + }; + + /// Required claims + std::unordered_map claims; + /// Leeway time for exp, nbf and iat + size_t default_leeway = 0; + /// Instance of clock type + Clock clock; + /// Supported algorithms + std::unordered_map> algs; + public: + /** + * Constructor for building a new verifier instance + * \param c Clock instance + */ + explicit verifier(Clock c) : clock(c) {} + + /** + * Set default leeway to use. + * \param leeway Default leeway to use if not specified otherwise + * \return *this to allow chaining + */ + verifier& leeway(size_t leeway) { default_leeway = leeway; return *this; } + /** + * Set leeway for expires at. + * If not specified the default leeway will be used. + * \param leeway Set leeway to use for expires at. + * \return *this to allow chaining + */ + verifier& expires_at_leeway(size_t leeway) { return with_claim("exp", claim(std::chrono::system_clock::from_time_t(leeway))); } + /** + * Set leeway for not before. + * If not specified the default leeway will be used. + * \param leeway Set leeway to use for not before. + * \return *this to allow chaining + */ + verifier& not_before_leeway(size_t leeway) { return with_claim("nbf", claim(std::chrono::system_clock::from_time_t(leeway))); } + /** + * Set leeway for issued at. + * If not specified the default leeway will be used. + * \param leeway Set leeway to use for issued at. + * \return *this to allow chaining + */ + verifier& issued_at_leeway(size_t leeway) { return with_claim("iat", claim(std::chrono::system_clock::from_time_t(leeway))); } + /** + * Set an issuer to check for. + * Check is casesensitive. + * \param iss Issuer to check for. + * \return *this to allow chaining + */ + verifier& with_issuer(const std::string& iss) { return with_claim("iss", claim(iss)); } + /** + * Set a subject to check for. + * Check is casesensitive. + * \param sub Subject to check for. + * \return *this to allow chaining + */ + verifier& with_subject(const std::string& sub) { return with_claim("sub", claim(sub)); } + /** + * Set an audience to check for. + * If any of the specified audiences is not present in the token the check fails. + * \param aud Audience to check for. + * \return *this to allow chaining + */ + verifier& with_audience(const std::set& aud) { return with_claim("aud", claim(aud)); } + /** + * Set an id to check for. + * Check is casesensitive. + * \param id ID to check for. + * \return *this to allow chaining + */ + verifier& with_id(const std::string& id) { return with_claim("jti", claim(id)); } + /** + * Specify a claim to check for. + * \param name Name of the claim to check for + * \param c Claim to check for + * \return *this to allow chaining + */ + verifier& with_claim(const std::string& name, claim c) { claims[name] = c; return *this; } + + /** + * Add an algorithm available for checking. + * \param alg Algorithm to allow + * \return *this to allow chaining + */ + template + verifier& allow_algorithm(Algorithm alg) { + algs[alg.name()] = std::make_shared>(alg); + return *this; + } + + /** + * Verify the given token. + * \param jwt Token to check + * \throws token_verification_exception Verification failed + */ + void verify(const decoded_jwt& jwt) const { + const std::string data = jwt.get_header_base64() + "." + jwt.get_payload_base64(); + const std::string sig = jwt.get_signature(); + const std::string& algo = jwt.get_algorithm(); + if (algs.count(algo) == 0) + throw token_verification_exception("wrong algorithm"); + algs.at(algo)->verify(data, sig); + + auto assert_claim_eq = [](const decoded_jwt& jwt, const std::string& key, const claim& c) { + if (!jwt.has_payload_claim(key)) + throw token_verification_exception("decoded_jwt is missing " + key + " claim"); + auto& jc = jwt.get_payload_claim(key); + if (jc.get_type() != c.get_type()) + throw token_verification_exception("claim " + key + " type mismatch"); + if (c.get_type() == claim::type::int64) { + if (c.as_date() != jc.as_date()) + throw token_verification_exception("claim " + key + " does not match expected"); + } + else if (c.get_type() == claim::type::array) { + auto s1 = c.as_set(); + auto s2 = jc.as_set(); + if (s1.size() != s2.size()) + throw token_verification_exception("claim " + key + " does not match expected"); + auto it1 = s1.cbegin(); + auto it2 = s2.cbegin(); + while (it1 != s1.cend() && it2 != s2.cend()) { + if (*it1++ != *it2++) + throw token_verification_exception("claim " + key + " does not match expected"); + } + } + else if (c.get_type() == claim::type::string) { + if (c.as_string() != jc.as_string()) + throw token_verification_exception("claim " + key + " does not match expected"); + } + else throw token_verification_exception("internal error"); + }; + + auto time = clock.now(); + + if (jwt.has_expires_at()) { + auto leeway = claims.count("exp") == 1 ? std::chrono::system_clock::to_time_t(claims.at("exp").as_date()) : default_leeway; + auto exp = jwt.get_expires_at(); + if (time > exp + std::chrono::seconds(leeway)) + throw token_verification_exception("token expired"); + } + if (jwt.has_issued_at()) { + auto leeway = claims.count("iat") == 1 ? std::chrono::system_clock::to_time_t(claims.at("iat").as_date()) : default_leeway; + auto iat = jwt.get_issued_at(); + if (time < iat - std::chrono::seconds(leeway)) + throw token_verification_exception("token expired"); + } + if (jwt.has_not_before()) { + auto leeway = claims.count("nbf") == 1 ? std::chrono::system_clock::to_time_t(claims.at("nbf").as_date()) : default_leeway; + auto nbf = jwt.get_not_before(); + if (time < nbf - std::chrono::seconds(leeway)) + throw token_verification_exception("token expired"); + } + for (auto& c : claims) + { + if (c.first == "exp" || c.first == "iat" || c.first == "nbf") { + // Nothing to do here, already checked + } + else if (c.first == "aud") { + if (!jwt.has_audience()) + throw token_verification_exception("token doesn't contain the required audience"); + auto aud = jwt.get_audience(); + auto expected = c.second.as_set(); + for (auto& e : expected) + if (aud.count(e) == 0) + throw token_verification_exception("token doesn't contain the required audience"); + } + else { + assert_claim_eq(jwt, c.first, c.second); + } + } + } + }; + + /** + * Create a verifier using the given clock + * \param c Clock instance to use + * \return verifier instance + */ + template + verifier verify(Clock c) { + return verifier(c); + } + + /** + * Default clock class using std::chrono::system_clock as a backend. + */ + struct default_clock { + std::chrono::system_clock::time_point now() const { + return std::chrono::system_clock::now(); + } + }; + + /** + * Create a verifier using the default clock + * \return verifier instance + */ + inline + verifier verify() { + return verify({}); + } + + /** + * Return a builder instance to create a new token + */ + inline + builder create() { + return builder(); + } + + /** + * Decode a token + * \param token Token to decode + * \return Decoded token + * \throws std::invalid_argument Token is not in correct format + * \throws std::runtime_error Base64 decoding failed or invalid json + */ + inline + decoded_jwt decode(const std::string& token) { + return decoded_jwt(token); + } +} diff --git a/src/rgw/librgw.cc b/src/rgw/librgw.cc new file mode 100644 index 000000000..bf6fc50d7 --- /dev/null +++ b/src/rgw/librgw.cc @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include + +#include "include/rados/librgw.h" + +#include "include/str_list.h" +#include "common/ceph_argparse.h" +#include "common/ceph_context.h" +#include "common/dout.h" + +#include "rgw_lib.h" + +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rgw + +namespace rgw { + +bool global_stop = false; +static std::mutex librgw_mtx; +static RGWLib rgwlib; + +} // namespace rgw + +extern "C" { + +int librgw_create(librgw_t* rgw, int argc, char **argv) +{ + using namespace rgw; + + int rc = -EINVAL; + + g_rgwlib = &rgwlib; + + if (! g_ceph_context) { + std::lock_guard lg(librgw_mtx); + if (! g_ceph_context) { + std::vector spl_args; + // last non-0 argument will be split and consumed + if (argc > 1) { + const std::string spl_arg{argv[(--argc)]}; + get_str_vec(spl_arg, " \t", spl_args); + } + auto args = argv_to_vec(argc, argv); + // append split args, if any + for (const auto& elt : spl_args) { + args.push_back(elt.c_str()); + } + rc = rgwlib.init(args); + } + } + + *rgw = g_ceph_context->get(); + + return rc; +} + +void librgw_shutdown(librgw_t rgw) +{ + using namespace rgw; + + CephContext* cct = static_cast(rgw); + rgwlib.stop(); + + dout(1) << "final shutdown" << dendl; + + cct->put(); +} + +} /* extern "C" */ diff --git a/src/rgw/picojson/picojson.h b/src/rgw/picojson/picojson.h new file mode 100644 index 000000000..ceaeb5ba8 --- /dev/null +++ b/src/rgw/picojson/picojson.h @@ -0,0 +1,1177 @@ +/* + * Copyright 2009-2010 Cybozu Labs, Inc. + * Copyright 2011-2014 Kazuho Oku + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef picojson_h +#define picojson_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// for isnan/isinf +#if __cplusplus >= 201103L +#include +#else +extern "C" { +#ifdef _MSC_VER +#include +#elif defined(__INTEL_COMPILER) +#include +#else +#include +#endif +} +#endif + +#ifndef PICOJSON_USE_RVALUE_REFERENCE +#if (defined(__cpp_rvalue_references) && __cpp_rvalue_references >= 200610) || (defined(_MSC_VER) && _MSC_VER >= 1600) +#define PICOJSON_USE_RVALUE_REFERENCE 1 +#else +#define PICOJSON_USE_RVALUE_REFERENCE 0 +#endif +#endif // PICOJSON_USE_RVALUE_REFERENCE + +#ifndef PICOJSON_NOEXCEPT +#if PICOJSON_USE_RVALUE_REFERENCE +#define PICOJSON_NOEXCEPT noexcept +#else +#define PICOJSON_NOEXCEPT throw() +#endif +#endif + +// experimental support for int64_t (see README.mkdn for detail) +#ifdef PICOJSON_USE_INT64 +//#define __STDC_FORMAT_MACROS +#include +#if __cplusplus >= 201103L +#include +#else +extern "C" { +#include +} +#endif +#endif + +// to disable the use of localeconv(3), set PICOJSON_USE_LOCALE to 0 +#ifndef PICOJSON_USE_LOCALE +#define PICOJSON_USE_LOCALE 1 +#endif +#if PICOJSON_USE_LOCALE +extern "C" { +#include +} +#endif + +#ifndef PICOJSON_ASSERT +#define PICOJSON_ASSERT(e) \ + do { \ + if (!(e)) \ + throw std::runtime_error(#e); \ + } while (0) +#endif + +#ifdef _MSC_VER +#define SNPRINTF _snprintf_s +#pragma warning(push) +#pragma warning(disable : 4244) // conversion from int to char +#pragma warning(disable : 4127) // conditional expression is constant +#pragma warning(disable : 4702) // unreachable code +#else +#define SNPRINTF snprintf +#endif + +namespace picojson { + +enum { + null_type, + boolean_type, + number_type, + string_type, + array_type, + object_type +#ifdef PICOJSON_USE_INT64 + , + int64_type +#endif +}; + +enum { INDENT_WIDTH = 2 }; + +struct null {}; + +class value { +public: + typedef std::vector array; + typedef std::map object; + union _storage { + bool boolean_; + double number_; +#ifdef PICOJSON_USE_INT64 + int64_t int64_; +#endif + std::string *string_; + array *array_; + object *object_; + }; + +protected: + int type_; + _storage u_; + +public: + value(); + value(int type, bool); + explicit value(bool b); +#ifdef PICOJSON_USE_INT64 + explicit value(int64_t i); +#endif + explicit value(double n); + explicit value(const std::string &s); + explicit value(const array &a); + explicit value(const object &o); +#if PICOJSON_USE_RVALUE_REFERENCE + explicit value(std::string &&s); + explicit value(array &&a); + explicit value(object &&o); +#endif + explicit value(const char *s); + value(const char *s, size_t len); + ~value(); + value(const value &x); + value &operator=(const value &x); +#if PICOJSON_USE_RVALUE_REFERENCE + value(value &&x) PICOJSON_NOEXCEPT; + value &operator=(value &&x) PICOJSON_NOEXCEPT; +#endif + void swap(value &x) PICOJSON_NOEXCEPT; + template bool is() const; + template const T &get() const; + template T &get(); + template void set(const T &); +#if PICOJSON_USE_RVALUE_REFERENCE + template void set(T &&); +#endif + bool evaluate_as_boolean() const; + const value &get(const size_t idx) const; + const value &get(const std::string &key) const; + value &get(const size_t idx); + value &get(const std::string &key); + + bool contains(const size_t idx) const; + bool contains(const std::string &key) const; + std::string to_str() const; + template void serialize(Iter os, bool prettify = false) const; + std::string serialize(bool prettify = false) const; + +private: + template value(const T *); // intentionally defined to block implicit conversion of pointer to bool + template static void _indent(Iter os, int indent); + template void _serialize(Iter os, int indent) const; + std::string _serialize(int indent) const; + void clear(); +}; + +typedef value::array array; +typedef value::object object; + +inline value::value() : type_(null_type), u_() { +} + +inline value::value(int type, bool) : type_(type), u_() { + switch (type) { +#define INIT(p, v) \ + case p##type: \ + u_.p = v; \ + break + INIT(boolean_, false); + INIT(number_, 0.0); +#ifdef PICOJSON_USE_INT64 + INIT(int64_, 0); +#endif + INIT(string_, new std::string()); + INIT(array_, new array()); + INIT(object_, new object()); +#undef INIT + default: + break; + } +} + +inline value::value(bool b) : type_(boolean_type), u_() { + u_.boolean_ = b; +} + +#ifdef PICOJSON_USE_INT64 +inline value::value(int64_t i) : type_(int64_type), u_() { + u_.int64_ = i; +} +#endif + +inline value::value(double n) : type_(number_type), u_() { + if ( +#ifdef _MSC_VER + !_finite(n) +#elif __cplusplus >= 201103L + std::isnan(n) || std::isinf(n) +#else + isnan(n) || isinf(n) +#endif + ) { + throw std::overflow_error(""); + } + u_.number_ = n; +} + +inline value::value(const std::string &s) : type_(string_type), u_() { + u_.string_ = new std::string(s); +} + +inline value::value(const array &a) : type_(array_type), u_() { + u_.array_ = new array(a); +} + +inline value::value(const object &o) : type_(object_type), u_() { + u_.object_ = new object(o); +} + +#if PICOJSON_USE_RVALUE_REFERENCE +inline value::value(std::string &&s) : type_(string_type), u_() { + u_.string_ = new std::string(std::move(s)); +} + +inline value::value(array &&a) : type_(array_type), u_() { + u_.array_ = new array(std::move(a)); +} + +inline value::value(object &&o) : type_(object_type), u_() { + u_.object_ = new object(std::move(o)); +} +#endif + +inline value::value(const char *s) : type_(string_type), u_() { + u_.string_ = new std::string(s); +} + +inline value::value(const char *s, size_t len) : type_(string_type), u_() { + u_.string_ = new std::string(s, len); +} + +inline void value::clear() { + switch (type_) { +#define DEINIT(p) \ + case p##type: \ + delete u_.p; \ + break + DEINIT(string_); + DEINIT(array_); + DEINIT(object_); +#undef DEINIT + default: + break; + } +} + +inline value::~value() { + clear(); +} + +inline value::value(const value &x) : type_(x.type_), u_() { + switch (type_) { +#define INIT(p, v) \ + case p##type: \ + u_.p = v; \ + break + INIT(string_, new std::string(*x.u_.string_)); + INIT(array_, new array(*x.u_.array_)); + INIT(object_, new object(*x.u_.object_)); +#undef INIT + default: + u_ = x.u_; + break; + } +} + +inline value &value::operator=(const value &x) { + if (this != &x) { + value t(x); + swap(t); + } + return *this; +} + +#if PICOJSON_USE_RVALUE_REFERENCE +inline value::value(value &&x) PICOJSON_NOEXCEPT : type_(null_type), u_() { + swap(x); +} +inline value &value::operator=(value &&x) PICOJSON_NOEXCEPT { + swap(x); + return *this; +} +#endif +inline void value::swap(value &x) PICOJSON_NOEXCEPT { + std::swap(type_, x.type_); + std::swap(u_, x.u_); +} + +#define IS(ctype, jtype) \ + template <> inline bool value::is() const { \ + return type_ == jtype##_type; \ + } +IS(null, null) +IS(bool, boolean) +#ifdef PICOJSON_USE_INT64 +IS(int64_t, int64) +#endif +IS(std::string, string) +IS(array, array) +IS(object, object) +#undef IS +template <> inline bool value::is() const { + return type_ == number_type +#ifdef PICOJSON_USE_INT64 + || type_ == int64_type +#endif + ; +} + +#define GET(ctype, var) \ + template <> inline const ctype &value::get() const { \ + PICOJSON_ASSERT("type mismatch! call is() before get()" && is()); \ + return var; \ + } \ + template <> inline ctype &value::get() { \ + PICOJSON_ASSERT("type mismatch! call is() before get()" && is()); \ + return var; \ + } +GET(bool, u_.boolean_) +GET(std::string, *u_.string_) +GET(array, *u_.array_) +GET(object, *u_.object_) +#ifdef PICOJSON_USE_INT64 +GET(double, + (type_ == int64_type && (const_cast(this)->type_ = number_type, (const_cast(this)->u_.number_ = u_.int64_)), + u_.number_)) +GET(int64_t, u_.int64_) +#else +GET(double, u_.number_) +#endif +#undef GET + +#define SET(ctype, jtype, setter) \ + template <> inline void value::set(const ctype &_val) { \ + clear(); \ + type_ = jtype##_type; \ + setter \ + } +SET(bool, boolean, u_.boolean_ = _val;) +SET(std::string, string, u_.string_ = new std::string(_val);) +SET(array, array, u_.array_ = new array(_val);) +SET(object, object, u_.object_ = new object(_val);) +SET(double, number, u_.number_ = _val;) +#ifdef PICOJSON_USE_INT64 +SET(int64_t, int64, u_.int64_ = _val;) +#endif +#undef SET + +#if PICOJSON_USE_RVALUE_REFERENCE +#define MOVESET(ctype, jtype, setter) \ + template <> inline void value::set(ctype && _val) { \ + clear(); \ + type_ = jtype##_type; \ + setter \ + } +MOVESET(std::string, string, u_.string_ = new std::string(std::move(_val));) +MOVESET(array, array, u_.array_ = new array(std::move(_val));) +MOVESET(object, object, u_.object_ = new object(std::move(_val));) +#undef MOVESET +#endif + +inline bool value::evaluate_as_boolean() const { + switch (type_) { + case null_type: + return false; + case boolean_type: + return u_.boolean_; + case number_type: + return u_.number_ != 0; +#ifdef PICOJSON_USE_INT64 + case int64_type: + return u_.int64_ != 0; +#endif + case string_type: + return !u_.string_->empty(); + default: + return true; + } +} + +inline const value &value::get(const size_t idx) const { + static value s_null; + PICOJSON_ASSERT(is()); + return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null; +} + +inline value &value::get(const size_t idx) { + static value s_null; + PICOJSON_ASSERT(is()); + return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null; +} + +inline const value &value::get(const std::string &key) const { + static value s_null; + PICOJSON_ASSERT(is()); + object::const_iterator i = u_.object_->find(key); + return i != u_.object_->end() ? i->second : s_null; +} + +inline value &value::get(const std::string &key) { + static value s_null; + PICOJSON_ASSERT(is()); + object::iterator i = u_.object_->find(key); + return i != u_.object_->end() ? i->second : s_null; +} + +inline bool value::contains(const size_t idx) const { + PICOJSON_ASSERT(is()); + return idx < u_.array_->size(); +} + +inline bool value::contains(const std::string &key) const { + PICOJSON_ASSERT(is()); + object::const_iterator i = u_.object_->find(key); + return i != u_.object_->end(); +} + +inline std::string value::to_str() const { + switch (type_) { + case null_type: + return "null"; + case boolean_type: + return u_.boolean_ ? "true" : "false"; +#ifdef PICOJSON_USE_INT64 + case int64_type: { + char buf[sizeof("-9223372036854775808")]; + SNPRINTF(buf, sizeof(buf), "%" PRId64, u_.int64_); + return buf; + } +#endif + case number_type: { + char buf[256]; + double tmp; + SNPRINTF(buf, sizeof(buf), fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0 ? "%.f" : "%.17g", u_.number_); +#if PICOJSON_USE_LOCALE + char *decimal_point = localeconv()->decimal_point; + if (strcmp(decimal_point, ".") != 0) { + size_t decimal_point_len = strlen(decimal_point); + for (char *p = buf; *p != '\0'; ++p) { + if (strncmp(p, decimal_point, decimal_point_len) == 0) { + return std::string(buf, p) + "." + (p + decimal_point_len); + } + } + } +#endif + return buf; + } + case string_type: + return *u_.string_; + case array_type: + return "array"; + case object_type: + return "object"; + default: + PICOJSON_ASSERT(0); +#ifdef _MSC_VER + __assume(0); +#endif + } + return std::string(); +} + +template void copy(const std::string &s, Iter oi) { + std::copy(s.begin(), s.end(), oi); +} + +template struct serialize_str_char { + Iter oi; + void operator()(char c) { + switch (c) { +#define MAP(val, sym) \ + case val: \ + copy(sym, oi); \ + break + MAP('"', "\\\""); + MAP('\\', "\\\\"); + MAP('/', "\\/"); + MAP('\b', "\\b"); + MAP('\f', "\\f"); + MAP('\n', "\\n"); + MAP('\r', "\\r"); + MAP('\t', "\\t"); +#undef MAP + default: + if (static_cast(c) < 0x20 || c == 0x7f) { + char buf[7]; + SNPRINTF(buf, sizeof(buf), "\\u%04x", c & 0xff); + copy(buf, buf + 6, oi); + } else { + *oi++ = c; + } + break; + } + } +}; + +template void serialize_str(const std::string &s, Iter oi) { + *oi++ = '"'; + serialize_str_char process_char = {oi}; + std::for_each(s.begin(), s.end(), process_char); + *oi++ = '"'; +} + +template void value::serialize(Iter oi, bool prettify) const { + return _serialize(oi, prettify ? 0 : -1); +} + +inline std::string value::serialize(bool prettify) const { + return _serialize(prettify ? 0 : -1); +} + +template void value::_indent(Iter oi, int indent) { + *oi++ = '\n'; + for (int i = 0; i < indent * INDENT_WIDTH; ++i) { + *oi++ = ' '; + } +} + +template void value::_serialize(Iter oi, int indent) const { + switch (type_) { + case string_type: + serialize_str(*u_.string_, oi); + break; + case array_type: { + *oi++ = '['; + if (indent != -1) { + ++indent; + } + for (array::const_iterator i = u_.array_->begin(); i != u_.array_->end(); ++i) { + if (i != u_.array_->begin()) { + *oi++ = ','; + } + if (indent != -1) { + _indent(oi, indent); + } + i->_serialize(oi, indent); + } + if (indent != -1) { + --indent; + if (!u_.array_->empty()) { + _indent(oi, indent); + } + } + *oi++ = ']'; + break; + } + case object_type: { + *oi++ = '{'; + if (indent != -1) { + ++indent; + } + for (object::const_iterator i = u_.object_->begin(); i != u_.object_->end(); ++i) { + if (i != u_.object_->begin()) { + *oi++ = ','; + } + if (indent != -1) { + _indent(oi, indent); + } + serialize_str(i->first, oi); + *oi++ = ':'; + if (indent != -1) { + *oi++ = ' '; + } + i->second._serialize(oi, indent); + } + if (indent != -1) { + --indent; + if (!u_.object_->empty()) { + _indent(oi, indent); + } + } + *oi++ = '}'; + break; + } + default: + copy(to_str(), oi); + break; + } + if (indent == 0) { + *oi++ = '\n'; + } +} + +inline std::string value::_serialize(int indent) const { + std::string s; + _serialize(std::back_inserter(s), indent); + return s; +} + +template class input { +protected: + Iter cur_, end_; + bool consumed_; + int line_; + +public: + input(const Iter &first, const Iter &last) : cur_(first), end_(last), consumed_(false), line_(1) { + } + int getc() { + if (consumed_) { + if (*cur_ == '\n') { + ++line_; + } + ++cur_; + } + if (cur_ == end_) { + consumed_ = false; + return -1; + } + consumed_ = true; + return *cur_ & 0xff; + } + void ungetc() { + consumed_ = false; + } + Iter cur() const { + if (consumed_) { + input *self = const_cast *>(this); + self->consumed_ = false; + ++self->cur_; + } + return cur_; + } + int line() const { + return line_; + } + void skip_ws() { + while (1) { + int ch = getc(); + if (!(ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) { + ungetc(); + break; + } + } + } + bool picojson_expect(const int expected) { + skip_ws(); + if (getc() != expected) { + ungetc(); + return false; + } + return true; + } + bool match(const std::string &pattern) { + for (std::string::const_iterator pi(pattern.begin()); pi != pattern.end(); ++pi) { + if (getc() != *pi) { + ungetc(); + return false; + } + } + return true; + } +}; + +template inline int _parse_quadhex(input &in) { + int uni_ch = 0, hex; + for (int i = 0; i < 4; i++) { + if ((hex = in.getc()) == -1) { + return -1; + } + if ('0' <= hex && hex <= '9') { + hex -= '0'; + } else if ('A' <= hex && hex <= 'F') { + hex -= 'A' - 0xa; + } else if ('a' <= hex && hex <= 'f') { + hex -= 'a' - 0xa; + } else { + in.ungetc(); + return -1; + } + uni_ch = uni_ch * 16 + hex; + } + return uni_ch; +} + +template inline bool _parse_codepoint(String &out, input &in) { + int uni_ch; + if ((uni_ch = _parse_quadhex(in)) == -1) { + return false; + } + if (0xd800 <= uni_ch && uni_ch <= 0xdfff) { + if (0xdc00 <= uni_ch) { + // a second 16-bit of a surrogate pair appeared + return false; + } + // first 16-bit of surrogate pair, get the next one + if (in.getc() != '\\' || in.getc() != 'u') { + in.ungetc(); + return false; + } + int second = _parse_quadhex(in); + if (!(0xdc00 <= second && second <= 0xdfff)) { + return false; + } + uni_ch = ((uni_ch - 0xd800) << 10) | ((second - 0xdc00) & 0x3ff); + uni_ch += 0x10000; + } + if (uni_ch < 0x80) { + out.push_back(static_cast(uni_ch)); + } else { + if (uni_ch < 0x800) { + out.push_back(static_cast(0xc0 | (uni_ch >> 6))); + } else { + if (uni_ch < 0x10000) { + out.push_back(static_cast(0xe0 | (uni_ch >> 12))); + } else { + out.push_back(static_cast(0xf0 | (uni_ch >> 18))); + out.push_back(static_cast(0x80 | ((uni_ch >> 12) & 0x3f))); + } + out.push_back(static_cast(0x80 | ((uni_ch >> 6) & 0x3f))); + } + out.push_back(static_cast(0x80 | (uni_ch & 0x3f))); + } + return true; +} + +template inline bool _parse_string(String &out, input &in) { + while (1) { + int ch = in.getc(); + if (ch < ' ') { + in.ungetc(); + return false; + } else if (ch == '"') { + return true; + } else if (ch == '\\') { + if ((ch = in.getc()) == -1) { + return false; + } + switch (ch) { +#define MAP(sym, val) \ + case sym: \ + out.push_back(val); \ + break + MAP('"', '\"'); + MAP('\\', '\\'); + MAP('/', '/'); + MAP('b', '\b'); + MAP('f', '\f'); + MAP('n', '\n'); + MAP('r', '\r'); + MAP('t', '\t'); +#undef MAP + case 'u': + if (!_parse_codepoint(out, in)) { + return false; + } + break; + default: + return false; + } + } else { + out.push_back(static_cast(ch)); + } + } + return false; +} + +template inline bool _parse_array(Context &ctx, input &in) { + if (!ctx.parse_array_start()) { + return false; + } + size_t idx = 0; + if (in.picojson_expect(']')) { + return ctx.parse_array_stop(idx); + } + do { + if (!ctx.parse_array_item(in, idx)) { + return false; + } + idx++; + } while (in.picojson_expect(',')); + return in.picojson_expect(']') && ctx.parse_array_stop(idx); +} + +template inline bool _parse_object(Context &ctx, input &in) { + if (!ctx.parse_object_start()) { + return false; + } + if (in.picojson_expect('}')) { + return true; + } + do { + std::string key; + if (!in.picojson_expect('"') || !_parse_string(key, in) || !in.picojson_expect(':')) { + return false; + } + if (!ctx.parse_object_item(in, key)) { + return false; + } + } while (in.picojson_expect(',')); + return in.picojson_expect('}'); +} + +template inline std::string _parse_number(input &in) { + std::string num_str; + while (1) { + int ch = in.getc(); + if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == 'e' || ch == 'E') { + num_str.push_back(static_cast(ch)); + } else if (ch == '.') { +#if PICOJSON_USE_LOCALE + num_str += localeconv()->decimal_point; +#else + num_str.push_back('.'); +#endif + } else { + in.ungetc(); + break; + } + } + return num_str; +} + +template inline bool _parse(Context &ctx, input &in) { + in.skip_ws(); + int ch = in.getc(); + switch (ch) { +#define IS(ch, text, op) \ + case ch: \ + if (in.match(text) && op) { \ + return true; \ + } else { \ + return false; \ + } + IS('n', "ull", ctx.set_null()); + IS('f', "alse", ctx.set_bool(false)); + IS('t', "rue", ctx.set_bool(true)); +#undef IS + case '"': + return ctx.parse_string(in); + case '[': + return _parse_array(ctx, in); + case '{': + return _parse_object(ctx, in); + default: + if (('0' <= ch && ch <= '9') || ch == '-') { + double f; + char *endp; + in.ungetc(); + std::string num_str(_parse_number(in)); + if (num_str.empty()) { + return false; + } +#ifdef PICOJSON_USE_INT64 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wtautological-type-limit-compare" + { + errno = 0; + intmax_t ival = strtoimax(num_str.c_str(), &endp, 10); + if (errno == 0 && std::numeric_limits::min() <= ival && ival <= std::numeric_limits::max() && + endp == num_str.c_str() + num_str.size()) { + ctx.set_int64(ival); + return true; + } + } +#pragma clang diagnostic pop +#endif + f = strtod(num_str.c_str(), &endp); + if (endp == num_str.c_str() + num_str.size()) { + ctx.set_number(f); + return true; + } + return false; + } + break; + } + in.ungetc(); + return false; +} + +class deny_parse_context { +public: + bool set_null() { + return false; + } + bool set_bool(bool) { + return false; + } +#ifdef PICOJSON_USE_INT64 + bool set_int64(int64_t) { + return false; + } +#endif + bool set_number(double) { + return false; + } + template bool parse_string(input &) { + return false; + } + bool parse_array_start() { + return false; + } + template bool parse_array_item(input &, size_t) { + return false; + } + bool parse_array_stop(size_t) { + return false; + } + bool parse_object_start() { + return false; + } + template bool parse_object_item(input &, const std::string &) { + return false; + } +}; + +class default_parse_context { +protected: + value *out_; + +public: + default_parse_context(value *out) : out_(out) { + } + bool set_null() { + *out_ = value(); + return true; + } + bool set_bool(bool b) { + *out_ = value(b); + return true; + } +#ifdef PICOJSON_USE_INT64 + bool set_int64(int64_t i) { + *out_ = value(i); + return true; + } +#endif + bool set_number(double f) { + *out_ = value(f); + return true; + } + template bool parse_string(input &in) { + *out_ = value(string_type, false); + return _parse_string(out_->get(), in); + } + bool parse_array_start() { + *out_ = value(array_type, false); + return true; + } + template bool parse_array_item(input &in, size_t) { + array &a = out_->get(); + a.push_back(value()); + default_parse_context ctx(&a.back()); + return _parse(ctx, in); + } + bool parse_array_stop(size_t) { + return true; + } + bool parse_object_start() { + *out_ = value(object_type, false); + return true; + } + template bool parse_object_item(input &in, const std::string &key) { + object &o = out_->get(); + default_parse_context ctx(&o[key]); + return _parse(ctx, in); + } + +private: + default_parse_context(const default_parse_context &); + default_parse_context &operator=(const default_parse_context &); +}; + +class null_parse_context { +public: + struct dummy_str { + void push_back(int) { + } + }; + +public: + null_parse_context() { + } + bool set_null() { + return true; + } + bool set_bool(bool) { + return true; + } +#ifdef PICOJSON_USE_INT64 + bool set_int64(int64_t) { + return true; + } +#endif + bool set_number(double) { + return true; + } + template bool parse_string(input &in) { + dummy_str s; + return _parse_string(s, in); + } + bool parse_array_start() { + return true; + } + template bool parse_array_item(input &in, size_t) { + return _parse(*this, in); + } + bool parse_array_stop(size_t) { + return true; + } + bool parse_object_start() { + return true; + } + template bool parse_object_item(input &in, const std::string &) { + return _parse(*this, in); + } + +private: + null_parse_context(const null_parse_context &); + null_parse_context &operator=(const null_parse_context &); +}; + +// obsolete, use the version below +template inline std::string parse(value &out, Iter &pos, const Iter &last) { + std::string err; + pos = parse(out, pos, last, &err); + return err; +} + +template inline Iter _parse(Context &ctx, const Iter &first, const Iter &last, std::string *err) { + input in(first, last); + if (!_parse(ctx, in) && err != NULL) { + char buf[64]; + SNPRINTF(buf, sizeof(buf), "syntax error at line %d near: ", in.line()); + *err = buf; + while (1) { + int ch = in.getc(); + if (ch == -1 || ch == '\n') { + break; + } else if (ch >= ' ') { + err->push_back(static_cast(ch)); + } + } + } + return in.cur(); +} + +template inline Iter parse(value &out, const Iter &first, const Iter &last, std::string *err) { + default_parse_context ctx(&out); + return _parse(ctx, first, last, err); +} + +inline std::string parse(value &out, const std::string &s) { + std::string err; + parse(out, s.begin(), s.end(), &err); + return err; +} + +inline std::string parse(value &out, std::istream &is) { + std::string err; + parse(out, std::istreambuf_iterator(is.rdbuf()), std::istreambuf_iterator(), &err); + return err; +} + +template struct last_error_t { static std::string s; }; +template std::string last_error_t::s; + +inline void set_last_error(const std::string &s) { + last_error_t::s = s; +} + +inline const std::string &get_last_error() { + return last_error_t::s; +} + +inline bool operator==(const value &x, const value &y) { + if (x.is()) + return y.is(); +#define PICOJSON_CMP(type) \ + if (x.is()) \ + return y.is() && x.get() == y.get() + PICOJSON_CMP(bool); + PICOJSON_CMP(double); + PICOJSON_CMP(std::string); + PICOJSON_CMP(array); + PICOJSON_CMP(object); +#undef PICOJSON_CMP + PICOJSON_ASSERT(0); +#ifdef _MSC_VER + __assume(0); +#endif + return false; +} + +inline bool operator!=(const value &x, const value &y) { + return !(x == y); +} +} + +#if !PICOJSON_USE_RVALUE_REFERENCE +namespace std { +template <> inline void swap(picojson::value &x, picojson::value &y) { + x.swap(y); +} +} +#endif + +inline std::istream &operator>>(std::istream &is, picojson::value &x) { + picojson::set_last_error(std::string()); + const std::string err(picojson::parse(x, is)); + if (!err.empty()) { + picojson::set_last_error(err); + is.setstate(std::ios::failbit); + } + return is; +} + +inline std::ostream &operator<<(std::ostream &os, const picojson::value &x) { + x.serialize(std::ostream_iterator(os)); + return os; +} +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#endif diff --git a/src/rgw/rgw-gap-list b/src/rgw/rgw-gap-list new file mode 100755 index 000000000..5018cedd7 --- /dev/null +++ b/src/rgw/rgw-gap-list @@ -0,0 +1,456 @@ +#!/usr/bin/env bash + +# Last revision 2023-01-13 + +# NOTE: This script based based on rgw-orphan-list but doing the +# reverse calculation. + +# NOTE: The awk included in this script replaces the 'ceph-diff-sorted' +# utility but duplicates its functionality. This was done to minimize +# the number of times the massive data set must be iterated to complete +# the task. + +# IMPORTANT: Affects order produced by 'sort'. +export LC_ALL=C + +trap "exit 1" TERM +TOP_PID=$$ + +out_dir="$PWD" +timestamp=$(date -u +%Y%m%d%H%M) +lspools_err="${out_dir}/lspools-${timestamp}.error" +rados_out="${out_dir}/rados-${timestamp}.intermediate" +rados_err="${out_dir}/rados-${timestamp}.error" +rgwadmin_out="${out_dir}/radosgw-admin-${timestamp}.intermediate" +rgwadmin_err="${out_dir}/radosgw-admin-${timestamp}.error" +gap_out="${out_dir}/gap-list-${timestamp}.gap" + + +# field separator +# contains ascii 0xFE, designed to be a character that won't appear +# in normal output, can only be a single character due to use in the +# sort command +fs=$(echo -e "\xFE") + +log() { + echo $(date +%F\ %T) $(hostname -s) "$1" +} + +# +# checkReturn RETURNCODE MESSAGE TERMINATE +# RETURNCODE - ( usually $? ) of previous command +# MESSAGE - Message to print on non-zero return code +# TERMINATE - non-empty == terminate the script on non-zero return code +# +checkReturn() { + if [ $1 -ne 0 ]; then + error_addon="" + if [ ! -z "$3" ]; then + error_addon="; Terminating" + fi + log "ERROR: ${2} failed: returned ${1}${error_addon}" + if [ ! -z "$3" ]; then + >&2 echo + >&2 echo '***' + >&2 echo '*** WARNING: The results are incomplete. Do not use! ***' + >&2 echo '***' + kill -s TERM $TOP_PID + fi + fi +} + +prompt_pool() { + # note: all prompts go to stderr so stdout contains just the result + rados lspools >"$temp_file" 2>"$lspools_err" + checkReturn $? "Listing pools" 1 + + >&2 echo "" + >&2 echo "Available pools:" + >&2 sed 's/^/ /' "$temp_file" # list pools and indent + >&2 echo "" + >&2 echo "Which Rados Gateway Data pool do you want to search for gaps? " + >&2 echo "" + >&2 echo "NOTE: If your installation has multiple bucket data pools using " + >&2 echo " bucket placement policies, please enter a space separated " + >&2 echo " list of bucket data pools to enumerate." + >&2 echo "" + local mypool + read mypool + echo $mypool +} + +radosgw_radoslist() { + log "Running 'radosgw-admin bucket radoslist'." + rm -f "$rgwadmin_flag" &> /dev/null + radosgw-admin bucket radoslist --rgw-obj-fs="$fs" >"$rgwadmin_out" 2>"$rgwadmin_err" + RETVAL=$? + if [ "$RETVAL" -ne 0 ] ;then + touch "$rgwadmin_flag" + fi + checkReturn $RETVAL "radosgw-admin radoslist" 1 + log "Completed 'radosgw-admin bucket radoslist'." + + log "Sorting 'radosgw-admin bucket radoslist' output." + sort -T ${temp_prefix} --field-separator="$fs" -k1,1 -u "$rgwadmin_out" > "$rgwadmin_temp" + checkReturn $? "Sorting 'radosgw-admin bucket radoslist' output" 1 + log "Completed sorting 'radosgw-admin bucket radoslist'." + + log "Moving 'radosgw-admin bucket radoslist' output." + mv -f "$rgwadmin_temp" "$rgwadmin_out" + checkReturn $? "Moving 'radosgw-admin bucket radoslist' output" 1 + log "Completed moving 'radosgw-admin bucket radoslist' output." +} + +rados_ls() { + log "Starting 'rados ls' function." + rm -f "$rados_flag" &> /dev/null + rm -f "$rados_out" &> /dev/null + local mypool + for mypool in $pool; do + log "Running 'rados ls' on pool ${mypool}." + rados ls --pool="$mypool" >>"$rados_out" 2>"$rados_err" + RETVAL=$? + if [ "$RETVAL" -ne 0 ] ;then + touch "$rados_flag" + fi + checkReturn $RETVAL "'rados ls' on pool ${mypool}" 1 + log "Completed 'rados ls' on pool ${mypool}." + done + if [ ! -e "$rados_flag" ]; then + log "Sorting 'rados ls' output(s)." + sort -T ${temp_prefix} -u "$rados_out" >"$rados_temp" + checkReturn $? "Sorting 'rados ls' output(s)" 1 + + log "Moving sorted output(s)." + mv -f "$rados_temp" "$rados_out" + checkReturn $? "Moving temp file to output file" 1 + log "Sorting 'rados ls' output(s) complete." + fi +} + +usage() { + >&2 cat << EOF + +WARNING WARNING WARNING WARNING WARNING WARNING WARNING +WARNING: +WARNING: Command option format has changed. Please check closely. +WARNING: +WARNING WARNING WARNING WARNING WARNING WARNING WARNING + +Usage: $0 [-m] [-p ] [-t ] + +Where: + -m Optionally, run the two listings in multiple threads. + --See NOTE below-- + + -p The RGW bucket data pool name, if omitted, pool name + will be prompted for during execution. + Multiple pools can be supplied as a space separated + double quoted list. + + -t Optionally, set the directory to use for temp space. + This may be required if /tmp is low on space. + +NOTE: This tool is currently considered to be EXPERIMENTAL. + +NOTE: False positives are possible. False positives would likely + appear as objects that were never deleted and are fully + intact. All results should therefore be verified. + +NOTE: Multithread listing may increase performance but may also increase + the risk of false positives when the cluster is undergoing + modifications during the listing processes. In addition to the + above, false positives might also include objects that were + intentionally deleted. + +EOF + exit 1 +} + +multithread=0 +error=0 +temp_prefix="/tmp" +while getopts ":mp:t:" o; do + case "${o}" in + m) + multithread=1 + ;; + p) + pool=${OPTARG} + ;; + t) + if [ -d ${OPTARG} ]; then + temp_prefix=${OPTARG} + else + echo + echo "ERROR: Temporary directory does not exist: ${OPTARG}" + error=1 + fi + ;; + *) + echo + echo "ERROR: Unrecognized argument: ${o}" + error=1 + ;; + esac +done +shift $((OPTIND-1)) + +temp_file=${temp_prefix}/gap-tmp.$$ +rados_temp=${temp_prefix}/rados-tmp.$$ +rgwadmin_temp=${temp_prefix}/radosgw-admin-tmp.$$ +rados_flag=${temp_prefix}/rados-flag.$$ +rgwadmin_flag=${temp_prefix}/radosgw-admin-flag.$$ +incremental_grep_awk="${temp_prefix}/ig-${$}.awk" + +if [ $error -gt 0 ]; then + usage +fi + +if [ -z "$pool" ]; then + pool="$(prompt_pool)" +fi + +error=0 +rados ${CEPH_ARGS} lspools > ${temp_file} +checkReturn $? "rados lspools" 1 +for mypool in $pool; do + if [ $(grep -c "^${mypool}$" "${temp_file}") -eq 0 ]; then + echo + echo "ERROR: Supplied pool does not exist: ${mypool}" + error=1 + fi +done + +if [ $error -gt 0 ]; then + exit 1 +fi + +log "Pool is \"$pool\"." +log "Note: output files produced will be tagged with the current timestamp -- ${timestamp}." + +if [ $multithread -eq 1 ] ;then + startsecs=$(date +%s) + log "Starting multithread tasks..." + rados_ls & + radosgw_radoslist & + jobs &> /dev/null # without this, the myjobs count always equals 1 (confused) + myjobs=$(jobs | wc -l) + while [ $myjobs -gt 0 ]; do + # provide minutely status update + if [ $(( ($(date +%s)-$startsecs) % 60 )) -eq 0 ]; then + echo + deltasecs=$(( $(date +%s)-$startsecs )) + log "Waiting for listing tasks to complete. Running ${myjobs} tasks for ${deltasecs} seconds." + fi + sleep 1 + echo -n . + if [ -e "$rgw_admin_flag" ]; then + exit 1 + fi + if [ -e "$rados_flag" ]; then + exit 2 + fi + jobs &> /dev/null # without this, the myjobs count always equals 1 (confused) + myjobs=$(jobs | wc -l) + done + echo +else + rados_ls + radosgw_radoslist +fi + +if [ -e "$rgw_admin_flag" ]; then + exit 1 +fi + +if [ -e "$rados_flag" ]; then + exit 2 +fi + +for myfile in $rados_out $rgwadmin_out; do + if [ ! -s "${myfile}" ]; then + log "ERROR: Empty file detected: ${myfile}" + log "ERROR: RESULTS ARE INCOMPLETE - DO NOT USE" + exit 1 + fi +done + +# Create an awk script in a file for parsing the two command outoputs. +log "Creating awk script for comparing outputs: ${incremental_grep_awk}" + +cat <<"EOF" >$incremental_grep_awk +# This awk script is used by rgw-gap-list and will sequence through +# each line in $rados_out and $rgwadmin_out exactly once. +# +# During this iteration: +# * The 1st column of $rgwadmin_out is compared to the line of +# $rados_out. +# * If they are equal, the next line of $rados_out is read in and the +# next line of $rgwadmin_out is provided via normal awk iteration. +# * If a value appears in $rgwadmin_out, but not $rados_out, this +# indicates a possible deleted tail object and the accompanying +# bucket / user object name is output, assuming it had not been +# previously identified. +# - A map of outputed bucket / user object is maintained in memory +# * If a value appears in $rados_out, but not in $rgwadmin_out, the +# $rados_out file is iterated until the $rados_out line is equal +# or > (alphabetically) the value from the $rgwadmin_out file. + +function usage() { + print "Example Usage:">>"/dev/stderr" + print " # limit $fs to single char that will not appear in either output">>"/dev/stderr" + print " # The below is Octal 376, or Hex 0xFE">>"/dev/stderr" + print "">>"/dev/stderr" + print " $ fs=$(echo -e \"\\0376\") ">>"/dev/stderr" + print " $ rados ls -p default.rgw.buckets.data > rados_out.txt">>"/dev/stderr" + print " $ radosgw-admin bucket radoslist --rgw-obj-fs=\"$fs\" \\">>"/dev/stderr" + print " | sort --field-separator=\"$fs\" -k 1,1 > rgwadmin_out.txt">>"/dev/stderr" + print " ">>"/dev/stderr" + print " $ awk -F \"$fs\" \\">>"/dev/stderr" + print " -v filetwo=rados_out.txt \\">>"/dev/stderr" + print " -v map_out=MappedOutput.txt \\">>"/dev/stderr" + print " -f ig_awk \\">>"/dev/stderr" + print " rgwadmin_out.txt">>"/dev/stderr" + print "">>"/dev/stderr" + print " Result will be provided in the 'MappedOutput.txt' file in this">>"/dev/stderr" + print " example. If you'd prefer the output to be sorted, you can run">>"/dev/stderr" + print " $ sort MappedOutput.txt > SortedMappedOutput.txt">>"/dev/stderr" + print "">>"/dev/stderr" + print "">>"/dev/stderr" + exit 1 +} + +function get_date_time() { + dtstr="date +%F\\ %T" + dtstr | getline mydt + close(dtstr) + return mydt +} + +function status_out() { + printf("%s % 17d\t% 17d\t% 12d\n",get_date_time(),f1_count,f2_count,lineoutCount)>>"/dev/stderr" +} + +function advance_f2() { + if ((getline f2line>map_out + lastline=$2" "$NF + lineoutCount++ + } +} + +BEGIN { + if(filetwo==""||map_out=="") { + print "">>"/dev/stderr" + print "">>"/dev/stderr" + print "Missing parameter." + print "">>"/dev/stderr" + print "">>"/dev/stderr" + usage() + } + status_delta=100000 + f1_count=0 + f2_count=0 + advance_f2() + printf("%s File 1 Line Count\tFile 2 Line Count\tPotentially Impacted Objects\n",get_date_time())>>"/dev/stderr" + for(n=0;n<256;n++) { + ord[sprintf("%c",n)]=n + } +} + +{ + f1_count++ + if(f2_eof==0) { + if(test_lines()==2) { + while ($1>b[1]) { + advance_f2() + } + test_lines() + } + } else { + # If EOF hit, dump all remaining lines since they're missing + # from filetwo + line_out() + } + if((f1_count % status_delta)==0) { + status_out() + } +} + +END { + if(f1_count>0) { + status_out() + } +} + +EOF + + +log "Begin identifying potentially impacted user object names." + +echo -n > "$temp_file" # Ensure the file is empty +awk -F "$fs" -v filetwo=$rados_out -v map_out=$temp_file -f $incremental_grep_awk $rgwadmin_out +checkReturn $? "Identifying potentially impacted user object names" 1 + +log "Begin sorting results." +sort -T ${temp_prefix} "$temp_file" > "$gap_out" +checkReturn $? "sorting results" 1 +rm -f "$temp_file" + +found=$(wc -l < "$gap_out") +mydate=$(date +%F\ %T) + +log "Done." + +cat << EOF + +Found $found *possible* gaps. +The results can be found in "${gap_out}". + +Intermediate files: "${rados_out}" and "${rgwadmin_out}". + +*** +*** WARNING: This is EXPERIMENTAL code and the results should be used +*** with CAUTION and VERIFIED. Not everything listed is an +*** actual gap. EXPECT false positives. Every result +*** produced should be verified. +*** +EOF diff --git a/src/rgw/rgw-gap-list-comparator b/src/rgw/rgw-gap-list-comparator new file mode 100755 index 000000000..c377fdaf8 --- /dev/null +++ b/src/rgw/rgw-gap-list-comparator @@ -0,0 +1,119 @@ +#!/usr/bin/awk -f + +# +# Version 1 +# +# This awk script takes two, similarly sorted lists and outputs +# only the lines which exist in both lists. The script takes +# three inputs: +# +# ./rgw-gap-list-comparator \ +# -v filetwo=gap-list-B.txt \ +# -v matchout=matched_lines.txt \ +# gap-list-A.txt +# + +function usage() { + print "">>"/dev/stderr" + print "">>"/dev/stderr" + print "The idea behind the script is to eliminate false positive hits">>"/dev/stderr" + print "from the rgw-gap-list tool which are due to upload timing of new">>"/dev/stderr" + print "objects during the tool's execution. To use the tool properly,">>"/dev/stderr" + print "the following process should be followed:">>"/dev/stderr" + print "">>"/dev/stderr" + print "">>"/dev/stderr" + print " 1: Run the 'rgw-gap-list' tool twice">>"/dev/stderr" + print "">>"/dev/stderr" + print " 2: Sort the resulting map files:">>"/dev/stderr" + print " $ export LC_ALL=C">>"/dev/stderr" + print " $ sort gap-list-A.gap > gap-list-A.sorted.gap">>"/dev/stderr" + print " $ sort gap-list-B.gap > gap-list.B.sorted.gap">>"/dev/stderr" + print " -- Where the A / B in the gap-list file names are the date/time associated with each of the respective 'rgw-gap-list' outputs">>"/dev/stderr" + print "">>"/dev/stderr" + print " 3: Run the 'same_lines_only.awk' script over the two files:">>"/dev/stderr" + print " $ rm matched_lines.txt">>"/dev/stderr" + print " $ ./rgw-gap-list-comparator -v filetwo=gap-list-B.sorted.gap -v matchout=matched_lines.txt gap-list-A.sorted.gap">>"/dev/stderr" + print " -- Where the A / B in the gap-list file names are the date/time associated with each of the respective 'rgw-gap-list' outputs">>"/dev/stderr" + print "">>"/dev/stderr" + print " The resulting 'matched_lines.txt' will be a high confidence list of impacted objects with little to no false positives.">>"/dev/stderr" + print "">>"/dev/stderr" + print "">>"/dev/stderr" + exit 1 +} + +function advance_f2() { + if ((getline f2line>matchout + lineoutcount++ + advance_f2() + return 0 + } else if ($0>f2line) { + return 2 + } else { + return 1 + } +} + +function status_out() { + printf("%s % 17d\t% 17d\t% 12d\n",get_date_time(),f1_count,f2_count,lineoutcount)>>"/dev/stderr" +} + +function get_date_time() { + dtstr="date +%F\\ %T" + dtstr | getline mydt + close(dtstr) + return mydt +} + +BEGIN { + if(filetwo==""||matchout=="") { + print "">>"/dev/stderr" + print "">>"/dev/stderr" + print "Missing parameter." + print "">>"/dev/stderr" + print "">>"/dev/stderr" + usage() + } + + f1_count=0 + f2_count=0 + lineoutcount=0 + f2_eof=0 + statusevery=100000 + advance_f2() + printf("%s File 1 Line Count\tFile 2 Line Count\tPotentially Impacted Objects\n",get_date_time())>>"/dev/stderr" + status_out() +} + + +{ + f1_count++ + if(f2_eof==0) { + if(test_lines()==2) { + while($0>f2line && f2_eof==0) { + advance_f2() + } + test_lines() + } + } else { + exit 0 + } + if ((f1_count % statusevery)==0) { + status_out() + } +} + +END { + if(f1_count>0) { + status_out() + } +} + diff --git a/src/rgw/rgw-orphan-list b/src/rgw/rgw-orphan-list new file mode 100755 index 000000000..c8856e8ee --- /dev/null +++ b/src/rgw/rgw-orphan-list @@ -0,0 +1,278 @@ +#!/usr/bin/env bash + +# version 2023-01-11 + +# IMPORTANT: affects order produced by 'sort' and 'ceph-diff-sorted' +# relies on this ordering +export LC_ALL=C + +# If your ceph.conf is not in /etc/ceph, then set CEPH_CONF="-c /path/to/ceph.conf" + +trap "exit 1" TERM +TOP_PID=$$ + +out_dir="." +timestamp=$(date -u +%Y%m%d%H%M%S) +lspools_err="${out_dir}/lspools-${timestamp}.error" +rados_out="${out_dir}/rados-${timestamp}.intermediate" +rados_odd="${out_dir}/rados-${timestamp}.issues" +rados_err="${out_dir}/rados-${timestamp}.error" +rgwadmin_out="${out_dir}/radosgw-admin-${timestamp}.intermediate" +rgwadmin_err="${out_dir}/radosgw-admin-${timestamp}.error" +delta_out="${out_dir}/orphan-list-${timestamp}.out" + +log() { + echo $(date +%F\ %T) $(hostname -s) "$1" +} + +usage() { + >&2 cat << EOF + +Usage: $0 [-h] "" [] + +Where: + -h This help output + The RGW data pool name, if omitted, pool name will be + prompted for during execution. + If specifying multiple pools, please use space separated + list and wrap the entire list in quotes. + + Optionally, set the directory to use for temp space. + This may be required if /tmp is low on space. + +NOTES: + - This tool should be ran on a node with ceph-radosgw package installed. + Specifically, it needs the 'ceph-diff-tool' command from that package. + + - This tool is currently considered to be EXPERIMENTAL. + + - False positives are possible. False positives would likely + appear as objects that were never deleted and are fully + intact. All results should therefore be verified. + +WARNING: + - Indexless buckets will appear as 100% orphan objects. + - Therefore, this tool MUST NOT be used in environments with indexless + buckets. + +EOF + exit 1 +} + +# +# checkReturn RETURNCODE MESSAGE TERMINATE +# RETURNCODE - ( usually $? ) of previous command +# MESSAGE - Message to print on non-zero return code +# TERMINATE - non-empty == terminate the script on non-zero return code +# +checkReturn() { + if [ $1 -ne 0 ]; then + error_addon="" + if [ ! -z "$3" ]; then + error_addon="; Terminating" + fi + log "ERROR: ${2} failed: returned ${1}${error_addon}" + if [ ! -z "$3" ]; then + >&2 echo + >&2 echo '***' + >&2 echo '*** WARNING: The results are incomplete. Do not use! ***' + >&2 echo '***' + kill -s TERM $TOP_PID + fi + fi +} + +prompt_pool() { + # note: all prompts go to stderr so stdout contains just the result + >&2 echo "Available pools:" + rados ${CEPH_CONF} lspools >"$temp_file" 2>"$lspools_err" + checkReturn $? "Listing pools failed" 1 + + >&2 sed 's/^/ /' "$temp_file" # list pools and indent + >&2 printf "Which pool do you want to search for orphans (for multiple, use space-separated list)? " + local mypool + read mypool + echo $mypool +} + +radosgw_radoslist() { + log "Running 'radosgw-admin bucket radoslist'." + rm -f "$rgwadmin_flag" &> /dev/null + radosgw-admin ${CEPH_CONF} bucket radoslist >"$rgwadmin_out" 2>"$rgwadmin_err" + RETVAL=$? + if [ "$RETVAL" -ne 0 ] ;then + touch "$rgwadmin_flag" + fi + checkReturn $RETVAL "radosgw-admin radoslist" 1 + log "Completed 'radosgw-admin bucket radoslist'." + + log "Sorting 'radosgw-admin bucket radoslist' output." + sort -T ${temp_prefix} -u "$rgwadmin_out" > "$rgwadmin_temp" + checkReturn $? "Sorting 'radosgw-admin bucket radoslist' output" 1 + log "Completed sorting 'radosgw-admin bucket radoslist'." + + log "Moving 'radosgw-admin bucket radoslist' output." + mv -f "$rgwadmin_temp" "$rgwadmin_out" + checkReturn $? "Moving 'radosgw-admin bucket radoslist' output" 1 + log "Completed moving 'radosgw-admin bucket radoslist' output." +} + +rados_ls() { + log "Starting 'rados ls' function." + rm -f "$rados_flag" &> /dev/null + rm -f "$rados_out" &> /dev/null + local mypool + for mypool in $pool; do + log "Running 'rados ls' on pool ${mypool}." + rados ${CEPH_CONF} ls --pool="$mypool" --all >>"$rados_out" 2>"$rados_err" + RETVAL=$? + if [ "$RETVAL" -ne 0 ] ;then + touch "$rados_flag" + fi + checkReturn $RETVAL "'rados ls' on pool ${mypool}" 1 + log "Completed 'rados ls' on pool ${mypool}." + done + if [ ! -e "$rados_flag" ]; then + # NOTE: Each entry (line of output) of `rados ls --all` should be in + # one of four formats depending on whether or not an entry has a + # namespace and/or locator: + # + # oid + # oidlocator + # namespaceoid + # namespaceoidlocator + # + # Any occurrences of the 2nd, 3rd, or 4th (i.e., existence of + # namespace and/or locator) should cause the create of the "odd" file + # and an explanation in the output, and those entries will not be + # retained, and therefore they will not be called out as orphans. They + # will need special handling by the end-user as we do not expect + # namespaces or locators. + + # check for namespaces -- any line that does not begin with a tab + # indicates a namespace; add those to "odd" file and set flag; note: + # this also picks up entries with namespace and locator + log "Checking for namespaces" + grep --text $'^[^\t]' "$rados_out" >"$rados_odd" + if [ "${PIPESTATUS[0]}" -eq 0 ] ;then + log "Namespaces found" + namespace_found=1 + fi + + # check for locators (w/o namespace); we idenitfy them by skipping + # past the empty namespace (i.e., one TAB), skipping past the oid, + # then looking for a TAB; note we use egrep to get the '+' character + # and the $ in front of the ' allows the \t to be interpreted as a TAB + log "Checking for locators" + egrep --text $'^\t[[:graph:]]+\t' "$rados_out" >>"$rados_odd" + if [ "${PIPESTATUS[0]}" -eq 0 ] ;then + log "Locator found" + locator_found=1 + fi + + # extract the entries that are just oids (i.e., no namespace or + # locator) for further processing; only look at lines that begin with + # a TAB and do not contain a second TAB, and then grab everything + # after the initial TAB + log "Generating final 'rados ls' output (without namespaces or locators)" + grep --text $'^\t' "$rados_out" | grep --text -v $'^\t.*\t' | sed -E 's/^\t//' >"$temp_file" + mv -f "$temp_file" "$rados_out" + + log "Sorting 'rados ls' output(s)." + sort -T ${temp_prefix} -u "$rados_out" >"$temp_file" + checkReturn $? "Sorting 'rados ls' output(s)" 1 + log "Sorting 'rados ls' output(s) complete." + + log "Moving sorted output(s)." + mv -f "$temp_file" "$rados_out" + checkReturn $? "Moving temp file to output file" 1 + fi +} + +temp_prefix="/tmp" +if [ ! -z "$2" ]; then + if [ -d "$2" ]; then + temp_prefix=$2 + else + echo + echo "ERROR: Provided temp directory does not exist: ${2}" + usage + fi + temp_prefix="$2" +fi +temp_file=${temp_prefix}/temp.$$ +rados_flag=${temp_prefix}/rados_flag.$$ +rgwadmin_flag=${temp_prefix}/rgwadmin_flag.$$ +rgwadmin_temp=${temp_prefix}/rgwadmin_temp.$$ + +if [ $# -eq 0 ] ;then + pool="$(prompt_pool)" +else + if [ "$1" == "-h" ]; then + usage + fi + pool="$1" +fi + +error=0 +rados ${CEPH_CONF} lspools > $temp_file +for mypool in $pool; do + if [ $(grep -c "^${mypool}$" "${temp_file}") -eq 0 ]; then + echo + echo "ERROR: Supplied pool does not exist: ${mypool}" + error=1 + fi +done +if [ $error -gt 0 ]; then + echo "Terminating" + exit 1 +fi + +log "Pool is \"$pool\"." +log "Note: output files produced will be tagged with the current timestamp -- ${timestamp}." + +rados_ls +radosgw_radoslist + +# +# Check for any empty output files +# + +for myfile in $rados_out $rgwadmin_out; do + if [ ! -s "${myfile}" ]; then + log "ERROR: Empty file detected: ${myfile}" + log "ERROR: RESULTS ARE INCOMPLETE - DO NOT USE" + exit 1 + fi +done + +log "Computing delta..." +ceph-diff-sorted "$rados_out" "$rgwadmin_out" | grep --text "^<" | sed 's/^< *//' >"$delta_out" +# use PIPESTATUS to get at exit status of first process in above pipe; +# 0 means same, 1 means different, >1 means error +if [ "${PIPESTATUS[0]}" -gt 1 ] ;then + log "ERROR: ceph-diff-sorted failed with status: ${PIPESTATUS[0]}" + log "TERMINATING - Results are incomplete - DO NOT USE" + exit 1 +fi + +log "Computing results..." +found=$(wc -l < "$delta_out") +possible=$(wc -l < "$rados_out") +percentage=0 +if [ $possible -ne 0 ] ;then + percentage=$(expr 100 \* $found / $possible) +fi + +echo "$found potential orphans found out of a possible $possible (${percentage}%)." +echo "The results can be found in '${delta_out}'." +echo " Intermediate files are '${rados_out}' and '${rgwadmin_out}'." +if [ -n "$namespace_found" -o -n "$locator_found" ] ;then + echo " Note: 'rados ls' found entries that might be in a namespace or might" + echo " have a locator; see '${rados_odd}' for those entries." +fi +echo "***" +echo "*** WARNING: This is EXPERIMENTAL code and the results should be used" +echo "*** only with CAUTION!" +echo "***" +echo "Done at $(date +%F\ %T)." diff --git a/src/rgw/rgw-restore-bucket-index b/src/rgw/rgw-restore-bucket-index new file mode 100755 index 000000000..056658119 --- /dev/null +++ b/src/rgw/rgw-restore-bucket-index @@ -0,0 +1,250 @@ +#!/usr/bin/env bash + +# version 2023-03-21 + +# rgw-restore-bucket-index is an EXPERIMENTAL tool to use in case +# bucket index entries for objects in the bucket are somehow lost. It +# is expected to be needed and used rarely. A bucket name is provided +# and the data pool for that bucket is scanned for all head objects +# matching the bucket's marker. The rgw object name is then extracted +# from the rados object name, and `radosgw-admin bucket reindex ...` +# is used to add the bucket index entry. +# +# Because this script must process json objects, the `jq` tool must be +# installed on the system. +# +# Usage: $0 [--proceed] [data-pool-name] +# +# This tool is designed to be interactive, allowing the user to +# examine the list of objects to be reindexed before +# proceeding. However, if the "--proceed" option is provided, the +# script will not prompt the user and simply proceed. + +trap "clean ; exit 1" TERM +export TOP_PID=$$ + +# IMPORTANT: affects order produced by 'sort' and 'ceph-diff-sorted' +# relies on this ordering +export LC_ALL=C + +export bkt_entry=/tmp/rgwrbi-bkt-entry.$$ +export bkt_inst=/tmp/rgwrbi-bkt-inst.$$ +export bkt_inst_new=/tmp/rgwrbi-bkt-inst-new.$$ +export obj_list=/tmp/rgwrbi-object-list.$$ +export zone_info=/tmp/rgwrbi-zone-info.$$ +export clean_temps=1 + +# number of seconds for a bucket index pending op to be completed via +# dir_suggest mechanism +pending_op_secs=120 + +# +if which radosgw-admin > /dev/null ;then + : +else + echo 'Error: must have command `radosgw-admin` installed and on $PATH for operation.' + exit 1 +fi + +# make sure jq is available +if which jq > /dev/null ;then + : +else + echo 'Error: must have command `jq` installed and on $PATH for json parsing.' + exit 1 +fi + +clean() { + if [ -n "$clean_temps" ] ;then + rm -f $bkt_entry $bkt_inst $bkt_inst_new $obj_list $zone_info + fi +} + +super_exit() { + kill -s TERM $TOP_PID +} + +usage() { + >&2 cat << EOF + +Usage: $0 [--proceed] [data-pool-name] + NOTE: This tool is currently considered EXPERIMENTAL. + NOTE: If a data-pool-name is not supplied then it will be inferred from bucket and zone information. + NOTE: If --proceed is provided then user will not be prompted to proceed. Use with caution. +EOF + super_exit +} + +# strips the starting and ending double quotes from a string, so: +# "dog" -> dog +# "dog -> "dog +# d"o"g -> d"o"g +# "do"g" -> do"g +strip_quotes() { + echo "$1" | sed 's/^"\(.*\)"$/\1/' +} + +# Determines the name of the data pool. Expects the optional +# command-line argument to appear as $1 if there is one. The +# command-line has the highest priority, then the "explicit_placement" +# in the bucket instance data, and finally the "placement_rule" in the +# bucket instance data. +get_pool() { + # command-line + if [ -n "$1" ] ;then + echo "$1" + exit 0 + fi + + # explicit_placement + expl_pool=$(strip_quotes $(jq '.data.bucket_info.bucket.explicit_placement.data_pool' $bkt_inst)) + if [ -n "$expl_pool" ] ;then + echo "$expl_pool" + exit 0 + fi + + # placement_rule + plmt_rule=$(strip_quotes $(jq '.data.bucket_info.placement_rule' $bkt_inst)) + plmt_pool=$(echo "$plmt_rule" | awk -F / '{print $1}') + plmt_class=$(echo "$plmt_rule" | awk -F / '{print $2}') + if [ -z "$plmt_class" ] ;then + plmt_class=STANDARD + fi + + radosgw-admin zone get >$zone_info 2>/dev/null + pool=$(strip_quotes $(jq ".placement_pools [] | select(.key | contains(\"${plmt_pool}\")) .val .storage_classes.${plmt_class}.data_pool" $zone_info)) + + if [ -z "$pool" ] ;then + echo ERROR: unable to determine pool. + super_exit + fi + echo "$pool" +} + +if [ $1 == "--proceed" ] ;then + echo "NOTICE: This tool is currently considered EXPERIMENTAL." + proceed=1 + shift +fi + +# expect 1 or 2 arguments +if [ $# -eq 0 -o $# -gt 2 ] ;then + usage +fi + +bucket=$1 + +# read bucket entry metadata +radosgw-admin metadata get bucket:$bucket >$bkt_entry 2>/dev/null +marker=$(strip_quotes $(jq ".data.bucket.marker" $bkt_entry)) +bucket_id=$(strip_quotes $(jq ".data.bucket.bucket_id" $bkt_entry)) +if [ -z "$marker" -o -z "$bucket_id" ] ;then + echo "ERROR: unable to read entry-point metadata for bucket \"$bucket\"." + clean + exit 1 +fi + +echo marker is $marker +echo bucket_id is $bucket_id + +# read bucket instance metadata +radosgw-admin metadata get bucket.instance:${bucket}:$bucket_id >$bkt_inst 2>/dev/null + +# handle versioned buckets +bkt_flags=$(jq ".data.bucket_info.flags" $bkt_inst) +if [ -z "$bkt_flags" ] ;then + echo "ERROR: unable to read instance metadata for bucket \"$bucket\"." + exit 1 +fi + +# mask bit indicating it's a versioned bucket +is_versioned=$(( $bkt_flags & 2)) +if [ "$is_versioned" -ne 0 ] ;then + echo "Error: this bucket appears to be versioned, and this tool cannot work with versioned buckets." + clean + exit 1 +fi + +# examine number of bucket index shards +num_shards=$(jq ".data.bucket_info.num_shards" $bkt_inst) +echo number of bucket index shards is $num_shards + +# determine data pool +pool=$(get_pool $2) +echo data pool is $pool + +# search the data pool for all of the head objects that begin with the +# marker that are not in namespaces (indicated by an extra underscore) +# and then strip away all but the rgw object name +( rados -p $pool ls | grep "^${marker}_[^_]" | sed "s/^${marker}_\(.*\)/\1/" >$obj_list ) 2>/dev/null + +# handle the case where the resulting object list file is empty +if [ -s $obj_list ] ;then + : +else + echo "NOTICE: No head objects for bucket \"$bucket\" were found in pool \"$pool\", so nothing was recovered." + clean + exit 0 +fi + +if [ -z "$proceed" ] ;then + # warn user and get permission to proceed + echo "NOTICE: This tool is currently considered EXPERIMENTAL." + echo "The list of objects that we will attempt to restore can be found in \"$obj_list\"." + echo "Please review the object names in that file (either below or in another window/terminal) before proceeding." + while true ; do + read -p "Type \"proceed!\" to proceed, \"view\" to view object list, or \"q\" to quit: " action + if [ "$action" == "q" ] ;then + echo "Exiting..." + clean + exit 0 + elif [ "$action" == "view" ] ;then + echo "Viewing..." + less $obj_list + elif [ "$action" == "proceed!" ] ;then + echo "Proceeding..." + break + else + echo "Error: response \"$action\" is not understood." + fi + done +fi + +# execute object rewrite on all of the head objects +radosgw-admin object reindex --bucket=$bucket --objects-file=$obj_list 2>/dev/null +reindex_done=$(date +%s) + +# note: large is 2^30 +export large=1073741824 + +listcmd="radosgw-admin bucket list --bucket=$bucket --allow-unordered --max-entries=$large" + +if [ -n "$proceed" ] ;then + sleep $pending_op_secs + $listcmd >/dev/null 2>/dev/null +else + echo "NOTICE: Bucket stats are currently incorrect. They can be restored with the following command after 2 minutes:" + echo " $listcmd" + + while true ; do + read -p "Would you like to take the time to recalculate bucket stats now? [yes/no] " action + if [ "$action" == "no" ] ;then + break + elif [ "$action" == "yes" ] ;then + # make sure at least $pending_op_secs since reindex completed + now=$(date +%s) + sleep_time=$(expr $pending_op_secs - $now + $reindex_done) + if [ "$sleep_time" -gt 0 ] ;then + sleep $sleep_time + fi + + $listcmd >/dev/null 2>/dev/null + break + else + echo "Error: response \"$action\" is not understood." + fi + done +fi + +clean +echo Done diff --git a/src/rgw/rgw_acl.cc b/src/rgw/rgw_acl.cc new file mode 100644 index 000000000..f32a73f26 --- /dev/null +++ b/src/rgw/rgw_acl.cc @@ -0,0 +1,442 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include +#include + +#include "include/types.h" + +#include "common/Formatter.h" + +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_user.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +bool operator==(const ACLPermission& lhs, const ACLPermission& rhs) { + return lhs.flags == rhs.flags; +} +bool operator!=(const ACLPermission& lhs, const ACLPermission& rhs) { + return !(lhs == rhs); +} + +bool operator==(const ACLGranteeType& lhs, const ACLGranteeType& rhs) { + return lhs.type == rhs.type; +} +bool operator!=(const ACLGranteeType& lhs, const ACLGranteeType& rhs) { + return lhs.type != rhs.type; +} + +bool operator==(const ACLGrant& lhs, const ACLGrant& rhs) { + return lhs.type == rhs.type && lhs.id == rhs.id + && lhs.email == rhs.email && lhs.permission == rhs.permission + && lhs.name == rhs.name && lhs.group == rhs.group + && lhs.url_spec == rhs.url_spec; +} +bool operator!=(const ACLGrant& lhs, const ACLGrant& rhs) { + return !(lhs == rhs); +} + +bool operator==(const ACLReferer& lhs, const ACLReferer& rhs) { + return lhs.url_spec == rhs.url_spec && lhs.perm == rhs.perm; +} +bool operator!=(const ACLReferer& lhs, const ACLReferer& rhs) { + return !(lhs == rhs); +} + +bool operator==(const RGWAccessControlList& lhs, + const RGWAccessControlList& rhs) { + return lhs.acl_user_map == rhs.acl_user_map + && lhs.acl_group_map == rhs.acl_group_map + && lhs.referer_list == rhs.referer_list + && lhs.grant_map == rhs.grant_map; +} +bool operator!=(const RGWAccessControlList& lhs, + const RGWAccessControlList& rhs) { + return !(lhs == rhs); +} + +bool operator==(const ACLOwner& lhs, const ACLOwner& rhs) { + return lhs.id == rhs.id && lhs.display_name == rhs.display_name; +} +bool operator!=(const ACLOwner& lhs, const ACLOwner& rhs) { + return !(lhs == rhs); +} + +bool operator==(const RGWAccessControlPolicy& lhs, + const RGWAccessControlPolicy& rhs) { + return lhs.acl == rhs.acl && lhs.owner == rhs.owner; +} +bool operator!=(const RGWAccessControlPolicy& lhs, + const RGWAccessControlPolicy& rhs) { + return !(lhs == rhs); +} + +void RGWAccessControlList::_add_grant(ACLGrant *grant) +{ + ACLPermission& perm = grant->get_permission(); + ACLGranteeType& type = grant->get_type(); + switch (type.get_type()) { + case ACL_TYPE_REFERER: + referer_list.emplace_back(grant->get_referer(), perm.get_permissions()); + + /* We're specially handling the Swift's .r:* as the S3 API has a similar + * concept and thus we can have a small portion of compatibility here. */ + if (grant->get_referer() == RGW_REFERER_WILDCARD) { + acl_group_map[ACL_GROUP_ALL_USERS] |= perm.get_permissions(); + } + break; + case ACL_TYPE_GROUP: + acl_group_map[grant->get_group()] |= perm.get_permissions(); + break; + default: + { + rgw_user id; + if (!grant->get_id(id)) { + ldout(cct, 0) << "ERROR: grant->get_id() failed" << dendl; + } + acl_user_map[id.to_str()] |= perm.get_permissions(); + } + } +} + +void RGWAccessControlList::add_grant(ACLGrant *grant) +{ + rgw_user id; + grant->get_id(id); // not that this will return false for groups, but that's ok, we won't search groups + grant_map.insert(pair(id.to_str(), *grant)); + _add_grant(grant); +} + +void RGWAccessControlList::remove_canon_user_grant(rgw_user& user_id) +{ + auto multi_map_iter = grant_map.find(user_id.to_str()); + if(multi_map_iter != grant_map.end()) { + auto grants = grant_map.equal_range(user_id.to_str()); + grant_map.erase(grants.first, grants.second); + } + + auto map_iter = acl_user_map.find(user_id.to_str()); + if (map_iter != acl_user_map.end()){ + acl_user_map.erase(map_iter); + } +} + +uint32_t RGWAccessControlList::get_perm(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + const uint32_t perm_mask) +{ + ldpp_dout(dpp, 5) << "Searching permissions for identity=" << auth_identity + << " mask=" << perm_mask << dendl; + + return perm_mask & auth_identity.get_perms_from_aclspec(dpp, acl_user_map); +} + +uint32_t RGWAccessControlList::get_group_perm(const DoutPrefixProvider *dpp, + ACLGroupTypeEnum group, + const uint32_t perm_mask) const +{ + ldpp_dout(dpp, 5) << "Searching permissions for group=" << (int)group + << " mask=" << perm_mask << dendl; + + const auto iter = acl_group_map.find((uint32_t)group); + if (iter != acl_group_map.end()) { + ldpp_dout(dpp, 5) << "Found permission: " << iter->second << dendl; + return iter->second & perm_mask; + } + ldpp_dout(dpp, 5) << "Permissions for group not found" << dendl; + return 0; +} + +uint32_t RGWAccessControlList::get_referer_perm(const DoutPrefixProvider *dpp, + const uint32_t current_perm, + const std::string http_referer, + const uint32_t perm_mask) +{ + ldpp_dout(dpp, 5) << "Searching permissions for referer=" << http_referer + << " mask=" << perm_mask << dendl; + + /* This function is basically a transformation from current perm to + * a new one that takes into consideration the Swift's HTTP referer- + * based ACLs. We need to go through all items to respect negative + * grants. */ + uint32_t referer_perm = current_perm; + for (const auto& r : referer_list) { + if (r.is_match(http_referer)) { + referer_perm = r.perm; + } + } + + ldpp_dout(dpp, 5) << "Found referer permission=" << referer_perm << dendl; + return referer_perm & perm_mask; +} + +uint32_t RGWAccessControlPolicy::get_perm(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + const uint32_t perm_mask, + const char * const http_referer, + bool ignore_public_acls) +{ + ldpp_dout(dpp, 20) << "-- Getting permissions begin with perm_mask=" << perm_mask + << dendl; + + uint32_t perm = acl.get_perm(dpp, auth_identity, perm_mask); + + if (auth_identity.is_owner_of(owner.get_id())) { + perm |= perm_mask & (RGW_PERM_READ_ACP | RGW_PERM_WRITE_ACP); + } + + if (perm == perm_mask) { + return perm; + } + + /* should we continue looking up? */ + if (!ignore_public_acls && ((perm & perm_mask) != perm_mask)) { + perm |= acl.get_group_perm(dpp, ACL_GROUP_ALL_USERS, perm_mask); + + if (false == auth_identity.is_owner_of(rgw_user(RGW_USER_ANON_ID))) { + /* this is not the anonymous user */ + perm |= acl.get_group_perm(dpp, ACL_GROUP_AUTHENTICATED_USERS, perm_mask); + } + } + + /* Should we continue looking up even deeper? */ + if (nullptr != http_referer && (perm & perm_mask) != perm_mask) { + perm = acl.get_referer_perm(dpp, perm, http_referer, perm_mask); + } + + ldpp_dout(dpp, 5) << "-- Getting permissions done for identity=" << auth_identity + << ", owner=" << owner.get_id() + << ", perm=" << perm << dendl; + + return perm; +} + +bool RGWAccessControlPolicy::verify_permission(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + const uint32_t user_perm_mask, + const uint32_t perm, + const char * const http_referer, + bool ignore_public_acls) +{ + uint32_t test_perm = perm | RGW_PERM_READ_OBJS | RGW_PERM_WRITE_OBJS; + + uint32_t policy_perm = get_perm(dpp, auth_identity, test_perm, http_referer, ignore_public_acls); + + /* the swift WRITE_OBJS perm is equivalent to the WRITE obj, just + convert those bits. Note that these bits will only be set on + buckets, so the swift READ permission on bucket will allow listing + the bucket content */ + if (policy_perm & RGW_PERM_WRITE_OBJS) { + policy_perm |= (RGW_PERM_WRITE | RGW_PERM_WRITE_ACP); + } + if (policy_perm & RGW_PERM_READ_OBJS) { + policy_perm |= (RGW_PERM_READ | RGW_PERM_READ_ACP); + } + + uint32_t acl_perm = policy_perm & perm & user_perm_mask; + + ldpp_dout(dpp, 10) << " identity=" << auth_identity + << " requested perm (type)=" << perm + << ", policy perm=" << policy_perm + << ", user_perm_mask=" << user_perm_mask + << ", acl perm=" << acl_perm << dendl; + + return (perm == acl_perm); +} + + +bool RGWAccessControlPolicy::is_public(const DoutPrefixProvider *dpp) const +{ + + static constexpr auto public_groups = {ACL_GROUP_ALL_USERS, + ACL_GROUP_AUTHENTICATED_USERS}; + return std::any_of(public_groups.begin(), public_groups.end(), + [&, dpp](ACLGroupTypeEnum g) { + auto p = acl.get_group_perm(dpp, g, RGW_PERM_FULL_CONTROL); + return (p != RGW_PERM_NONE) && (p != RGW_PERM_INVALID); + } + ); + +} + +void ACLPermission::generate_test_instances(list& o) +{ + ACLPermission *p = new ACLPermission; + p->set_permissions(RGW_PERM_WRITE_ACP); + o.push_back(p); + o.push_back(new ACLPermission); +} + +void ACLPermission::dump(Formatter *f) const +{ + f->dump_int("flags", flags); +} + +void ACLGranteeType::dump(Formatter *f) const +{ + f->dump_unsigned("type", type); +} + +void ACLGrant::dump(Formatter *f) const +{ + f->open_object_section("type"); + type.dump(f); + f->close_section(); + + f->dump_string("id", id.to_str()); + f->dump_string("email", email); + + f->open_object_section("permission"); + permission.dump(f); + f->close_section(); + + f->dump_string("name", name); + f->dump_int("group", (int)group); + f->dump_string("url_spec", url_spec); +} + +void ACLGrant::generate_test_instances(list& o) +{ + rgw_user id("rgw"); + string name, email; + name = "Mr. RGW"; + email = "r@gw"; + + ACLGrant *g1 = new ACLGrant; + g1->set_canon(id, name, RGW_PERM_READ); + g1->email = email; + o.push_back(g1); + + ACLGrant *g2 = new ACLGrant; + g1->set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_WRITE); + o.push_back(g2); + + o.push_back(new ACLGrant); +} + +void ACLGranteeType::generate_test_instances(list& o) +{ + ACLGranteeType *t = new ACLGranteeType; + t->set(ACL_TYPE_CANON_USER); + o.push_back(t); + o.push_back(new ACLGranteeType); +} + +void RGWAccessControlList::generate_test_instances(list& o) +{ + RGWAccessControlList *acl = new RGWAccessControlList(NULL); + + list glist; + list::iterator iter; + + ACLGrant::generate_test_instances(glist); + for (iter = glist.begin(); iter != glist.end(); ++iter) { + ACLGrant *grant = *iter; + acl->add_grant(grant); + + delete grant; + } + o.push_back(acl); + o.push_back(new RGWAccessControlList(NULL)); +} + +void ACLOwner::generate_test_instances(list& o) +{ + ACLOwner *owner = new ACLOwner; + owner->id = "rgw"; + owner->display_name = "Mr. RGW"; + o.push_back(owner); + o.push_back(new ACLOwner); +} + +void RGWAccessControlPolicy::generate_test_instances(list& o) +{ + list acl_list; + list::iterator iter; + for (iter = acl_list.begin(); iter != acl_list.end(); ++iter) { + RGWAccessControlList::generate_test_instances(acl_list); + iter = acl_list.begin(); + + RGWAccessControlPolicy *p = new RGWAccessControlPolicy(NULL); + RGWAccessControlList *l = *iter; + p->acl = *l; + + string name = "radosgw"; + rgw_user id("rgw"); + p->owner.set_name(name); + p->owner.set_id(id); + + o.push_back(p); + + delete l; + } + + o.push_back(new RGWAccessControlPolicy(NULL)); +} + +void RGWAccessControlList::dump(Formatter *f) const +{ + map::const_iterator acl_user_iter = acl_user_map.begin(); + f->open_array_section("acl_user_map"); + for (; acl_user_iter != acl_user_map.end(); ++acl_user_iter) { + f->open_object_section("entry"); + f->dump_string("user", acl_user_iter->first); + f->dump_int("acl", acl_user_iter->second); + f->close_section(); + } + f->close_section(); + + map::const_iterator acl_group_iter = acl_group_map.begin(); + f->open_array_section("acl_group_map"); + for (; acl_group_iter != acl_group_map.end(); ++acl_group_iter) { + f->open_object_section("entry"); + f->dump_unsigned("group", acl_group_iter->first); + f->dump_int("acl", acl_group_iter->second); + f->close_section(); + } + f->close_section(); + + multimap::const_iterator giter = grant_map.begin(); + f->open_array_section("grant_map"); + for (; giter != grant_map.end(); ++giter) { + f->open_object_section("entry"); + f->dump_string("id", giter->first); + f->open_object_section("grant"); + giter->second.dump(f); + f->close_section(); + f->close_section(); + } + f->close_section(); +} + +void ACLOwner::dump(Formatter *f) const +{ + encode_json("id", id.to_str(), f); + encode_json("display_name", display_name, f); +} + +void ACLOwner::decode_json(JSONObj *obj) { + string id_str; + JSONDecoder::decode_json("id", id_str, obj); + id.from_str(id_str); + JSONDecoder::decode_json("display_name", display_name, obj); +} + +void RGWAccessControlPolicy::dump(Formatter *f) const +{ + encode_json("acl", acl, f); + encode_json("owner", owner, f); +} + +ACLGroupTypeEnum ACLGrant::uri_to_group(string& uri) +{ + // this is required for backward compatibility + return ACLGrant_S3::uri_to_group(uri); +} + diff --git a/src/rgw/rgw_acl.h b/src/rgw/rgw_acl.h new file mode 100644 index 000000000..c52050158 --- /dev/null +++ b/src/rgw/rgw_acl.h @@ -0,0 +1,414 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "common/debug.h" + +#include "rgw_basic_types.h" //includes rgw_acl_types.h + +class ACLGrant +{ +protected: + ACLGranteeType type; + rgw_user id; + std::string email; + mutable rgw_user email_id; + ACLPermission permission; + std::string name; + ACLGroupTypeEnum group; + std::string url_spec; + +public: + ACLGrant() : group(ACL_GROUP_NONE) {} + virtual ~ACLGrant() {} + + /* there's an assumption here that email/uri/id encodings are + different and there can't be any overlap */ + bool get_id(rgw_user& _id) const { + switch(type.get_type()) { + case ACL_TYPE_EMAIL_USER: + _id = email; // implies from_str() that parses the 't:u' syntax + return true; + case ACL_TYPE_GROUP: + case ACL_TYPE_REFERER: + return false; + default: + _id = id; + return true; + } + } + + const rgw_user* get_id() const { + switch(type.get_type()) { + case ACL_TYPE_EMAIL_USER: + email_id.from_str(email); + return &email_id; + case ACL_TYPE_GROUP: + case ACL_TYPE_REFERER: + return nullptr; + default: + return &id; + } + } + + ACLGranteeType& get_type() { return type; } + const ACLGranteeType& get_type() const { return type; } + ACLPermission& get_permission() { return permission; } + const ACLPermission& get_permission() const { return permission; } + ACLGroupTypeEnum get_group() const { return group; } + const std::string& get_referer() const { return url_spec; } + + void encode(bufferlist& bl) const { + ENCODE_START(5, 3, bl); + encode(type, bl); + std::string s; + id.to_str(s); + encode(s, bl); + std::string uri; + encode(uri, bl); + encode(email, bl); + encode(permission, bl); + encode(name, bl); + __u32 g = (__u32)group; + encode(g, bl); + encode(url_spec, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl); + decode(type, bl); + std::string s; + decode(s, bl); + id.from_str(s); + std::string uri; + decode(uri, bl); + decode(email, bl); + decode(permission, bl); + decode(name, bl); + if (struct_v > 1) { + __u32 g; + decode(g, bl); + group = (ACLGroupTypeEnum)g; + } else { + group = uri_to_group(uri); + } + if (struct_v >= 5) { + decode(url_spec, bl); + } else { + url_spec.clear(); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + ACLGroupTypeEnum uri_to_group(std::string& uri); + + void set_canon(const rgw_user& _id, const std::string& _name, const uint32_t perm) { + type.set(ACL_TYPE_CANON_USER); + id = _id; + name = _name; + permission.set_permissions(perm); + } + void set_group(ACLGroupTypeEnum _group, const uint32_t perm) { + type.set(ACL_TYPE_GROUP); + group = _group; + permission.set_permissions(perm); + } + void set_referer(const std::string& _url_spec, const uint32_t perm) { + type.set(ACL_TYPE_REFERER); + url_spec = _url_spec; + permission.set_permissions(perm); + } + + friend bool operator==(const ACLGrant& lhs, const ACLGrant& rhs); + friend bool operator!=(const ACLGrant& lhs, const ACLGrant& rhs); +}; +WRITE_CLASS_ENCODER(ACLGrant) + +struct ACLReferer { + std::string url_spec; + uint32_t perm; + + ACLReferer() : perm(0) {} + ACLReferer(const std::string& url_spec, + const uint32_t perm) + : url_spec(url_spec), + perm(perm) { + } + + bool is_match(std::string_view http_referer) const { + const auto http_host = get_http_host(http_referer); + if (!http_host || http_host->length() < url_spec.length()) { + return false; + } + + if ("*" == url_spec) { + return true; + } + + if (http_host->compare(url_spec) == 0) { + return true; + } + + if ('.' == url_spec[0]) { + /* Wildcard support: a referer matches the spec when its last char are + * perfectly equal to spec. */ + return boost::algorithm::ends_with(http_host.value(), url_spec); + } + + return false; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(url_spec, bl); + encode(perm, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl); + decode(url_spec, bl); + decode(perm, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + + friend bool operator==(const ACLReferer& lhs, const ACLReferer& rhs); + friend bool operator!=(const ACLReferer& lhs, const ACLReferer& rhs); + +private: + boost::optional get_http_host(const std::string_view url) const { + size_t pos = url.find("://"); + if (pos == std::string_view::npos || boost::algorithm::starts_with(url, "://") || + boost::algorithm::ends_with(url, "://") || boost::algorithm::ends_with(url, "@")) { + return boost::none; + } + std::string_view url_sub = url.substr(pos + strlen("://")); + pos = url_sub.find('@'); + if (pos != std::string_view::npos) { + url_sub = url_sub.substr(pos + 1); + } + pos = url_sub.find_first_of("/:"); + if (pos == std::string_view::npos) { + /* no port or path exists */ + return url_sub; + } + return url_sub.substr(0, pos); + } +}; +WRITE_CLASS_ENCODER(ACLReferer) + +namespace rgw { +namespace auth { + class Identity; +} +} + +using ACLGrantMap = std::multimap; + +class RGWAccessControlList +{ +protected: + CephContext *cct; + /* FIXME: in the feature we should consider switching to uint32_t also + * in data structures. */ + std::map acl_user_map; + std::map acl_group_map; + std::list referer_list; + ACLGrantMap grant_map; + void _add_grant(ACLGrant *grant); +public: + explicit RGWAccessControlList(CephContext *_cct) : cct(_cct) {} + RGWAccessControlList() : cct(NULL) {} + + void set_ctx(CephContext *ctx) { + cct = ctx; + } + + virtual ~RGWAccessControlList() {} + + uint32_t get_perm(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + uint32_t perm_mask); + uint32_t get_group_perm(const DoutPrefixProvider *dpp, ACLGroupTypeEnum group, uint32_t perm_mask) const; + uint32_t get_referer_perm(const DoutPrefixProvider *dpp, uint32_t current_perm, + std::string http_referer, + uint32_t perm_mask); + void encode(bufferlist& bl) const { + ENCODE_START(4, 3, bl); + bool maps_initialized = true; + encode(maps_initialized, bl); + encode(acl_user_map, bl); + encode(grant_map, bl); + encode(acl_group_map, bl); + encode(referer_list, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl); + bool maps_initialized; + decode(maps_initialized, bl); + decode(acl_user_map, bl); + decode(grant_map, bl); + if (struct_v >= 2) { + decode(acl_group_map, bl); + } else if (!maps_initialized) { + ACLGrantMap::iterator iter; + for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) { + ACLGrant& grant = iter->second; + _add_grant(&grant); + } + } + if (struct_v >= 4) { + decode(referer_list, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + void add_grant(ACLGrant *grant); + void remove_canon_user_grant(rgw_user& user_id); + + ACLGrantMap& get_grant_map() { return grant_map; } + const ACLGrantMap& get_grant_map() const { return grant_map; } + + void create_default(const rgw_user& id, std::string name) { + acl_user_map.clear(); + acl_group_map.clear(); + referer_list.clear(); + + ACLGrant grant; + grant.set_canon(id, name, RGW_PERM_FULL_CONTROL); + add_grant(&grant); + } + + friend bool operator==(const RGWAccessControlList& lhs, const RGWAccessControlList& rhs); + friend bool operator!=(const RGWAccessControlList& lhs, const RGWAccessControlList& rhs); +}; +WRITE_CLASS_ENCODER(RGWAccessControlList) + +class ACLOwner +{ +protected: + rgw_user id; + std::string display_name; +public: + ACLOwner() {} + ACLOwner(const rgw_user& _id) : id(_id) {} + ~ACLOwner() {} + + void encode(bufferlist& bl) const { + ENCODE_START(3, 2, bl); + std::string s; + id.to_str(s); + encode(s, bl); + encode(display_name, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + std::string s; + decode(s, bl); + id.from_str(s); + decode(display_name, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); + void set_id(const rgw_user& _id) { id = _id; } + void set_name(const std::string& name) { display_name = name; } + + rgw_user& get_id() { return id; } + const rgw_user& get_id() const { return id; } + std::string& get_display_name() { return display_name; } + const std::string& get_display_name() const { return display_name; } + friend bool operator==(const ACLOwner& lhs, const ACLOwner& rhs); + friend bool operator!=(const ACLOwner& lhs, const ACLOwner& rhs); +}; +WRITE_CLASS_ENCODER(ACLOwner) + +class RGWAccessControlPolicy +{ +protected: + CephContext *cct; + RGWAccessControlList acl; + ACLOwner owner; + +public: + explicit RGWAccessControlPolicy(CephContext *_cct) : cct(_cct), acl(_cct) {} + RGWAccessControlPolicy() : cct(NULL), acl(NULL) {} + virtual ~RGWAccessControlPolicy() {} + + void set_ctx(CephContext *ctx) { + cct = ctx; + acl.set_ctx(ctx); + } + + uint32_t get_perm(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + uint32_t perm_mask, + const char * http_referer, + bool ignore_public_acls=false); + bool verify_permission(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + uint32_t user_perm_mask, + uint32_t perm, + const char * http_referer = nullptr, + bool ignore_public_acls=false); + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(owner, bl); + encode(acl, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(owner, bl); + decode(acl, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + void decode_owner(bufferlist::const_iterator& bl) { // sometimes we only need that, should be faster + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(owner, bl); + DECODE_FINISH(bl); + } + + void set_owner(ACLOwner& o) { owner = o; } + ACLOwner& get_owner() { + return owner; + } + + void create_default(const rgw_user& id, std::string& name) { + acl.create_default(id, name); + owner.set_id(id); + owner.set_name(name); + } + RGWAccessControlList& get_acl() { + return acl; + } + const RGWAccessControlList& get_acl() const { + return acl; + } + + virtual bool compare_group_name(std::string& id, ACLGroupTypeEnum group) { return false; } + bool is_public(const DoutPrefixProvider *dpp) const; + + friend bool operator==(const RGWAccessControlPolicy& lhs, const RGWAccessControlPolicy& rhs); + friend bool operator!=(const RGWAccessControlPolicy& lhs, const RGWAccessControlPolicy& rhs); +}; +WRITE_CLASS_ENCODER(RGWAccessControlPolicy) diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc new file mode 100644 index 000000000..9f71e3281 --- /dev/null +++ b/src/rgw/rgw_acl_s3.cc @@ -0,0 +1,643 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_acl_s3.h" +#include "rgw_user.h" +#include "rgw_sal.h" + +#define dout_subsys ceph_subsys_rgw + + + +#define RGW_URI_ALL_USERS "http://acs.amazonaws.com/groups/global/AllUsers" +#define RGW_URI_AUTH_USERS "http://acs.amazonaws.com/groups/global/AuthenticatedUsers" + +using namespace std; + +static string rgw_uri_all_users = RGW_URI_ALL_USERS; +static string rgw_uri_auth_users = RGW_URI_AUTH_USERS; + +void ACLPermission_S3::to_xml(ostream& out) +{ + if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) { + out << "FULL_CONTROL"; + } else { + if (flags & RGW_PERM_READ) + out << "READ"; + if (flags & RGW_PERM_WRITE) + out << "WRITE"; + if (flags & RGW_PERM_READ_ACP) + out << "READ_ACP"; + if (flags & RGW_PERM_WRITE_ACP) + out << "WRITE_ACP"; + } +} + +bool ACLPermission_S3:: +xml_end(const char *el) +{ + const char *s = data.c_str(); + if (strcasecmp(s, "READ") == 0) { + flags |= RGW_PERM_READ; + return true; + } else if (strcasecmp(s, "WRITE") == 0) { + flags |= RGW_PERM_WRITE; + return true; + } else if (strcasecmp(s, "READ_ACP") == 0) { + flags |= RGW_PERM_READ_ACP; + return true; + } else if (strcasecmp(s, "WRITE_ACP") == 0) { + flags |= RGW_PERM_WRITE_ACP; + return true; + } else if (strcasecmp(s, "FULL_CONTROL") == 0) { + flags |= RGW_PERM_FULL_CONTROL; + return true; + } + return false; +} + + +class ACLGranteeType_S3 { +public: + static const char *to_string(ACLGranteeType& type) { + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + return "CanonicalUser"; + case ACL_TYPE_EMAIL_USER: + return "AmazonCustomerByEmail"; + case ACL_TYPE_GROUP: + return "Group"; + default: + return "unknown"; + } + } + + static void set(const char *s, ACLGranteeType& type) { + if (!s) { + type.set(ACL_TYPE_UNKNOWN); + return; + } + if (strcmp(s, "CanonicalUser") == 0) + type.set(ACL_TYPE_CANON_USER); + else if (strcmp(s, "AmazonCustomerByEmail") == 0) + type.set(ACL_TYPE_EMAIL_USER); + else if (strcmp(s, "Group") == 0) + type.set(ACL_TYPE_GROUP); + else + type.set(ACL_TYPE_UNKNOWN); + } +}; + +class ACLID_S3 : public XMLObj +{ +public: + ACLID_S3() {} + ~ACLID_S3() override {} + string& to_str() { return data; } +}; + +class ACLURI_S3 : public XMLObj +{ +public: + ACLURI_S3() {} + ~ACLURI_S3() override {} +}; + +class ACLEmail_S3 : public XMLObj +{ +public: + ACLEmail_S3() {} + ~ACLEmail_S3() override {} +}; + +class ACLDisplayName_S3 : public XMLObj +{ +public: + ACLDisplayName_S3() {} + ~ACLDisplayName_S3() override {} +}; + +bool ACLOwner_S3::xml_end(const char *el) { + ACLID_S3 *acl_id = static_cast(find_first("ID")); + ACLID_S3 *acl_name = static_cast(find_first("DisplayName")); + + // ID is mandatory + if (!acl_id) + return false; + id = acl_id->get_data(); + + // DisplayName is optional + if (acl_name) + display_name = acl_name->get_data(); + else + display_name = ""; + + return true; +} + +void ACLOwner_S3::to_xml(ostream& out) { + string s; + id.to_str(s); + if (s.empty()) + return; + out << "" << "" << s << ""; + if (!display_name.empty()) + out << "" << display_name << ""; + out << ""; +} + +bool ACLGrant_S3::xml_end(const char *el) { + ACLGrantee_S3 *acl_grantee; + ACLID_S3 *acl_id; + ACLURI_S3 *acl_uri; + ACLEmail_S3 *acl_email; + ACLPermission_S3 *acl_permission; + ACLDisplayName_S3 *acl_name; + string uri; + + acl_grantee = static_cast(find_first("Grantee")); + if (!acl_grantee) + return false; + string type_str; + if (!acl_grantee->get_attr("xsi:type", type_str)) + return false; + ACLGranteeType_S3::set(type_str.c_str(), type); + + acl_permission = static_cast(find_first("Permission")); + if (!acl_permission) + return false; + + permission = *acl_permission; + + id.clear(); + name.clear(); + email.clear(); + + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + acl_id = static_cast(acl_grantee->find_first("ID")); + if (!acl_id) + return false; + id = acl_id->to_str(); + acl_name = static_cast(acl_grantee->find_first("DisplayName")); + if (acl_name) + name = acl_name->get_data(); + break; + case ACL_TYPE_GROUP: + acl_uri = static_cast(acl_grantee->find_first("URI")); + if (!acl_uri) + return false; + uri = acl_uri->get_data(); + group = uri_to_group(uri); + break; + case ACL_TYPE_EMAIL_USER: + acl_email = static_cast(acl_grantee->find_first("EmailAddress")); + if (!acl_email) + return false; + email = acl_email->get_data(); + break; + default: + // unknown user type + return false; + }; + return true; +} + +void ACLGrant_S3::to_xml(CephContext *cct, ostream& out) { + ACLPermission_S3& perm = static_cast(permission); + + /* only show s3 compatible permissions */ + if (!(perm.get_permissions() & RGW_PERM_ALL_S3)) + return; + + string uri; + + out << "" << + ""; + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + out << "" << id << ""; + if (name.size()) { + out << "" << name << ""; + } + break; + case ACL_TYPE_EMAIL_USER: + out << "" << email << ""; + break; + case ACL_TYPE_GROUP: + if (!group_to_uri(group, uri)) { + ldout(cct, 0) << "ERROR: group_to_uri failed with group=" << (int)group << dendl; + break; + } + out << "" << uri << ""; + break; + default: + break; + } + out << ""; + perm.to_xml(out); + out << ""; +} + +bool ACLGrant_S3::group_to_uri(ACLGroupTypeEnum group, string& uri) +{ + switch (group) { + case ACL_GROUP_ALL_USERS: + uri = rgw_uri_all_users; + return true; + case ACL_GROUP_AUTHENTICATED_USERS: + uri = rgw_uri_auth_users; + return true; + default: + return false; + } +} + +bool RGWAccessControlList_S3::xml_end(const char *el) { + XMLObjIter iter = find("Grant"); + ACLGrant_S3 *grant = static_cast(iter.get_next()); + while (grant) { + add_grant(grant); + grant = static_cast(iter.get_next()); + } + return true; +} + +void RGWAccessControlList_S3::to_xml(ostream& out) { + multimap::iterator iter; + out << ""; + for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) { + ACLGrant_S3& grant = static_cast(iter->second); + grant.to_xml(cct, out); + } + out << ""; +} + +struct s3_acl_header { + int rgw_perm; + const char *http_header; +}; + +static const char *get_acl_header(const RGWEnv *env, + const struct s3_acl_header *perm) +{ + const char *header = perm->http_header; + + return env->get(header, NULL); +} + +static int parse_grantee_str(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, string& grantee_str, + const struct s3_acl_header *perm, ACLGrant& grant) +{ + string id_type, id_val_quoted; + int rgw_perm = perm->rgw_perm; + int ret; + + ret = parse_key_value(grantee_str, id_type, id_val_quoted); + if (ret < 0) + return ret; + + string id_val = rgw_trim_quotes(id_val_quoted); + + if (strcasecmp(id_type.c_str(), "emailAddress") == 0) { + std::unique_ptr user; + ret = driver->get_user_by_email(dpp, id_val, null_yield, &user); + if (ret < 0) + return ret; + + grant.set_canon(user->get_id(), user->get_display_name(), rgw_perm); + } else if (strcasecmp(id_type.c_str(), "id") == 0) { + std::unique_ptr user = driver->get_user(rgw_user(id_val)); + ret = user->load_user(dpp, null_yield); + if (ret < 0) + return ret; + + grant.set_canon(user->get_id(), user->get_display_name(), rgw_perm); + } else if (strcasecmp(id_type.c_str(), "uri") == 0) { + ACLGroupTypeEnum gid = grant.uri_to_group(id_val); + if (gid == ACL_GROUP_NONE) + return -EINVAL; + + grant.set_group(gid, rgw_perm); + } else { + return -EINVAL; + } + + return 0; +} + +static int parse_acl_header(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + const RGWEnv *env, const struct s3_acl_header *perm, + std::list& _grants) +{ + std::list grantees; + std::string hacl_str; + + const char *hacl = get_acl_header(env, perm); + if (hacl == NULL) + return 0; + + hacl_str = hacl; + get_str_list(hacl_str, ",", grantees); + + for (list::iterator it = grantees.begin(); it != grantees.end(); ++it) { + ACLGrant grant; + int ret = parse_grantee_str(dpp, driver, *it, perm, grant); + if (ret < 0) + return ret; + + _grants.push_back(grant); + } + + return 0; +} + +int RGWAccessControlList_S3::create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl) +{ + acl_user_map.clear(); + grant_map.clear(); + + ACLGrant owner_grant; + + rgw_user bid = bucket_owner.get_id(); + string bname = bucket_owner.get_display_name(); + + /* owner gets full control */ + owner_grant.set_canon(owner.get_id(), owner.get_display_name(), RGW_PERM_FULL_CONTROL); + add_grant(&owner_grant); + + if (canned_acl.size() == 0 || canned_acl.compare("private") == 0) { + return 0; + } + + ACLGrant bucket_owner_grant; + ACLGrant group_grant; + if (canned_acl.compare("public-read") == 0) { + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); + add_grant(&group_grant); + } else if (canned_acl.compare("public-read-write") == 0) { + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); + add_grant(&group_grant); + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_WRITE); + add_grant(&group_grant); + } else if (canned_acl.compare("authenticated-read") == 0) { + group_grant.set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_READ); + add_grant(&group_grant); + } else if (canned_acl.compare("bucket-owner-read") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_READ); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else if (canned_acl.compare("bucket-owner-full-control") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_FULL_CONTROL); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else { + return -EINVAL; + } + + return 0; +} + +int RGWAccessControlList_S3::create_from_grants(std::list& grants) +{ + if (grants.empty()) + return -EINVAL; + + acl_user_map.clear(); + grant_map.clear(); + + for (std::list::iterator it = grants.begin(); it != grants.end(); ++it) { + ACLGrant g = *it; + add_grant(&g); + } + + return 0; +} + +bool RGWAccessControlPolicy_S3::xml_end(const char *el) { + RGWAccessControlList_S3 *s3acl = + static_cast(find_first("AccessControlList")); + if (!s3acl) + return false; + + acl = *s3acl; + + ACLOwner *owner_p = static_cast(find_first("Owner")); + if (!owner_p) + return false; + owner = *owner_p; + return true; +} + +void RGWAccessControlPolicy_S3::to_xml(ostream& out) { + out << ""; + ACLOwner_S3& _owner = static_cast(owner); + RGWAccessControlList_S3& _acl = static_cast(acl); + _owner.to_xml(out); + _acl.to_xml(out); + out << ""; +} + +static const s3_acl_header acl_header_perms[] = { + {RGW_PERM_READ, "HTTP_X_AMZ_GRANT_READ"}, + {RGW_PERM_WRITE, "HTTP_X_AMZ_GRANT_WRITE"}, + {RGW_PERM_READ_ACP,"HTTP_X_AMZ_GRANT_READ_ACP"}, + {RGW_PERM_WRITE_ACP, "HTTP_X_AMZ_GRANT_WRITE_ACP"}, + {RGW_PERM_FULL_CONTROL, "HTTP_X_AMZ_GRANT_FULL_CONTROL"}, + {0, NULL} +}; + +int RGWAccessControlPolicy_S3::create_from_headers(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const RGWEnv *env, ACLOwner& _owner) +{ + std::list grants; + int r = 0; + + for (const struct s3_acl_header *p = acl_header_perms; p->rgw_perm; p++) { + r = parse_acl_header(dpp, driver, env, p, grants); + if (r < 0) { + return r; + } + } + + RGWAccessControlList_S3& _acl = static_cast(acl); + r = _acl.create_from_grants(grants); + + owner = _owner; + + return r; +} + +/* + can only be called on object that was parsed + */ +int RGWAccessControlPolicy_S3::rebuild(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, ACLOwner *owner, + RGWAccessControlPolicy& dest, std::string &err_msg) +{ + if (!owner) + return -EINVAL; + + ACLOwner *requested_owner = static_cast(find_first("Owner")); + if (requested_owner) { + rgw_user& requested_id = requested_owner->get_id(); + if (!requested_id.empty() && requested_id.compare(owner->get_id()) != 0) + return -EPERM; + } + + std::unique_ptr user = driver->get_user(owner->get_id()); + if (user->load_user(dpp, null_yield) < 0) { + ldpp_dout(dpp, 10) << "owner info does not exist" << dendl; + err_msg = "Invalid id"; + return -EINVAL; + } + ACLOwner& dest_owner = dest.get_owner(); + dest_owner.set_id(owner->get_id()); + dest_owner.set_name(user->get_display_name()); + + ldpp_dout(dpp, 20) << "owner id=" << owner->get_id() << dendl; + ldpp_dout(dpp, 20) << "dest owner id=" << dest.get_owner().get_id() << dendl; + + RGWAccessControlList& dst_acl = dest.get_acl(); + + multimap& grant_map = acl.get_grant_map(); + multimap::iterator iter; + for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) { + ACLGrant& src_grant = iter->second; + ACLGranteeType& type = src_grant.get_type(); + ACLGrant new_grant; + bool grant_ok = false; + rgw_user uid; + RGWUserInfo grant_user; + switch (type.get_type()) { + case ACL_TYPE_EMAIL_USER: + { + string email; + rgw_user u; + if (!src_grant.get_id(u)) { + ldpp_dout(dpp, 0) << "ERROR: src_grant.get_id() failed" << dendl; + return -EINVAL; + } + email = u.id; + ldpp_dout(dpp, 10) << "grant user email=" << email << dendl; + if (driver->get_user_by_email(dpp, email, null_yield, &user) < 0) { + ldpp_dout(dpp, 10) << "grant user email not found or other error" << dendl; + err_msg = "The e-mail address you provided does not match any account on record."; + return -ERR_UNRESOLVABLE_EMAIL; + } + grant_user = user->get_info(); + uid = grant_user.user_id; + } + case ACL_TYPE_CANON_USER: + { + if (type.get_type() == ACL_TYPE_CANON_USER) { + if (!src_grant.get_id(uid)) { + ldpp_dout(dpp, 0) << "ERROR: src_grant.get_id() failed" << dendl; + err_msg = "Invalid id"; + return -EINVAL; + } + } + + if (grant_user.user_id.empty()) { + user = driver->get_user(uid); + if (user->load_user(dpp, null_yield) < 0) { + ldpp_dout(dpp, 10) << "grant user does not exist:" << uid << dendl; + err_msg = "Invalid id"; + return -EINVAL; + } else { + grant_user = user->get_info(); + } + } + ACLPermission& perm = src_grant.get_permission(); + new_grant.set_canon(uid, grant_user.display_name, perm.get_permissions()); + grant_ok = true; + rgw_user new_id; + new_grant.get_id(new_id); + ldpp_dout(dpp, 10) << "new grant: " << new_id << ":" << grant_user.display_name << dendl; + } + break; + case ACL_TYPE_GROUP: + { + string uri; + if (ACLGrant_S3::group_to_uri(src_grant.get_group(), uri)) { + new_grant = src_grant; + grant_ok = true; + ldpp_dout(dpp, 10) << "new grant: " << uri << dendl; + } else { + ldpp_dout(dpp, 10) << "bad grant group:" << (int)src_grant.get_group() << dendl; + err_msg = "Invalid group uri"; + return -EINVAL; + } + } + default: + break; + } + if (grant_ok) { + dst_acl.add_grant(&new_grant); + } + } + + return 0; +} + +bool RGWAccessControlPolicy_S3::compare_group_name(string& id, ACLGroupTypeEnum group) +{ + switch (group) { + case ACL_GROUP_ALL_USERS: + return (id.compare(RGW_USER_ANON_ID) == 0); + case ACL_GROUP_AUTHENTICATED_USERS: + return (id.compare(rgw_uri_auth_users) == 0); + default: + return id.empty(); + } + + // shouldn't get here + return false; +} + +XMLObj *RGWACLXMLParser_S3::alloc_obj(const char *el) +{ + XMLObj * obj = NULL; + if (strcmp(el, "AccessControlPolicy") == 0) { + obj = new RGWAccessControlPolicy_S3(cct); + } else if (strcmp(el, "Owner") == 0) { + obj = new ACLOwner_S3(); + } else if (strcmp(el, "AccessControlList") == 0) { + obj = new RGWAccessControlList_S3(cct); + } else if (strcmp(el, "ID") == 0) { + obj = new ACLID_S3(); + } else if (strcmp(el, "DisplayName") == 0) { + obj = new ACLDisplayName_S3(); + } else if (strcmp(el, "Grant") == 0) { + obj = new ACLGrant_S3(); + } else if (strcmp(el, "Grantee") == 0) { + obj = new ACLGrantee_S3(); + } else if (strcmp(el, "Permission") == 0) { + obj = new ACLPermission_S3(); + } else if (strcmp(el, "URI") == 0) { + obj = new ACLURI_S3(); + } else if (strcmp(el, "EmailAddress") == 0) { + obj = new ACLEmail_S3(); + } + + return obj; +} + +ACLGroupTypeEnum ACLGrant_S3::uri_to_group(string& uri) +{ + if (uri.compare(rgw_uri_all_users) == 0) + return ACL_GROUP_ALL_USERS; + else if (uri.compare(rgw_uri_auth_users) == 0) + return ACL_GROUP_AUTHENTICATED_USERS; + + return ACL_GROUP_NONE; +} + diff --git a/src/rgw/rgw_acl_s3.h b/src/rgw/rgw_acl_s3.h new file mode 100644 index 000000000..c234d722b --- /dev/null +++ b/src/rgw/rgw_acl_s3.h @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include "include/str_list.h" +#include "rgw_xml.h" +#include "rgw_acl.h" +#include "rgw_sal_fwd.h" + +class RGWUserCtl; + +class ACLPermission_S3 : public ACLPermission, public XMLObj +{ +public: + ACLPermission_S3() {} + virtual ~ACLPermission_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(std::ostream& out); +}; + +class ACLGrantee_S3 : public ACLGrantee, public XMLObj +{ +public: + ACLGrantee_S3() {} + virtual ~ACLGrantee_S3() override {} + + bool xml_start(const char *el, const char **attr); +}; + + +class ACLGrant_S3 : public ACLGrant, public XMLObj +{ +public: + ACLGrant_S3() {} + virtual ~ACLGrant_S3() override {} + + void to_xml(CephContext *cct, std::ostream& out); + bool xml_end(const char *el) override; + bool xml_start(const char *el, const char **attr); + + static ACLGroupTypeEnum uri_to_group(std::string& uri); + static bool group_to_uri(ACLGroupTypeEnum group, std::string& uri); +}; + +class RGWAccessControlList_S3 : public RGWAccessControlList, public XMLObj +{ +public: + explicit RGWAccessControlList_S3(CephContext *_cct) : RGWAccessControlList(_cct) {} + virtual ~RGWAccessControlList_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(std::ostream& out); + + int create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const std::string& canned_acl); + int create_from_grants(std::list& grants); +}; + +class ACLOwner_S3 : public ACLOwner, public XMLObj +{ +public: + ACLOwner_S3() {} + virtual ~ACLOwner_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(std::ostream& out); +}; + +class RGWEnv; + +class RGWAccessControlPolicy_S3 : public RGWAccessControlPolicy, public XMLObj +{ +public: + explicit RGWAccessControlPolicy_S3(CephContext *_cct) : RGWAccessControlPolicy(_cct) {} + virtual ~RGWAccessControlPolicy_S3() override {} + + bool xml_end(const char *el) override; + + void to_xml(std::ostream& out); + int rebuild(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, ACLOwner *owner, + RGWAccessControlPolicy& dest, std::string &err_msg); + bool compare_group_name(std::string& id, ACLGroupTypeEnum group) override; + + virtual int create_canned(ACLOwner& _owner, ACLOwner& bucket_owner, const std::string& canned_acl) { + RGWAccessControlList_S3& _acl = static_cast(acl); + if (_owner.get_id() == rgw_user("anonymous")) { + owner = bucket_owner; + } else { + owner = _owner; + } + int ret = _acl.create_canned(owner, bucket_owner, canned_acl); + return ret; + } + int create_from_headers(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + const RGWEnv *env, ACLOwner& _owner); +}; + +/** + * Interfaces with the webserver's XML handling code + * to parse it in a way that makes sense for the rgw. + */ +class RGWACLXMLParser_S3 : public RGWXMLParser +{ + CephContext *cct; + + XMLObj *alloc_obj(const char *el) override; +public: + explicit RGWACLXMLParser_S3(CephContext *_cct) : cct(_cct) {} +}; diff --git a/src/rgw/rgw_acl_swift.cc b/src/rgw/rgw_acl_swift.cc new file mode 100644 index 000000000..f1ca68d63 --- /dev/null +++ b/src/rgw/rgw_acl_swift.cc @@ -0,0 +1,438 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include + +#include + +#include "common/ceph_json.h" +#include "rgw_common.h" +#include "rgw_user.h" +#include "rgw_acl_swift.h" +#include "rgw_sal.h" + +#define dout_subsys ceph_subsys_rgw + + +#define SWIFT_PERM_READ RGW_PERM_READ_OBJS +#define SWIFT_PERM_WRITE RGW_PERM_WRITE_OBJS +/* FIXME: do we really need separate RW? */ +#define SWIFT_PERM_RWRT (SWIFT_PERM_READ | SWIFT_PERM_WRITE) +#define SWIFT_PERM_ADMIN RGW_PERM_FULL_CONTROL + +#define SWIFT_GROUP_ALL_USERS ".r:*" + +using namespace std; + +static int parse_list(const char* uid_list, + std::vector& uids) /* out */ +{ + char *s = strdup(uid_list); + if (!s) { + return -ENOMEM; + } + + char *tokctx; + const char *p = strtok_r(s, " ,", &tokctx); + while (p) { + if (*p) { + string acl = p; + uids.push_back(acl); + } + p = strtok_r(NULL, " ,", &tokctx); + } + free(s); + return 0; +} + +static bool is_referrer(const std::string& designator) +{ + return designator.compare(".r") == 0 || + designator.compare(".ref") == 0 || + designator.compare(".referer") == 0 || + designator.compare(".referrer") == 0; +} + +static bool uid_is_public(const string& uid) +{ + if (uid[0] != '.' || uid[1] != 'r') + return false; + + int pos = uid.find(':'); + if (pos < 0 || pos == (int)uid.size()) + return false; + + string sub = uid.substr(0, pos); + string after = uid.substr(pos + 1); + + if (after.compare("*") != 0) + return false; + + return is_referrer(sub); +} + +static boost::optional referrer_to_grant(std::string url_spec, + const uint32_t perm) +{ + /* This function takes url_spec as non-ref std::string because of the trim + * operation that is essential to preserve compliance with Swift. It can't + * be easily accomplished with std::string_view. */ + try { + bool is_negative; + ACLGrant grant; + + if ('-' == url_spec[0]) { + url_spec = url_spec.substr(1); + boost::algorithm::trim(url_spec); + + is_negative = true; + } else { + is_negative = false; + } + + if (url_spec != RGW_REFERER_WILDCARD) { + if ('*' == url_spec[0]) { + url_spec = url_spec.substr(1); + boost::algorithm::trim(url_spec); + } + + if (url_spec.empty() || url_spec == ".") { + return boost::none; + } + } else { + /* Please be aware we're specially handling the .r:* in _add_grant() + * of RGWAccessControlList as the S3 API has a similar concept, and + * thus we can have a small portion of compatibility. */ + } + + grant.set_referer(url_spec, is_negative ? 0 : perm); + return grant; + } catch (const std::out_of_range&) { + return boost::none; + } +} + +static ACLGrant user_to_grant(const DoutPrefixProvider *dpp, + CephContext* const cct, + rgw::sal::Driver* driver, + const std::string& uid, + const uint32_t perm) +{ + RGWUserInfo grant_user; + ACLGrant grant; + std::unique_ptr user; + + user = driver->get_user(rgw_user(uid)); + if (user->load_user(dpp, null_yield) < 0) { + ldpp_dout(dpp, 10) << "grant user does not exist: " << uid << dendl; + /* skipping silently */ + grant.set_canon(user->get_id(), std::string(), perm); + } else { + grant.set_canon(user->get_id(), user->get_display_name(), perm); + } + + return grant; +} + +int RGWAccessControlPolicy_SWIFT::add_grants(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const std::vector& uids, + const uint32_t perm) +{ + for (const auto& uid : uids) { + boost::optional grant; + ldpp_dout(dpp, 20) << "trying to add grant for ACL uid=" << uid << dendl; + + /* Let's check whether the item has a separator potentially indicating + * a special meaning (like an HTTP referral-based grant). */ + const size_t pos = uid.find(':'); + if (std::string::npos == pos) { + /* No, it don't have -- we've got just a regular user identifier. */ + grant = user_to_grant(dpp, cct, driver, uid, perm); + } else { + /* Yes, *potentially* an HTTP referral. */ + auto designator = uid.substr(0, pos); + auto designatee = uid.substr(pos + 1); + + /* Swift strips whitespaces at both beginning and end. */ + boost::algorithm::trim(designator); + boost::algorithm::trim(designatee); + + if (! boost::algorithm::starts_with(designator, ".")) { + grant = user_to_grant(dpp, cct, driver, uid, perm); + } else if ((perm & SWIFT_PERM_WRITE) == 0 && is_referrer(designator)) { + /* HTTP referrer-based ACLs aren't acceptable for writes. */ + grant = referrer_to_grant(designatee, perm); + } + } + + if (grant) { + acl.add_grant(&*grant); + } else { + return -EINVAL; + } + } + + return 0; +} + + +int RGWAccessControlPolicy_SWIFT::create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const rgw_user& id, + const std::string& name, + const char* read_list, + const char* write_list, + uint32_t& rw_mask) +{ + acl.create_default(id, name); + owner.set_id(id); + owner.set_name(name); + rw_mask = 0; + + if (read_list) { + std::vector uids; + int r = parse_list(read_list, uids); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: parse_list for read returned r=" + << r << dendl; + return r; + } + + r = add_grants(dpp, driver, uids, SWIFT_PERM_READ); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: add_grants for read returned r=" + << r << dendl; + return r; + } + rw_mask |= SWIFT_PERM_READ; + } + if (write_list) { + std::vector uids; + int r = parse_list(write_list, uids); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: parse_list for write returned r=" + << r << dendl; + return r; + } + + r = add_grants(dpp, driver, uids, SWIFT_PERM_WRITE); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: add_grants for write returned r=" + << r << dendl; + return r; + } + rw_mask |= SWIFT_PERM_WRITE; + } + return 0; +} + +void RGWAccessControlPolicy_SWIFT::filter_merge(uint32_t rw_mask, + RGWAccessControlPolicy_SWIFT *old) +{ + /* rw_mask&SWIFT_PERM_READ => setting read acl, + * rw_mask&SWIFT_PERM_WRITE => setting write acl + * when bit is cleared, copy matching elements from old. + */ + if (rw_mask == (SWIFT_PERM_READ|SWIFT_PERM_WRITE)) { + return; + } + rw_mask ^= (SWIFT_PERM_READ|SWIFT_PERM_WRITE); + for (auto &iter: old->acl.get_grant_map()) { + ACLGrant& grant = iter.second; + uint32_t perm = grant.get_permission().get_permissions(); + rgw_user id; + string url_spec; + if (!grant.get_id(id)) { + if (grant.get_group() != ACL_GROUP_ALL_USERS) { + url_spec = grant.get_referer(); + if (url_spec.empty()) { + continue; + } + if (perm == 0) { + /* We need to carry also negative, HTTP referrer-based ACLs. */ + perm = SWIFT_PERM_READ; + } + } + } + if (perm & rw_mask) { + acl.add_grant(&grant); + } + } +} + +void RGWAccessControlPolicy_SWIFT::to_str(string& read, string& write) +{ + multimap& m = acl.get_grant_map(); + multimap::iterator iter; + + for (iter = m.begin(); iter != m.end(); ++iter) { + ACLGrant& grant = iter->second; + const uint32_t perm = grant.get_permission().get_permissions(); + rgw_user id; + string url_spec; + if (!grant.get_id(id)) { + if (grant.get_group() == ACL_GROUP_ALL_USERS) { + id = SWIFT_GROUP_ALL_USERS; + } else { + url_spec = grant.get_referer(); + if (url_spec.empty()) { + continue; + } + id = (perm != 0) ? ".r:" + url_spec : ".r:-" + url_spec; + } + } + if (perm & SWIFT_PERM_READ) { + if (!read.empty()) { + read.append(","); + } + read.append(id.to_str()); + } else if (perm & SWIFT_PERM_WRITE) { + if (!write.empty()) { + write.append(","); + } + write.append(id.to_str()); + } else if (perm == 0 && !url_spec.empty()) { + /* only X-Container-Read headers support referers */ + if (!read.empty()) { + read.append(","); + } + read.append(id.to_str()); + } + } +} + +void RGWAccessControlPolicy_SWIFTAcct::add_grants(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const std::vector& uids, + const uint32_t perm) +{ + for (const auto& uid : uids) { + ACLGrant grant; + + if (uid_is_public(uid)) { + grant.set_group(ACL_GROUP_ALL_USERS, perm); + acl.add_grant(&grant); + } else { + std::unique_ptr user = driver->get_user(rgw_user(uid)); + + if (user->load_user(dpp, null_yield) < 0) { + ldpp_dout(dpp, 10) << "grant user does not exist:" << uid << dendl; + /* skipping silently */ + grant.set_canon(user->get_id(), std::string(), perm); + acl.add_grant(&grant); + } else { + grant.set_canon(user->get_id(), user->get_display_name(), perm); + acl.add_grant(&grant); + } + } + } +} + +bool RGWAccessControlPolicy_SWIFTAcct::create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const rgw_user& id, + const std::string& name, + const std::string& acl_str) +{ + acl.create_default(id, name); + owner.set_id(id); + owner.set_name(name); + + JSONParser parser; + + if (!parser.parse(acl_str.c_str(), acl_str.length())) { + ldpp_dout(dpp, 0) << "ERROR: JSONParser::parse returned error=" << dendl; + return false; + } + + JSONObjIter iter = parser.find_first("admin"); + if (!iter.end() && (*iter)->is_array()) { + std::vector admin; + decode_json_obj(admin, *iter); + ldpp_dout(dpp, 0) << "admins: " << admin << dendl; + + add_grants(dpp, driver, admin, SWIFT_PERM_ADMIN); + } + + iter = parser.find_first("read-write"); + if (!iter.end() && (*iter)->is_array()) { + std::vector readwrite; + decode_json_obj(readwrite, *iter); + ldpp_dout(dpp, 0) << "read-write: " << readwrite << dendl; + + add_grants(dpp, driver, readwrite, SWIFT_PERM_RWRT); + } + + iter = parser.find_first("read-only"); + if (!iter.end() && (*iter)->is_array()) { + std::vector readonly; + decode_json_obj(readonly, *iter); + ldpp_dout(dpp, 0) << "read-only: " << readonly << dendl; + + add_grants(dpp, driver, readonly, SWIFT_PERM_READ); + } + + return true; +} + +boost::optional RGWAccessControlPolicy_SWIFTAcct::to_str() const +{ + std::vector admin; + std::vector readwrite; + std::vector readonly; + + /* Parition the grant map into three not-overlapping groups. */ + for (const auto& item : get_acl().get_grant_map()) { + const ACLGrant& grant = item.second; + const uint32_t perm = grant.get_permission().get_permissions(); + + rgw_user id; + if (!grant.get_id(id)) { + if (grant.get_group() != ACL_GROUP_ALL_USERS) { + continue; + } + id = SWIFT_GROUP_ALL_USERS; + } else if (owner.get_id() == id) { + continue; + } + + if (SWIFT_PERM_ADMIN == (perm & SWIFT_PERM_ADMIN)) { + admin.insert(admin.end(), id.to_str()); + } else if (SWIFT_PERM_RWRT == (perm & SWIFT_PERM_RWRT)) { + readwrite.insert(readwrite.end(), id.to_str()); + } else if (SWIFT_PERM_READ == (perm & SWIFT_PERM_READ)) { + readonly.insert(readonly.end(), id.to_str()); + } else { + // FIXME: print a warning + } + } + + /* If there is no grant to serialize, let's exit earlier to not return + * an empty JSON object which brakes the functional tests of Swift. */ + if (admin.empty() && readwrite.empty() && readonly.empty()) { + return boost::none; + } + + /* Serialize the groups. */ + JSONFormatter formatter; + + formatter.open_object_section("acl"); + if (!readonly.empty()) { + encode_json("read-only", readonly, &formatter); + } + if (!readwrite.empty()) { + encode_json("read-write", readwrite, &formatter); + } + if (!admin.empty()) { + encode_json("admin", admin, &formatter); + } + formatter.close_section(); + + std::ostringstream oss; + formatter.flush(oss); + + return oss.str(); +} diff --git a/src/rgw/rgw_acl_swift.h b/src/rgw/rgw_acl_swift.h new file mode 100644 index 000000000..4cb1e4b8f --- /dev/null +++ b/src/rgw/rgw_acl_swift.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include + +#include "rgw_acl.h" + +class RGWUserCtl; + +class RGWAccessControlPolicy_SWIFT : public RGWAccessControlPolicy +{ + int add_grants(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + const std::vector& uids, + uint32_t perm); + +public: + explicit RGWAccessControlPolicy_SWIFT(CephContext* const cct) + : RGWAccessControlPolicy(cct) { + } + ~RGWAccessControlPolicy_SWIFT() override = default; + + int create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const rgw_user& id, + const std::string& name, + const char* read_list, + const char* write_list, + uint32_t& rw_mask); + void filter_merge(uint32_t mask, RGWAccessControlPolicy_SWIFT *policy); + void to_str(std::string& read, std::string& write); +}; + +class RGWAccessControlPolicy_SWIFTAcct : public RGWAccessControlPolicy +{ +public: + explicit RGWAccessControlPolicy_SWIFTAcct(CephContext * const cct) + : RGWAccessControlPolicy(cct) { + } + ~RGWAccessControlPolicy_SWIFTAcct() override {} + + void add_grants(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const std::vector& uids, + uint32_t perm); + bool create(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const rgw_user& id, + const std::string& name, + const std::string& acl_str); + boost::optional to_str() const; +}; diff --git a/src/rgw/rgw_acl_types.h b/src/rgw/rgw_acl_types.h new file mode 100644 index 000000000..af256b1b5 --- /dev/null +++ b/src/rgw/rgw_acl_types.h @@ -0,0 +1,213 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * introduce changes or include files which can only be compiled in + * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h) + */ + +#pragma once + +#include +#include +#include + +#include "include/types.h" +#include "common/Formatter.h" + +#define RGW_PERM_NONE 0x00 +#define RGW_PERM_READ 0x01 +#define RGW_PERM_WRITE 0x02 +#define RGW_PERM_READ_ACP 0x04 +#define RGW_PERM_WRITE_ACP 0x08 +#define RGW_PERM_READ_OBJS 0x10 +#define RGW_PERM_WRITE_OBJS 0x20 +#define RGW_PERM_FULL_CONTROL ( RGW_PERM_READ | RGW_PERM_WRITE | \ + RGW_PERM_READ_ACP | RGW_PERM_WRITE_ACP ) +#define RGW_PERM_ALL_S3 RGW_PERM_FULL_CONTROL +#define RGW_PERM_INVALID 0xFF00 + +static constexpr char RGW_REFERER_WILDCARD[] = "*"; + +struct RGWAccessKey { + std::string id; // AccessKey + std::string key; // SecretKey + std::string subuser; + + RGWAccessKey() {} + RGWAccessKey(std::string _id, std::string _key) + : id(std::move(_id)), key(std::move(_key)) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(id, bl); + encode(key, bl); + encode(subuser, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl); + decode(id, bl); + decode(key, bl); + decode(subuser, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void dump_plain(Formatter *f) const; + void dump(Formatter *f, const std::string& user, bool swift) const; + static void generate_test_instances(std::list& o); + + void decode_json(JSONObj *obj); + void decode_json(JSONObj *obj, bool swift); +}; +WRITE_CLASS_ENCODER(RGWAccessKey) + +struct RGWSubUser { + std::string name; + uint32_t perm_mask; + + RGWSubUser() : perm_mask(0) {} + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(name, bl); + encode(perm_mask, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl); + decode(name, bl); + decode(perm_mask, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void dump(Formatter *f, const std::string& user) const; + static void generate_test_instances(std::list& o); + + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWSubUser) + +class RGWUserCaps +{ + std::map caps; + + int get_cap(const std::string& cap, std::string& type, uint32_t *perm); + int add_cap(const std::string& cap); + int remove_cap(const std::string& cap); +public: + static int parse_cap_perm(const std::string& str, uint32_t *perm); + int add_from_string(const std::string& str); + int remove_from_string(const std::string& str); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(caps, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(caps, bl); + DECODE_FINISH(bl); + } + int check_cap(const std::string& cap, uint32_t perm) const; + bool is_valid_cap_type(const std::string& tp); + void dump(Formatter *f) const; + void dump(Formatter *f, const char *name) const; + + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWUserCaps) + +enum ACLGranteeTypeEnum { +/* numbers are encoded, should not change */ + ACL_TYPE_CANON_USER = 0, + ACL_TYPE_EMAIL_USER = 1, + ACL_TYPE_GROUP = 2, + ACL_TYPE_UNKNOWN = 3, + ACL_TYPE_REFERER = 4, +}; + +enum ACLGroupTypeEnum { +/* numbers are encoded should not change */ + ACL_GROUP_NONE = 0, + ACL_GROUP_ALL_USERS = 1, + ACL_GROUP_AUTHENTICATED_USERS = 2, +}; + +class ACLPermission +{ +protected: + int flags; +public: + ACLPermission() : flags(0) {} + ~ACLPermission() {} + uint32_t get_permissions() const { return flags; } + void set_permissions(uint32_t perm) { flags = perm; } + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(flags, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(flags, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + friend bool operator==(const ACLPermission& lhs, const ACLPermission& rhs); + friend bool operator!=(const ACLPermission& lhs, const ACLPermission& rhs); +}; +WRITE_CLASS_ENCODER(ACLPermission) + +class ACLGranteeType +{ +protected: + __u32 type; +public: + ACLGranteeType() : type(ACL_TYPE_UNKNOWN) {} + virtual ~ACLGranteeType() {} +// virtual const char *to_string() = 0; + ACLGranteeTypeEnum get_type() const { return (ACLGranteeTypeEnum)type; } + void set(ACLGranteeTypeEnum t) { type = t; } +// virtual void set(const char *s) = 0; + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(type, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(type, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + friend bool operator==(const ACLGranteeType& lhs, const ACLGranteeType& rhs); + friend bool operator!=(const ACLGranteeType& lhs, const ACLGranteeType& rhs); +}; +WRITE_CLASS_ENCODER(ACLGranteeType) + +class ACLGrantee +{ +public: + ACLGrantee() {} + ~ACLGrantee() {} +}; diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc new file mode 100644 index 000000000..73b0736b1 --- /dev/null +++ b/src/rgw/rgw_admin.cc @@ -0,0 +1,10799 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + +#include +#include + +extern "C" { +#include +} + +#include + +#include "auth/Crypto.h" +#include "compressor/Compressor.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "common/fault_injector.h" + +#include "include/util.h" + +#include "cls/rgw/cls_rgw_types.h" +#include "cls/rgw/cls_rgw_client.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_otp.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_datalog.h" +#include "rgw_lc.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_orphan.h" +#include "rgw_sync.h" +#include "rgw_trim_bilog.h" +#include "rgw_trim_datalog.h" +#include "rgw_trim_mdlog.h" +#include "rgw_data_sync.h" +#include "rgw_rest_conn.h" +#include "rgw_realm_watcher.h" +#include "rgw_role.h" +#include "rgw_reshard.h" +#include "rgw_http_client_curl.h" +#include "rgw_zone.h" +#include "rgw_pubsub.h" +#include "rgw_bucket_sync.h" +#include "rgw_sync_checkpoint.h" +#include "rgw_lua.h" +#include "rgw_sal.h" +#include "rgw_sal_config.h" + +#include "services/svc_sync_modules.h" +#include "services/svc_cls.h" +#include "services/svc_bilog_rados.h" +#include "services/svc_mdlog.h" +#include "services/svc_meta_be_otp.h" +#include "services/svc_user.h" +#include "services/svc_zone.h" + +#include "driver/rados/rgw_bucket.h" +#include "driver/rados/rgw_sal_rados.h" + +#define dout_context g_ceph_context + +#define SECRET_KEY_LEN 40 +#define PUBLIC_ID_LEN 20 + +using namespace std; + +static rgw::sal::Driver* driver = NULL; +static constexpr auto dout_subsys = ceph_subsys_rgw; + + +static const DoutPrefixProvider* dpp() { + struct GlobalPrefix : public DoutPrefixProvider { + CephContext *get_cct() const override { return dout_context; } + unsigned get_subsys() const override { return dout_subsys; } + std::ostream& gen_prefix(std::ostream& out) const override { return out; } + }; + static GlobalPrefix global_dpp; + return &global_dpp; +} + +#define CHECK_TRUE(x, msg, err) \ + do { \ + if (!x) { \ + cerr << msg << std::endl; \ + return err; \ + } \ + } while (0) + +#define CHECK_SUCCESS(x, msg) \ + do { \ + int _x_val = (x); \ + if (_x_val < 0) { \ + cerr << msg << ": " << cpp_strerror(-_x_val) << std::endl; \ + return _x_val; \ + } \ + } while (0) + +static inline int posix_errortrans(int r) +{ + switch(r) { + case ERR_NO_SUCH_BUCKET: + r = ENOENT; + break; + default: + break; + } + return r; +} + + +static const std::string LUA_CONTEXT_LIST("prerequest, postrequest, background, getdata, putdata"); + +void usage() +{ + cout << "usage: radosgw-admin [options...]" << std::endl; + cout << "commands:\n"; + cout << " user create create a new user\n" ; + cout << " user modify modify user\n"; + cout << " user info get user info\n"; + cout << " user rename rename user\n"; + cout << " user rm remove user\n"; + cout << " user suspend suspend a user\n"; + cout << " user enable re-enable user after suspension\n"; + cout << " user check check user info\n"; + cout << " user stats show user stats as accounted by quota subsystem\n"; + cout << " user list list users\n"; + cout << " caps add add user capabilities\n"; + cout << " caps rm remove user capabilities\n"; + cout << " subuser create create a new subuser\n" ; + cout << " subuser modify modify subuser\n"; + cout << " subuser rm remove subuser\n"; + cout << " key create create access key\n"; + cout << " key rm remove access key\n"; + cout << " bucket list list buckets (specify --allow-unordered for\n"; + cout << " faster, unsorted listing)\n"; + cout << " bucket limit check show bucket sharding stats\n"; + cout << " bucket link link bucket to specified user\n"; + cout << " bucket unlink unlink bucket from specified user\n"; + cout << " bucket stats returns bucket statistics\n"; + cout << " bucket rm remove bucket\n"; + cout << " bucket check check bucket index by verifying size and object count stats\n"; + cout << " bucket check olh check for olh index entries and objects that are pending removal\n"; + cout << " bucket check unlinked check for object versions that are not visible in a bucket listing \n"; + cout << " bucket chown link bucket to specified user and update its object ACLs\n"; + cout << " bucket reshard reshard bucket\n"; + cout << " bucket rewrite rewrite all objects in the specified bucket\n"; + cout << " bucket sync checkpoint poll a bucket's sync status until it catches up to its remote\n"; + cout << " bucket sync disable disable bucket sync\n"; + cout << " bucket sync enable enable bucket sync\n"; + cout << " bucket radoslist list rados objects backing bucket's objects\n"; + cout << " bi get retrieve bucket index object entries\n"; + cout << " bi put store bucket index object entries\n"; + cout << " bi list list raw bucket index entries\n"; + cout << " bi purge purge bucket index entries\n"; + cout << " object rm remove object\n"; + cout << " object put put object\n"; + cout << " object stat stat an object for its metadata\n"; + cout << " object unlink unlink object from bucket index\n"; + cout << " object rewrite rewrite the specified object\n"; + cout << " object reindex reindex the object(s) indicated by --bucket and either --object or --objects-file\n"; + cout << " objects expire run expired objects cleanup\n"; + cout << " objects expire-stale list list stale expired objects (caused by reshard)\n"; + cout << " objects expire-stale rm remove stale expired objects\n"; + cout << " period rm remove a period\n"; + cout << " period get get period info\n"; + cout << " period get-current get current period info\n"; + cout << " period pull pull a period\n"; + cout << " period push push a period\n"; + cout << " period list list all periods\n"; + cout << " period update update the staging period\n"; + cout << " period commit commit the staging period\n"; + cout << " quota set set quota params\n"; + cout << " quota enable enable quota\n"; + cout << " quota disable disable quota\n"; + cout << " ratelimit get get ratelimit params\n"; + cout << " ratelimit set set ratelimit params\n"; + cout << " ratelimit enable enable ratelimit\n"; + cout << " ratelimit disable disable ratelimit\n"; + cout << " global quota get view global quota params\n"; + cout << " global quota set set global quota params\n"; + cout << " global quota enable enable a global quota\n"; + cout << " global quota disable disable a global quota\n"; + cout << " global ratelimit get view global ratelimit params\n"; + cout << " global ratelimit set set global ratelimit params\n"; + cout << " global ratelimit enable enable a ratelimit quota\n"; + cout << " global ratelimit disable disable a ratelimit quota\n"; + cout << " realm create create a new realm\n"; + cout << " realm rm remove a realm\n"; + cout << " realm get show realm info\n"; + cout << " realm get-default get default realm name\n"; + cout << " realm list list realms\n"; + cout << " realm list-periods list all realm periods\n"; + cout << " realm rename rename a realm\n"; + cout << " realm set set realm info (requires infile)\n"; + cout << " realm default set realm as default\n"; + cout << " realm pull pull a realm and its current period\n"; + cout << " zonegroup add add a zone to a zonegroup\n"; + cout << " zonegroup create create a new zone group info\n"; + cout << " zonegroup default set default zone group\n"; + cout << " zonegroup delete delete a zone group info\n"; + cout << " zonegroup get show zone group info\n"; + cout << " zonegroup modify modify an existing zonegroup\n"; + cout << " zonegroup set set zone group info (requires infile)\n"; + cout << " zonegroup rm remove a zone from a zonegroup\n"; + cout << " zonegroup rename rename a zone group\n"; + cout << " zonegroup list list all zone groups set on this cluster\n"; + cout << " zonegroup placement list list zonegroup's placement targets\n"; + cout << " zonegroup placement get get a placement target of a specific zonegroup\n"; + cout << " zonegroup placement add add a placement target id to a zonegroup\n"; + cout << " zonegroup placement modify modify a placement target of a specific zonegroup\n"; + cout << " zonegroup placement rm remove a placement target from a zonegroup\n"; + cout << " zonegroup placement default set a zonegroup's default placement target\n"; + cout << " zone create create a new zone\n"; + cout << " zone rm remove a zone\n"; + cout << " zone get show zone cluster params\n"; + cout << " zone modify modify an existing zone\n"; + cout << " zone set set zone cluster params (requires infile)\n"; + cout << " zone list list all zones set on this cluster\n"; + cout << " zone rename rename a zone\n"; + cout << " zone placement list list zone's placement targets\n"; + cout << " zone placement get get a zone placement target\n"; + cout << " zone placement add add a zone placement target\n"; + cout << " zone placement modify modify a zone placement target\n"; + cout << " zone placement rm remove a zone placement target\n"; + cout << " metadata sync status get metadata sync status\n"; + cout << " metadata sync init init metadata sync\n"; + cout << " metadata sync run run metadata sync\n"; + cout << " data sync status get data sync status of the specified source zone\n"; + cout << " data sync init init data sync for the specified source zone\n"; + cout << " data sync run run data sync for the specified source zone\n"; + cout << " pool add add an existing pool for data placement\n"; + cout << " pool rm remove an existing pool from data placement set\n"; + cout << " pools list list placement active set\n"; + cout << " policy read bucket/object policy\n"; + cout << " log list list log objects\n"; + cout << " log show dump a log from specific object or (bucket + date\n"; + cout << " + bucket-id)\n"; + cout << " (NOTE: required to specify formatting of date\n"; + cout << " to \"YYYY-MM-DD-hh\")\n"; + cout << " log rm remove log object\n"; + cout << " usage show show usage (by user, by bucket, date range)\n"; + cout << " usage trim trim usage (by user, by bucket, date range)\n"; + cout << " usage clear reset all the usage stats for the cluster\n"; + cout << " gc list dump expired garbage collection objects (specify\n"; + cout << " --include-all to list all entries, including unexpired)\n"; + cout << " gc process manually process garbage (specify\n"; + cout << " --include-all to process all entries, including unexpired)\n"; + cout << " lc list list all bucket lifecycle progress\n"; + cout << " lc get get a lifecycle bucket configuration\n"; + cout << " lc process manually process lifecycle\n"; + cout << " lc reshard fix fix LC for a resharded bucket\n"; + cout << " metadata get get metadata info\n"; + cout << " metadata put put metadata info\n"; + cout << " metadata rm remove metadata info\n"; + cout << " metadata list list metadata info\n"; + cout << " mdlog list list metadata log\n"; + cout << " mdlog autotrim auto trim metadata log\n"; + cout << " mdlog trim trim metadata log (use marker)\n"; + cout << " mdlog status read metadata log status\n"; + cout << " bilog list list bucket index log\n"; + cout << " bilog trim trim bucket index log (use start-marker, end-marker)\n"; + cout << " bilog status read bucket index log status\n"; + cout << " bilog autotrim auto trim bucket index log\n"; + cout << " datalog list list data log\n"; + cout << " datalog trim trim data log\n"; + cout << " datalog status read data log status\n"; + cout << " datalog type change datalog type to --log_type={fifo,omap}\n"; + cout << " orphans find deprecated -- init and run search for leaked rados objects (use job-id, pool)\n"; + cout << " orphans finish deprecated -- clean up search for leaked rados objects\n"; + cout << " orphans list-jobs deprecated -- list the current job-ids for orphans search\n"; + cout << " * the three 'orphans' sub-commands are now deprecated; consider using the `rgw-orphan-list` tool\n"; + cout << " role create create a AWS role for use with STS\n"; + cout << " role delete remove a role\n"; + cout << " role get get a role\n"; + cout << " role list list roles with specified path prefix\n"; + cout << " role-trust-policy modify modify the assume role policy of an existing role\n"; + cout << " role-policy put add/update permission policy to role\n"; + cout << " role-policy list list policies attached to a role\n"; + cout << " role-policy get get the specified inline policy document embedded with the given role\n"; + cout << " role-policy delete remove policy attached to a role\n"; + cout << " role update update max_session_duration of a role\n"; + cout << " reshard add schedule a resharding of a bucket\n"; + cout << " reshard list list all bucket resharding or scheduled to be resharded\n"; + cout << " reshard status read bucket resharding status\n"; + cout << " reshard process process of scheduled reshard jobs\n"; + cout << " reshard cancel cancel resharding a bucket\n"; + cout << " reshard stale-instances list list stale-instances from bucket resharding\n"; + cout << " reshard stale-instances delete cleanup stale-instances from bucket resharding\n"; + cout << " sync error list list sync error\n"; + cout << " sync error trim trim sync error\n"; + cout << " mfa create create a new MFA TOTP token\n"; + cout << " mfa list list MFA TOTP tokens\n"; + cout << " mfa get show MFA TOTP token\n"; + cout << " mfa remove delete MFA TOTP token\n"; + cout << " mfa check check MFA TOTP token\n"; + cout << " mfa resync re-sync MFA TOTP token\n"; + cout << " topic list list bucket notifications topics\n"; + cout << " topic get get a bucket notifications topic\n"; + cout << " topic rm remove a bucket notifications topic\n"; + cout << " script put upload a lua script to a context\n"; + cout << " script get get the lua script of a context\n"; + cout << " script rm remove the lua scripts of a context\n"; + cout << " script-package add add a lua package to the scripts allowlist\n"; + cout << " script-package rm remove a lua package from the scripts allowlist\n"; + cout << " script-package list get the lua packages allowlist\n"; + cout << " notification list list bucket notifications configuration\n"; + cout << " notification get get a bucket notifications configuration\n"; + cout << " notification rm remove a bucket notifications configuration\n"; + cout << "options:\n"; + cout << " --tenant= tenant name\n"; + cout << " --user_ns= namespace of user (oidc in case of users authenticated with oidc provider)\n"; + cout << " --uid= user id\n"; + cout << " --new-uid= new user id\n"; + cout << " --subuser= subuser name\n"; + cout << " --access-key= S3 access key\n"; + cout << " --email= user's email address\n"; + cout << " --secret/--secret-key=\n"; + cout << " specify secret key\n"; + cout << " --gen-access-key generate random access key (for S3)\n"; + cout << " --gen-secret generate random secret key\n"; + cout << " --key-type= key type, options are: swift, s3\n"; + cout << " --temp-url-key[-2]= temp url key\n"; + cout << " --access= Set access permissions for sub-user, should be one\n"; + cout << " of read, write, readwrite, full\n"; + cout << " --display-name= user's display name\n"; + cout << " --max-buckets max number of buckets for a user\n"; + cout << " --admin set the admin flag on the user\n"; + cout << " --system set the system flag on the user\n"; + cout << " --op-mask set the op mask on the user\n"; + cout << " --bucket= Specify the bucket name. Also used by the quota command.\n"; + cout << " --pool= Specify the pool name. Also used to scan for leaked rados objects.\n"; + cout << " --object= object name\n"; + cout << " --objects-file= file containing a list of object names to process\n"; + cout << " --object-version= object version\n"; + cout << " --date= date in the format yyyy-mm-dd\n"; + cout << " --start-date= start date in the format yyyy-mm-dd\n"; + cout << " --end-date= end date in the format yyyy-mm-dd\n"; + cout << " --bucket-id= bucket id\n"; + cout << " --bucket-new-name=\n"; + cout << " for bucket link: optional new name\n"; + cout << " --shard-id= optional for: \n"; + cout << " mdlog list\n"; + cout << " data sync status\n"; + cout << " required for: \n"; + cout << " mdlog trim\n"; + cout << " --gen= optional for: \n"; + cout << " bilog list\n"; + cout << " bilog trim\n"; + cout << " bilog status\n"; + cout << " --max-entries= max entries for listing operations\n"; + cout << " --metadata-key= key to retrieve metadata from with metadata get\n"; + cout << " --remote= zone or zonegroup id of remote gateway\n"; + cout << " --period= period id\n"; + cout << " --url= url for pushing/pulling period/realm\n"; + cout << " --epoch= period epoch\n"; + cout << " --commit commit the period during 'period update'\n"; + cout << " --staging get staging period info\n"; + cout << " --master set as master\n"; + cout << " --master-zone= master zone id\n"; + cout << " --rgw-realm= realm name\n"; + cout << " --realm-id= realm id\n"; + cout << " --realm-new-name= realm new name\n"; + cout << " --rgw-zonegroup= zonegroup name\n"; + cout << " --zonegroup-id= zonegroup id\n"; + cout << " --zonegroup-new-name=\n"; + cout << " zonegroup new name\n"; + cout << " --rgw-zone= name of zone in which radosgw is running\n"; + cout << " --zone-id= zone id\n"; + cout << " --zone-new-name= zone new name\n"; + cout << " --source-zone specify the source zone (for data sync)\n"; + cout << " --default set entity (realm, zonegroup, zone) as default\n"; + cout << " --read-only set zone as read-only (when adding to zonegroup)\n"; + cout << " --redirect-zone specify zone id to redirect when response is 404 (not found)\n"; + cout << " --placement-id placement id for zonegroup placement commands\n"; + cout << " --storage-class storage class for zonegroup placement commands\n"; + cout << " --tags= list of tags for zonegroup placement add and modify commands\n"; + cout << " --tags-add= list of tags to add for zonegroup placement modify command\n"; + cout << " --tags-rm= list of tags to remove for zonegroup placement modify command\n"; + cout << " --endpoints= zone endpoints\n"; + cout << " --index-pool= placement target index pool\n"; + cout << " --data-pool= placement target data pool\n"; + cout << " --data-extra-pool= placement target data extra (non-ec) pool\n"; + cout << " --placement-index-type=\n"; + cout << " placement target index type (normal, indexless, or #id)\n"; + cout << " --placement-inline-data=\n"; + cout << " set whether the placement target is configured to store a data\n"; + cout << " chunk inline in head objects\n"; + cout << " --compression= placement target compression type (plugin name or empty/none)\n"; + cout << " --tier-type= zone tier type\n"; + cout << " --tier-config==[,...]\n"; + cout << " set zone tier config keys, values\n"; + cout << " --tier-config-rm=[,...]\n"; + cout << " unset zone tier config keys\n"; + cout << " --sync-from-all[=false] set/reset whether zone syncs from all zonegroup peers\n"; + cout << " --sync-from=[zone-name][,...]\n"; + cout << " set list of zones to sync from\n"; + cout << " --sync-from-rm=[zone-name][,...]\n"; + cout << " remove zones from list of zones to sync from\n"; + cout << " --bucket-index-max-shards override a zone/zonegroup's default bucket index shard count\n"; + cout << " --fix besides checking bucket index, will also fix it\n"; + cout << " --check-objects bucket check: rebuilds bucket index according to\n"; + cout << " actual objects state\n"; + cout << " --format= specify output format for certain operations: xml,\n"; + cout << " json\n"; + cout << " --purge-data when specified, user removal will also purge all the\n"; + cout << " user data\n"; + cout << " --purge-keys when specified, subuser removal will also purge all the\n"; + cout << " subuser keys\n"; + cout << " --purge-objects remove a bucket's objects before deleting it\n"; + cout << " (NOTE: required to delete a non-empty bucket)\n"; + cout << " --sync-stats option to 'user stats', update user stats with current\n"; + cout << " stats reported by user's buckets indexes\n"; + cout << " --reset-stats option to 'user stats', reset stats in accordance with user buckets\n"; + cout << " --show-config show configuration\n"; + cout << " --show-log-entries= enable/disable dump of log entries on log show\n"; + cout << " --show-log-sum= enable/disable dump of log summation on log show\n"; + cout << " --skip-zero-entries log show only dumps entries that don't have zero value\n"; + cout << " in one of the numeric field\n"; + cout << " --infile= file to read in when setting data\n"; + cout << " --categories= comma separated list of categories, used in usage show\n"; + cout << " --caps= list of caps (e.g., \"usage=read, write; user=read\")\n"; + cout << " --op-mask= permission of user's operations (e.g., \"read, write, delete, *\")\n"; + cout << " --yes-i-really-mean-it required for certain operations\n"; + cout << " --warnings-only when specified with bucket limit check, list\n"; + cout << " only buckets nearing or over the current max\n"; + cout << " objects per shard value\n"; + cout << " --bypass-gc when specified with bucket deletion, triggers\n"; + cout << " object deletions by not involving GC\n"; + cout << " --inconsistent-index when specified with bucket deletion and bypass-gc set to true,\n"; + cout << " ignores bucket index consistency\n"; + cout << " --min-rewrite-size min object size for bucket rewrite (default 4M)\n"; + cout << " --max-rewrite-size max object size for bucket rewrite (default ULLONG_MAX)\n"; + cout << " --min-rewrite-stripe-size min stripe size for object rewrite (default 0)\n"; + cout << " --trim-delay-ms time interval in msec to limit the frequency of sync error log entries trimming operations,\n"; + cout << " the trimming process will sleep the specified msec for every 1000 entries trimmed\n"; + cout << " --max-concurrent-ios maximum concurrent ios for bucket operations (default: 32)\n"; + cout << " --enable-feature enable a zone/zonegroup feature\n"; + cout << " --disable-feature disable a zone/zonegroup feature\n"; + cout << "\n"; + cout << " := \"YYYY-MM-DD[ hh:mm:ss]\"\n"; + cout << "\nQuota options:\n"; + cout << " --max-objects specify max objects (negative value to disable)\n"; + cout << " --max-size specify max size (in B/K/M/G/T, negative value to disable)\n"; + cout << " --quota-scope scope of quota (bucket, user)\n"; + cout << "\nRate limiting options:\n"; + cout << " --max-read-ops specify max requests per minute for READ ops per RGW (GET and HEAD request methods), 0 means unlimited\n"; + cout << " --max-read-bytes specify max bytes per minute for READ ops per RGW (GET and HEAD request methods), 0 means unlimited\n"; + cout << " --max-write-ops specify max requests per minute for WRITE ops per RGW (Not GET or HEAD request methods), 0 means unlimited\n"; + cout << " --max-write-bytes specify max bytes per minute for WRITE ops per RGW (Not GET or HEAD request methods), 0 means unlimited\n"; + cout << " --ratelimit-scope scope of rate limiting: bucket, user, anonymous\n"; + cout << " anonymous can be configured only with global rate limit\n"; + cout << "\nOrphans search options:\n"; + cout << " --num-shards num of shards to use for keeping the temporary scan info\n"; + cout << " --orphan-stale-secs num of seconds to wait before declaring an object to be an orphan (default: 86400)\n"; + cout << " --job-id set the job id (for orphans find)\n"; + cout << " --detail detailed mode, log and stat head objects as well\n"; + cout << "\nOrphans list-jobs options:\n"; + cout << " --extra-info provide extra info in job list\n"; + cout << "\nRole options:\n"; + cout << " --role-name name of the role to create\n"; + cout << " --path path to the role\n"; + cout << " --assume-role-policy-doc the trust relationship policy document that grants an entity permission to assume the role\n"; + cout << " --policy-name name of the policy document\n"; + cout << " --policy-doc permission policy document\n"; + cout << " --path-prefix path prefix for filtering roles\n"; + cout << "\nMFA options:\n"; + cout << " --totp-serial a string that represents the ID of a TOTP token\n"; + cout << " --totp-seed the secret seed that is used to calculate the TOTP\n"; + cout << " --totp-seconds the time resolution that is being used for TOTP generation\n"; + cout << " --totp-window the number of TOTP tokens that are checked before and after the current token when validating token\n"; + cout << " --totp-pin the valid value of a TOTP token at a certain time\n"; + cout << "\nBucket notifications options:\n"; + cout << " --topic bucket notifications topic name\n"; + cout << " --notification-id bucket notifications id\n"; + cout << "\nScript options:\n"; + cout << " --context context in which the script runs. one of: "+LUA_CONTEXT_LIST+"\n"; + cout << " --package name of the lua package that should be added/removed to/from the allowlist\n"; + cout << " --allow-compilation package is allowed to compile C code as part of its installation\n"; + cout << "\nBucket check olh/unlinked options:\n"; + cout << " --min-age-hours minimum age of unlinked objects to consider for bucket check unlinked (default: 1)\n"; + cout << " --dump-keys when specified, all keys identified as problematic are printed to stdout\n"; + cout << " --hide-progress when specified, per-shard progress details are not printed to stderr\n"; + cout << "\nradoslist options:\n"; + cout << " --rgw-obj-fs the field separator that will separate the rados\n"; + cout << " object name from the rgw object name;\n"; + cout << " additionally rados objects for incomplete\n"; + cout << " multipart uploads will not be output\n"; + cout << "\n"; + generic_client_usage(); +} + + +class SimpleCmd { +public: + struct Def { + string cmd; + std::any opt; + }; + + using Aliases = std::vector >; + using Commands = std::vector; + +private: + struct Node { + map next; + set expected; /* separate un-normalized list */ + std::any opt; + }; + + Node cmd_root; + map alias_map; + + string normalize_alias(const string& s) const { + auto iter = alias_map.find(s); + if (iter == alias_map.end()) { + return s; + } + + return iter->second; + } + void init_alias_map(Aliases& aliases) { + for (auto& alias_set : aliases) { + std::optional first; + + for (auto& alias : alias_set) { + if (!first) { + first = alias; + } else { + alias_map[alias] = *first; + } + } + } + } + + bool gen_next_expected(Node *node, vector *expected, bool ret) { + for (auto& next_cmd : node->expected) { + expected->push_back(next_cmd); + } + return ret; + } + + Node root; + +public: + SimpleCmd() {} + + SimpleCmd(std::optional cmds, + std::optional aliases) { + if (aliases) { + add_aliases(*aliases); + } + + if (cmds) { + add_commands(*cmds); + } + } + + void add_aliases(Aliases& aliases) { + init_alias_map(aliases); + } + + void add_commands(std::vector& cmds) { + for (auto& cmd : cmds) { + vector words; + get_str_vec(cmd.cmd, " ", words); + + auto node = &cmd_root; + for (auto& word : words) { + auto norm = normalize_alias(word); + auto parent = node; + + node->expected.insert(word); + + node = &node->next[norm]; + + if (norm == "[*]") { /* optional param at the end */ + parent->next["*"] = *node; /* can be also looked up by '*' */ + parent->opt = cmd.opt; + } + } + + node->opt = cmd.opt; + } + } + + template + bool find_command(Container& args, + std::any *opt_cmd, + vector *extra_args, + string *error, + vector *expected) { + auto node = &cmd_root; + + std::optional found_opt; + + for (auto& arg : args) { + string norm = normalize_alias(arg); + auto iter = node->next.find(norm); + if (iter == node->next.end()) { + iter = node->next.find("*"); + if (iter == node->next.end()) { + *error = string("ERROR: Unrecognized argument: '") + arg + "'"; + return gen_next_expected(node, expected, false); + } + extra_args->push_back(arg); + if (!found_opt) { + found_opt = node->opt; + } + } + node = &(iter->second); + } + + *opt_cmd = found_opt.value_or(node->opt); + + if (!opt_cmd->has_value()) { + *error ="ERROR: Unknown command"; + return gen_next_expected(node, expected, false); + } + + return true; + } +}; + + +namespace rgw_admin { + +enum class OPT { + NO_CMD, + USER_CREATE, + USER_INFO, + USER_MODIFY, + USER_RENAME, + USER_RM, + USER_SUSPEND, + USER_ENABLE, + USER_CHECK, + USER_STATS, + USER_LIST, + SUBUSER_CREATE, + SUBUSER_MODIFY, + SUBUSER_RM, + KEY_CREATE, + KEY_RM, + BUCKETS_LIST, + BUCKET_LIMIT_CHECK, + BUCKET_LINK, + BUCKET_UNLINK, + BUCKET_LAYOUT, + BUCKET_STATS, + BUCKET_CHECK, + BUCKET_CHECK_OLH, + BUCKET_CHECK_UNLINKED, + BUCKET_SYNC_CHECKPOINT, + BUCKET_SYNC_INFO, + BUCKET_SYNC_STATUS, + BUCKET_SYNC_MARKERS, + BUCKET_SYNC_INIT, + BUCKET_SYNC_RUN, + BUCKET_SYNC_DISABLE, + BUCKET_SYNC_ENABLE, + BUCKET_RM, + BUCKET_REWRITE, + BUCKET_RESHARD, + BUCKET_CHOWN, + BUCKET_RADOS_LIST, + BUCKET_SHARD_OBJECTS, + BUCKET_OBJECT_SHARD, + BUCKET_RESYNC_ENCRYPTED_MULTIPART, + POLICY, + POOL_ADD, + POOL_RM, + POOLS_LIST, + LOG_LIST, + LOG_SHOW, + LOG_RM, + USAGE_SHOW, + USAGE_TRIM, + USAGE_CLEAR, + OBJECT_PUT, + OBJECT_RM, + OBJECT_UNLINK, + OBJECT_STAT, + OBJECT_REWRITE, + OBJECT_REINDEX, + OBJECTS_EXPIRE, + OBJECTS_EXPIRE_STALE_LIST, + OBJECTS_EXPIRE_STALE_RM, + BI_GET, + BI_PUT, + BI_LIST, + BI_PURGE, + OLH_GET, + OLH_READLOG, + QUOTA_SET, + QUOTA_ENABLE, + QUOTA_DISABLE, + GC_LIST, + GC_PROCESS, + LC_LIST, + LC_GET, + LC_PROCESS, + LC_RESHARD_FIX, + ORPHANS_FIND, + ORPHANS_FINISH, + ORPHANS_LIST_JOBS, + RATELIMIT_GET, + RATELIMIT_SET, + RATELIMIT_ENABLE, + RATELIMIT_DISABLE, + ZONEGROUP_ADD, + ZONEGROUP_CREATE, + ZONEGROUP_DEFAULT, + ZONEGROUP_DELETE, + ZONEGROUP_GET, + ZONEGROUP_MODIFY, + ZONEGROUP_SET, + ZONEGROUP_LIST, + ZONEGROUP_REMOVE, + ZONEGROUP_RENAME, + ZONEGROUP_PLACEMENT_ADD, + ZONEGROUP_PLACEMENT_MODIFY, + ZONEGROUP_PLACEMENT_RM, + ZONEGROUP_PLACEMENT_LIST, + ZONEGROUP_PLACEMENT_GET, + ZONEGROUP_PLACEMENT_DEFAULT, + ZONE_CREATE, + ZONE_DELETE, + ZONE_GET, + ZONE_MODIFY, + ZONE_SET, + ZONE_LIST, + ZONE_RENAME, + ZONE_DEFAULT, + ZONE_PLACEMENT_ADD, + ZONE_PLACEMENT_MODIFY, + ZONE_PLACEMENT_RM, + ZONE_PLACEMENT_LIST, + ZONE_PLACEMENT_GET, + CAPS_ADD, + CAPS_RM, + METADATA_GET, + METADATA_PUT, + METADATA_RM, + METADATA_LIST, + METADATA_SYNC_STATUS, + METADATA_SYNC_INIT, + METADATA_SYNC_RUN, + MDLOG_LIST, + MDLOG_AUTOTRIM, + MDLOG_TRIM, + MDLOG_FETCH, + MDLOG_STATUS, + SYNC_ERROR_LIST, + SYNC_ERROR_TRIM, + SYNC_GROUP_CREATE, + SYNC_GROUP_MODIFY, + SYNC_GROUP_GET, + SYNC_GROUP_REMOVE, + SYNC_GROUP_FLOW_CREATE, + SYNC_GROUP_FLOW_REMOVE, + SYNC_GROUP_PIPE_CREATE, + SYNC_GROUP_PIPE_MODIFY, + SYNC_GROUP_PIPE_REMOVE, + SYNC_POLICY_GET, + BILOG_LIST, + BILOG_TRIM, + BILOG_STATUS, + BILOG_AUTOTRIM, + DATA_SYNC_STATUS, + DATA_SYNC_INIT, + DATA_SYNC_RUN, + DATALOG_LIST, + DATALOG_STATUS, + DATALOG_AUTOTRIM, + DATALOG_TRIM, + DATALOG_TYPE, + DATALOG_PRUNE, + REALM_CREATE, + REALM_DELETE, + REALM_GET, + REALM_GET_DEFAULT, + REALM_LIST, + REALM_LIST_PERIODS, + REALM_RENAME, + REALM_SET, + REALM_DEFAULT, + REALM_PULL, + PERIOD_DELETE, + PERIOD_GET, + PERIOD_GET_CURRENT, + PERIOD_PULL, + PERIOD_PUSH, + PERIOD_LIST, + PERIOD_UPDATE, + PERIOD_COMMIT, + GLOBAL_QUOTA_GET, + GLOBAL_QUOTA_SET, + GLOBAL_QUOTA_ENABLE, + GLOBAL_QUOTA_DISABLE, + GLOBAL_RATELIMIT_GET, + GLOBAL_RATELIMIT_SET, + GLOBAL_RATELIMIT_ENABLE, + GLOBAL_RATELIMIT_DISABLE, + SYNC_INFO, + SYNC_STATUS, + ROLE_CREATE, + ROLE_DELETE, + ROLE_GET, + ROLE_TRUST_POLICY_MODIFY, + ROLE_LIST, + ROLE_POLICY_PUT, + ROLE_POLICY_LIST, + ROLE_POLICY_GET, + ROLE_POLICY_DELETE, + ROLE_UPDATE, + RESHARD_ADD, + RESHARD_LIST, + RESHARD_STATUS, + RESHARD_PROCESS, + RESHARD_CANCEL, + MFA_CREATE, + MFA_REMOVE, + MFA_GET, + MFA_LIST, + MFA_CHECK, + MFA_RESYNC, + RESHARD_STALE_INSTANCES_LIST, + RESHARD_STALE_INSTANCES_DELETE, + PUBSUB_TOPIC_LIST, + PUBSUB_TOPIC_GET, + PUBSUB_TOPIC_RM, + PUBSUB_NOTIFICATION_LIST, + PUBSUB_NOTIFICATION_GET, + PUBSUB_NOTIFICATION_RM, + SCRIPT_PUT, + SCRIPT_GET, + SCRIPT_RM, + SCRIPT_PACKAGE_ADD, + SCRIPT_PACKAGE_RM, + SCRIPT_PACKAGE_LIST +}; + +} + +using namespace rgw_admin; + +static SimpleCmd::Commands all_cmds = { + { "user create", OPT::USER_CREATE }, + { "user info", OPT::USER_INFO }, + { "user modify", OPT::USER_MODIFY }, + { "user rename", OPT::USER_RENAME }, + { "user rm", OPT::USER_RM }, + { "user suspend", OPT::USER_SUSPEND }, + { "user enable", OPT::USER_ENABLE }, + { "user check", OPT::USER_CHECK }, + { "user stats", OPT::USER_STATS }, + { "user list", OPT::USER_LIST }, + { "subuser create", OPT::SUBUSER_CREATE }, + { "subuser modify", OPT::SUBUSER_MODIFY }, + { "subuser rm", OPT::SUBUSER_RM }, + { "key create", OPT::KEY_CREATE }, + { "key rm", OPT::KEY_RM }, + { "buckets list", OPT::BUCKETS_LIST }, + { "bucket list", OPT::BUCKETS_LIST }, + { "bucket limit check", OPT::BUCKET_LIMIT_CHECK }, + { "bucket link", OPT::BUCKET_LINK }, + { "bucket unlink", OPT::BUCKET_UNLINK }, + { "bucket layout", OPT::BUCKET_LAYOUT }, + { "bucket stats", OPT::BUCKET_STATS }, + { "bucket check", OPT::BUCKET_CHECK }, + { "bucket check olh", OPT::BUCKET_CHECK_OLH }, + { "bucket check unlinked", OPT::BUCKET_CHECK_UNLINKED }, + { "bucket sync checkpoint", OPT::BUCKET_SYNC_CHECKPOINT }, + { "bucket sync info", OPT::BUCKET_SYNC_INFO }, + { "bucket sync status", OPT::BUCKET_SYNC_STATUS }, + { "bucket sync markers", OPT::BUCKET_SYNC_MARKERS }, + { "bucket sync init", OPT::BUCKET_SYNC_INIT }, + { "bucket sync run", OPT::BUCKET_SYNC_RUN }, + { "bucket sync disable", OPT::BUCKET_SYNC_DISABLE }, + { "bucket sync enable", OPT::BUCKET_SYNC_ENABLE }, + { "bucket rm", OPT::BUCKET_RM }, + { "bucket rewrite", OPT::BUCKET_REWRITE }, + { "bucket reshard", OPT::BUCKET_RESHARD }, + { "bucket chown", OPT::BUCKET_CHOWN }, + { "bucket radoslist", OPT::BUCKET_RADOS_LIST }, + { "bucket rados list", OPT::BUCKET_RADOS_LIST }, + { "bucket shard objects", OPT::BUCKET_SHARD_OBJECTS }, + { "bucket shard object", OPT::BUCKET_SHARD_OBJECTS }, + { "bucket object shard", OPT::BUCKET_OBJECT_SHARD }, + { "bucket resync encrypted multipart", OPT::BUCKET_RESYNC_ENCRYPTED_MULTIPART }, + { "policy", OPT::POLICY }, + { "pool add", OPT::POOL_ADD }, + { "pool rm", OPT::POOL_RM }, + { "pool list", OPT::POOLS_LIST }, + { "pools list", OPT::POOLS_LIST }, + { "log list", OPT::LOG_LIST }, + { "log show", OPT::LOG_SHOW }, + { "log rm", OPT::LOG_RM }, + { "usage show", OPT::USAGE_SHOW }, + { "usage trim", OPT::USAGE_TRIM }, + { "usage clear", OPT::USAGE_CLEAR }, + { "object put", OPT::OBJECT_PUT }, + { "object rm", OPT::OBJECT_RM }, + { "object unlink", OPT::OBJECT_UNLINK }, + { "object stat", OPT::OBJECT_STAT }, + { "object rewrite", OPT::OBJECT_REWRITE }, + { "object reindex", OPT::OBJECT_REINDEX }, + { "objects expire", OPT::OBJECTS_EXPIRE }, + { "objects expire-stale list", OPT::OBJECTS_EXPIRE_STALE_LIST }, + { "objects expire-stale rm", OPT::OBJECTS_EXPIRE_STALE_RM }, + { "bi get", OPT::BI_GET }, + { "bi put", OPT::BI_PUT }, + { "bi list", OPT::BI_LIST }, + { "bi purge", OPT::BI_PURGE }, + { "olh get", OPT::OLH_GET }, + { "olh readlog", OPT::OLH_READLOG }, + { "quota set", OPT::QUOTA_SET }, + { "quota enable", OPT::QUOTA_ENABLE }, + { "quota disable", OPT::QUOTA_DISABLE }, + { "ratelimit get", OPT::RATELIMIT_GET }, + { "ratelimit set", OPT::RATELIMIT_SET }, + { "ratelimit enable", OPT::RATELIMIT_ENABLE }, + { "ratelimit disable", OPT::RATELIMIT_DISABLE }, + { "gc list", OPT::GC_LIST }, + { "gc process", OPT::GC_PROCESS }, + { "lc list", OPT::LC_LIST }, + { "lc get", OPT::LC_GET }, + { "lc process", OPT::LC_PROCESS }, + { "lc reshard fix", OPT::LC_RESHARD_FIX }, + { "orphans find", OPT::ORPHANS_FIND }, + { "orphans finish", OPT::ORPHANS_FINISH }, + { "orphans list jobs", OPT::ORPHANS_LIST_JOBS }, + { "orphans list-jobs", OPT::ORPHANS_LIST_JOBS }, + { "zonegroup add", OPT::ZONEGROUP_ADD }, + { "zonegroup create", OPT::ZONEGROUP_CREATE }, + { "zonegroup default", OPT::ZONEGROUP_DEFAULT }, + { "zonegroup delete", OPT::ZONEGROUP_DELETE }, + { "zonegroup get", OPT::ZONEGROUP_GET }, + { "zonegroup modify", OPT::ZONEGROUP_MODIFY }, + { "zonegroup set", OPT::ZONEGROUP_SET }, + { "zonegroup list", OPT::ZONEGROUP_LIST }, + { "zonegroups list", OPT::ZONEGROUP_LIST }, + { "zonegroup remove", OPT::ZONEGROUP_REMOVE }, + { "zonegroup remove zone", OPT::ZONEGROUP_REMOVE }, + { "zonegroup rename", OPT::ZONEGROUP_RENAME }, + { "zonegroup placement add", OPT::ZONEGROUP_PLACEMENT_ADD }, + { "zonegroup placement modify", OPT::ZONEGROUP_PLACEMENT_MODIFY }, + { "zonegroup placement rm", OPT::ZONEGROUP_PLACEMENT_RM }, + { "zonegroup placement list", OPT::ZONEGROUP_PLACEMENT_LIST }, + { "zonegroup placement get", OPT::ZONEGROUP_PLACEMENT_GET }, + { "zonegroup placement default", OPT::ZONEGROUP_PLACEMENT_DEFAULT }, + { "zone create", OPT::ZONE_CREATE }, + { "zone delete", OPT::ZONE_DELETE }, + { "zone get", OPT::ZONE_GET }, + { "zone modify", OPT::ZONE_MODIFY }, + { "zone set", OPT::ZONE_SET }, + { "zone list", OPT::ZONE_LIST }, + { "zones list", OPT::ZONE_LIST }, + { "zone rename", OPT::ZONE_RENAME }, + { "zone default", OPT::ZONE_DEFAULT }, + { "zone placement add", OPT::ZONE_PLACEMENT_ADD }, + { "zone placement modify", OPT::ZONE_PLACEMENT_MODIFY }, + { "zone placement rm", OPT::ZONE_PLACEMENT_RM }, + { "zone placement list", OPT::ZONE_PLACEMENT_LIST }, + { "zone placement get", OPT::ZONE_PLACEMENT_GET }, + { "caps add", OPT::CAPS_ADD }, + { "caps rm", OPT::CAPS_RM }, + { "metadata get [*]", OPT::METADATA_GET }, + { "metadata put [*]", OPT::METADATA_PUT }, + { "metadata rm [*]", OPT::METADATA_RM }, + { "metadata list [*]", OPT::METADATA_LIST }, + { "metadata sync status", OPT::METADATA_SYNC_STATUS }, + { "metadata sync init", OPT::METADATA_SYNC_INIT }, + { "metadata sync run", OPT::METADATA_SYNC_RUN }, + { "mdlog list", OPT::MDLOG_LIST }, + { "mdlog autotrim", OPT::MDLOG_AUTOTRIM }, + { "mdlog trim", OPT::MDLOG_TRIM }, + { "mdlog fetch", OPT::MDLOG_FETCH }, + { "mdlog status", OPT::MDLOG_STATUS }, + { "sync error list", OPT::SYNC_ERROR_LIST }, + { "sync error trim", OPT::SYNC_ERROR_TRIM }, + { "sync policy get", OPT::SYNC_POLICY_GET }, + { "sync group create", OPT::SYNC_GROUP_CREATE }, + { "sync group modify", OPT::SYNC_GROUP_MODIFY }, + { "sync group get", OPT::SYNC_GROUP_GET }, + { "sync group remove", OPT::SYNC_GROUP_REMOVE }, + { "sync group flow create", OPT::SYNC_GROUP_FLOW_CREATE }, + { "sync group flow remove", OPT::SYNC_GROUP_FLOW_REMOVE }, + { "sync group pipe create", OPT::SYNC_GROUP_PIPE_CREATE }, + { "sync group pipe modify", OPT::SYNC_GROUP_PIPE_MODIFY }, + { "sync group pipe remove", OPT::SYNC_GROUP_PIPE_REMOVE }, + { "bilog list", OPT::BILOG_LIST }, + { "bilog trim", OPT::BILOG_TRIM }, + { "bilog status", OPT::BILOG_STATUS }, + { "bilog autotrim", OPT::BILOG_AUTOTRIM }, + { "data sync status", OPT::DATA_SYNC_STATUS }, + { "data sync init", OPT::DATA_SYNC_INIT }, + { "data sync run", OPT::DATA_SYNC_RUN }, + { "datalog list", OPT::DATALOG_LIST }, + { "datalog status", OPT::DATALOG_STATUS }, + { "datalog autotrim", OPT::DATALOG_AUTOTRIM }, + { "datalog trim", OPT::DATALOG_TRIM }, + { "datalog type", OPT::DATALOG_TYPE }, + { "datalog prune", OPT::DATALOG_PRUNE }, + { "realm create", OPT::REALM_CREATE }, + { "realm rm", OPT::REALM_DELETE }, + { "realm get", OPT::REALM_GET }, + { "realm get default", OPT::REALM_GET_DEFAULT }, + { "realm get-default", OPT::REALM_GET_DEFAULT }, + { "realm list", OPT::REALM_LIST }, + { "realm list periods", OPT::REALM_LIST_PERIODS }, + { "realm list-periods", OPT::REALM_LIST_PERIODS }, + { "realm rename", OPT::REALM_RENAME }, + { "realm set", OPT::REALM_SET }, + { "realm default", OPT::REALM_DEFAULT }, + { "realm pull", OPT::REALM_PULL }, + { "period delete", OPT::PERIOD_DELETE }, + { "period get", OPT::PERIOD_GET }, + { "period get-current", OPT::PERIOD_GET_CURRENT }, + { "period get current", OPT::PERIOD_GET_CURRENT }, + { "period pull", OPT::PERIOD_PULL }, + { "period push", OPT::PERIOD_PUSH }, + { "period list", OPT::PERIOD_LIST }, + { "period update", OPT::PERIOD_UPDATE }, + { "period commit", OPT::PERIOD_COMMIT }, + { "global quota get", OPT::GLOBAL_QUOTA_GET }, + { "global quota set", OPT::GLOBAL_QUOTA_SET }, + { "global quota enable", OPT::GLOBAL_QUOTA_ENABLE }, + { "global quota disable", OPT::GLOBAL_QUOTA_DISABLE }, + { "global ratelimit get", OPT::GLOBAL_RATELIMIT_GET }, + { "global ratelimit set", OPT::GLOBAL_RATELIMIT_SET }, + { "global ratelimit enable", OPT::GLOBAL_RATELIMIT_ENABLE }, + { "global ratelimit disable", OPT::GLOBAL_RATELIMIT_DISABLE }, + { "sync info", OPT::SYNC_INFO }, + { "sync status", OPT::SYNC_STATUS }, + { "role create", OPT::ROLE_CREATE }, + { "role delete", OPT::ROLE_DELETE }, + { "role get", OPT::ROLE_GET }, + { "role-trust-policy modify", OPT::ROLE_TRUST_POLICY_MODIFY }, + { "role list", OPT::ROLE_LIST }, + { "role policy put", OPT::ROLE_POLICY_PUT }, + { "role-policy put", OPT::ROLE_POLICY_PUT }, + { "role policy list", OPT::ROLE_POLICY_LIST }, + { "role-policy list", OPT::ROLE_POLICY_LIST }, + { "role policy get", OPT::ROLE_POLICY_GET }, + { "role-policy get", OPT::ROLE_POLICY_GET }, + { "role policy delete", OPT::ROLE_POLICY_DELETE }, + { "role-policy delete", OPT::ROLE_POLICY_DELETE }, + { "role update", OPT::ROLE_UPDATE }, + { "reshard bucket", OPT::BUCKET_RESHARD }, + { "reshard add", OPT::RESHARD_ADD }, + { "reshard list", OPT::RESHARD_LIST }, + { "reshard status", OPT::RESHARD_STATUS }, + { "reshard process", OPT::RESHARD_PROCESS }, + { "reshard cancel", OPT::RESHARD_CANCEL }, + { "mfa create", OPT::MFA_CREATE }, + { "mfa remove", OPT::MFA_REMOVE }, + { "mfa get", OPT::MFA_GET }, + { "mfa list", OPT::MFA_LIST }, + { "mfa check", OPT::MFA_CHECK }, + { "mfa resync", OPT::MFA_RESYNC }, + { "reshard stale-instances list", OPT::RESHARD_STALE_INSTANCES_LIST }, + { "reshard stale list", OPT::RESHARD_STALE_INSTANCES_LIST }, + { "reshard stale-instances delete", OPT::RESHARD_STALE_INSTANCES_DELETE }, + { "reshard stale delete", OPT::RESHARD_STALE_INSTANCES_DELETE }, + { "topic list", OPT::PUBSUB_TOPIC_LIST }, + { "topic get", OPT::PUBSUB_TOPIC_GET }, + { "topic rm", OPT::PUBSUB_TOPIC_RM }, + { "notification list", OPT::PUBSUB_NOTIFICATION_LIST }, + { "notification get", OPT::PUBSUB_NOTIFICATION_GET }, + { "notification rm", OPT::PUBSUB_NOTIFICATION_RM }, + { "script put", OPT::SCRIPT_PUT }, + { "script get", OPT::SCRIPT_GET }, + { "script rm", OPT::SCRIPT_RM }, + { "script-package add", OPT::SCRIPT_PACKAGE_ADD }, + { "script-package rm", OPT::SCRIPT_PACKAGE_RM }, + { "script-package list", OPT::SCRIPT_PACKAGE_LIST }, +}; + +static SimpleCmd::Aliases cmd_aliases = { + { "delete", "del" }, + { "remove", "rm" }, + { "rename", "mv" }, +}; + + + +BIIndexType get_bi_index_type(const string& type_str) { + if (type_str == "plain") + return BIIndexType::Plain; + if (type_str == "instance") + return BIIndexType::Instance; + if (type_str == "olh") + return BIIndexType::OLH; + + return BIIndexType::Invalid; +} + +log_type get_log_type(const string& type_str) { + if (strcasecmp(type_str.c_str(), "fifo") == 0) + return log_type::fifo; + if (strcasecmp(type_str.c_str(), "omap") == 0) + return log_type::omap; + + return static_cast(0xff); +} + +void dump_bi_entry(bufferlist& bl, BIIndexType index_type, Formatter *formatter) +{ + auto iter = bl.cbegin(); + switch (index_type) { + case BIIndexType::Plain: + case BIIndexType::Instance: + { + rgw_bucket_dir_entry entry; + decode(entry, iter); + encode_json("entry", entry, formatter); + } + break; + case BIIndexType::OLH: + { + rgw_bucket_olh_entry entry; + decode(entry, iter); + encode_json("entry", entry, formatter); + } + break; + default: + ceph_abort(); + break; + } +} + +static void show_user_info(RGWUserInfo& info, Formatter *formatter) +{ + encode_json("user_info", info, formatter); + formatter->flush(cout); + cout << std::endl; +} + +static void show_perm_policy(string perm_policy, Formatter* formatter) +{ + formatter->open_object_section("role"); + formatter->dump_string("Permission policy", perm_policy); + formatter->close_section(); + formatter->flush(cout); +} + +static void show_policy_names(std::vector policy_names, Formatter* formatter) +{ + formatter->open_array_section("PolicyNames"); + for (const auto& it : policy_names) { + formatter->dump_string("policyname", it); + } + formatter->close_section(); + formatter->flush(cout); +} + +static void show_role_info(rgw::sal::RGWRole* role, Formatter* formatter) +{ + formatter->open_object_section("role"); + role->dump(formatter); + formatter->close_section(); + formatter->flush(cout); +} + +static void show_roles_info(vector>& roles, Formatter* formatter) +{ + formatter->open_array_section("Roles"); + for (const auto& it : roles) { + formatter->open_object_section("role"); + it->dump(formatter); + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); +} + +static void show_reshard_status( + const list& status, Formatter *formatter) +{ + formatter->open_array_section("status"); + for (const auto& entry : status) { + formatter->open_object_section("entry"); + formatter->dump_string("reshard_status", to_string(entry.reshard_status)); + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); +} + +class StoreDestructor { + rgw::sal::Driver* driver; +public: + explicit StoreDestructor(rgw::sal::Driver* _s) : driver(_s) {} + ~StoreDestructor() { + DriverManager::close_storage(driver); + rgw_http_client_cleanup(); + } +}; + +static int init_bucket(rgw::sal::User* user, const rgw_bucket& b, + std::unique_ptr* bucket) +{ + return driver->get_bucket(dpp(), user, b, bucket, null_yield); +} + +static int init_bucket(rgw::sal::User* user, + const string& tenant_name, + const string& bucket_name, + const string& bucket_id, + std::unique_ptr* bucket) +{ + rgw_bucket b{tenant_name, bucket_name, bucket_id}; + return init_bucket(user, b, bucket); +} + +static int read_input(const string& infile, bufferlist& bl) +{ + int fd = 0; + if (infile.size()) { + fd = open(infile.c_str(), O_RDONLY); + if (fd < 0) { + int err = -errno; + cerr << "error reading input file " << infile << std::endl; + return err; + } + } + +#define READ_CHUNK 8196 + int r; + int err; + + do { + char buf[READ_CHUNK]; + + r = safe_read(fd, buf, READ_CHUNK); + if (r < 0) { + err = -errno; + cerr << "error while reading input" << std::endl; + goto out; + } + bl.append(buf, r); + } while (r > 0); + err = 0; + + out: + if (infile.size()) { + close(fd); + } + return err; +} + +template +static int read_decode_json(const string& infile, T& t) +{ + bufferlist bl; + int ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl; + return ret; + } + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + cout << "failed to parse JSON" << std::endl; + return -EINVAL; + } + + try { + decode_json_obj(t, &p); + } catch (const JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.what() << std::endl; + return -EINVAL; + } + return 0; +} + +template +static int read_decode_json(const string& infile, T& t, K *k) +{ + bufferlist bl; + int ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl; + return ret; + } + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + cout << "failed to parse JSON" << std::endl; + return -EINVAL; + } + + try { + t.decode_json(&p, k); + } catch (const JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.what() << std::endl; + return -EINVAL; + } + return 0; +} + +template +static bool decode_dump(const char *field_name, bufferlist& bl, Formatter *f) +{ + T t; + + auto iter = bl.cbegin(); + + try { + decode(t, iter); + } catch (buffer::error& err) { + return false; + } + + encode_json(field_name, t, f); + + return true; +} + +static bool dump_string(const char *field_name, bufferlist& bl, Formatter *f) +{ + string val = bl.to_str(); + f->dump_string(field_name, val.c_str() /* hide encoded null termination chars */); + + return true; +} + +bool set_ratelimit_info(RGWRateLimitInfo& ratelimit, OPT opt_cmd, int64_t max_read_ops, int64_t max_write_ops, + int64_t max_read_bytes, int64_t max_write_bytes, + bool have_max_read_ops, bool have_max_write_ops, + bool have_max_read_bytes, bool have_max_write_bytes) +{ + bool ratelimit_configured = true; + switch (opt_cmd) { + case OPT::RATELIMIT_ENABLE: + case OPT::GLOBAL_RATELIMIT_ENABLE: + ratelimit.enabled = true; + break; + + case OPT::RATELIMIT_SET: + case OPT::GLOBAL_RATELIMIT_SET: + ratelimit_configured = false; + if (have_max_read_ops) { + if (max_read_ops >= 0) { + ratelimit.max_read_ops = max_read_ops; + ratelimit_configured = true; + } + } + if (have_max_write_ops) { + if (max_write_ops >= 0) { + ratelimit.max_write_ops = max_write_ops; + ratelimit_configured = true; + } + } + if (have_max_read_bytes) { + if (max_read_bytes >= 0) { + ratelimit.max_read_bytes = max_read_bytes; + ratelimit_configured = true; + } + } + if (have_max_write_bytes) { + if (max_write_bytes >= 0) { + ratelimit.max_write_bytes = max_write_bytes; + ratelimit_configured = true; + } + } + break; + case OPT::RATELIMIT_DISABLE: + case OPT::GLOBAL_RATELIMIT_DISABLE: + ratelimit.enabled = false; + break; + default: + break; + } + return ratelimit_configured; +} + +void set_quota_info(RGWQuotaInfo& quota, OPT opt_cmd, int64_t max_size, int64_t max_objects, + bool have_max_size, bool have_max_objects) +{ + switch (opt_cmd) { + case OPT::QUOTA_ENABLE: + case OPT::GLOBAL_QUOTA_ENABLE: + quota.enabled = true; + + // falling through on purpose + + case OPT::QUOTA_SET: + case OPT::GLOBAL_QUOTA_SET: + if (have_max_objects) { + if (max_objects < 0) { + quota.max_objects = -1; + } else { + quota.max_objects = max_objects; + } + } + if (have_max_size) { + if (max_size < 0) { + quota.max_size = -1; + } else { + quota.max_size = rgw_rounded_kb(max_size) * 1024; + } + } + break; + case OPT::QUOTA_DISABLE: + case OPT::GLOBAL_QUOTA_DISABLE: + quota.enabled = false; + break; + default: + break; + } +} + +int set_bucket_quota(rgw::sal::Driver* driver, OPT opt_cmd, + const string& tenant_name, const string& bucket_name, + int64_t max_size, int64_t max_objects, + bool have_max_size, bool have_max_objects) +{ + std::unique_ptr bucket; + int r = driver->get_bucket(dpp(), nullptr, tenant_name, bucket_name, &bucket, null_yield); + if (r < 0) { + cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + + set_quota_info(bucket->get_info().quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects); + + r = bucket->put_info(dpp(), false, real_time()); + if (r < 0) { + cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl; + return -r; + } + return 0; +} + +int set_bucket_ratelimit(rgw::sal::Driver* driver, OPT opt_cmd, + const string& tenant_name, const string& bucket_name, + int64_t max_read_ops, int64_t max_write_ops, + int64_t max_read_bytes, int64_t max_write_bytes, + bool have_max_read_ops, bool have_max_write_ops, + bool have_max_read_bytes, bool have_max_write_bytes) +{ + std::unique_ptr bucket; + int r = driver->get_bucket(dpp(), nullptr, tenant_name, bucket_name, &bucket, null_yield); + if (r < 0) { + cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + RGWRateLimitInfo ratelimit_info; + auto iter = bucket->get_attrs().find(RGW_ATTR_RATELIMIT); + if(iter != bucket->get_attrs().end()) { + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(ratelimit_info, biter); + } catch (buffer::error& err) { + ldpp_dout(dpp(), 0) << "ERROR: failed to decode rate limit" << dendl; + return -EIO; + } + } + bool ratelimit_configured = set_ratelimit_info(ratelimit_info, opt_cmd, max_read_ops, max_write_ops, + max_read_bytes, max_write_bytes, + have_max_read_ops, have_max_write_ops, + have_max_read_bytes, have_max_write_bytes); + if (!ratelimit_configured) { + ldpp_dout(dpp(), 0) << "ERROR: no rate limit values have been specified" << dendl; + return -EINVAL; + } + bufferlist bl; + ratelimit_info.encode(bl); + rgw::sal::Attrs attr; + attr[RGW_ATTR_RATELIMIT] = bl; + r = bucket->merge_and_store_attrs(dpp(), attr, null_yield); + if (r < 0) { + cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl; + return -r; + } + return 0; +} + +int set_user_ratelimit(OPT opt_cmd, std::unique_ptr& user, + int64_t max_read_ops, int64_t max_write_ops, + int64_t max_read_bytes, int64_t max_write_bytes, + bool have_max_read_ops, bool have_max_write_ops, + bool have_max_read_bytes, bool have_max_write_bytes) +{ + RGWRateLimitInfo ratelimit_info; + user->load_user(dpp(), null_yield); + auto iter = user->get_attrs().find(RGW_ATTR_RATELIMIT); + if(iter != user->get_attrs().end()) { + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(ratelimit_info, biter); + } catch (buffer::error& err) { + ldpp_dout(dpp(), 0) << "ERROR: failed to decode rate limit" << dendl; + return -EIO; + } + } + bool ratelimit_configured = set_ratelimit_info(ratelimit_info, opt_cmd, max_read_ops, max_write_ops, + max_read_bytes, max_write_bytes, + have_max_read_ops, have_max_write_ops, + have_max_read_bytes, have_max_write_bytes); + if (!ratelimit_configured) { + ldpp_dout(dpp(), 0) << "ERROR: no rate limit values have been specified" << dendl; + return -EINVAL; + } + bufferlist bl; + ratelimit_info.encode(bl); + rgw::sal::Attrs attr; + attr[RGW_ATTR_RATELIMIT] = bl; + int r = user->merge_and_store_attrs(dpp(), attr, null_yield); + if (r < 0) { + cerr << "ERROR: failed writing user instance info: " << cpp_strerror(-r) << std::endl; + return -r; + } + return 0; +} + +int show_user_ratelimit(std::unique_ptr& user, Formatter *formatter) +{ + RGWRateLimitInfo ratelimit_info; + user->load_user(dpp(), null_yield); + auto iter = user->get_attrs().find(RGW_ATTR_RATELIMIT); + if(iter != user->get_attrs().end()) { + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(ratelimit_info, biter); + } catch (buffer::error& err) { + ldpp_dout(dpp(), 0) << "ERROR: failed to decode rate limit" << dendl; + return -EIO; + } + } + formatter->open_object_section("user_ratelimit"); + encode_json("user_ratelimit", ratelimit_info, formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + return 0; +} + +int show_bucket_ratelimit(rgw::sal::Driver* driver, const string& tenant_name, + const string& bucket_name, Formatter *formatter) +{ + std::unique_ptr bucket; + int r = driver->get_bucket(dpp(), nullptr, tenant_name, bucket_name, &bucket, null_yield); + if (r < 0) { + cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + RGWRateLimitInfo ratelimit_info; + auto iter = bucket->get_attrs().find(RGW_ATTR_RATELIMIT); + if (iter != bucket->get_attrs().end()) { + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(ratelimit_info, biter); + } catch (buffer::error& err) { + ldpp_dout(dpp(), 0) << "ERROR: failed to decode rate limit" << dendl; + return -EIO; + } + } + formatter->open_object_section("bucket_ratelimit"); + encode_json("bucket_ratelimit", ratelimit_info, formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + return 0; +} +int set_user_bucket_quota(OPT opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects, + bool have_max_size, bool have_max_objects) +{ + RGWUserInfo& user_info = op_state.get_user_info(); + + set_quota_info(user_info.quota.bucket_quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects); + + op_state.set_bucket_quota(user_info.quota.bucket_quota); + + string err; + int r = user.modify(dpp(), op_state, null_yield, &err); + if (r < 0) { + cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + return 0; +} + +int set_user_quota(OPT opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects, + bool have_max_size, bool have_max_objects) +{ + RGWUserInfo& user_info = op_state.get_user_info(); + + set_quota_info(user_info.quota.user_quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects); + + op_state.set_user_quota(user_info.quota.user_quota); + + string err; + int r = user.modify(dpp(), op_state, null_yield, &err); + if (r < 0) { + cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + return 0; +} + +int check_min_obj_stripe_size(rgw::sal::Driver* driver, rgw::sal::Object* obj, uint64_t min_stripe_size, bool *need_rewrite) +{ + int ret = obj->get_obj_attrs(null_yield, dpp()); + if (ret < 0) { + ldpp_dout(dpp(), -1) << "ERROR: failed to stat object, returned error: " << cpp_strerror(-ret) << dendl; + return ret; + } + + map::iterator iter; + iter = obj->get_attrs().find(RGW_ATTR_MANIFEST); + if (iter == obj->get_attrs().end()) { + *need_rewrite = (obj->get_obj_size() >= min_stripe_size); + return 0; + } + + RGWObjManifest manifest; + + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(manifest, biter); + } catch (buffer::error& err) { + ldpp_dout(dpp(), 0) << "ERROR: failed to decode manifest" << dendl; + return -EIO; + } + + map& objs = manifest.get_explicit_objs(); + map::iterator oiter; + for (oiter = objs.begin(); oiter != objs.end(); ++oiter) { + RGWObjManifestPart& part = oiter->second; + + if (part.size >= min_stripe_size) { + *need_rewrite = true; + return 0; + } + } + *need_rewrite = false; + + return 0; +} + + +int check_obj_locator_underscore(rgw::sal::Object* obj, bool fix, bool remove_bad, Formatter *f) { + f->open_object_section("object"); + f->open_object_section("key"); + f->dump_string("type", "head"); + f->dump_string("name", obj->get_name()); + f->dump_string("instance", obj->get_instance()); + f->close_section(); + + string oid; + string locator; + + get_obj_bucket_and_oid_loc(obj->get_obj(), oid, locator); + + f->dump_string("oid", oid); + f->dump_string("locator", locator); + + std::unique_ptr read_op = obj->get_read_op(); + + int ret = read_op->prepare(null_yield, dpp()); + bool needs_fixing = (ret == -ENOENT); + + f->dump_bool("needs_fixing", needs_fixing); + + string status = (needs_fixing ? "needs_fixing" : "ok"); + + if ((needs_fixing || remove_bad) && fix) { + ret = static_cast(driver)->getRados()->fix_head_obj_locator(dpp(), obj->get_bucket()->get_info(), needs_fixing, remove_bad, obj->get_key()); + if (ret < 0) { + cerr << "ERROR: fix_head_object_locator() returned ret=" << ret << std::endl; + goto done; + } + status = "fixed"; + } + +done: + f->dump_string("status", status); + + f->close_section(); + + return 0; +} + +int check_obj_tail_locator_underscore(RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, Formatter *f) { + f->open_object_section("object"); + f->open_object_section("key"); + f->dump_string("type", "tail"); + f->dump_string("name", key.name); + f->dump_string("instance", key.instance); + f->close_section(); + + bool needs_fixing; + string status; + + int ret = static_cast(driver)->getRados()->fix_tail_obj_locator(dpp(), bucket_info, key, fix, &needs_fixing, null_yield); + if (ret < 0) { + cerr << "ERROR: fix_tail_object_locator_underscore() returned ret=" << ret << std::endl; + status = "failed"; + } else { + status = (needs_fixing && !fix ? "needs_fixing" : "ok"); + } + + f->dump_bool("needs_fixing", needs_fixing); + f->dump_string("status", status); + + f->close_section(); + + return 0; +} + +int do_check_object_locator(const string& tenant_name, const string& bucket_name, + bool fix, bool remove_bad, Formatter *f) +{ + if (remove_bad && !fix) { + cerr << "ERROR: can't have remove_bad specified without fix" << std::endl; + return -EINVAL; + } + + std::unique_ptr bucket; + string bucket_id; + + f->open_object_section("bucket"); + f->dump_string("bucket", bucket_name); + int ret = init_bucket(nullptr, tenant_name, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return ret; + } + int count = 0; + + int max_entries = 1000; + + string prefix; + string delim; + string marker; + vector result; + string ns; + + rgw::sal::Bucket::ListParams params; + rgw::sal::Bucket::ListResults results; + + params.prefix = prefix; + params.delim = delim; + params.marker = rgw_obj_key(marker); + params.ns = ns; + params.enforce_ns = true; + params.list_versions = true; + + f->open_array_section("check_objects"); + do { + ret = bucket->list(dpp(), params, max_entries - count, results, null_yield); + if (ret < 0) { + cerr << "ERROR: driver->list_objects(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += results.objs.size(); + + for (vector::iterator iter = results.objs.begin(); iter != results.objs.end(); ++iter) { + std::unique_ptr obj = bucket->get_object(iter->key); + + if (obj->get_name()[0] == '_') { + ret = check_obj_locator_underscore(obj.get(), fix, remove_bad, f); + + if (ret >= 0) { + ret = check_obj_tail_locator_underscore(bucket->get_info(), obj->get_key(), fix, f); + if (ret < 0) { + cerr << "ERROR: check_obj_tail_locator_underscore(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + } + f->flush(cout); + } while (results.is_truncated && count < max_entries); + f->close_section(); + f->close_section(); + + f->flush(cout); + + return 0; +} + +/// search for a matching zone/zonegroup id and return a connection if found +static boost::optional get_remote_conn(rgw::sal::RadosStore* driver, + const RGWZoneGroup& zonegroup, + const std::string& remote) +{ + boost::optional conn; + if (remote == zonegroup.get_id()) { + conn.emplace(driver->ctx(), driver, remote, zonegroup.endpoints, zonegroup.api_name); + } else { + for (const auto& z : zonegroup.zones) { + const auto& zone = z.second; + if (remote == zone.id) { + conn.emplace(driver->ctx(), driver, remote, zone.endpoints, zonegroup.api_name); + break; + } + } + } + return conn; +} + +/// search each zonegroup for a connection +static boost::optional get_remote_conn(rgw::sal::RadosStore* driver, + const RGWPeriodMap& period_map, + const std::string& remote) +{ + boost::optional conn; + for (const auto& zg : period_map.zonegroups) { + conn = get_remote_conn(driver, zg.second, remote); + if (conn) { + break; + } + } + return conn; +} + +// we expect a very small response +static constexpr size_t MAX_REST_RESPONSE = 128 * 1024; + +static int send_to_remote_gateway(RGWRESTConn* conn, req_info& info, + bufferlist& in_data, JSONParser& parser) +{ + if (!conn) { + return -EINVAL; + } + + ceph::bufferlist response; + rgw_user user; + int ret = conn->forward(dpp(), user, info, nullptr, MAX_REST_RESPONSE, &in_data, &response, null_yield); + + int parse_ret = parser.parse(response.c_str(), response.length()); + if (parse_ret < 0) { + cerr << "failed to parse response" << std::endl; + return parse_ret; + } + return ret; +} + +static int send_to_url(const string& url, + std::optional opt_region, + const string& access, + const string& secret, req_info& info, + bufferlist& in_data, JSONParser& parser) +{ + if (access.empty() || secret.empty()) { + cerr << "An --access-key and --secret must be provided with --url." << std::endl; + return -EINVAL; + } + RGWAccessKey key; + key.id = access; + key.key = secret; + + param_vec_t params; + RGWRESTSimpleRequest req(g_ceph_context, info.method, url, NULL, ¶ms, opt_region); + + bufferlist response; + int ret = req.forward_request(dpp(), key, info, MAX_REST_RESPONSE, &in_data, &response, null_yield); + + int parse_ret = parser.parse(response.c_str(), response.length()); + if (parse_ret < 0) { + cout << "failed to parse response" << std::endl; + return parse_ret; + } + return ret; +} + +static int send_to_remote_or_url(RGWRESTConn *conn, const string& url, + std::optional opt_region, + const string& access, const string& secret, + req_info& info, bufferlist& in_data, + JSONParser& parser) +{ + if (url.empty()) { + return send_to_remote_gateway(conn, info, in_data, parser); + } + return send_to_url(url, opt_region, access, secret, info, in_data, parser); +} + +static int commit_period(rgw::sal::ConfigStore* cfgstore, + RGWRealm& realm, rgw::sal::RealmWriter& realm_writer, + RGWPeriod& period, string remote, const string& url, + std::optional opt_region, + const string& access, const string& secret, + bool force) +{ + auto& master_zone = period.get_master_zone().id; + if (master_zone.empty()) { + cerr << "cannot commit period: period does not have a master zone of a master zonegroup" << std::endl; + return -EINVAL; + } + // are we the period's master zone? + if (driver->get_zone()->get_id() == master_zone) { + // read the current period + RGWPeriod current_period; + int ret = cfgstore->read_period(dpp(), null_yield, realm.current_period, + std::nullopt, current_period); + if (ret < 0) { + cerr << "failed to load current period: " << cpp_strerror(ret) << std::endl; + return ret; + } + // the master zone can commit locally + ret = rgw::commit_period(dpp(), null_yield, cfgstore, driver, + realm, realm_writer, current_period, + period, cerr, force); + if (ret < 0) { + cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl; + } + return ret; + } + + if (remote.empty() && url.empty()) { + // use the new master zone's connection + remote = master_zone; + cerr << "Sending period to new master zone " << remote << std::endl; + } + boost::optional conn; + RGWRESTConn *remote_conn = nullptr; + if (!remote.empty()) { + conn = get_remote_conn(static_cast(driver), period.get_map(), remote); + if (!conn) { + cerr << "failed to find a zone or zonegroup for remote " + << remote << std::endl; + return -ENOENT; + } + remote_conn = &*conn; + } + + // push period to the master with an empty period id + period.set_id(string()); + + RGWEnv env; + req_info info(g_ceph_context, &env); + info.method = "POST"; + info.request_uri = "/admin/realm/period"; + + // json format into a bufferlist + JSONFormatter jf(false); + encode_json("period", period, &jf); + bufferlist bl; + jf.flush(bl); + + JSONParser p; + int ret = send_to_remote_or_url(remote_conn, url, opt_region, access, secret, info, bl, p); + if (ret < 0) { + cerr << "request failed: " << cpp_strerror(-ret) << std::endl; + + // did we parse an error message? + auto message = p.find_obj("Message"); + if (message) { + cerr << "Reason: " << message->get_data() << std::endl; + } + return ret; + } + + // decode the response and driver it back + try { + decode_json_obj(period, &p); + } catch (const JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.what() << std::endl; + return -EINVAL; + } + if (period.get_id().empty()) { + cerr << "Period commit got back an empty period id" << std::endl; + return -EINVAL; + } + // the master zone gave us back the period that it committed, so it's + // safe to save it as our latest epoch + constexpr bool exclusive = false; + ret = cfgstore->create_period(dpp(), null_yield, exclusive, period); + if (ret < 0) { + cerr << "Error storing committed period " << period.get_id() << ": " + << cpp_strerror(ret) << std::endl; + return ret; + } + ret = rgw::reflect_period(dpp(), null_yield, cfgstore, period); + if (ret < 0) { + cerr << "Error updating local objects: " << cpp_strerror(ret) << std::endl; + return ret; + } + (void) cfgstore->realm_notify_new_period(dpp(), null_yield, period); + return ret; +} + +static int update_period(rgw::sal::ConfigStore* cfgstore, + const string& realm_id, const string& realm_name, + const string& period_epoch, bool commit, + const string& remote, const string& url, + std::optional opt_region, + const string& access, const string& secret, + Formatter *formatter, bool force) +{ + RGWRealm realm; + std::unique_ptr realm_writer; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore, + realm_id, realm_name, + realm, &realm_writer); + if (ret < 0) { + cerr << "failed to load realm " << cpp_strerror(-ret) << std::endl; + return ret; + } + std::optional epoch; + if (!period_epoch.empty()) { + epoch = atoi(period_epoch.c_str()); + } + RGWPeriod period; + ret = cfgstore->read_period(dpp(), null_yield, realm.current_period, + epoch, period); + if (ret < 0) { + cerr << "failed to load current period: " << cpp_strerror(-ret) << std::endl; + return ret; + } + // convert to the realm's staging period + rgw::fork_period(dpp(), period); + // update the staging period with all of the realm's zonegroups + ret = rgw::update_period(dpp(), null_yield, cfgstore, period); + if (ret < 0) { + return ret; + } + + constexpr bool exclusive = false; + ret = cfgstore->create_period(dpp(), null_yield, exclusive, period); + if (ret < 0) { + cerr << "failed to driver period: " << cpp_strerror(-ret) << std::endl; + return ret; + } + if (commit) { + ret = commit_period(cfgstore, realm, *realm_writer, period, remote, url, + opt_region, access, secret, force); + if (ret < 0) { + cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl; + return ret; + } + } + encode_json("period", period, formatter); + formatter->flush(cout); + return 0; +} + +static int init_bucket_for_sync(rgw::sal::User* user, + const string& tenant, const string& bucket_name, + const string& bucket_id, + std::unique_ptr* bucket) +{ + int ret = init_bucket(user, tenant, bucket_name, bucket_id, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + return 0; +} + +static int do_period_pull(rgw::sal::ConfigStore* cfgstore, + RGWRESTConn *remote_conn, const string& url, + std::optional opt_region, + const string& access_key, const string& secret_key, + const string& realm_id, const string& realm_name, + const string& period_id, const string& period_epoch, + RGWPeriod *period) +{ + RGWEnv env; + req_info info(g_ceph_context, &env); + info.method = "GET"; + info.request_uri = "/admin/realm/period"; + + map ¶ms = info.args.get_params(); + if (!realm_id.empty()) + params["realm_id"] = realm_id; + if (!realm_name.empty()) + params["realm_name"] = realm_name; + if (!period_id.empty()) + params["period_id"] = period_id; + if (!period_epoch.empty()) + params["epoch"] = period_epoch; + + bufferlist bl; + JSONParser p; + int ret = send_to_remote_or_url(remote_conn, url, opt_region, access_key, secret_key, + info, bl, p); + if (ret < 0) { + cerr << "request failed: " << cpp_strerror(-ret) << std::endl; + return ret; + } + try { + decode_json_obj(*period, &p); + } catch (const JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.what() << std::endl; + return -EINVAL; + } + constexpr bool exclusive = false; + ret = cfgstore->create_period(dpp(), null_yield, exclusive, *period); + if (ret < 0) { + cerr << "Error storing period " << period->get_id() << ": " << cpp_strerror(ret) << std::endl; + } + return 0; +} + +void flush_ss(stringstream& ss, list& l) +{ + if (!ss.str().empty()) { + l.push_back(ss.str()); + } + ss.str(""); +} + +stringstream& push_ss(stringstream& ss, list& l, int tab = 0) +{ + flush_ss(ss, l); + if (tab > 0) { + ss << setw(tab) << "" << setw(1); + } + return ss; +} + +static void get_md_sync_status(list& status) +{ + RGWMetaSyncStatusManager sync(static_cast(driver), static_cast(driver)->svc()->rados->get_async_processor()); + + int ret = sync.init(dpp()); + if (ret < 0) { + status.push_back(string("failed to retrieve sync info: sync.init() failed: ") + cpp_strerror(-ret)); + return; + } + + rgw_meta_sync_status sync_status; + ret = sync.read_sync_status(dpp(), &sync_status); + if (ret < 0) { + status.push_back(string("failed to read sync status: ") + cpp_strerror(-ret)); + return; + } + + string status_str; + switch (sync_status.sync_info.state) { + case rgw_meta_sync_info::StateInit: + status_str = "init"; + break; + case rgw_meta_sync_info::StateBuildingFullSyncMaps: + status_str = "preparing for full sync"; + break; + case rgw_meta_sync_info::StateSync: + status_str = "syncing"; + break; + default: + status_str = "unknown"; + } + + status.push_back(status_str); + + uint64_t full_total = 0; + uint64_t full_complete = 0; + + int num_full = 0; + int num_inc = 0; + int total_shards = 0; + set shards_behind_set; + + for (auto marker_iter : sync_status.sync_markers) { + full_total += marker_iter.second.total_entries; + total_shards++; + if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) { + num_full++; + full_complete += marker_iter.second.pos; + int shard_id = marker_iter.first; + shards_behind_set.insert(shard_id); + } else { + full_complete += marker_iter.second.total_entries; + } + if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync) { + num_inc++; + } + } + + stringstream ss; + push_ss(ss, status) << "full sync: " << num_full << "/" << total_shards << " shards"; + + if (num_full > 0) { + push_ss(ss, status) << "full sync: " << full_total - full_complete << " entries to sync"; + } + + push_ss(ss, status) << "incremental sync: " << num_inc << "/" << total_shards << " shards"; + + map master_shards_info; + string master_period = static_cast(driver)->svc()->zone->get_current_period_id(); + + ret = sync.read_master_log_shards_info(dpp(), master_period, &master_shards_info); + if (ret < 0) { + status.push_back(string("failed to fetch master sync status: ") + cpp_strerror(-ret)); + return; + } + + map shards_behind; + if (sync_status.sync_info.period != master_period) { + status.push_back(string("master is on a different period: master_period=" + + master_period + " local_period=" + sync_status.sync_info.period)); + } else { + for (auto local_iter : sync_status.sync_markers) { + int shard_id = local_iter.first; + auto iter = master_shards_info.find(shard_id); + + if (iter == master_shards_info.end()) { + /* huh? */ + derr << "ERROR: could not find remote sync shard status for shard_id=" << shard_id << dendl; + continue; + } + auto master_marker = iter->second.marker; + if (local_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync && + master_marker > local_iter.second.marker) { + shards_behind[shard_id] = local_iter.second.marker; + shards_behind_set.insert(shard_id); + } + } + } + + // fetch remote log entries to determine the oldest change + std::optional> oldest; + if (!shards_behind.empty()) { + map master_pos; + ret = sync.read_master_log_shards_next(dpp(), sync_status.sync_info.period, shards_behind, &master_pos); + if (ret < 0) { + derr << "ERROR: failed to fetch master next positions (" << cpp_strerror(-ret) << ")" << dendl; + } else { + for (auto iter : master_pos) { + rgw_mdlog_shard_data& shard_data = iter.second; + + if (shard_data.entries.empty()) { + // there aren't any entries in this shard, so we're not really behind + shards_behind.erase(iter.first); + shards_behind_set.erase(iter.first); + } else { + rgw_mdlog_entry& entry = shard_data.entries.front(); + if (!oldest) { + oldest.emplace(iter.first, entry.timestamp); + } else if (!ceph::real_clock::is_zero(entry.timestamp) && entry.timestamp < oldest->second) { + oldest.emplace(iter.first, entry.timestamp); + } + } + } + } + } + + int total_behind = shards_behind.size() + (sync_status.sync_info.num_shards - num_inc); + if (total_behind == 0) { + push_ss(ss, status) << "metadata is caught up with master"; + } else { + push_ss(ss, status) << "metadata is behind on " << total_behind << " shards"; + push_ss(ss, status) << "behind shards: " << "[" << shards_behind_set << "]"; + if (oldest) { + push_ss(ss, status) << "oldest incremental change not applied: " + << oldest->second << " [" << oldest->first << ']'; + } + } + + flush_ss(ss, status); +} + +static void get_data_sync_status(const rgw_zone_id& source_zone, list& status, int tab) +{ + stringstream ss; + + RGWZone *sz; + + if (!(sz = static_cast(driver)->svc()->zone->find_zone(source_zone))) { + push_ss(ss, status, tab) << string("zone not found"); + flush_ss(ss, status); + return; + } + + if (!static_cast(driver)->svc()->zone->zone_syncs_from(*sz)) { + push_ss(ss, status, tab) << string("not syncing from zone"); + flush_ss(ss, status); + return; + } + RGWDataSyncStatusManager sync(static_cast(driver), static_cast(driver)->svc()->rados->get_async_processor(), source_zone, nullptr); + + int ret = sync.init(dpp()); + if (ret < 0) { + push_ss(ss, status, tab) << string("failed to retrieve sync info: ") + cpp_strerror(-ret); + flush_ss(ss, status); + return; + } + + rgw_data_sync_status sync_status; + ret = sync.read_sync_status(dpp(), &sync_status); + if (ret < 0 && ret != -ENOENT) { + push_ss(ss, status, tab) << string("failed read sync status: ") + cpp_strerror(-ret); + return; + } + + set recovering_shards; + ret = sync.read_recovering_shards(dpp(), sync_status.sync_info.num_shards, recovering_shards); + if (ret < 0 && ret != ENOENT) { + push_ss(ss, status, tab) << string("failed read recovering shards: ") + cpp_strerror(-ret); + return; + } + + string status_str; + switch (sync_status.sync_info.state) { + case rgw_data_sync_info::StateInit: + status_str = "init"; + break; + case rgw_data_sync_info::StateBuildingFullSyncMaps: + status_str = "preparing for full sync"; + break; + case rgw_data_sync_info::StateSync: + status_str = "syncing"; + break; + default: + status_str = "unknown"; + } + + push_ss(ss, status, tab) << status_str; + + uint64_t full_total = 0; + uint64_t full_complete = 0; + + int num_full = 0; + int num_inc = 0; + int total_shards = 0; + set shards_behind_set; + + for (auto marker_iter : sync_status.sync_markers) { + full_total += marker_iter.second.total_entries; + total_shards++; + if (marker_iter.second.state == rgw_data_sync_marker::SyncState::FullSync) { + num_full++; + full_complete += marker_iter.second.pos; + int shard_id = marker_iter.first; + shards_behind_set.insert(shard_id); + } else { + full_complete += marker_iter.second.total_entries; + } + if (marker_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync) { + num_inc++; + } + } + + push_ss(ss, status, tab) << "full sync: " << num_full << "/" << total_shards << " shards"; + + if (num_full > 0) { + push_ss(ss, status, tab) << "full sync: " << full_total - full_complete << " buckets to sync"; + } + + push_ss(ss, status, tab) << "incremental sync: " << num_inc << "/" << total_shards << " shards"; + + map source_shards_info; + + ret = sync.read_source_log_shards_info(dpp(), &source_shards_info); + if (ret < 0) { + push_ss(ss, status, tab) << string("failed to fetch source sync status: ") + cpp_strerror(-ret); + return; + } + + map shards_behind; + + for (auto local_iter : sync_status.sync_markers) { + int shard_id = local_iter.first; + auto iter = source_shards_info.find(shard_id); + + if (iter == source_shards_info.end()) { + /* huh? */ + derr << "ERROR: could not find remote sync shard status for shard_id=" << shard_id << dendl; + continue; + } + auto master_marker = iter->second.marker; + if (local_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync && + master_marker > local_iter.second.marker) { + shards_behind[shard_id] = local_iter.second.marker; + shards_behind_set.insert(shard_id); + } + } + + std::optional> oldest; + if (!shards_behind.empty()) { + map master_pos; + ret = sync.read_source_log_shards_next(dpp(), shards_behind, &master_pos); + + if (ret < 0) { + derr << "ERROR: failed to fetch next positions (" << cpp_strerror(-ret) << ")" << dendl; + } else { + for (auto iter : master_pos) { + rgw_datalog_shard_data& shard_data = iter.second; + if (shard_data.entries.empty()) { + // there aren't any entries in this shard, so we're not really behind + shards_behind.erase(iter.first); + shards_behind_set.erase(iter.first); + } else { + rgw_datalog_entry& entry = shard_data.entries.front(); + if (!oldest) { + oldest.emplace(iter.first, entry.timestamp); + } else if (!ceph::real_clock::is_zero(entry.timestamp) && entry.timestamp < oldest->second) { + oldest.emplace(iter.first, entry.timestamp); + } + } + } + } + } + + int total_behind = shards_behind.size() + (sync_status.sync_info.num_shards - num_inc); + int total_recovering = recovering_shards.size(); + + if (total_behind == 0 && total_recovering == 0) { + push_ss(ss, status, tab) << "data is caught up with source"; + } else if (total_behind > 0) { + push_ss(ss, status, tab) << "data is behind on " << total_behind << " shards"; + push_ss(ss, status, tab) << "behind shards: " << "[" << shards_behind_set << "]" ; + if (oldest) { + push_ss(ss, status, tab) << "oldest incremental change not applied: " + << oldest->second << " [" << oldest->first << ']'; + } + } + + if (total_recovering > 0) { + push_ss(ss, status, tab) << total_recovering << " shards are recovering"; + push_ss(ss, status, tab) << "recovering shards: " << "[" << recovering_shards << "]"; + } + + flush_ss(ss, status); +} + +static void tab_dump(const string& header, int width, const list& entries) +{ + string s = header; + + for (auto e : entries) { + cout << std::setw(width) << s << std::setw(1) << " " << e << std::endl; + s.clear(); + } +} + +// return features that are supported but not enabled +static auto get_disabled_features(const rgw::zone_features::set& enabled) { + auto features = rgw::zone_features::set{rgw::zone_features::supported.begin(), + rgw::zone_features::supported.end()}; + for (const auto& feature : enabled) { + features.erase(feature); + } + return features; +} + + +static void sync_status(Formatter *formatter) +{ + const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup(); + rgw::sal::Zone* zone = driver->get_zone(); + + int width = 15; + + cout << std::setw(width) << "realm" << std::setw(1) << " " << zone->get_realm_id() << " (" << zone->get_realm_name() << ")" << std::endl; + cout << std::setw(width) << "zonegroup" << std::setw(1) << " " << zonegroup.get_id() << " (" << zonegroup.get_name() << ")" << std::endl; + cout << std::setw(width) << "zone" << std::setw(1) << " " << zone->get_id() << " (" << zone->get_name() << ")" << std::endl; + cout << std::setw(width) << "current time" << std::setw(1) << " " + << to_iso_8601(ceph::real_clock::now(), iso_8601_format::YMDhms) << std::endl; + + const auto& rzg = + static_cast(zonegroup).get_group(); + + cout << std::setw(width) << "zonegroup features enabled: " << rzg.enabled_features << std::endl; + if (auto d = get_disabled_features(rzg.enabled_features); !d.empty()) { + cout << std::setw(width) << " disabled: " << d << std::endl; + } + + list md_status; + + if (driver->is_meta_master()) { + md_status.push_back("no sync (zone is master)"); + } else { + get_md_sync_status(md_status); + } + + tab_dump("metadata sync", width, md_status); + + list data_status; + + auto& zone_conn_map = static_cast(driver)->svc()->zone->get_zone_conn_map(); + + for (auto iter : zone_conn_map) { + const rgw_zone_id& source_id = iter.first; + string source_str = "source: "; + string s = source_str + source_id.id; + std::unique_ptr sz; + if (driver->get_zone()->get_zonegroup().get_zone_by_id(source_id.id, &sz) == 0) { + s += string(" (") + sz->get_name() + ")"; + } + data_status.push_back(s); + get_data_sync_status(source_id, data_status, source_str.size()); + } + + tab_dump("data sync", width, data_status); +} + +struct indented { + int w; // indent width + std::string_view header; + indented(int w, std::string_view header = "") : w(w), header(header) {} +}; +std::ostream& operator<<(std::ostream& out, const indented& h) { + return out << std::setw(h.w) << h.header << std::setw(1) << ' '; +} + +static int bucket_source_sync_status(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* driver, const RGWZone& zone, + const RGWZone& source, RGWRESTConn *conn, + const RGWBucketInfo& bucket_info, + rgw_sync_bucket_pipe pipe, + int width, std::ostream& out) +{ + out << indented{width, "source zone"} << source.id << " (" << source.name << ")" << std::endl; + + // syncing from this zone? + if (!driver->svc()->zone->zone_syncs_from(zone, source)) { + out << indented{width} << "does not sync from zone\n"; + return 0; + } + + if (!pipe.source.bucket) { + ldpp_dout(dpp, -1) << __func__ << "(): missing source bucket" << dendl; + return -EINVAL; + } + + std::unique_ptr source_bucket; + int r = init_bucket(nullptr, *pipe.source.bucket, &source_bucket); + if (r < 0) { + ldpp_dout(dpp, -1) << "failed to read source bucket info: " << cpp_strerror(r) << dendl; + return r; + } + + out << indented{width, "source bucket"} << source_bucket->get_key() << std::endl; + pipe.source.bucket = source_bucket->get_key(); + + pipe.dest.bucket = bucket_info.bucket; + + uint64_t gen = 0; + std::vector shard_status; + + // check for full sync status + rgw_bucket_sync_status full_status; + r = rgw_read_bucket_full_sync_status(dpp, driver, pipe, &full_status, null_yield); + if (r >= 0) { + if (full_status.state == BucketSyncState::Init) { + out << indented{width} << "init: bucket sync has not started\n"; + return 0; + } + if (full_status.state == BucketSyncState::Stopped) { + out << indented{width} << "stopped: bucket sync is disabled\n"; + return 0; + } + if (full_status.state == BucketSyncState::Full) { + out << indented{width} << "full sync: " << full_status.full.count << " objects completed\n"; + return 0; + } + gen = full_status.incremental_gen; + shard_status.resize(full_status.shards_done_with_gen.size()); + } else if (r == -ENOENT) { + // no full status, but there may be per-shard status from before upgrade + const auto& logs = source_bucket->get_info().layout.logs; + if (logs.empty()) { + out << indented{width} << "init: bucket sync has not started\n"; + return 0; + } + const auto& log = logs.front(); + if (log.gen > 0) { + // this isn't the backward-compatible case, so we just haven't started yet + out << indented{width} << "init: bucket sync has not started\n"; + return 0; + } + if (log.layout.type != rgw::BucketLogType::InIndex) { + ldpp_dout(dpp, -1) << "unrecognized log layout type " << log.layout.type << dendl; + return -EINVAL; + } + // use shard count from our log gen=0 + shard_status.resize(rgw::num_shards(log.layout.in_index)); + } else { + lderr(driver->ctx()) << "failed to read bucket full sync status: " << cpp_strerror(r) << dendl; + return r; + } + + r = rgw_read_bucket_inc_sync_status(dpp, driver, pipe, gen, &shard_status); + if (r < 0) { + lderr(driver->ctx()) << "failed to read bucket incremental sync status: " << cpp_strerror(r) << dendl; + return r; + } + + const int total_shards = shard_status.size(); + + out << indented{width} << "incremental sync on " << total_shards << " shards\n"; + + rgw_bucket_index_marker_info remote_info; + BucketIndexShardsManager remote_markers; + r = rgw_read_remote_bilog_info(dpp, conn, source_bucket->get_key(), + remote_info, remote_markers, null_yield); + if (r < 0) { + ldpp_dout(dpp, -1) << "failed to read remote log: " << cpp_strerror(r) << dendl; + return r; + } + + std::set shards_behind; + for (const auto& r : remote_markers.get()) { + auto shard_id = r.first; + if (r.second.empty()) { + continue; // empty bucket index shard + } + if (shard_id >= total_shards) { + // unexpected shard id. we don't have status for it, so we're behind + shards_behind.insert(shard_id); + continue; + } + auto& m = shard_status[shard_id]; + const auto pos = BucketIndexShardsManager::get_shard_marker(m.inc_marker.position); + if (pos < r.second) { + shards_behind.insert(shard_id); + } + } + if (!shards_behind.empty()) { + out << indented{width} << "bucket is behind on " << shards_behind.size() << " shards\n"; + out << indented{width} << "behind shards: [" << shards_behind << "]\n" ; + } else { + out << indented{width} << "bucket is caught up with source\n"; + } + return 0; +} + +void encode_json(const char *name, const RGWBucketSyncFlowManager::pipe_set& pset, Formatter *f) +{ + Formatter::ObjectSection top_section(*f, name); + Formatter::ArraySection as(*f, "entries"); + + for (auto& pipe_handler : pset) { + Formatter::ObjectSection hs(*f, "handler"); + encode_json("source", pipe_handler.source, f); + encode_json("dest", pipe_handler.dest, f); + } +} + +static std::vector convert_bucket_set_to_str_vec(const std::set& bs) +{ + std::vector result; + result.reserve(bs.size()); + for (auto& b : bs) { + result.push_back(b.get_key()); + } + return result; +} + +static void get_hint_entities(const std::set& zones, const std::set& buckets, + std::set *hint_entities) +{ + for (auto& zone_id : zones) { + for (auto& b : buckets) { + std::unique_ptr hint_bucket; + int ret = init_bucket(nullptr, b, &hint_bucket); + if (ret < 0) { + ldpp_dout(dpp(), 20) << "could not init bucket info for hint bucket=" << b << " ... skipping" << dendl; + continue; + } + + hint_entities->insert(rgw_sync_bucket_entity(zone_id, hint_bucket->get_key())); + } + } +} + +static rgw_zone_id resolve_zone_id(const string& s) +{ + std::unique_ptr zone; + int ret = driver->get_zone()->get_zonegroup().get_zone_by_id(s, &zone); + if (ret < 0) + ret = driver->get_zone()->get_zonegroup().get_zone_by_name(s, &zone); + if (ret < 0) + return rgw_zone_id(s); + + return rgw_zone_id(zone->get_id()); +} + +rgw_zone_id validate_zone_id(const rgw_zone_id& zone_id) +{ + return resolve_zone_id(zone_id.id); +} + +static int sync_info(std::optional opt_target_zone, std::optional opt_bucket, Formatter *formatter) +{ + rgw_zone_id zone_id = opt_target_zone.value_or(driver->get_zone()->get_id()); + + auto zone_policy_handler = driver->get_zone()->get_sync_policy_handler(); + + RGWBucketSyncPolicyHandlerRef bucket_handler; + + std::optional eff_bucket = opt_bucket; + + auto handler = zone_policy_handler; + + if (eff_bucket) { + std::unique_ptr bucket; + + int ret = init_bucket(nullptr, *eff_bucket, &bucket); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: init_bucket failed: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + if (ret >= 0) { + rgw::sal::Attrs attrs = bucket->get_attrs(); + bucket_handler.reset(handler->alloc_child(bucket->get_info(), std::move(attrs))); + } else { + cerr << "WARNING: bucket not found, simulating result" << std::endl; + bucket_handler.reset(handler->alloc_child(*eff_bucket, nullopt)); + } + + ret = bucket_handler->init(dpp(), null_yield); + if (ret < 0) { + cerr << "ERROR: failed to init bucket sync policy handler: " << cpp_strerror(-ret) << " (ret=" << ret << ")" << std::endl; + return ret; + } + + handler = bucket_handler; + } + + std::set sources; + std::set dests; + + handler->get_pipes(&sources, &dests, std::nullopt); + + auto source_hints_vec = convert_bucket_set_to_str_vec(handler->get_source_hints()); + auto target_hints_vec = convert_bucket_set_to_str_vec(handler->get_target_hints()); + + std::set resolved_sources; + std::set resolved_dests; + + rgw_sync_bucket_entity self_entity(zone_id, opt_bucket); + + set source_zones; + set target_zones; + + zone_policy_handler->reflect(dpp(), nullptr, nullptr, + nullptr, nullptr, + &source_zones, + &target_zones, + false); /* relaxed: also get all zones that we allow to sync to/from */ + + std::set hint_entities; + + get_hint_entities(source_zones, handler->get_source_hints(), &hint_entities); + get_hint_entities(target_zones, handler->get_target_hints(), &hint_entities); + + for (auto& hint_entity : hint_entities) { + if (!hint_entity.zone || + !hint_entity.bucket) { + continue; /* shouldn't really happen */ + } + + auto zid = validate_zone_id(*hint_entity.zone); + auto& hint_bucket = *hint_entity.bucket; + + RGWBucketSyncPolicyHandlerRef hint_bucket_handler; + int r = driver->get_sync_policy_handler(dpp(), zid, hint_bucket, &hint_bucket_handler, null_yield); + if (r < 0) { + ldpp_dout(dpp(), 20) << "could not get bucket sync policy handler for hint bucket=" << hint_bucket << " ... skipping" << dendl; + continue; + } + + hint_bucket_handler->get_pipes(&resolved_dests, + &resolved_sources, + self_entity); /* flipping resolved dests and sources as these are + relative to the remote entity */ + } + + { + Formatter::ObjectSection os(*formatter, "result"); + encode_json("sources", sources, formatter); + encode_json("dests", dests, formatter); + { + Formatter::ObjectSection hints_section(*formatter, "hints"); + encode_json("sources", source_hints_vec, formatter); + encode_json("dests", target_hints_vec, formatter); + } + { + Formatter::ObjectSection resolved_hints_section(*formatter, "resolved-hints-1"); + encode_json("sources", resolved_sources, formatter); + encode_json("dests", resolved_dests, formatter); + } + { + Formatter::ObjectSection resolved_hints_section(*formatter, "resolved-hints"); + encode_json("sources", handler->get_resolved_source_hints(), formatter); + encode_json("dests", handler->get_resolved_dest_hints(), formatter); + } + } + + formatter->flush(cout); + + return 0; +} + +static int bucket_sync_info(rgw::sal::Driver* driver, const RGWBucketInfo& info, + std::ostream& out) +{ + const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup(); + rgw::sal::Zone* zone = driver->get_zone(); + constexpr int width = 15; + + out << indented{width, "realm"} << zone->get_realm_id() << " (" << zone->get_realm_name() << ")\n"; + out << indented{width, "zonegroup"} << zonegroup.get_id() << " (" << zonegroup.get_name() << ")\n"; + out << indented{width, "zone"} << zone->get_id() << " (" << zone->get_name() << ")\n"; + out << indented{width, "bucket"} << info.bucket << "\n\n"; + + if (!static_cast(driver)->ctl()->bucket->bucket_imports_data(info.bucket, null_yield, dpp())) { + out << "Sync is disabled for bucket " << info.bucket.name << '\n'; + return 0; + } + + RGWBucketSyncPolicyHandlerRef handler; + + int r = driver->get_sync_policy_handler(dpp(), std::nullopt, info.bucket, &handler, null_yield); + if (r < 0) { + ldpp_dout(dpp(), -1) << "ERROR: failed to get policy handler for bucket (" << info.bucket << "): r=" << r << ": " << cpp_strerror(-r) << dendl; + return r; + } + + auto& sources = handler->get_sources(); + + for (auto& m : sources) { + auto& zone = m.first; + out << indented{width, "source zone"} << zone << std::endl; + for (auto& pipe_handler : m.second) { + out << indented{width, "bucket"} << *pipe_handler.source.bucket << std::endl; + } + } + + return 0; +} + +static int bucket_sync_status(rgw::sal::Driver* driver, const RGWBucketInfo& info, + const rgw_zone_id& source_zone_id, + std::optional& opt_source_bucket, + std::ostream& out) +{ + const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup(); + rgw::sal::Zone* zone = driver->get_zone(); + constexpr int width = 15; + + out << indented{width, "realm"} << zone->get_realm_id() << " (" << zone->get_realm_name() << ")\n"; + out << indented{width, "zonegroup"} << zonegroup.get_id() << " (" << zonegroup.get_name() << ")\n"; + out << indented{width, "zone"} << zone->get_id() << " (" << zone->get_name() << ")\n"; + out << indented{width, "bucket"} << info.bucket << "\n"; + out << indented{width, "current time"} + << to_iso_8601(ceph::real_clock::now(), iso_8601_format::YMDhms) << "\n\n"; + + + if (!static_cast(driver)->ctl()->bucket->bucket_imports_data(info.bucket, null_yield, dpp())) { + out << "Sync is disabled for bucket " << info.bucket.name << " or bucket has no sync sources" << std::endl; + return 0; + } + + RGWBucketSyncPolicyHandlerRef handler; + + int r = driver->get_sync_policy_handler(dpp(), std::nullopt, info.bucket, &handler, null_yield); + if (r < 0) { + ldpp_dout(dpp(), -1) << "ERROR: failed to get policy handler for bucket (" << info.bucket << "): r=" << r << ": " << cpp_strerror(-r) << dendl; + return r; + } + + auto sources = handler->get_all_sources(); + + auto& zone_conn_map = static_cast(driver)->svc()->zone->get_zone_conn_map(); + set zone_ids; + + if (!source_zone_id.empty()) { + std::unique_ptr zone; + int ret = driver->get_zone()->get_zonegroup().get_zone_by_id(source_zone_id.id, &zone); + if (ret < 0) { + ldpp_dout(dpp(), -1) << "Source zone not found in zonegroup " + << zonegroup.get_name() << dendl; + return -EINVAL; + } + auto c = zone_conn_map.find(source_zone_id); + if (c == zone_conn_map.end()) { + ldpp_dout(dpp(), -1) << "No connection to zone " << zone->get_name() << dendl; + return -EINVAL; + } + zone_ids.insert(source_zone_id); + } else { + std::list ids; + int ret = driver->get_zone()->get_zonegroup().list_zones(ids); + if (ret == 0) { + for (const auto& entry : ids) { + zone_ids.insert(entry); + } + } + } + + for (auto& zone_id : zone_ids) { + auto z = static_cast(driver)->svc()->zone->get_zonegroup().zones.find(zone_id.id); + if (z == static_cast(driver)->svc()->zone->get_zonegroup().zones.end()) { /* should't happen */ + continue; + } + auto c = zone_conn_map.find(zone_id.id); + if (c == zone_conn_map.end()) { /* should't happen */ + continue; + } + + for (auto& entry : sources) { + auto& pipe = entry.second; + if (opt_source_bucket && + pipe.source.bucket != opt_source_bucket) { + continue; + } + if (pipe.source.zone.value_or(rgw_zone_id()) == z->second.id) { + bucket_source_sync_status(dpp(), static_cast(driver), static_cast(driver)->svc()->zone->get_zone(), z->second, + c->second, + info, pipe, + width, out); + } + } + } + + return 0; +} + +static void parse_tier_config_param(const string& s, map& out) +{ + int level = 0; + string cur_conf; + list confs; + for (auto c : s) { + if (c == ',') { + if (level == 0) { + confs.push_back(cur_conf); + cur_conf.clear(); + continue; + } + } + if (c == '{') { + ++level; + } else if (c == '}') { + --level; + } + cur_conf += c; + } + if (!cur_conf.empty()) { + confs.push_back(cur_conf); + } + + for (auto c : confs) { + ssize_t pos = c.find("="); + if (pos < 0) { + out[c] = ""; + } else { + out[c.substr(0, pos)] = c.substr(pos + 1); + } + } +} + +static int check_pool_support_omap(const rgw_pool& pool) +{ + librados::IoCtx io_ctx; + int ret = static_cast(driver)->getRados()->get_rados_handle()->ioctx_create(pool.to_str().c_str(), io_ctx); + if (ret < 0) { + // the pool may not exist at this moment, we have no way to check if it supports omap. + return 0; + } + + ret = io_ctx.omap_clear("__omap_test_not_exist_oid__"); + if (ret == -EOPNOTSUPP) { + io_ctx.close(); + return ret; + } + io_ctx.close(); + return 0; +} + +int check_reshard_bucket_params(rgw::sal::Driver* driver, + const string& bucket_name, + const string& tenant, + const string& bucket_id, + bool num_shards_specified, + int num_shards, + int yes_i_really_mean_it, + std::unique_ptr* bucket) +{ + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return -EINVAL; + } + + if (!num_shards_specified) { + cerr << "ERROR: --num-shards not specified" << std::endl; + return -EINVAL; + } + + if (num_shards > (int)static_cast(driver)->getRados()->get_max_bucket_shards()) { + cerr << "ERROR: num_shards too high, max value: " << static_cast(driver)->getRados()->get_max_bucket_shards() << std::endl; + return -EINVAL; + } + + if (num_shards < 0) { + cerr << "ERROR: num_shards must be non-negative integer" << std::endl; + return -EINVAL; + } + + int ret = init_bucket(nullptr, tenant, bucket_name, bucket_id, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + int num_source_shards = rgw::current_num_shards((*bucket)->get_info().layout); + + if (num_shards <= num_source_shards && !yes_i_really_mean_it) { + cerr << "num shards is less or equal to current shards count" << std::endl + << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return -EINVAL; + } + return 0; +} + +static int scan_totp(CephContext *cct, ceph::real_time& now, rados::cls::otp::otp_info_t& totp, vector& pins, + time_t *pofs) +{ +#define MAX_TOTP_SKEW_HOURS (24 * 7) + time_t start_time = ceph::real_clock::to_time_t(now); + time_t time_ofs = 0, time_ofs_abs = 0; + time_t step_size = totp.step_size; + if (step_size == 0) { + step_size = OATH_TOTP_DEFAULT_TIME_STEP_SIZE; + } + uint32_t count = 0; + int sign = 1; + + uint32_t max_skew = MAX_TOTP_SKEW_HOURS * 3600; + + while (time_ofs_abs < max_skew) { + int rc = oath_totp_validate2(totp.seed_bin.c_str(), totp.seed_bin.length(), + start_time, + step_size, + time_ofs, + 1, + nullptr, + pins[0].c_str()); + if (rc != OATH_INVALID_OTP) { + rc = oath_totp_validate2(totp.seed_bin.c_str(), totp.seed_bin.length(), + start_time, + step_size, + time_ofs - step_size, /* smaller time_ofs moves time forward */ + 1, + nullptr, + pins[1].c_str()); + if (rc != OATH_INVALID_OTP) { + *pofs = time_ofs - step_size + step_size * totp.window / 2; + ldpp_dout(dpp(), 20) << "found at time=" << start_time - time_ofs << " time_ofs=" << time_ofs << dendl; + return 0; + } + } + sign = -sign; + time_ofs_abs = (++count) * step_size; + time_ofs = sign * time_ofs_abs; + } + + return -ENOENT; +} + +static int trim_sync_error_log(int shard_id, const string& marker, int delay_ms) +{ + auto oid = RGWSyncErrorLogger::get_shard_oid(RGW_SYNC_ERROR_LOG_SHARD_PREFIX, + shard_id); + // call cls_log_trim() until it returns -ENODATA + for (;;) { + int ret = static_cast(driver)->svc()->cls->timelog.trim(dpp(), oid, {}, {}, {}, marker, nullptr, + null_yield); + if (ret == -ENODATA) { + return 0; + } + if (ret < 0) { + return ret; + } + if (delay_ms) { + std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms)); + } + } + // unreachable +} + +static bool symmetrical_flow_opt(const string& opt) +{ + return (opt == "symmetrical" || opt == "symmetric"); +} + +static bool directional_flow_opt(const string& opt) +{ + return (opt == "directional" || opt == "direction"); +} + +template +static bool require_opt(std::optional opt, bool extra_check = true) +{ + if (!opt || !extra_check) { + return false; + } + return true; +} + +template +static bool require_non_empty_opt(std::optional opt, bool extra_check = true) +{ + if (!opt || opt->empty() || !extra_check) { + return false; + } + return true; +} + +template +static void show_result(T& obj, + Formatter *formatter, + ostream& os) +{ + encode_json("obj", obj, formatter); + + formatter->flush(cout); +} + +void init_optional_bucket(std::optional& opt_bucket, + std::optional& opt_tenant, + std::optional& opt_bucket_name, + std::optional& opt_bucket_id) +{ + if (opt_tenant || opt_bucket_name || opt_bucket_id) { + opt_bucket.emplace(); + if (opt_tenant) { + opt_bucket->tenant = *opt_tenant; + } + if (opt_bucket_name) { + opt_bucket->name = *opt_bucket_name; + } + if (opt_bucket_id) { + opt_bucket->bucket_id = *opt_bucket_id; + } + } +} + +class SyncPolicyContext +{ + rgw::sal::ConfigStore* cfgstore; + RGWZoneGroup zonegroup; + std::unique_ptr zonegroup_writer; + + std::optional b; + std::unique_ptr bucket; + + rgw_sync_policy_info *policy{nullptr}; + + std::optional owner; + +public: + SyncPolicyContext(rgw::sal::ConfigStore* cfgstore, + std::optional _bucket) + : cfgstore(cfgstore), b(std::move(_bucket)) {} + + int init(const string& zonegroup_id, const string& zonegroup_name) { + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore, + zonegroup_id, zonegroup_name, + zonegroup, &zonegroup_writer); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + if (!b) { + policy = &zonegroup.sync_policy; + return 0; + } + + ret = init_bucket(nullptr, *b, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + owner = bucket->get_info().owner; + + if (!bucket->get_info().sync_policy) { + rgw_sync_policy_info new_policy; + bucket->get_info().set_sync_policy(std::move(new_policy)); + } + + policy = &(*bucket->get_info().sync_policy); + + return 0; + } + + int write_policy() { + if (!b) { + int ret = zonegroup_writer->write(dpp(), null_yield, zonegroup); + if (ret < 0) { + cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + return 0; + } + + int ret = bucket->put_info(dpp(), false, real_time()); + if (ret < 0) { + cerr << "failed to driver bucket info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + return 0; + } + + rgw_sync_policy_info& get_policy() { + return *policy; + } + + std::optional& get_owner() { + return owner; + } +}; + +void resolve_zone_id_opt(std::optional& zone_name, std::optional& zone_id) +{ + if (!zone_name || zone_id) { + return; + } + zone_id.emplace(); + std::unique_ptr zone; + int ret = driver->get_zone()->get_zonegroup().get_zone_by_name(*zone_name, &zone); + if (ret < 0) { + cerr << "WARNING: cannot find source zone id for name=" << *zone_name << std::endl; + zone_id = rgw_zone_id(*zone_name); + } else { + zone_id->id = zone->get_id(); + } +} +void resolve_zone_ids_opt(std::optional >& names, std::optional >& ids) +{ + if (!names || ids) { + return; + } + ids.emplace(); + for (auto& name : *names) { + rgw_zone_id zid; + std::unique_ptr zone; + int ret = driver->get_zone()->get_zonegroup().get_zone_by_name(name, &zone); + if (ret < 0) { + cerr << "WARNING: cannot find source zone id for name=" << name << std::endl; + zid = rgw_zone_id(name); + } else { + zid.id = zone->get_id(); + } + ids->push_back(zid); + } +} + +static vector zone_ids_from_str(const string& val) +{ + vector result; + vector v; + get_str_vec(val, v); + for (auto& z : v) { + result.push_back(rgw_zone_id(z)); + } + return result; +} + +class JSONFormatter_PrettyZone : public JSONFormatter { + class Handler : public JSONEncodeFilter::Handler { + void encode_json(const char *name, const void *pval, ceph::Formatter *f) const override { + auto zone_id = *(static_cast(pval)); + string zone_name; + std::unique_ptr zone; + if (driver->get_zone()->get_zonegroup().get_zone_by_id(zone_id.id, &zone) == 0) { + zone_name = zone->get_name(); + } else { + cerr << "WARNING: cannot find zone name for id=" << zone_id << std::endl; + zone_name = zone_id.id; + } + + ::encode_json(name, zone_name, f); + } + } zone_id_type_handler; + + JSONEncodeFilter encode_filter; +public: + JSONFormatter_PrettyZone(bool pretty_format) : JSONFormatter(pretty_format) { + encode_filter.register_type(&zone_id_type_handler); + } + + void *get_external_feature_handler(const std::string& feature) override { + if (feature != "JSONEncodeFilter") { + return nullptr; + } + return &encode_filter; + } +}; + +void init_realm_param(CephContext *cct, string& var, std::optional& opt_var, const string& conf_name) +{ + var = cct->_conf.get_val(conf_name); + if (!var.empty()) { + opt_var = var; + } +} + +int main(int argc, const char **argv) +{ + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = rgw_global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + + // for region -> zonegroup conversion (must happen before common_init_finish()) + if (!g_conf()->rgw_region.empty() && g_conf()->rgw_zonegroup.empty()) { + g_conf().set_val_or_die("rgw_zonegroup", g_conf()->rgw_region.c_str()); + } + + rgw_user user_id_arg; + std::unique_ptr user; + string tenant; + string user_ns; + rgw_user new_user_id; + std::string access_key, secret_key, user_email, display_name; + std::string bucket_name, pool_name, object; + rgw_pool pool; + std::string date, subuser, access, format; + std::string start_date, end_date; + std::string key_type_str; + std::string period_id, period_epoch, remote, url; + std::optional opt_region; + std::string master_zone; + std::string realm_name, realm_id, realm_new_name; + std::optional opt_realm_name, opt_realm_id; + std::string zone_name, zone_id, zone_new_name; + std::optional opt_zone_name, opt_zone_id; + std::string zonegroup_name, zonegroup_id, zonegroup_new_name; + std::optional opt_zonegroup_name, opt_zonegroup_id; + std::string api_name; + std::string role_name, path, assume_role_doc, policy_name, perm_policy_doc, path_prefix, max_session_duration; + std::string redirect_zone; + bool redirect_zone_set = false; + list endpoints; + int tmp_int; + int sync_from_all_specified = false; + bool sync_from_all = false; + list sync_from; + list sync_from_rm; + int is_master_int; + int set_default = 0; + bool is_master = false; + bool is_master_set = false; + int read_only_int; + bool read_only = false; + int is_read_only_set = false; + int commit = false; + int staging = false; + int key_type = KEY_TYPE_UNDEFINED; + std::unique_ptr bucket; + uint32_t perm_mask = 0; + RGWUserInfo info; + OPT opt_cmd = OPT::NO_CMD; + int gen_access_key = 0; + int gen_secret_key = 0; + bool set_perm = false; + bool set_temp_url_key = false; + map temp_url_keys; + string bucket_id; + string new_bucket_name; + std::unique_ptr formatter; + std::unique_ptr zone_formatter; + int purge_data = false; + int pretty_format = false; + int show_log_entries = true; + int show_log_sum = true; + int skip_zero_entries = false; // log show + int purge_keys = false; + int yes_i_really_mean_it = false; + int delete_child_objects = false; + int fix = false; + int remove_bad = false; + int check_head_obj_locator = false; + int max_buckets = -1; + bool max_buckets_specified = false; + map categories; + string caps; + int check_objects = false; + RGWBucketAdminOpState bucket_op; + string infile; + string metadata_key; + RGWObjVersionTracker objv_tracker; + string marker; + string start_marker; + string end_marker; + int max_entries = -1; + bool max_entries_specified = false; + int admin = false; + bool admin_specified = false; + int system = false; + bool system_specified = false; + int shard_id = -1; + bool specified_shard_id = false; + string client_id; + string op_id; + string op_mask_str; + string quota_scope; + string ratelimit_scope; + std::string objects_file; + string object_version; + string placement_id; + std::optional opt_storage_class; + list tags; + list tags_add; + list tags_rm; + int placement_inline_data = true; + bool placement_inline_data_specified = false; + + int64_t max_objects = -1; + int64_t max_size = -1; + int64_t max_read_ops = 0; + int64_t max_write_ops = 0; + int64_t max_read_bytes = 0; + int64_t max_write_bytes = 0; + bool have_max_objects = false; + bool have_max_size = false; + bool have_max_write_ops = false; + bool have_max_read_ops = false; + bool have_max_write_bytes = false; + bool have_max_read_bytes = false; + int include_all = false; + int allow_unordered = false; + + int sync_stats = false; + int reset_stats = false; + int bypass_gc = false; + int warnings_only = false; + int inconsistent_index = false; + + int verbose = false; + + int extra_info = false; + + uint64_t min_rewrite_size = 4 * 1024 * 1024; + uint64_t max_rewrite_size = ULLONG_MAX; + uint64_t min_rewrite_stripe_size = 0; + + BIIndexType bi_index_type = BIIndexType::Plain; + std::optional opt_log_type; + + string job_id; + int num_shards = 0; + bool num_shards_specified = false; + std::optional bucket_index_max_shards; + + int max_concurrent_ios = 32; + ceph::timespan min_age = std::chrono::hours(1); + bool hide_progress = false; + bool dump_keys = false; + uint64_t orphan_stale_secs = (24 * 3600); + int detail = false; + + std::string val; + std::ostringstream errs; + string err; + + string source_zone_name; + rgw_zone_id source_zone; /* zone id */ + + string tier_type; + bool tier_type_specified = false; + + map tier_config_add; + map tier_config_rm; + + boost::optional index_pool; + boost::optional data_pool; + boost::optional data_extra_pool; + rgw::BucketIndexType placement_index_type = rgw::BucketIndexType::Normal; + bool index_type_specified = false; + + boost::optional compression_type; + + string totp_serial; + string totp_seed; + string totp_seed_type = "hex"; + vector totp_pin; + int totp_seconds = 0; + int totp_window = 0; + int trim_delay_ms = 0; + + string topic_name; + string notification_id; + string sub_name; + string event_id; + + std::optional gen; + std::optional str_script_ctx; + std::optional script_package; + int allow_compilation = false; + + std::optional opt_group_id; + std::optional opt_status; + std::optional opt_flow_type; + std::optional > opt_zone_names; + std::optional > opt_zone_ids; + std::optional opt_flow_id; + std::optional opt_source_zone_name; + std::optional opt_source_zone_id; + std::optional opt_dest_zone_name; + std::optional opt_dest_zone_id; + std::optional > opt_source_zone_names; + std::optional > opt_source_zone_ids; + std::optional > opt_dest_zone_names; + std::optional > opt_dest_zone_ids; + std::optional opt_pipe_id; + std::optional opt_bucket; + std::optional opt_tenant; + std::optional opt_bucket_name; + std::optional opt_bucket_id; + std::optional opt_source_bucket; + std::optional opt_source_tenant; + std::optional opt_source_bucket_name; + std::optional opt_source_bucket_id; + std::optional opt_dest_bucket; + std::optional opt_dest_tenant; + std::optional opt_dest_bucket_name; + std::optional opt_dest_bucket_id; + std::optional opt_effective_zone_name; + std::optional opt_effective_zone_id; + + std::optional opt_prefix; + std::optional opt_prefix_rm; + + std::optional opt_priority; + std::optional opt_mode; + std::optional opt_dest_owner; + ceph::timespan opt_retry_delay_ms = std::chrono::milliseconds(2000); + ceph::timespan opt_timeout_sec = std::chrono::seconds(60); + + std::optional inject_error_at; + std::optional inject_error_code; + std::optional inject_abort_at; + std::optional inject_delay_at; + ceph::timespan inject_delay = std::chrono::milliseconds(2000); + + rgw::zone_features::set enable_features; + rgw::zone_features::set disable_features; + + SimpleCmd cmd(all_cmds, cmd_aliases); + bool raw_storage_op = false; + + std::optional rgw_obj_fs; // radoslist field separator + + init_realm_param(cct.get(), realm_id, opt_realm_id, "rgw_realm_id"); + init_realm_param(cct.get(), zonegroup_id, opt_zonegroup_id, "rgw_zonegroup_id"); + init_realm_param(cct.get(), zone_id, opt_zone_id, "rgw_zone_id"); + + for (std::vector::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_witharg(args, i, &val, "-i", "--uid", (char*)NULL)) { + user_id_arg.from_str(val); + if (user_id_arg.empty()) { + cerr << "no value for uid" << std::endl; + exit(1); + } + } else if (ceph_argparse_witharg(args, i, &val, "--new-uid", (char*)NULL)) { + new_user_id.from_str(val); + } else if (ceph_argparse_witharg(args, i, &val, "--tenant", (char*)NULL)) { + tenant = val; + opt_tenant = val; + } else if (ceph_argparse_witharg(args, i, &val, "--user_ns", (char*)NULL)) { + user_ns = val; + } else if (ceph_argparse_witharg(args, i, &val, "--access-key", (char*)NULL)) { + access_key = val; + } else if (ceph_argparse_witharg(args, i, &val, "--subuser", (char*)NULL)) { + subuser = val; + } else if (ceph_argparse_witharg(args, i, &val, "--secret", "--secret-key", (char*)NULL)) { + secret_key = val; + } else if (ceph_argparse_witharg(args, i, &val, "-e", "--email", (char*)NULL)) { + user_email = val; + } else if (ceph_argparse_witharg(args, i, &val, "-n", "--display-name", (char*)NULL)) { + display_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "-b", "--bucket", (char*)NULL)) { + bucket_name = val; + opt_bucket_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) { + pool_name = val; + pool = rgw_pool(pool_name); + } else if (ceph_argparse_witharg(args, i, &val, "-o", "--object", (char*)NULL)) { + object = val; + } else if (ceph_argparse_witharg(args, i, &val, "--objects-file", (char*)NULL)) { + objects_file = val; + } else if (ceph_argparse_witharg(args, i, &val, "--object-version", (char*)NULL)) { + object_version = val; + } else if (ceph_argparse_witharg(args, i, &val, "--client-id", (char*)NULL)) { + client_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--op-id", (char*)NULL)) { + op_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--op-mask", (char*)NULL)) { + op_mask_str = val; + } else if (ceph_argparse_witharg(args, i, &val, "--key-type", (char*)NULL)) { + key_type_str = val; + if (key_type_str.compare("swift") == 0) { + key_type = KEY_TYPE_SWIFT; + } else if (key_type_str.compare("s3") == 0) { + key_type = KEY_TYPE_S3; + } else { + cerr << "bad key type: " << key_type_str << std::endl; + exit(1); + } + } else if (ceph_argparse_witharg(args, i, &val, "--job-id", (char*)NULL)) { + job_id = val; + } else if (ceph_argparse_binary_flag(args, i, &gen_access_key, NULL, "--gen-access-key", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &gen_secret_key, NULL, "--gen-secret", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &show_log_entries, NULL, "--show-log-entries", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &show_log_sum, NULL, "--show-log-sum", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &skip_zero_entries, NULL, "--skip-zero-entries", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &admin, NULL, "--admin", (char*)NULL)) { + admin_specified = true; + } else if (ceph_argparse_binary_flag(args, i, &system, NULL, "--system", (char*)NULL)) { + system_specified = true; + } else if (ceph_argparse_binary_flag(args, i, &verbose, NULL, "--verbose", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &staging, NULL, "--staging", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &commit, NULL, "--commit", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_witharg(args, i, &val, "--min-rewrite-size", (char*)NULL)) { + min_rewrite_size = (uint64_t)atoll(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--max-rewrite-size", (char*)NULL)) { + max_rewrite_size = (uint64_t)atoll(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--min-rewrite-stripe-size", (char*)NULL)) { + min_rewrite_stripe_size = (uint64_t)atoll(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--max-buckets", (char*)NULL)) { + max_buckets = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max buckets: " << err << std::endl; + return EINVAL; + } + max_buckets_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) { + max_entries = (int)strict_strtol(val.c_str(), 10, &err); + max_entries_specified = true; + if (!err.empty()) { + cerr << "ERROR: failed to parse max entries: " << err << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) { + max_size = strict_iec_cast(val, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max size: " << err << std::endl; + return EINVAL; + } + have_max_size = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) { + max_objects = (int64_t)strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max objects: " << err << std::endl; + return EINVAL; + } + have_max_objects = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-read-ops", (char*)NULL)) { + max_read_ops = (int64_t)strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max read requests: " << err << std::endl; + return EINVAL; + } + have_max_read_ops = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-write-ops", (char*)NULL)) { + max_write_ops = (int64_t)strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max write requests: " << err << std::endl; + return EINVAL; + } + have_max_write_ops = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-read-bytes", (char*)NULL)) { + max_read_bytes = (int64_t)strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max read bytes: " << err << std::endl; + return EINVAL; + } + have_max_read_bytes = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-write-bytes", (char*)NULL)) { + max_write_bytes = (int64_t)strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max write bytes: " << err << std::endl; + return EINVAL; + } + have_max_write_bytes = true; + } else if (ceph_argparse_witharg(args, i, &val, "--date", "--time", (char*)NULL)) { + date = val; + if (end_date.empty()) + end_date = date; + } else if (ceph_argparse_witharg(args, i, &val, "--start-date", "--start-time", (char*)NULL)) { + start_date = val; + } else if (ceph_argparse_witharg(args, i, &val, "--end-date", "--end-time", (char*)NULL)) { + end_date = val; + } else if (ceph_argparse_witharg(args, i, &val, "--num-shards", (char*)NULL)) { + num_shards = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse num shards: " << err << std::endl; + return EINVAL; + } + num_shards_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--bucket-index-max-shards", (char*)NULL)) { + bucket_index_max_shards = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse bucket-index-max-shards: " << err << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--max-concurrent-ios", (char*)NULL)) { + max_concurrent_ios = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max concurrent ios: " << err << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--min-age-hours", (char*)NULL)) { + min_age = std::chrono::hours(atoi(val.c_str())); + } else if (ceph_argparse_witharg(args, i, &val, "--orphan-stale-secs", (char*)NULL)) { + orphan_stale_secs = (uint64_t)strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse orphan stale secs: " << err << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--shard-id", (char*)NULL)) { + shard_id = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse shard id: " << err << std::endl; + return EINVAL; + } + specified_shard_id = true; + } else if (ceph_argparse_witharg(args, i, &val, "--gen", (char*)NULL)) { + gen = strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse gen id: " << err << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--access", (char*)NULL)) { + access = val; + perm_mask = rgw_str_to_perm(access.c_str()); + set_perm = true; + } else if (ceph_argparse_witharg(args, i, &val, "--temp-url-key", (char*)NULL)) { + temp_url_keys[0] = val; + set_temp_url_key = true; + } else if (ceph_argparse_witharg(args, i, &val, "--temp-url-key2", "--temp-url-key-2", (char*)NULL)) { + temp_url_keys[1] = val; + set_temp_url_key = true; + } else if (ceph_argparse_witharg(args, i, &val, "--bucket-id", (char*)NULL)) { + bucket_id = val; + opt_bucket_id = val; + if (bucket_id.empty()) { + cerr << "no value for bucket-id" << std::endl; + exit(1); + } + } else if (ceph_argparse_witharg(args, i, &val, "--bucket-new-name", (char*)NULL)) { + new_bucket_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--format", (char*)NULL)) { + format = val; + } else if (ceph_argparse_witharg(args, i, &val, "--categories", (char*)NULL)) { + string cat_str = val; + list cat_list; + list::iterator iter; + get_str_list(cat_str, cat_list); + for (iter = cat_list.begin(); iter != cat_list.end(); ++iter) { + categories[*iter] = true; + } + } else if (ceph_argparse_binary_flag(args, i, &delete_child_objects, NULL, "--purge-objects", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &pretty_format, NULL, "--pretty-format", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &purge_data, NULL, "--purge-data", (char*)NULL)) { + delete_child_objects = purge_data; + } else if (ceph_argparse_binary_flag(args, i, &purge_keys, NULL, "--purge-keys", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &yes_i_really_mean_it, NULL, "--yes-i-really-mean-it", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &fix, NULL, "--fix", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &remove_bad, NULL, "--remove-bad", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &check_head_obj_locator, NULL, "--check-head-obj-locator", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &check_objects, NULL, "--check-objects", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &sync_stats, NULL, "--sync-stats", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &reset_stats, NULL, "--reset-stats", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &include_all, NULL, "--include-all", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &allow_unordered, NULL, "--allow-unordered", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &extra_info, NULL, "--extra-info", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &bypass_gc, NULL, "--bypass-gc", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &warnings_only, NULL, "--warnings-only", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &inconsistent_index, NULL, "--inconsistent-index", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_flag(args, i, "--hide-progress", (char*)NULL)) { + hide_progress = true; + } else if (ceph_argparse_flag(args, i, "--dump-keys", (char*)NULL)) { + dump_keys = true; + } else if (ceph_argparse_binary_flag(args, i, &placement_inline_data, NULL, "--placement-inline-data", (char*)NULL)) { + placement_inline_data_specified = true; + // do nothing + } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) { + caps = val; + } else if (ceph_argparse_witharg(args, i, &val, "--infile", (char*)NULL)) { + infile = val; + } else if (ceph_argparse_witharg(args, i, &val, "--metadata-key", (char*)NULL)) { + metadata_key = val; + } else if (ceph_argparse_witharg(args, i, &val, "--marker", (char*)NULL)) { + marker = val; + } else if (ceph_argparse_witharg(args, i, &val, "--start-marker", (char*)NULL)) { + start_marker = val; + } else if (ceph_argparse_witharg(args, i, &val, "--end-marker", (char*)NULL)) { + end_marker = val; + } else if (ceph_argparse_witharg(args, i, &val, "--quota-scope", (char*)NULL)) { + quota_scope = val; + } else if (ceph_argparse_witharg(args, i, &val, "--ratelimit-scope", (char*)NULL)) { + ratelimit_scope = val; + } else if (ceph_argparse_witharg(args, i, &val, "--index-type", (char*)NULL)) { + string index_type_str = val; + bi_index_type = get_bi_index_type(index_type_str); + if (bi_index_type == BIIndexType::Invalid) { + cerr << "ERROR: invalid bucket index entry type" << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--log-type", (char*)NULL)) { + string log_type_str = val; + auto l = get_log_type(log_type_str); + if (l == static_cast(0xff)) { + cerr << "ERROR: invalid log type" << std::endl; + return EINVAL; + } + opt_log_type = l; + } else if (ceph_argparse_binary_flag(args, i, &is_master_int, NULL, "--master", (char*)NULL)) { + is_master = (bool)is_master_int; + is_master_set = true; + } else if (ceph_argparse_binary_flag(args, i, &set_default, NULL, "--default", (char*)NULL)) { + /* do nothing */ + } else if (ceph_argparse_witharg(args, i, &val, "--redirect-zone", (char*)NULL)) { + redirect_zone = val; + redirect_zone_set = true; + } else if (ceph_argparse_binary_flag(args, i, &read_only_int, NULL, "--read-only", (char*)NULL)) { + read_only = (bool)read_only_int; + is_read_only_set = true; + } else if (ceph_argparse_witharg(args, i, &val, "--master-zone", (char*)NULL)) { + master_zone = val; + } else if (ceph_argparse_witharg(args, i, &val, "--period", (char*)NULL)) { + period_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--epoch", (char*)NULL)) { + period_epoch = val; + } else if (ceph_argparse_witharg(args, i, &val, "--remote", (char*)NULL)) { + remote = val; + } else if (ceph_argparse_witharg(args, i, &val, "--url", (char*)NULL)) { + url = val; + } else if (ceph_argparse_witharg(args, i, &val, "--region", (char*)NULL)) { + opt_region = val; + } else if (ceph_argparse_witharg(args, i, &val, "--realm-id", (char*)NULL)) { + realm_id = val; + opt_realm_id = val; + g_conf().set_val("rgw_realm_id", val); + } else if (ceph_argparse_witharg(args, i, &val, "--realm-new-name", (char*)NULL)) { + realm_new_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--zonegroup-id", (char*)NULL)) { + zonegroup_id = val; + opt_zonegroup_id = val; + g_conf().set_val("rgw_zonegroup_id", val); + } else if (ceph_argparse_witharg(args, i, &val, "--zonegroup-new-name", (char*)NULL)) { + zonegroup_new_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--placement-id", (char*)NULL)) { + placement_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--storage-class", (char*)NULL)) { + opt_storage_class = val; + } else if (ceph_argparse_witharg(args, i, &val, "--tags", (char*)NULL)) { + get_str_list(val, ",", tags); + } else if (ceph_argparse_witharg(args, i, &val, "--tags-add", (char*)NULL)) { + get_str_list(val, ",", tags_add); + } else if (ceph_argparse_witharg(args, i, &val, "--tags-rm", (char*)NULL)) { + get_str_list(val, ",", tags_rm); + } else if (ceph_argparse_witharg(args, i, &val, "--api-name", (char*)NULL)) { + api_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--zone-id", (char*)NULL)) { + zone_id = val; + opt_zone_id = val; + g_conf().set_val("rgw_zone_id", val); + } else if (ceph_argparse_witharg(args, i, &val, "--zone-new-name", (char*)NULL)) { + zone_new_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--endpoints", (char*)NULL)) { + get_str_list(val, endpoints); + } else if (ceph_argparse_witharg(args, i, &val, "--sync-from", (char*)NULL)) { + get_str_list(val, sync_from); + } else if (ceph_argparse_witharg(args, i, &val, "--sync-from-rm", (char*)NULL)) { + get_str_list(val, sync_from_rm); + } else if (ceph_argparse_binary_flag(args, i, &tmp_int, NULL, "--sync-from-all", (char*)NULL)) { + sync_from_all = (bool)tmp_int; + sync_from_all_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--source-zone", (char*)NULL)) { + source_zone_name = val; + opt_source_zone_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--source-zone-id", (char*)NULL)) { + opt_source_zone_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--dest-zone", (char*)NULL)) { + opt_dest_zone_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--dest-zone-id", (char*)NULL)) { + opt_dest_zone_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--tier-type", (char*)NULL)) { + tier_type = val; + tier_type_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--tier-config", (char*)NULL)) { + parse_tier_config_param(val, tier_config_add); + } else if (ceph_argparse_witharg(args, i, &val, "--tier-config-rm", (char*)NULL)) { + parse_tier_config_param(val, tier_config_rm); + } else if (ceph_argparse_witharg(args, i, &val, "--index-pool", (char*)NULL)) { + index_pool = val; + } else if (ceph_argparse_witharg(args, i, &val, "--data-pool", (char*)NULL)) { + data_pool = val; + } else if (ceph_argparse_witharg(args, i, &val, "--data-extra-pool", (char*)NULL)) { + data_extra_pool = val; + } else if (ceph_argparse_witharg(args, i, &val, "--placement-index-type", (char*)NULL)) { + if (val == "normal") { + placement_index_type = rgw::BucketIndexType::Normal; + } else if (val == "indexless") { + placement_index_type = rgw::BucketIndexType::Indexless; + } else { + placement_index_type = (rgw::BucketIndexType)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse index type index: " << err << std::endl; + return EINVAL; + } + } + index_type_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--compression", (char*)NULL)) { + compression_type = val; + } else if (ceph_argparse_witharg(args, i, &val, "--role-name", (char*)NULL)) { + role_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--path", (char*)NULL)) { + path = val; + } else if (ceph_argparse_witharg(args, i, &val, "--assume-role-policy-doc", (char*)NULL)) { + assume_role_doc = val; + } else if (ceph_argparse_witharg(args, i, &val, "--policy-name", (char*)NULL)) { + policy_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--policy-doc", (char*)NULL)) { + perm_policy_doc = val; + } else if (ceph_argparse_witharg(args, i, &val, "--path-prefix", (char*)NULL)) { + path_prefix = val; + } else if (ceph_argparse_witharg(args, i, &val, "--max-session-duration", (char*)NULL)) { + max_session_duration = val; + } else if (ceph_argparse_witharg(args, i, &val, "--totp-serial", (char*)NULL)) { + totp_serial = val; + } else if (ceph_argparse_witharg(args, i, &val, "--totp-pin", (char*)NULL)) { + totp_pin.push_back(val); + } else if (ceph_argparse_witharg(args, i, &val, "--totp-seed", (char*)NULL)) { + totp_seed = val; + } else if (ceph_argparse_witharg(args, i, &val, "--totp-seed-type", (char*)NULL)) { + totp_seed_type = val; + } else if (ceph_argparse_witharg(args, i, &val, "--totp-seconds", (char*)NULL)) { + totp_seconds = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--totp-window", (char*)NULL)) { + totp_window = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--trim-delay-ms", (char*)NULL)) { + trim_delay_ms = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--topic", (char*)NULL)) { + topic_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--notification-id", (char*)NULL)) { + notification_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--subscription", (char*)NULL)) { + sub_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--event-id", (char*)NULL)) { + event_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--group-id", (char*)NULL)) { + opt_group_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--status", (char*)NULL)) { + opt_status = val; + } else if (ceph_argparse_witharg(args, i, &val, "--flow-type", (char*)NULL)) { + opt_flow_type = val; + } else if (ceph_argparse_witharg(args, i, &val, "--zones", "--zone-names", (char*)NULL)) { + vector v; + get_str_vec(val, v); + opt_zone_names = std::move(v); + } else if (ceph_argparse_witharg(args, i, &val, "--zone-ids", (char*)NULL)) { + opt_zone_ids = zone_ids_from_str(val); + } else if (ceph_argparse_witharg(args, i, &val, "--source-zones", "--source-zone-names", (char*)NULL)) { + vector v; + get_str_vec(val, v); + opt_source_zone_names = std::move(v); + } else if (ceph_argparse_witharg(args, i, &val, "--source-zone-ids", (char*)NULL)) { + opt_source_zone_ids = zone_ids_from_str(val); + } else if (ceph_argparse_witharg(args, i, &val, "--dest-zones", "--dest-zone-names", (char*)NULL)) { + vector v; + get_str_vec(val, v); + opt_dest_zone_names = std::move(v); + } else if (ceph_argparse_witharg(args, i, &val, "--dest-zone-ids", (char*)NULL)) { + opt_dest_zone_ids = zone_ids_from_str(val); + } else if (ceph_argparse_witharg(args, i, &val, "--flow-id", (char*)NULL)) { + opt_flow_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--pipe-id", (char*)NULL)) { + opt_pipe_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--source-tenant", (char*)NULL)) { + opt_source_tenant = val; + } else if (ceph_argparse_witharg(args, i, &val, "--source-bucket", (char*)NULL)) { + opt_source_bucket_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--source-bucket-id", (char*)NULL)) { + opt_source_bucket_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--dest-tenant", (char*)NULL)) { + opt_dest_tenant = val; + } else if (ceph_argparse_witharg(args, i, &val, "--dest-bucket", (char*)NULL)) { + opt_dest_bucket_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--dest-bucket-id", (char*)NULL)) { + opt_dest_bucket_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--effective-zone-name", "--effective-zone", (char*)NULL)) { + opt_effective_zone_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--effective-zone-id", (char*)NULL)) { + opt_effective_zone_id = rgw_zone_id(val); + } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) { + opt_prefix = val; + } else if (ceph_argparse_witharg(args, i, &val, "--prefix-rm", (char*)NULL)) { + opt_prefix_rm = val; + } else if (ceph_argparse_witharg(args, i, &val, "--priority", (char*)NULL)) { + opt_priority = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--mode", (char*)NULL)) { + opt_mode = val; + } else if (ceph_argparse_witharg(args, i, &val, "--dest-owner", (char*)NULL)) { + opt_dest_owner.emplace(val); + opt_dest_owner = val; + } else if (ceph_argparse_witharg(args, i, &val, "--retry-delay-ms", (char*)NULL)) { + opt_retry_delay_ms = std::chrono::milliseconds(atoi(val.c_str())); + } else if (ceph_argparse_witharg(args, i, &val, "--timeout-sec", (char*)NULL)) { + opt_timeout_sec = std::chrono::seconds(atoi(val.c_str())); + } else if (ceph_argparse_witharg(args, i, &val, "--inject-error-at", (char*)NULL)) { + inject_error_at = val; + } else if (ceph_argparse_witharg(args, i, &val, "--inject-error-code", (char*)NULL)) { + inject_error_code = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--inject-abort-at", (char*)NULL)) { + inject_abort_at = val; + } else if (ceph_argparse_witharg(args, i, &val, "--inject-delay-at", (char*)NULL)) { + inject_delay_at = val; + } else if (ceph_argparse_witharg(args, i, &val, "--inject-delay-ms", (char*)NULL)) { + inject_delay = std::chrono::milliseconds(atoi(val.c_str())); + } else if (ceph_argparse_binary_flag(args, i, &detail, NULL, "--detail", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_witharg(args, i, &val, "--context", (char*)NULL)) { + str_script_ctx = val; + } else if (ceph_argparse_witharg(args, i, &val, "--package", (char*)NULL)) { + script_package = val; + } else if (ceph_argparse_binary_flag(args, i, &allow_compilation, NULL, "--allow-compilation", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_witharg(args, i, &val, "--rgw-obj-fs", (char*)NULL)) { + rgw_obj_fs = val; + } else if (ceph_argparse_witharg(args, i, &val, "--enable-feature", (char*)NULL)) { + if (!rgw::zone_features::supports(val)) { + std::cerr << "ERROR: Cannot enable unrecognized zone feature \"" << val << "\"" << std::endl; + return EINVAL; + } + enable_features.insert(val); + } else if (ceph_argparse_witharg(args, i, &val, "--disable-feature", (char*)NULL)) { + disable_features.insert(val); + } else if (strncmp(*i, "-", 1) == 0) { + cerr << "ERROR: invalid flag " << *i << std::endl; + return EINVAL; + } else { + ++i; + } + } + + /* common_init_finish needs to be called after g_conf().set_val() */ + common_init_finish(g_ceph_context); + + std::unique_ptr cfgstore; + + if (args.empty()) { + usage(); + exit(1); + } + else { + std::vector extra_args; + std::vector expected; + + std::any _opt_cmd; + + if (!cmd.find_command(args, &_opt_cmd, &extra_args, &err, &expected)) { + if (!expected.empty()) { + cerr << err << std::endl; + cerr << "Expected one of the following:" << std::endl; + for (auto& exp : expected) { + if (exp == "*" || exp == "[*]") { + continue; + } + cerr << " " << exp << std::endl; + } + } else { + cerr << "Command not found:"; + for (auto& arg : args) { + cerr << " " << arg; + } + cerr << std::endl; + } + exit(1); + } + + opt_cmd = std::any_cast(_opt_cmd); + + /* some commands may have an optional extra param */ + if (!extra_args.empty()) { + switch (opt_cmd) { + case OPT::METADATA_GET: + case OPT::METADATA_PUT: + case OPT::METADATA_RM: + case OPT::METADATA_LIST: + metadata_key = extra_args[0]; + break; + default: + break; + } + } + + // not a raw op if 'period update' needs to commit to master + bool raw_period_update = opt_cmd == OPT::PERIOD_UPDATE && !commit; + // not a raw op if 'period pull' needs to read zone/period configuration + bool raw_period_pull = opt_cmd == OPT::PERIOD_PULL && !url.empty(); + + std::set raw_storage_ops_list = {OPT::ZONEGROUP_ADD, OPT::ZONEGROUP_CREATE, + OPT::ZONEGROUP_DELETE, + OPT::ZONEGROUP_GET, OPT::ZONEGROUP_LIST, + OPT::ZONEGROUP_SET, OPT::ZONEGROUP_DEFAULT, + OPT::ZONEGROUP_RENAME, OPT::ZONEGROUP_MODIFY, + OPT::ZONEGROUP_REMOVE, + OPT::ZONEGROUP_PLACEMENT_ADD, OPT::ZONEGROUP_PLACEMENT_RM, + OPT::ZONEGROUP_PLACEMENT_MODIFY, OPT::ZONEGROUP_PLACEMENT_LIST, + OPT::ZONEGROUP_PLACEMENT_GET, + OPT::ZONEGROUP_PLACEMENT_DEFAULT, + OPT::ZONE_CREATE, OPT::ZONE_DELETE, + OPT::ZONE_GET, OPT::ZONE_SET, OPT::ZONE_RENAME, + OPT::ZONE_LIST, OPT::ZONE_MODIFY, OPT::ZONE_DEFAULT, + OPT::ZONE_PLACEMENT_ADD, OPT::ZONE_PLACEMENT_RM, + OPT::ZONE_PLACEMENT_MODIFY, OPT::ZONE_PLACEMENT_LIST, + OPT::ZONE_PLACEMENT_GET, + OPT::REALM_CREATE, + OPT::PERIOD_DELETE, OPT::PERIOD_GET, + OPT::PERIOD_GET_CURRENT, OPT::PERIOD_LIST, + OPT::GLOBAL_QUOTA_GET, OPT::GLOBAL_QUOTA_SET, + OPT::GLOBAL_QUOTA_ENABLE, OPT::GLOBAL_QUOTA_DISABLE, + OPT::GLOBAL_RATELIMIT_GET, OPT::GLOBAL_RATELIMIT_SET, + OPT::GLOBAL_RATELIMIT_ENABLE, OPT::GLOBAL_RATELIMIT_DISABLE, + OPT::REALM_DELETE, OPT::REALM_GET, OPT::REALM_LIST, + OPT::REALM_LIST_PERIODS, + OPT::REALM_GET_DEFAULT, + OPT::REALM_RENAME, OPT::REALM_SET, + OPT::REALM_DEFAULT, OPT::REALM_PULL}; + + std::set readonly_ops_list = { + OPT::USER_INFO, + OPT::USER_STATS, + OPT::BUCKETS_LIST, + OPT::BUCKET_LIMIT_CHECK, + OPT::BUCKET_LAYOUT, + OPT::BUCKET_STATS, + OPT::BUCKET_SYNC_CHECKPOINT, + OPT::BUCKET_SYNC_INFO, + OPT::BUCKET_SYNC_STATUS, + OPT::BUCKET_SYNC_MARKERS, + OPT::BUCKET_SHARD_OBJECTS, + OPT::BUCKET_OBJECT_SHARD, + OPT::LOG_LIST, + OPT::LOG_SHOW, + OPT::USAGE_SHOW, + OPT::OBJECT_STAT, + OPT::BI_GET, + OPT::BI_LIST, + OPT::OLH_GET, + OPT::OLH_READLOG, + OPT::GC_LIST, + OPT::LC_LIST, + OPT::ORPHANS_LIST_JOBS, + OPT::ZONEGROUP_GET, + OPT::ZONEGROUP_LIST, + OPT::ZONEGROUP_PLACEMENT_LIST, + OPT::ZONEGROUP_PLACEMENT_GET, + OPT::ZONE_GET, + OPT::ZONE_LIST, + OPT::ZONE_PLACEMENT_LIST, + OPT::ZONE_PLACEMENT_GET, + OPT::METADATA_GET, + OPT::METADATA_LIST, + OPT::METADATA_SYNC_STATUS, + OPT::MDLOG_LIST, + OPT::MDLOG_STATUS, + OPT::SYNC_ERROR_LIST, + OPT::SYNC_GROUP_GET, + OPT::SYNC_POLICY_GET, + OPT::BILOG_LIST, + OPT::BILOG_STATUS, + OPT::DATA_SYNC_STATUS, + OPT::DATALOG_LIST, + OPT::DATALOG_STATUS, + OPT::REALM_GET, + OPT::REALM_GET_DEFAULT, + OPT::REALM_LIST, + OPT::REALM_LIST_PERIODS, + OPT::PERIOD_GET, + OPT::PERIOD_GET_CURRENT, + OPT::PERIOD_LIST, + OPT::GLOBAL_QUOTA_GET, + OPT::GLOBAL_RATELIMIT_GET, + OPT::SYNC_INFO, + OPT::SYNC_STATUS, + OPT::ROLE_GET, + OPT::ROLE_LIST, + OPT::ROLE_POLICY_LIST, + OPT::ROLE_POLICY_GET, + OPT::RESHARD_LIST, + OPT::RESHARD_STATUS, + OPT::PUBSUB_TOPIC_LIST, + OPT::PUBSUB_NOTIFICATION_LIST, + OPT::PUBSUB_TOPIC_GET, + OPT::PUBSUB_NOTIFICATION_GET, + OPT::SCRIPT_GET, + }; + + std::set gc_ops_list = { + OPT::GC_LIST, + OPT::GC_PROCESS, + OPT::OBJECT_RM, + OPT::BUCKET_RM, // --purge-objects + OPT::USER_RM, // --purge-data + OPT::OBJECTS_EXPIRE, + OPT::OBJECTS_EXPIRE_STALE_RM, + OPT::LC_PROCESS, + OPT::BUCKET_SYNC_RUN, + OPT::DATA_SYNC_RUN, + OPT::BUCKET_REWRITE, + OPT::OBJECT_REWRITE + }; + + raw_storage_op = (raw_storage_ops_list.find(opt_cmd) != raw_storage_ops_list.end() || + raw_period_update || raw_period_pull); + bool need_cache = readonly_ops_list.find(opt_cmd) == readonly_ops_list.end(); + bool need_gc = (gc_ops_list.find(opt_cmd) != gc_ops_list.end()) && !bypass_gc; + + DriverManager::Config cfg = DriverManager::get_config(true, g_ceph_context); + + auto config_store_type = g_conf().get_val("rgw_config_store"); + cfgstore = DriverManager::create_config_store(dpp(), config_store_type); + if (!cfgstore) { + cerr << "couldn't init config storage provider" << std::endl; + return EIO; + } + + if (raw_storage_op) { + driver = DriverManager::get_raw_storage(dpp(), + g_ceph_context, + cfg); + } else { + driver = DriverManager::get_storage(dpp(), + g_ceph_context, + cfg, + false, + false, + false, + false, + false, + need_cache && g_conf()->rgw_cache_enabled, + need_gc); + } + if (!driver) { + cerr << "couldn't init storage provider" << std::endl; + return EIO; + } + + /* Needs to be after the driver is initialized. Note, user could be empty here. */ + user = driver->get_user(user_id_arg); + + init_optional_bucket(opt_bucket, opt_tenant, + opt_bucket_name, opt_bucket_id); + init_optional_bucket(opt_source_bucket, opt_source_tenant, + opt_source_bucket_name, opt_source_bucket_id); + init_optional_bucket(opt_dest_bucket, opt_dest_tenant, + opt_dest_bucket_name, opt_dest_bucket_id); + + if (tenant.empty()) { + tenant = user->get_tenant(); + } else { + if (rgw::sal::User::empty(user) && opt_cmd != OPT::ROLE_CREATE + && opt_cmd != OPT::ROLE_DELETE + && opt_cmd != OPT::ROLE_GET + && opt_cmd != OPT::ROLE_TRUST_POLICY_MODIFY + && opt_cmd != OPT::ROLE_LIST + && opt_cmd != OPT::ROLE_POLICY_PUT + && opt_cmd != OPT::ROLE_POLICY_LIST + && opt_cmd != OPT::ROLE_POLICY_GET + && opt_cmd != OPT::ROLE_POLICY_DELETE + && opt_cmd != OPT::ROLE_UPDATE + && opt_cmd != OPT::RESHARD_ADD + && opt_cmd != OPT::RESHARD_CANCEL + && opt_cmd != OPT::RESHARD_STATUS + && opt_cmd != OPT::PUBSUB_TOPIC_LIST + && opt_cmd != OPT::PUBSUB_NOTIFICATION_LIST + && opt_cmd != OPT::PUBSUB_TOPIC_GET + && opt_cmd != OPT::PUBSUB_NOTIFICATION_GET + && opt_cmd != OPT::PUBSUB_TOPIC_RM + && opt_cmd != OPT::PUBSUB_NOTIFICATION_RM) { + cerr << "ERROR: --tenant is set, but there's no user ID" << std::endl; + return EINVAL; + } + user->set_tenant(tenant); + } + if (user_ns.empty()) { + user_ns = user->get_id().ns; + } else { + user->set_ns(user_ns); + } + + if (!new_user_id.empty() && !tenant.empty()) { + new_user_id.tenant = tenant; + } + + /* check key parameter conflict */ + if ((!access_key.empty()) && gen_access_key) { + cerr << "ERROR: key parameter conflict, --access-key & --gen-access-key" << std::endl; + return EINVAL; + } + if ((!secret_key.empty()) && gen_secret_key) { + cerr << "ERROR: key parameter conflict, --secret & --gen-secret" << std::endl; + return EINVAL; + } + } + + // default to pretty json + if (format.empty()) { + format = "json"; + pretty_format = true; + } + + if (format == "xml") + formatter = make_unique(new XMLFormatter(pretty_format)); + else if (format == "json") + formatter = make_unique(new JSONFormatter(pretty_format)); + else { + cerr << "unrecognized format: " << format << std::endl; + exit(1); + } + + zone_formatter = std::make_unique(pretty_format); + + realm_name = g_conf()->rgw_realm; + zone_name = g_conf()->rgw_zone; + zonegroup_name = g_conf()->rgw_zonegroup; + + if (!realm_name.empty()) { + opt_realm_name = realm_name; + } + + if (!zone_name.empty()) { + opt_zone_name = zone_name; + } + + if (!zonegroup_name.empty()) { + opt_zonegroup_name = zonegroup_name; + } + + RGWStreamFlusher stream_flusher(formatter.get(), cout); + + RGWUserAdminOpState user_op(driver); + if (!user_email.empty()) { + user_op.user_email_specified=true; + } + + if (!source_zone_name.empty()) { + std::unique_ptr zone; + if (driver->get_zone()->get_zonegroup().get_zone_by_name(source_zone_name, &zone) < 0) { + cerr << "WARNING: cannot find source zone id for name=" << source_zone_name << std::endl; + source_zone = source_zone_name; + } else { + source_zone.id = zone->get_id(); + } + } + + rgw_http_client_init(g_ceph_context); + + struct rgw_curl_setup { + rgw_curl_setup() { + rgw::curl::setup_curl(boost::none); + } + ~rgw_curl_setup() { + rgw::curl::cleanup_curl(); + } + } curl_cleanup; + + oath_init(); + + StoreDestructor store_destructor(driver); + + if (raw_storage_op) { + switch (opt_cmd) { + case OPT::PERIOD_DELETE: + { + if (period_id.empty()) { + cerr << "missing period id" << std::endl; + return EINVAL; + } + int ret = cfgstore->delete_period(dpp(), null_yield, period_id); + if (ret < 0) { + cerr << "ERROR: couldn't delete period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + } + break; + case OPT::PERIOD_GET: + { + std::optional epoch; + if (!period_epoch.empty()) { + epoch = atoi(period_epoch.c_str()); + } + if (staging) { + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0 ) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + realm_id = realm.get_id(); + realm_name = realm.get_name(); + period_id = RGWPeriod::get_staging_id(realm_id); + epoch = 1; + } + if (period_id.empty()) { + // use realm's current period + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0 ) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + period_id = realm.current_period; + } + + RGWPeriod period; + int ret = cfgstore->read_period(dpp(), null_yield, period_id, + epoch, period); + if (ret < 0) { + cerr << "failed to load period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("period", period, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::PERIOD_GET_CURRENT: + { + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0) { + std::cerr << "failed to load realm: " << cpp_strerror(ret) << std::endl; + return -ret; + } + + formatter->open_object_section("period_get_current"); + encode_json("current_period", realm.current_period, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } + break; + case OPT::PERIOD_LIST: + { + Formatter::ObjectSection periods_list{*formatter, "periods_list"}; + Formatter::ArraySection periods{*formatter, "periods"}; + rgw::sal::ListResult listing; + std::array period_ids; // list in pages of 1000 + do { + int ret = cfgstore->list_period_ids(dpp(), null_yield, listing.next, + period_ids, listing); + if (ret < 0) { + std::cerr << "failed to list periods: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + for (const auto& id : listing.entries) { + encode_json("id", id, formatter.get()); + } + } while (!listing.next.empty()); + } // close sections periods and periods_list + formatter->flush(cout); + break; + case OPT::PERIOD_UPDATE: + { + int ret = update_period(cfgstore.get(), realm_id, realm_name, + period_epoch, commit, remote, url, + opt_region, access_key, secret_key, + formatter.get(), yes_i_really_mean_it); + if (ret < 0) { + return -ret; + } + } + break; + case OPT::PERIOD_PULL: + { + boost::optional conn; + RGWRESTConn *remote_conn = nullptr; + if (url.empty()) { + // load current period for endpoints + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0 ) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + period_id = realm.current_period; + + RGWPeriod current_period; + ret = cfgstore->read_period(dpp(), null_yield, period_id, + std::nullopt, current_period); + if (ret < 0) { + cerr << "failed to load current period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (remote.empty()) { + // use realm master zone as remote + remote = current_period.get_master_zone().id; + } + conn = get_remote_conn(static_cast(driver), current_period.get_map(), remote); + if (!conn) { + cerr << "failed to find a zone or zonegroup for remote " + << remote << std::endl; + return -ENOENT; + } + remote_conn = &*conn; + } + + RGWPeriod period; + int ret = do_period_pull(cfgstore.get(), remote_conn, url, + opt_region, access_key, secret_key, + realm_id, realm_name, period_id, period_epoch, + &period); + if (ret < 0) { + cerr << "period pull failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("period", period, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::GLOBAL_RATELIMIT_GET: + case OPT::GLOBAL_RATELIMIT_SET: + case OPT::GLOBAL_RATELIMIT_ENABLE: + case OPT::GLOBAL_RATELIMIT_DISABLE: + { + if (realm_id.empty()) { + if (!realm_name.empty()) { + // look up realm_id for the given realm_name + int ret = cfgstore->read_realm_id(dpp(), null_yield, + realm_name, realm_id); + if (ret < 0) { + cerr << "ERROR: failed to read realm for " << realm_name + << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + // use default realm_id when none is given + int ret = cfgstore->read_default_realm_id(dpp(), null_yield, + realm_id); + if (ret < 0 && ret != -ENOENT) { // on ENOENT, use empty realm_id + cerr << "ERROR: failed to read default realm: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + RGWPeriodConfig period_config; + int ret = cfgstore->read_period_config(dpp(), null_yield, realm_id, + period_config); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: failed to read period config: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + bool ratelimit_configured = true; + formatter->open_object_section("period_config"); + if (ratelimit_scope == "bucket") { + ratelimit_configured = set_ratelimit_info(period_config.bucket_ratelimit, opt_cmd, + max_read_ops, max_write_ops, + max_read_bytes, max_write_bytes, + have_max_read_ops, have_max_write_ops, + have_max_read_bytes, have_max_write_bytes); + encode_json("bucket_ratelimit", period_config.bucket_ratelimit, formatter.get()); + } else if (ratelimit_scope == "user") { + ratelimit_configured = set_ratelimit_info(period_config.user_ratelimit, opt_cmd, + max_read_ops, max_write_ops, + max_read_bytes, max_write_bytes, + have_max_read_ops, have_max_write_ops, + have_max_read_bytes, have_max_write_bytes); + encode_json("user_ratelimit", period_config.user_ratelimit, formatter.get()); + } else if (ratelimit_scope == "anonymous") { + ratelimit_configured = set_ratelimit_info(period_config.anon_ratelimit, opt_cmd, + max_read_ops, max_write_ops, + max_read_bytes, max_write_bytes, + have_max_read_ops, have_max_write_ops, + have_max_read_bytes, have_max_write_bytes); + encode_json("anonymous_ratelimit", period_config.anon_ratelimit, formatter.get()); + } else if (ratelimit_scope.empty() && opt_cmd == OPT::GLOBAL_RATELIMIT_GET) { + // if no scope is given for GET, print both + encode_json("bucket_ratelimit", period_config.bucket_ratelimit, formatter.get()); + encode_json("user_ratelimit", period_config.user_ratelimit, formatter.get()); + encode_json("anonymous_ratelimit", period_config.anon_ratelimit, formatter.get()); + } else { + cerr << "ERROR: invalid rate limit scope specification. Please specify " + "either --ratelimit-scope=bucket, or --ratelimit-scope=user or --ratelimit-scope=anonymous" << std::endl; + return EINVAL; + } + if (!ratelimit_configured) { + cerr << "ERROR: no rate limit values have been specified" << std::endl; + return EINVAL; + } + + formatter->close_section(); + + if (opt_cmd != OPT::GLOBAL_RATELIMIT_GET) { + // write the modified period config + constexpr bool exclusive = false; + ret = cfgstore->write_period_config(dpp(), null_yield, exclusive, + realm_id, period_config); + if (ret < 0) { + cerr << "ERROR: failed to write period config: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (!realm_id.empty()) { + cout << "Global ratelimit changes saved. Use 'period update' to apply " + "them to the staging period, and 'period commit' to commit the " + "new period." << std::endl; + } else { + cout << "Global ratelimit changes saved. They will take effect as " + "the gateways are restarted." << std::endl; + } + } + + formatter->flush(cout); + } + break; + case OPT::GLOBAL_QUOTA_GET: + case OPT::GLOBAL_QUOTA_SET: + case OPT::GLOBAL_QUOTA_ENABLE: + case OPT::GLOBAL_QUOTA_DISABLE: + { + if (realm_id.empty()) { + if (!realm_name.empty()) { + // look up realm_id for the given realm_name + int ret = cfgstore->read_realm_id(dpp(), null_yield, + realm_name, realm_id); + if (ret < 0) { + cerr << "ERROR: failed to read realm for " << realm_name + << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + // use default realm_id when none is given + int ret = cfgstore->read_default_realm_id(dpp(), null_yield, + realm_id); + if (ret < 0 && ret != -ENOENT) { // on ENOENT, use empty realm_id + cerr << "ERROR: failed to read default realm: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + RGWPeriodConfig period_config; + int ret = cfgstore->read_period_config(dpp(), null_yield, realm_id, + period_config); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: failed to read period config: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + + formatter->open_object_section("period_config"); + if (quota_scope == "bucket") { + set_quota_info(period_config.quota.bucket_quota, opt_cmd, + max_size, max_objects, + have_max_size, have_max_objects); + encode_json("bucket quota", period_config.quota.bucket_quota, formatter.get()); + } else if (quota_scope == "user") { + set_quota_info(period_config.quota.user_quota, opt_cmd, + max_size, max_objects, + have_max_size, have_max_objects); + encode_json("user quota", period_config.quota.user_quota, formatter.get()); + } else if (quota_scope.empty() && opt_cmd == OPT::GLOBAL_QUOTA_GET) { + // if no scope is given for GET, print both + encode_json("bucket quota", period_config.quota.bucket_quota, formatter.get()); + encode_json("user quota", period_config.quota.user_quota, formatter.get()); + } else { + cerr << "ERROR: invalid quota scope specification. Please specify " + "either --quota-scope=bucket, or --quota-scope=user" << std::endl; + return EINVAL; + } + formatter->close_section(); + + if (opt_cmd != OPT::GLOBAL_QUOTA_GET) { + // write the modified period config + constexpr bool exclusive = false; + ret = cfgstore->write_period_config(dpp(), null_yield, exclusive, + realm_id, period_config); + if (ret < 0) { + cerr << "ERROR: failed to write period config: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (!realm_id.empty()) { + cout << "Global quota changes saved. Use 'period update' to apply " + "them to the staging period, and 'period commit' to commit the " + "new period." << std::endl; + } else { + cout << "Global quota changes saved. They will take effect as " + "the gateways are restarted." << std::endl; + } + } + + formatter->flush(cout); + } + break; + case OPT::REALM_CREATE: + { + if (realm_name.empty()) { + cerr << "missing realm name" << std::endl; + return EINVAL; + } + + RGWRealm realm; + realm.name = realm_name; + + constexpr bool exclusive = true; + int ret = rgw::create_realm(dpp(), null_yield, cfgstore.get(), + exclusive, realm); + if (ret < 0) { + cerr << "ERROR: couldn't create realm " << realm_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (set_default) { + ret = rgw::set_default_realm(dpp(), null_yield, cfgstore.get(), realm); + if (ret < 0) { + cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("realm", realm, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::REALM_DELETE: + { + if (realm_id.empty() && realm_name.empty()) { + cerr << "missing realm name or id" << std::endl; + return EINVAL; + } + RGWRealm realm; + std::unique_ptr writer; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm, &writer); + if (ret < 0) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = writer->remove(dpp(), null_yield); + if (ret < 0) { + cerr << "failed to remove realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + } + break; + case OPT::REALM_GET: + { + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0) { + if (ret == -ENOENT && realm_name.empty() && realm_id.empty()) { + cerr << "missing realm name or id, or default realm not found" << std::endl; + } else { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + } + return -ret; + } + encode_json("realm", realm, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::REALM_GET_DEFAULT: + { + string default_id; + int ret = cfgstore->read_default_realm_id(dpp(), null_yield, default_id); + if (ret == -ENOENT) { + cout << "No default realm is set" << std::endl; + return -ret; + } else if (ret < 0) { + cerr << "Error reading default realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + cout << "default realm: " << default_id << std::endl; + } + break; + case OPT::REALM_LIST: + { + std::string default_id; + int ret = cfgstore->read_default_realm_id(dpp(), null_yield, + default_id); + if (ret < 0 && ret != -ENOENT) { + cerr << "could not determine default realm: " << cpp_strerror(-ret) << std::endl; + } + + Formatter::ObjectSection realms_list{*formatter, "realms_list"}; + encode_json("default_info", default_id, formatter.get()); + + Formatter::ArraySection realms{*formatter, "realms"}; + rgw::sal::ListResult listing; + std::array names; // list in pages of 1000 + do { + ret = cfgstore->list_realm_names(dpp(), null_yield, listing.next, + names, listing); + if (ret < 0) { + std::cerr << "failed to list realms: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + for (const auto& name : listing.entries) { + encode_json("name", name, formatter.get()); + } + } while (!listing.next.empty()); + } // close sections realms and realms_list + formatter->flush(cout); + break; + case OPT::REALM_LIST_PERIODS: + { + // use realm's current period + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + period_id = realm.current_period; + + Formatter::ObjectSection periods_list{*formatter, "realm_periods_list"}; + encode_json("current_period", period_id, formatter.get()); + + Formatter::ArraySection periods{*formatter, "periods"}; + + while (!period_id.empty()) { + RGWPeriod period; + ret = cfgstore->read_period(dpp(), null_yield, period_id, + std::nullopt, period); + if (ret < 0) { + cerr << "failed to load period id " << period_id + << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("id", period_id, formatter.get()); + period_id = period.predecessor_uuid; + } + } // close sections periods and realm_periods_list + formatter->flush(cout); + break; + + case OPT::REALM_RENAME: + { + if (realm_new_name.empty()) { + cerr << "missing realm new name" << std::endl; + return EINVAL; + } + if (realm_name.empty() && realm_id.empty()) { + cerr << "missing realm name or id" << std::endl; + return EINVAL; + } + + RGWRealm realm; + std::unique_ptr writer; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm, &writer); + if (ret < 0) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = writer->rename(dpp(), null_yield, realm, realm_new_name); + if (ret < 0) { + cerr << "rename failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + cout << "Realm name updated. Note that this change only applies to " + "the current cluster, so this command must be run separately " + "on each of the realm's other clusters." << std::endl; + } + break; + case OPT::REALM_SET: + { + if (realm_id.empty() && realm_name.empty()) { + cerr << "no realm name or id provided" << std::endl; + return EINVAL; + } + bool new_realm = false; + RGWRealm realm; + std::unique_ptr writer; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm, &writer); + if (ret < 0 && ret != -ENOENT) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } else if (ret == -ENOENT) { + new_realm = true; + } + ret = read_decode_json(infile, realm); + if (ret < 0) { + return 1; + } + if (!realm_name.empty() && realm.get_name() != realm_name) { + cerr << "mismatch between --rgw-realm " << realm_name << " and json input file name " << + realm.get_name() << std::endl; + return EINVAL; + } + /* new realm */ + if (new_realm) { + cout << "clearing period and epoch for new realm" << std::endl; + realm.clear_current_period_and_epoch(); + constexpr bool exclusive = true; + ret = rgw::create_realm(dpp(), null_yield, cfgstore.get(), + exclusive, realm); + if (ret < 0) { + cerr << "ERROR: couldn't create new realm: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } else { + ret = writer->write(dpp(), null_yield, realm); + if (ret < 0) { + cerr << "ERROR: couldn't driver realm info: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + if (set_default) { + ret = rgw::set_default_realm(dpp(), null_yield, cfgstore.get(), realm); + if (ret < 0) { + cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + encode_json("realm", realm, formatter.get()); + formatter->flush(cout); + } + break; + + case OPT::REALM_DEFAULT: + { + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = rgw::set_default_realm(dpp(), null_yield, cfgstore.get(), realm); + if (ret < 0) { + cerr << "failed to set realm as default: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT::REALM_PULL: + { + if (url.empty()) { + cerr << "A --url must be provided." << std::endl; + return EINVAL; + } + RGWEnv env; + req_info info(g_ceph_context, &env); + info.method = "GET"; + info.request_uri = "/admin/realm"; + + map ¶ms = info.args.get_params(); + if (!realm_id.empty()) + params["id"] = realm_id; + if (!realm_name.empty()) + params["name"] = realm_name; + + bufferlist bl; + JSONParser p; + int ret = send_to_url(url, opt_region, access_key, secret_key, info, bl, p); + if (ret < 0) { + cerr << "request failed: " << cpp_strerror(-ret) << std::endl; + if (ret == -EACCES) { + cerr << "If the realm has been changed on the master zone, the " + "master zone's gateway may need to be restarted to recognize " + "this user." << std::endl; + } + return -ret; + } + RGWRealm realm; + try { + decode_json_obj(realm, &p); + } catch (const JSONDecoder::err& e) { + cerr << "failed to decode JSON response: " << e.what() << std::endl; + return EINVAL; + } + RGWPeriod period; + auto& current_period = realm.get_current_period(); + if (!current_period.empty()) { + // pull the latest epoch of the realm's current period + ret = do_period_pull(cfgstore.get(), nullptr, url, opt_region, + access_key, secret_key, + realm_id, realm_name, current_period, "", + &period); + if (ret < 0) { + cerr << "could not fetch period " << current_period << std::endl; + return -ret; + } + } + constexpr bool exclusive = false; + ret = rgw::create_realm(dpp(), null_yield, cfgstore.get(), + exclusive, realm); + if (ret < 0) { + cerr << "Error storing realm " << realm.get_id() << ": " + << cpp_strerror(ret) << std::endl; + return -ret; + } + + if (set_default) { + ret = rgw::set_default_realm(dpp(), null_yield, cfgstore.get(), realm); + if (ret < 0) { + cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("realm", realm, formatter.get()); + formatter->flush(cout); + } + break; + + case OPT::ZONEGROUP_ADD: + { + if (zonegroup_id.empty() && zonegroup_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + + // load the zonegroup and zone params + RGWZoneGroup zonegroup; + std::unique_ptr zonegroup_writer; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &zonegroup_writer); + if (ret < 0) { + cerr << "failed to load zonegroup " << zonegroup_name << " id " + << zonegroup_id << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWZoneParams zone_params; + std::unique_ptr zone_writer; + ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone_params, &zone_writer); + if (ret < 0) { + cerr << "unable to load zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + // update zone_params if necessary + bool need_zone_update = false; + + if (zone_params.realm_id != zonegroup.realm_id) { + if (!zone_params.realm_id.empty()) { + cerr << "WARNING: overwriting zone realm_id=" << zone_params.realm_id + << " to match zonegroup realm_id=" << zonegroup.realm_id << std::endl; + } + zone_params.realm_id = zonegroup.realm_id; + need_zone_update = true; + } + + for (auto a : tier_config_add) { + ret = zone_params.tier_config.set(a.first, a.second); + if (ret < 0) { + cerr << "ERROR: failed to set configurable: " << a << std::endl; + return EINVAL; + } + need_zone_update = true; + } + + if (need_zone_update) { + ret = zone_writer->write(dpp(), null_yield, zone_params); + if (ret < 0) { + cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + const bool *pis_master = (is_master_set ? &is_master : nullptr); + const bool *pread_only = (is_read_only_set ? &read_only : nullptr); + const bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr); + const string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr); + + // validate --tier-type if specified + const string *ptier_type = (tier_type_specified ? &tier_type : nullptr); + if (ptier_type) { + auto sync_mgr = static_cast(driver)->svc()->sync_modules->get_manager(); + if (!sync_mgr->get_module(*ptier_type, nullptr)) { + ldpp_dout(dpp(), -1) << "ERROR: could not find sync module: " + << *ptier_type << ", valid sync modules: " + << sync_mgr->get_registered_module_names() << dendl; + return EINVAL; + } + } + + if (enable_features.empty()) { // enable all features by default + enable_features.insert(rgw::zone_features::supported.begin(), + rgw::zone_features::supported.end()); + } + + // add/update the public zone information stored in the zonegroup + ret = rgw::add_zone_to_group(dpp(), zonegroup, zone_params, + pis_master, pread_only, endpoints, + ptier_type, psync_from_all, + sync_from, sync_from_rm, + predirect_zone, bucket_index_max_shards, + enable_features, disable_features); + if (ret < 0) { + return -ret; + } + + // write the updated zonegroup + ret = zonegroup_writer->write(dpp(), null_yield, zonegroup); + if (ret < 0) { + cerr << "failed to write updated zonegroup " << zonegroup.get_name() + << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("zonegroup", zonegroup, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONEGROUP_CREATE: + { + if (zonegroup_name.empty()) { + cerr << "Missing zonegroup name" << std::endl; + return EINVAL; + } + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWZoneGroup zonegroup; + zonegroup.name = zonegroup_name; + zonegroup.is_master = is_master; + zonegroup.realm_id = realm.get_id(); + zonegroup.endpoints = endpoints; + zonegroup.api_name = (api_name.empty() ? zonegroup_name : api_name); + + zonegroup.enabled_features = enable_features; + if (zonegroup.enabled_features.empty()) { // enable features by default + zonegroup.enabled_features.insert(rgw::zone_features::enabled.begin(), + rgw::zone_features::enabled.end()); + } + for (const auto& feature : disable_features) { + auto i = zonegroup.enabled_features.find(feature); + if (i == zonegroup.enabled_features.end()) { + ldout(cct, 1) << "WARNING: zone feature \"" << feature + << "\" was not enabled in zonegroup " << zonegroup_name << dendl; + continue; + } + zonegroup.enabled_features.erase(i); + } + + constexpr bool exclusive = true; + ret = rgw::create_zonegroup(dpp(), null_yield, cfgstore.get(), + exclusive, zonegroup); + if (ret < 0) { + cerr << "failed to create zonegroup " << zonegroup_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (set_default) { + ret = rgw::set_default_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup); + if (ret < 0) { + cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zonegroup", zonegroup, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONEGROUP_DEFAULT: + { + if (zonegroup_id.empty() && zonegroup_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + + RGWZoneGroup zonegroup; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = rgw::set_default_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup); + if (ret < 0) { + cerr << "failed to set zonegroup as default: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT::ZONEGROUP_DELETE: + { + if (zonegroup_id.empty() && zonegroup_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + RGWZoneGroup zonegroup; + std::unique_ptr writer; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &writer); + if (ret < 0) { + cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = writer->remove(dpp(), null_yield); + if (ret < 0) { + cerr << "ERROR: couldn't delete zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT::ZONEGROUP_GET: + { + RGWZoneGroup zonegroup; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, zonegroup); + if (ret < 0) { + cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("zonegroup", zonegroup, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONEGROUP_LIST: + { + RGWZoneGroup default_zonegroup; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + {}, {}, default_zonegroup); + if (ret < 0 && ret != -ENOENT) { + cerr << "could not determine default zonegroup: " << cpp_strerror(-ret) << std::endl; + } + + Formatter::ObjectSection zonegroups_list{*formatter, "zonegroups_list"}; + encode_json("default_info", default_zonegroup.id, formatter.get()); + + Formatter::ArraySection zonegroups{*formatter, "zonegroups"}; + rgw::sal::ListResult listing; + std::array names; // list in pages of 1000 + do { + ret = cfgstore->list_zonegroup_names(dpp(), null_yield, listing.next, + names, listing); + if (ret < 0) { + std::cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + for (const auto& name : listing.entries) { + encode_json("name", name, formatter.get()); + } + } while (!listing.next.empty()); + } // close sections zonegroups and zonegroups_list + formatter->flush(cout); + break; + case OPT::ZONEGROUP_MODIFY: + { + RGWZoneGroup zonegroup; + std::unique_ptr writer; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &writer); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + bool need_update = false; + + if (!master_zone.empty()) { + zonegroup.master_zone = master_zone; + need_update = true; + } + + if (is_master_set) { + zonegroup.is_master = is_master; + need_update = true; + } + + if (!endpoints.empty()) { + zonegroup.endpoints = endpoints; + need_update = true; + } + + if (!api_name.empty()) { + zonegroup.api_name = api_name; + need_update = true; + } + + if (!realm_id.empty()) { + zonegroup.realm_id = realm_id; + need_update = true; + } else if (!realm_name.empty()) { + // get realm id from name + ret = cfgstore->read_realm_id(dpp(), null_yield, realm_name, + zonegroup.realm_id); + if (ret < 0) { + cerr << "failed to find realm by name " << realm_name << std::endl; + return -ret; + } + need_update = true; + } + + if (bucket_index_max_shards) { + for (auto& [name, zone] : zonegroup.zones) { + zone.bucket_index_max_shards = *bucket_index_max_shards; + } + need_update = true; + } + + for (const auto& feature : enable_features) { + zonegroup.enabled_features.insert(feature); + need_update = true; + } + for (const auto& feature : disable_features) { + auto i = zonegroup.enabled_features.find(feature); + if (i == zonegroup.enabled_features.end()) { + ldout(cct, 1) << "WARNING: zone feature \"" << feature + << "\" was not enabled in zonegroup " + << zonegroup.get_name() << dendl; + continue; + } + zonegroup.enabled_features.erase(i); + need_update = true; + } + + if (need_update) { + ret = writer->write(dpp(), null_yield, zonegroup); + if (ret < 0) { + cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (set_default) { + ret = rgw::set_default_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup); + if (ret < 0) { + cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zonegroup", zonegroup, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONEGROUP_SET: + { + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + bool default_realm_not_exist = (ret == -ENOENT && realm_id.empty() && realm_name.empty()); + + if (ret < 0 && !default_realm_not_exist) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWZoneGroup zonegroup; + ret = read_decode_json(infile, zonegroup); + if (ret < 0) { + return 1; + } + if (zonegroup.realm_id.empty() && !default_realm_not_exist) { + zonegroup.realm_id = realm.get_id(); + } + // validate zonegroup features + for (const auto& feature : zonegroup.enabled_features) { + if (!rgw::zone_features::supports(feature)) { + std::cerr << "ERROR: Unrecognized zonegroup feature \"" + << feature << "\"" << std::endl; + return EINVAL; + } + } + for (const auto& [name, zone] : zonegroup.zones) { + // validate zone features + for (const auto& feature : zone.supported_features) { + if (!rgw::zone_features::supports(feature)) { + std::cerr << "ERROR: Unrecognized zone feature \"" + << feature << "\" in zone " << zone.name << std::endl; + return EINVAL; + } + } + // zone must support everything zonegroup does + for (const auto& feature : zonegroup.enabled_features) { + if (!zone.supports(feature)) { + std::cerr << "ERROR: Zone " << name << " does not support feature \"" + << feature << "\" required by zonegroup" << std::endl; + return EINVAL; + } + } + } + + // create/overwrite the zonegroup info + constexpr bool exclusive = false; + ret = rgw::create_zonegroup(dpp(), null_yield, cfgstore.get(), + exclusive, zonegroup); + if (ret < 0) { + cerr << "ERROR: couldn't create zonegroup info: " << cpp_strerror(-ret) << std::endl; + return 1; + } + + if (set_default) { + ret = rgw::set_default_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup); + if (ret < 0) { + cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zonegroup", zonegroup, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONEGROUP_REMOVE: + { + RGWZoneGroup zonegroup; + std::unique_ptr writer; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &writer); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (zone_id.empty()) { + if (zone_name.empty()) { + cerr << "no --zone-id or --rgw-zone name provided" << std::endl; + return EINVAL; + } + // look up zone id by name + for (auto& z : zonegroup.zones) { + if (zone_name == z.second.name) { + zone_id = z.second.id; + break; + } + } + if (zone_id.empty()) { + cerr << "zone name " << zone_name << " not found in zonegroup " + << zonegroup.get_name() << std::endl; + return ENOENT; + } + } + + ret = rgw::remove_zone_from_group(dpp(), zonegroup, zone_id); + if (ret < 0) { + cerr << "failed to remove zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = writer->write(dpp(), null_yield, zonegroup); + if (ret < 0) { + cerr << "failed to write zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("zonegroup", zonegroup, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONEGROUP_RENAME: + { + if (zonegroup_new_name.empty()) { + cerr << " missing zonegroup new name" << std::endl; + return EINVAL; + } + if (zonegroup_id.empty() && zonegroup_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + RGWZoneGroup zonegroup; + std::unique_ptr writer; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &writer); + if (ret < 0) { + cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = writer->rename(dpp(), null_yield, zonegroup, zonegroup_new_name); + if (ret < 0) { + cerr << "failed to rename zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT::ZONEGROUP_PLACEMENT_LIST: + { + RGWZoneGroup zonegroup; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, zonegroup); + if (ret < 0) { + cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("placement_targets", zonegroup.placement_targets, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONEGROUP_PLACEMENT_GET: + { + if (placement_id.empty()) { + cerr << "ERROR: --placement-id not specified" << std::endl; + return EINVAL; + } + + RGWZoneGroup zonegroup; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, zonegroup); + if (ret < 0) { + cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto p = zonegroup.placement_targets.find(placement_id); + if (p == zonegroup.placement_targets.end()) { + cerr << "failed to find a zonegroup placement target named '" << placement_id << "'" << std::endl; + return -ENOENT; + } + encode_json("placement_targets", p->second, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONEGROUP_PLACEMENT_ADD: + case OPT::ZONEGROUP_PLACEMENT_MODIFY: + case OPT::ZONEGROUP_PLACEMENT_RM: + case OPT::ZONEGROUP_PLACEMENT_DEFAULT: + { + if (placement_id.empty()) { + cerr << "ERROR: --placement-id not specified" << std::endl; + return EINVAL; + } + + rgw_placement_rule rule; + rule.from_str(placement_id); + + if (!rule.storage_class.empty() && opt_storage_class && + rule.storage_class != *opt_storage_class) { + cerr << "ERROR: provided contradicting storage class configuration" << std::endl; + return EINVAL; + } else if (rule.storage_class.empty()) { + rule.storage_class = opt_storage_class.value_or(string()); + } + + RGWZoneGroup zonegroup; + std::unique_ptr writer; + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &writer); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (opt_cmd == OPT::ZONEGROUP_PLACEMENT_ADD || + opt_cmd == OPT::ZONEGROUP_PLACEMENT_MODIFY) { + RGWZoneGroupPlacementTarget& target = zonegroup.placement_targets[placement_id]; + if (!tags.empty()) { + target.tags.clear(); + for (auto& t : tags) { + target.tags.insert(t); + } + } + + target.name = placement_id; + for (auto& t : tags_rm) { + target.tags.erase(t); + } + for (auto& t : tags_add) { + target.tags.insert(t); + } + target.storage_classes.insert(rule.get_storage_class()); + + /* Tier options */ + bool tier_class = false; + std::string storage_class = rule.get_storage_class(); + RGWZoneGroupPlacementTier t{storage_class}; + RGWZoneGroupPlacementTier *pt = &t; + + auto ptiter = target.tier_targets.find(storage_class); + if (ptiter != target.tier_targets.end()) { + pt = &ptiter->second; + tier_class = true; + } else if (tier_type_specified) { + if (tier_type == "cloud-s3") { + /* we support only cloud-s3 tier-type for now. + * Once set cant be reset. */ + tier_class = true; + pt->tier_type = tier_type; + pt->storage_class = storage_class; + } else { + cerr << "ERROR: Invalid tier-type specified" << std::endl; + return EINVAL; + } + } + + if (tier_class) { + if (tier_config_add.size() > 0) { + JSONFormattable tconfig; + for (auto add : tier_config_add) { + int r = tconfig.set(add.first, add.second); + if (r < 0) { + cerr << "ERROR: failed to set configurable: " << add << std::endl; + return EINVAL; + } + } + int r = pt->update_params(tconfig); + if (r < 0) { + cerr << "ERROR: failed to update tier_config options"<< std::endl; + } + } + if (tier_config_rm.size() > 0) { + JSONFormattable tconfig; + for (auto add : tier_config_rm) { + int r = tconfig.set(add.first, add.second); + if (r < 0) { + cerr << "ERROR: failed to set configurable: " << add << std::endl; + return EINVAL; + } + } + int r = pt->clear_params(tconfig); + if (r < 0) { + cerr << "ERROR: failed to update tier_config options"<< std::endl; + } + } + + target.tier_targets.emplace(std::make_pair(storage_class, *pt)); + } + + if (zonegroup.default_placement.empty()) { + zonegroup.default_placement.init(rule.name, RGW_STORAGE_CLASS_STANDARD); + } + } else if (opt_cmd == OPT::ZONEGROUP_PLACEMENT_RM) { + if (!opt_storage_class || opt_storage_class->empty()) { + zonegroup.placement_targets.erase(placement_id); + if (zonegroup.default_placement.name == placement_id) { + // clear default placement + zonegroup.default_placement.clear(); + } + } else { + auto iter = zonegroup.placement_targets.find(placement_id); + if (iter != zonegroup.placement_targets.end()) { + RGWZoneGroupPlacementTarget& info = zonegroup.placement_targets[placement_id]; + info.storage_classes.erase(*opt_storage_class); + + if (zonegroup.default_placement == rule) { + // clear default storage class + zonegroup.default_placement.storage_class.clear(); + } + + auto ptiter = info.tier_targets.find(*opt_storage_class); + if (ptiter != info.tier_targets.end()) { + info.tier_targets.erase(ptiter); + } + } + } + } else if (opt_cmd == OPT::ZONEGROUP_PLACEMENT_DEFAULT) { + if (!zonegroup.placement_targets.count(placement_id)) { + cerr << "failed to find a zonegroup placement target named '" + << placement_id << "'" << std::endl; + return -ENOENT; + } + zonegroup.default_placement = rule; + } + + ret = writer->write(dpp(), null_yield, zonegroup); + if (ret < 0) { + cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("placement_targets", zonegroup.placement_targets, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONE_CREATE: + { + if (zone_name.empty()) { + cerr << "zone name not provided" << std::endl; + return EINVAL; + } + + RGWZoneGroup zonegroup; + std::unique_ptr zonegroup_writer; + /* if the user didn't provide zonegroup info , create stand alone zone */ + if (!zonegroup_id.empty() || !zonegroup_name.empty()) { + int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &zonegroup_writer); + if (ret < 0) { + cerr << "failed to load zonegroup " << zonegroup_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (realm_id.empty() && realm_name.empty()) { + realm_id = zonegroup.realm_id; + } + } + + // create the local zone params + RGWZoneParams zone_params; + zone_params.id = zone_id; + zone_params.name = zone_name; + + zone_params.system_key.id = access_key; + zone_params.system_key.key = secret_key; + zone_params.realm_id = realm_id; + for (const auto& a : tier_config_add) { + int r = zone_params.tier_config.set(a.first, a.second); + if (r < 0) { + cerr << "ERROR: failed to set configurable: " << a << std::endl; + return EINVAL; + } + } + + if (zone_params.realm_id.empty()) { + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0 && ret != -ENOENT) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + zone_params.realm_id = realm.id; + cerr << "NOTICE: set zone's realm_id=" << realm.id << std::endl; + } + + constexpr bool exclusive = true; + int ret = rgw::create_zone(dpp(), null_yield, cfgstore.get(), + exclusive, zone_params); + if (ret < 0) { + cerr << "failed to create zone " << zone_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (zonegroup_writer) { + const bool *pis_master = (is_master_set ? &is_master : nullptr); + const bool *pread_only = (is_read_only_set ? &read_only : nullptr); + const bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr); + const string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr); + + // validate --tier-type if specified + const string *ptier_type = (tier_type_specified ? &tier_type : nullptr); + if (ptier_type) { + auto sync_mgr = static_cast(driver)->svc()->sync_modules->get_manager(); + if (!sync_mgr->get_module(*ptier_type, nullptr)) { + ldpp_dout(dpp(), -1) << "ERROR: could not find sync module: " + << *ptier_type << ", valid sync modules: " + << sync_mgr->get_registered_module_names() << dendl; + return EINVAL; + } + } + + if (enable_features.empty()) { // enable all features by default + enable_features.insert(rgw::zone_features::supported.begin(), + rgw::zone_features::supported.end()); + } + + // add/update the public zone information stored in the zonegroup + ret = rgw::add_zone_to_group(dpp(), zonegroup, zone_params, + pis_master, pread_only, endpoints, + ptier_type, psync_from_all, + sync_from, sync_from_rm, + predirect_zone, bucket_index_max_shards, + enable_features, disable_features); + if (ret < 0) { + return -ret; + } + + // write the updated zonegroup + ret = zonegroup_writer->write(dpp(), null_yield, zonegroup); + if (ret < 0) { + cerr << "failed to add zone " << zone_name << " to zonegroup " << zonegroup.get_name() + << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (set_default) { + ret = rgw::set_default_zone(dpp(), null_yield, cfgstore.get(), + zone_params); + if (ret < 0) { + cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zone", zone_params, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONE_DEFAULT: + { + if (zone_id.empty() && zone_name.empty()) { + cerr << "no zone name or id provided" << std::endl; + return EINVAL; + } + RGWZoneParams zone_params; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone_params); + if (ret < 0) { + cerr << "unable to load zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = rgw::set_default_zone(dpp(), null_yield, cfgstore.get(), + zone_params); + if (ret < 0) { + cerr << "failed to set zone as default: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT::ZONE_DELETE: + { + if (zone_id.empty() && zone_name.empty()) { + cerr << "no zone name or id provided" << std::endl; + return EINVAL; + } + RGWZoneParams zone_params; + std::unique_ptr writer; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone_params, &writer); + if (ret < 0) { + cerr << "failed to load zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = rgw::delete_zone(dpp(), null_yield, cfgstore.get(), + zone_params, *writer); + if (ret < 0) { + cerr << "failed to delete zone " << zone_params.get_name() + << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT::ZONE_GET: + { + RGWZoneParams zone_params; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone_params); + if (ret < 0) { + cerr << "failed to load zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("zone", zone_params, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONE_SET: + { + RGWZoneParams zone; + std::unique_ptr writer; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone, &writer); + if (ret < 0 && ret != -ENOENT) { + cerr << "failed to load zone: " << cpp_strerror(ret) << std::endl; + return -ret; + } + + string orig_id = zone.get_id(); + + ret = read_decode_json(infile, zone); + if (ret < 0) { + return 1; + } + + if (zone.realm_id.empty()) { + RGWRealm realm; + ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0 && ret != -ENOENT) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + zone.realm_id = realm.get_id(); + cerr << "NOTICE: set zone's realm_id=" << zone.realm_id << std::endl; + } + + if (!zone_name.empty() && !zone.get_name().empty() && zone.get_name() != zone_name) { + cerr << "Error: zone name " << zone_name << " is different than the zone name " << zone.get_name() << " in the provided json " << std::endl; + return EINVAL; + } + + if (zone.get_name().empty()) { + zone.set_name(zone_name); + if (zone.get_name().empty()) { + cerr << "no zone name specified" << std::endl; + return EINVAL; + } + } + + zone_name = zone.get_name(); + + if (zone.get_id().empty()) { + zone.set_id(orig_id); + } + + constexpr bool exclusive = false; + ret = rgw::create_zone(dpp(), null_yield, cfgstore.get(), + exclusive, zone); + if (ret < 0) { + cerr << "ERROR: couldn't create zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (set_default) { + ret = rgw::set_default_zone(dpp(), null_yield, cfgstore.get(), zone); + if (ret < 0) { + cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zone", zone, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONE_LIST: + { + RGWZoneParams default_zone_params; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + {}, {}, default_zone_params); + if (ret < 0 && ret != -ENOENT) { + cerr << "could not determine default zone: " << cpp_strerror(-ret) << std::endl; + } + + Formatter::ObjectSection zones_list{*formatter, "zones_list"}; + encode_json("default_info", default_zone_params.id, formatter.get()); + + Formatter::ArraySection zones{*formatter, "zones"}; + rgw::sal::ListResult listing; + std::array names; // list in pages of 1000 + do { + ret = cfgstore->list_zone_names(dpp(), null_yield, listing.next, + names, listing); + if (ret < 0) { + std::cerr << "failed to list zones: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + for (const auto& name : listing.entries) { + encode_json("name", name, formatter.get()); + } + } while (!listing.next.empty()); + } // close sections zones and zones_list + formatter->flush(cout); + break; + case OPT::ZONE_MODIFY: + { + RGWZoneParams zone_params; + std::unique_ptr zone_writer; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone_params, &zone_writer); + if (ret < 0) { + cerr << "failed to load zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + bool need_zone_update = false; + if (!access_key.empty()) { + zone_params.system_key.id = access_key; + need_zone_update = true; + } + + if (!secret_key.empty()) { + zone_params.system_key.key = secret_key; + need_zone_update = true; + } + + if (!realm_id.empty()) { + zone_params.realm_id = realm_id; + need_zone_update = true; + } else if (!realm_name.empty()) { + // get realm id from name + ret = cfgstore->read_realm_id(dpp(), null_yield, + realm_name, zone_params.realm_id); + if (ret < 0) { + cerr << "failed to find realm by name " << realm_name << std::endl; + return -ret; + } + need_zone_update = true; + } + + for (const auto& add : tier_config_add) { + ret = zone_params.tier_config.set(add.first, add.second); + if (ret < 0) { + cerr << "ERROR: failed to set configurable: " << add << std::endl; + return EINVAL; + } + need_zone_update = true; + } + + for (const auto& rm : tier_config_rm) { + if (!rm.first.empty()) { /* otherwise will remove the entire config */ + zone_params.tier_config.erase(rm.first); + need_zone_update = true; + } + } + + if (need_zone_update) { + ret = zone_writer->write(dpp(), null_yield, zone_params); + if (ret < 0) { + cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + RGWZoneGroup zonegroup; + std::unique_ptr zonegroup_writer; + ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &zonegroup_writer); + if (ret < 0) { + cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + const bool *pis_master = (is_master_set ? &is_master : nullptr); + const bool *pread_only = (is_read_only_set ? &read_only : nullptr); + const bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr); + const string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr); + + // validate --tier-type if specified + const string *ptier_type = (tier_type_specified ? &tier_type : nullptr); + if (ptier_type) { + auto sync_mgr = static_cast(driver)->svc()->sync_modules->get_manager(); + if (!sync_mgr->get_module(*ptier_type, nullptr)) { + ldpp_dout(dpp(), -1) << "ERROR: could not find sync module: " + << *ptier_type << ", valid sync modules: " + << sync_mgr->get_registered_module_names() << dendl; + return EINVAL; + } + } + + if (enable_features.empty()) { // enable all features by default + enable_features.insert(rgw::zone_features::supported.begin(), + rgw::zone_features::supported.end()); + } + + // add/update the public zone information stored in the zonegroup + ret = rgw::add_zone_to_group(dpp(), zonegroup, zone_params, + pis_master, pread_only, endpoints, + ptier_type, psync_from_all, + sync_from, sync_from_rm, + predirect_zone, bucket_index_max_shards, + enable_features, disable_features); + if (ret < 0) { + return -ret; + } + + // write the updated zonegroup + ret = zonegroup_writer->write(dpp(), null_yield, zonegroup); + if (ret < 0) { + cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (set_default) { + ret = rgw::set_default_zone(dpp(), null_yield, cfgstore.get(), + zone_params); + if (ret < 0) { + cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zone", zone_params, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONE_RENAME: + { + if (zone_new_name.empty()) { + cerr << " missing zone new name" << std::endl; + return EINVAL; + } + if (zone_id.empty() && zone_name.empty()) { + cerr << "no zone name or id provided" << std::endl; + return EINVAL; + } + + RGWZoneParams zone_params; + std::unique_ptr zone_writer; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone_params, &zone_writer); + if (ret < 0) { + cerr << "failed to load zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = zone_writer->rename(dpp(), null_yield, zone_params, zone_new_name); + if (ret < 0) { + cerr << "failed to rename zone " << zone_name << " to " << zone_new_name << ": " << cpp_strerror(-ret) + << std::endl; + return -ret; + } + + RGWZoneGroup zonegroup; + std::unique_ptr zonegroup_writer; + ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, + zonegroup, &zonegroup_writer); + if (ret < 0) { + cerr << "WARNING: failed to load zonegroup " << zonegroup_name << std::endl; + return EXIT_SUCCESS; + } + + auto z = zonegroup.zones.find(zone_params.id); + if (z == zonegroup.zones.end()) { + return EXIT_SUCCESS; + } + z->second.name = zone_params.name; + + ret = zonegroup_writer->write(dpp(), null_yield, zonegroup); + if (ret < 0) { + cerr << "Error in zonegroup rename for " << zone_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT::ZONE_PLACEMENT_ADD: + case OPT::ZONE_PLACEMENT_MODIFY: + case OPT::ZONE_PLACEMENT_RM: + { + if (placement_id.empty()) { + cerr << "ERROR: --placement-id not specified" << std::endl; + return EINVAL; + } + // validate compression type + if (compression_type && *compression_type != "random" + && !Compressor::get_comp_alg_type(*compression_type)) { + std::cerr << "Unrecognized compression type" << std::endl; + return EINVAL; + } + + RGWZoneParams zone; + std::unique_ptr writer; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone, &writer); + if (ret < 0) { + cerr << "failed to init zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (opt_cmd == OPT::ZONE_PLACEMENT_ADD || + opt_cmd == OPT::ZONE_PLACEMENT_MODIFY) { + RGWZoneGroup zonegroup; + ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(), + zonegroup_id, zonegroup_name, zonegroup); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto ptiter = zonegroup.placement_targets.find(placement_id); + if (ptiter == zonegroup.placement_targets.end()) { + cerr << "ERROR: placement id '" << placement_id << "' is not configured in zonegroup placement targets" << std::endl; + return EINVAL; + } + + string storage_class = rgw_placement_rule::get_canonical_storage_class(opt_storage_class.value_or(string())); + if (ptiter->second.storage_classes.find(storage_class) == ptiter->second.storage_classes.end()) { + cerr << "ERROR: storage class '" << storage_class << "' is not defined in zonegroup '" << placement_id << "' placement target" << std::endl; + return EINVAL; + } + if (ptiter->second.tier_targets.find(storage_class) != ptiter->second.tier_targets.end()) { + cerr << "ERROR: storage class '" << storage_class << "' is of tier type in zonegroup '" << placement_id << "' placement target" << std::endl; + return EINVAL; + } + + RGWZonePlacementInfo& info = zone.placement_pools[placement_id]; + + string opt_index_pool = index_pool.value_or(string()); + string opt_data_pool = data_pool.value_or(string()); + + if (!opt_index_pool.empty()) { + info.index_pool = opt_index_pool; + } + + if (info.index_pool.empty()) { + cerr << "ERROR: index pool not configured, need to specify --index-pool" << std::endl; + return EINVAL; + } + + if (opt_data_pool.empty()) { + const RGWZoneStorageClass *porig_sc{nullptr}; + if (info.storage_classes.find(storage_class, &porig_sc)) { + if (porig_sc->data_pool) { + opt_data_pool = porig_sc->data_pool->to_str(); + } + } + if (opt_data_pool.empty()) { + cerr << "ERROR: data pool not configured, need to specify --data-pool" << std::endl; + return EINVAL; + } + } + + rgw_pool dp = opt_data_pool; + info.storage_classes.set_storage_class(storage_class, &dp, compression_type.get_ptr()); + + if (data_extra_pool) { + info.data_extra_pool = *data_extra_pool; + } + if (index_type_specified) { + info.index_type = placement_index_type; + } + if (placement_inline_data_specified) { + info.inline_data = placement_inline_data; + } + + ret = check_pool_support_omap(info.get_data_extra_pool()); + if (ret < 0) { + cerr << "ERROR: the data extra (non-ec) pool '" << info.get_data_extra_pool() + << "' does not support omap" << std::endl; + return ret; + } + } else if (opt_cmd == OPT::ZONE_PLACEMENT_RM) { + if (!opt_storage_class || + opt_storage_class->empty()) { + zone.placement_pools.erase(placement_id); + } else { + auto iter = zone.placement_pools.find(placement_id); + if (iter != zone.placement_pools.end()) { + RGWZonePlacementInfo& info = zone.placement_pools[placement_id]; + info.storage_classes.remove_storage_class(*opt_storage_class); + } + } + } + + ret = writer->write(dpp(), null_yield, zone); + if (ret < 0) { + cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("zone", zone, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONE_PLACEMENT_LIST: + { + RGWZoneParams zone; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("placement_pools", zone.placement_pools, formatter.get()); + formatter->flush(cout); + } + break; + case OPT::ZONE_PLACEMENT_GET: + { + if (placement_id.empty()) { + cerr << "ERROR: --placement-id not specified" << std::endl; + return EINVAL; + } + + RGWZoneParams zone; + int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(), + zone_id, zone_name, zone); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + auto p = zone.placement_pools.find(placement_id); + if (p == zone.placement_pools.end()) { + cerr << "ERROR: zone placement target '" << placement_id << "' not found" << std::endl; + return ENOENT; + } + encode_json("placement_pools", p->second, formatter.get()); + formatter->flush(cout); + } + default: + break; + } + return 0; + } + + resolve_zone_id_opt(opt_effective_zone_name, opt_effective_zone_id); + resolve_zone_id_opt(opt_source_zone_name, opt_source_zone_id); + resolve_zone_id_opt(opt_dest_zone_name, opt_dest_zone_id); + resolve_zone_ids_opt(opt_zone_names, opt_zone_ids); + resolve_zone_ids_opt(opt_source_zone_names, opt_source_zone_ids); + resolve_zone_ids_opt(opt_dest_zone_names, opt_dest_zone_ids); + + bool non_master_cmd = (!driver->is_meta_master() && !yes_i_really_mean_it); + std::set non_master_ops_list = {OPT::USER_CREATE, OPT::USER_RM, + OPT::USER_MODIFY, OPT::USER_ENABLE, + OPT::USER_SUSPEND, OPT::SUBUSER_CREATE, + OPT::SUBUSER_MODIFY, OPT::SUBUSER_RM, + OPT::BUCKET_LINK, OPT::BUCKET_UNLINK, + OPT::BUCKET_RM, + OPT::BUCKET_CHOWN, OPT::METADATA_PUT, + OPT::METADATA_RM, OPT::MFA_CREATE, + OPT::MFA_REMOVE, OPT::MFA_RESYNC, + OPT::CAPS_ADD, OPT::CAPS_RM, + OPT::ROLE_CREATE, OPT::ROLE_DELETE, + OPT::ROLE_POLICY_PUT, OPT::ROLE_POLICY_DELETE}; + + bool print_warning_message = (non_master_ops_list.find(opt_cmd) != non_master_ops_list.end() && + non_master_cmd); + + if (print_warning_message) { + cerr << "Please run the command on master zone. Performing this operation on non-master zone leads to inconsistent metadata between zones" << std::endl; + cerr << "Are you sure you want to go ahead? (requires --yes-i-really-mean-it)" << std::endl; + return EINVAL; + } + + if (!rgw::sal::User::empty(user)) { + user_op.set_user_id(user->get_id()); + bucket_op.set_user_id(user->get_id()); + } + + if (!display_name.empty()) + user_op.set_display_name(display_name); + + if (!user_email.empty()) + user_op.set_user_email(user_email); + + if (!rgw::sal::User::empty(user)) { + user_op.set_new_user_id(new_user_id); + } + + if (!access_key.empty()) + user_op.set_access_key(access_key); + + if (!secret_key.empty()) + user_op.set_secret_key(secret_key); + + if (!subuser.empty()) + user_op.set_subuser(subuser); + + if (!caps.empty()) + user_op.set_caps(caps); + + user_op.set_purge_data(purge_data); + + if (purge_keys) + user_op.set_purge_keys(); + + if (gen_access_key) + user_op.set_generate_key(); + + if (gen_secret_key) + user_op.set_gen_secret(); // assume that a key pair should be created + + if (max_buckets_specified) + user_op.set_max_buckets(max_buckets); + + if (admin_specified) + user_op.set_admin(admin); + + if (system_specified) + user_op.set_system(system); + + if (set_perm) + user_op.set_perm(perm_mask); + + if (set_temp_url_key) { + map::iterator iter = temp_url_keys.begin(); + for (; iter != temp_url_keys.end(); ++iter) { + user_op.set_temp_url_key(iter->second, iter->first); + } + } + + if (!op_mask_str.empty()) { + uint32_t op_mask; + int ret = rgw_parse_op_type_list(op_mask_str, &op_mask); + if (ret < 0) { + cerr << "failed to parse op_mask: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + user_op.set_op_mask(op_mask); + } + + if (key_type != KEY_TYPE_UNDEFINED) + user_op.set_key_type(key_type); + + // set suspension operation parameters + if (opt_cmd == OPT::USER_ENABLE) + user_op.set_suspension(false); + else if (opt_cmd == OPT::USER_SUSPEND) + user_op.set_suspension(true); + + if (!placement_id.empty()) { + rgw_placement_rule target_rule; + target_rule.name = placement_id; + target_rule.storage_class = opt_storage_class.value_or(""); + if (!driver->valid_placement(target_rule)) { + cerr << "NOTICE: invalid dest placement: " << target_rule.to_str() << std::endl; + return EINVAL; + } + user_op.set_default_placement(target_rule); + } + + if (!tags.empty()) { + user_op.set_placement_tags(tags); + } + + // RGWUser to use for user operations + RGWUser ruser; + int ret = 0; + if (!(rgw::sal::User::empty(user) && access_key.empty()) || !subuser.empty()) { + ret = ruser.init(dpp(), driver, user_op, null_yield); + if (ret < 0) { + cerr << "user.init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + /* populate bucket operation */ + bucket_op.set_bucket_name(bucket_name); + bucket_op.set_object(object); + bucket_op.set_check_objects(check_objects); + bucket_op.set_delete_children(delete_child_objects); + bucket_op.set_fix_index(fix); + bucket_op.set_max_aio(max_concurrent_ios); + bucket_op.set_min_age(min_age); + bucket_op.set_dump_keys(dump_keys); + bucket_op.set_hide_progress(hide_progress); + + // required to gather errors from operations + std::string err_msg; + + bool output_user_info = true; + + switch (opt_cmd) { + case OPT::USER_INFO: + if (rgw::sal::User::empty(user) && access_key.empty()) { + cerr << "ERROR: --uid or --access-key required" << std::endl; + return EINVAL; + } + break; + case OPT::USER_CREATE: + if (!user_op.has_existing_user()) { + user_op.set_generate_key(); // generate a new key by default + } + ret = ruser.add(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not create user: " << err_msg << std::endl; + if (ret == -ERR_INVALID_TENANT_NAME) + ret = -EINVAL; + + return -ret; + } + if (!subuser.empty()) { + ret = ruser.subusers.add(dpp(),user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not create subuser: " << err_msg << std::endl; + return -ret; + } + } + break; + case OPT::USER_RM: + ret = ruser.remove(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not remove user: " << err_msg << std::endl; + return -ret; + } + + output_user_info = false; + break; + case OPT::USER_RENAME: + if (yes_i_really_mean_it) { + user_op.set_overwrite_new_user(true); + } + ret = ruser.rename(user_op, null_yield, dpp(), &err_msg); + if (ret < 0) { + if (ret == -EEXIST) { + err_msg += ". to overwrite this user, add --yes-i-really-mean-it"; + } + cerr << "could not rename user: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT::USER_ENABLE: + case OPT::USER_SUSPEND: + case OPT::USER_MODIFY: + ret = ruser.modify(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not modify user: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT::SUBUSER_CREATE: + ret = ruser.subusers.add(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not create subuser: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT::SUBUSER_MODIFY: + ret = ruser.subusers.modify(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not modify subuser: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT::SUBUSER_RM: + ret = ruser.subusers.remove(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not remove subuser: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT::CAPS_ADD: + ret = ruser.caps.add(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not add caps: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT::CAPS_RM: + ret = ruser.caps.remove(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not remove caps: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT::KEY_CREATE: + ret = ruser.keys.add(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not create key: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT::KEY_RM: + ret = ruser.keys.remove(dpp(), user_op, null_yield, &err_msg); + if (ret < 0) { + cerr << "could not remove key: " << err_msg << std::endl; + return -ret; + } + break; + case OPT::PERIOD_PUSH: + { + RGWEnv env; + req_info info(g_ceph_context, &env); + info.method = "POST"; + info.request_uri = "/admin/realm/period"; + + map ¶ms = info.args.get_params(); + if (!realm_id.empty()) + params["realm_id"] = realm_id; + if (!realm_name.empty()) + params["realm_name"] = realm_name; + if (!period_id.empty()) + params["period_id"] = period_id; + if (!period_epoch.empty()) + params["epoch"] = period_epoch; + + // load the period + RGWPeriod period; + int ret = cfgstore->read_period(dpp(), null_yield, period_id, + std::nullopt, period); + if (ret < 0) { + cerr << "failed to load period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + // json format into a bufferlist + JSONFormatter jf(false); + encode_json("period", period, &jf); + bufferlist bl; + jf.flush(bl); + + JSONParser p; + ret = send_to_remote_or_url(nullptr, url, opt_region, + access_key, secret_key, + info, bl, p); + if (ret < 0) { + cerr << "request failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + return 0; + case OPT::PERIOD_UPDATE: + { + int ret = update_period(cfgstore.get(), realm_id, realm_name, + period_epoch, commit, remote, url, + opt_region, access_key, secret_key, + formatter.get(), yes_i_really_mean_it); + if (ret < 0) { + return -ret; + } + } + return 0; + case OPT::PERIOD_COMMIT: + { + // read realm and staging period + RGWRealm realm; + std::unique_ptr realm_writer; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, + realm, &realm_writer); + if (ret < 0) { + cerr << "Error initializing realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + period_id = rgw::get_staging_period_id(realm.id); + epoch_t epoch = 1; + + RGWPeriod period; + ret = cfgstore->read_period(dpp(), null_yield, period_id, epoch, period); + if (ret < 0) { + cerr << "failed to load period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = commit_period(cfgstore.get(), realm, *realm_writer, period, + remote, url, opt_region, access_key, secret_key, + yes_i_really_mean_it); + if (ret < 0) { + cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("period", period, formatter.get()); + formatter->flush(cout); + } + return 0; + case OPT::ROLE_CREATE: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + if (assume_role_doc.empty()) { + cerr << "ERROR: assume role policy document is empty" << std::endl; + return -EINVAL; + } + bufferlist bl = bufferlist::static_from_string(assume_role_doc); + try { + const rgw::IAM::Policy p( + g_ceph_context, tenant, bl, + g_ceph_context->_conf.get_val( + "rgw_policy_reject_invalid_principals")); + } catch (rgw::IAM::PolicyParseException& e) { + cerr << "failed to parse policy: " << e.what() << std::endl; + return -EINVAL; + } + std::unique_ptr role = driver->get_role(role_name, tenant, path, assume_role_doc); + ret = role->create(dpp(), true, "", null_yield); + if (ret < 0) { + return -ret; + } + show_role_info(role.get(), formatter.get()); + return 0; + } + case OPT::ROLE_DELETE: + { + if (role_name.empty()) { + cerr << "ERROR: empty role name" << std::endl; + return -EINVAL; + } + std::unique_ptr role = driver->get_role(role_name, tenant); + ret = role->delete_obj(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + cout << "role: " << role_name << " successfully deleted" << std::endl; + return 0; + } + case OPT::ROLE_GET: + { + if (role_name.empty()) { + cerr << "ERROR: empty role name" << std::endl; + return -EINVAL; + } + std::unique_ptr role = driver->get_role(role_name, tenant); + ret = role->get(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + show_role_info(role.get(), formatter.get()); + return 0; + } + case OPT::ROLE_TRUST_POLICY_MODIFY: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + if (assume_role_doc.empty()) { + cerr << "ERROR: assume role policy document is empty" << std::endl; + return -EINVAL; + } + + bufferlist bl = bufferlist::static_from_string(assume_role_doc); + try { + const rgw::IAM::Policy p(g_ceph_context, tenant, bl, + g_ceph_context->_conf.get_val( + "rgw_policy_reject_invalid_principals")); + } catch (rgw::IAM::PolicyParseException& e) { + cerr << "failed to parse policy: " << e.what() << std::endl; + return -EINVAL; + } + + std::unique_ptr role = driver->get_role(role_name, tenant); + ret = role->get(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + role->update_trust_policy(assume_role_doc); + ret = role->update(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + cout << "Assume role policy document updated successfully for role: " << role_name << std::endl; + return 0; + } + case OPT::ROLE_LIST: + { + vector> result; + ret = driver->get_roles(dpp(), null_yield, path_prefix, tenant, result); + if (ret < 0) { + return -ret; + } + show_roles_info(result, formatter.get()); + return 0; + } + case OPT::ROLE_POLICY_PUT: + { + if (role_name.empty()) { + cerr << "role name is empty" << std::endl; + return -EINVAL; + } + + if (policy_name.empty()) { + cerr << "policy name is empty" << std::endl; + return -EINVAL; + } + + if (perm_policy_doc.empty() && infile.empty()) { + cerr << "permission policy document is empty" << std::endl; + return -EINVAL; + } + + bufferlist bl; + if (!infile.empty()) { + int ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input policy document: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + perm_policy_doc = bl.to_str(); + } else { + bl = bufferlist::static_from_string(perm_policy_doc); + } + try { + const rgw::IAM::Policy p(g_ceph_context, tenant, bl, + g_ceph_context->_conf.get_val( + "rgw_policy_reject_invalid_principals")); + } catch (rgw::IAM::PolicyParseException& e) { + cerr << "failed to parse perm policy: " << e.what() << std::endl; + return -EINVAL; + } + + std::unique_ptr role = driver->get_role(role_name, tenant); + ret = role->get(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + role->set_perm_policy(policy_name, perm_policy_doc); + ret = role->update(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + cout << "Permission policy attached successfully" << std::endl; + return 0; + } + case OPT::ROLE_POLICY_LIST: + { + if (role_name.empty()) { + cerr << "ERROR: Role name is empty" << std::endl; + return -EINVAL; + } + std::unique_ptr role = driver->get_role(role_name, tenant); + ret = role->get(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + std::vector policy_names = role->get_role_policy_names(); + show_policy_names(policy_names, formatter.get()); + return 0; + } + case OPT::ROLE_POLICY_GET: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + if (policy_name.empty()) { + cerr << "ERROR: policy name is empty" << std::endl; + return -EINVAL; + } + std::unique_ptr role = driver->get_role(role_name, tenant); + int ret = role->get(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + string perm_policy; + ret = role->get_role_policy(dpp(), policy_name, perm_policy); + if (ret < 0) { + return -ret; + } + show_perm_policy(perm_policy, formatter.get()); + return 0; + } + case OPT::ROLE_POLICY_DELETE: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + if (policy_name.empty()) { + cerr << "ERROR: policy name is empty" << std::endl; + return -EINVAL; + } + std::unique_ptr role = driver->get_role(role_name, tenant); + ret = role->get(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + ret = role->delete_policy(dpp(), policy_name); + if (ret < 0) { + return -ret; + } + ret = role->update(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + cout << "Policy: " << policy_name << " successfully deleted for role: " + << role_name << std::endl; + return 0; + } + case OPT::ROLE_UPDATE: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + std::unique_ptr role = driver->get_role(role_name, tenant); + ret = role->get(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + if (!role->validate_max_session_duration(dpp())) { + ret = -EINVAL; + return ret; + } + role->update_max_session_duration(max_session_duration); + ret = role->update(dpp(), null_yield); + if (ret < 0) { + return -ret; + } + cout << "Max session duration updated successfully for role: " << role_name << std::endl; + return 0; + } + default: + output_user_info = false; + } + + // output the result of a user operation + if (output_user_info) { + ret = ruser.info(info, &err_msg); + if (ret < 0) { + cerr << "could not fetch user info: " << err_msg << std::endl; + return -ret; + } + show_user_info(info, formatter.get()); + } + + if (opt_cmd == OPT::POLICY) { + if (format == "xml") { + int ret = RGWBucketAdminOp::dump_s3_policy(driver, bucket_op, cout, dpp()); + if (ret < 0) { + cerr << "ERROR: failed to get policy: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + int ret = RGWBucketAdminOp::get_policy(driver, bucket_op, stream_flusher, dpp()); + if (ret < 0) { + cerr << "ERROR: failed to get policy: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + if (opt_cmd == OPT::BUCKET_LIMIT_CHECK) { + void *handle; + std::list user_ids; + metadata_key = "user"; + int max = 1000; + + bool truncated; + + if (!rgw::sal::User::empty(user)) { + user_ids.push_back(user->get_id().id); + ret = + RGWBucketAdminOp::limit_check(driver, bucket_op, user_ids, stream_flusher, + null_yield, dpp(), warnings_only); + } else { + /* list users in groups of max-keys, then perform user-bucket + * limit-check on each group */ + ret = driver->meta_list_keys_init(dpp(), metadata_key, string(), &handle); + if (ret < 0) { + cerr << "ERROR: buckets limit check can't get user metadata_key: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + + do { + ret = driver->meta_list_keys_next(dpp(), handle, max, user_ids, + &truncated); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: buckets limit check lists_keys_next(): " + << cpp_strerror(-ret) << std::endl; + break; + } else { + /* ok, do the limit checks for this group */ + ret = + RGWBucketAdminOp::limit_check(driver, bucket_op, user_ids, stream_flusher, + null_yield, dpp(), warnings_only); + if (ret < 0) + break; + } + user_ids.clear(); + } while (truncated); + driver->meta_list_keys_complete(handle); + } + return -ret; + } /* OPT::BUCKET_LIMIT_CHECK */ + + if (opt_cmd == OPT::BUCKETS_LIST) { + if (bucket_name.empty()) { + if (!rgw::sal::User::empty(user)) { + if (!user_op.has_existing_user()) { + cerr << "ERROR: could not find user: " << user << std::endl; + return -ENOENT; + } + } + RGWBucketAdminOp::info(driver, bucket_op, stream_flusher, null_yield, dpp()); + } else { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_array_section("entries"); + + int count = 0; + + static constexpr int MAX_PAGINATE_SIZE = 10000; + static constexpr int DEFAULT_MAX_ENTRIES = 1000; + + if (max_entries < 0) { + max_entries = DEFAULT_MAX_ENTRIES; + } + const int paginate_size = std::min(max_entries, MAX_PAGINATE_SIZE); + + string prefix; + string delim; + string ns; + + rgw::sal::Bucket::ListParams params; + rgw::sal::Bucket::ListResults results; + + params.prefix = prefix; + params.delim = delim; + params.marker = rgw_obj_key(marker); + params.ns = ns; + params.enforce_ns = false; + params.list_versions = true; + params.allow_unordered = bool(allow_unordered); + + do { + const int remaining = max_entries - count; + ret = bucket->list(dpp(), params, std::min(remaining, paginate_size), results, + null_yield); + if (ret < 0) { + cerr << "ERROR: driver->list_objects(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ldpp_dout(dpp(), 20) << "INFO: " << __func__ << + ": list() returned without error; results.objs.sizie()=" << + results.objs.size() << "results.is_truncated=" << results.is_truncated << ", marker=" << + params.marker << dendl; + + count += results.objs.size(); + + for (const auto& entry : results.objs) { + encode_json("entry", entry, formatter.get()); + } + formatter->flush(cout); + } while (results.is_truncated && count < max_entries); + ldpp_dout(dpp(), 20) << "INFO: " << __func__ << ": done" << dendl; + + formatter->close_section(); + formatter->flush(cout); + } /* have bucket_name */ + } /* OPT::BUCKETS_LIST */ + + if (opt_cmd == OPT::BUCKET_RADOS_LIST) { + RGWRadosList lister(static_cast(driver), + max_concurrent_ios, orphan_stale_secs, tenant); + if (rgw_obj_fs) { + lister.set_field_separator(*rgw_obj_fs); + } + + if (bucket_name.empty()) { + // yes_i_really_mean_it means continue with listing even if + // there are indexless buckets + ret = lister.run(dpp(), yes_i_really_mean_it); + } else { + ret = lister.run(dpp(), bucket_name); + } + + if (ret < 0) { + std::cerr << + "ERROR: bucket radoslist failed to finish before " << + "encountering error: " << cpp_strerror(-ret) << std::endl; + std::cerr << "************************************" + "************************************" << std::endl; + std::cerr << "WARNING: THE RESULTS ARE NOT RELIABLE AND SHOULD NOT " << + "BE USED IN DELETING ORPHANS" << std::endl; + std::cerr << "************************************" + "************************************" << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::BUCKET_LAYOUT) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + const auto& bucket_info = bucket->get_info(); + formatter->open_object_section("layout"); + encode_json("layout", bucket_info.layout, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BUCKET_STATS) { + if (bucket_name.empty() && !bucket_id.empty()) { + rgw_bucket bucket; + if (!rgw_find_bucket_by_id(dpp(), driver->ctx(), driver, marker, bucket_id, &bucket)) { + cerr << "failure: no such bucket id" << std::endl; + return -ENOENT; + } + bucket_op.set_tenant(bucket.tenant); + bucket_op.set_bucket_name(bucket.name); + } + bucket_op.set_fetch_stats(true); + + int r = RGWBucketAdminOp::info(driver, bucket_op, stream_flusher, null_yield, dpp()); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << ": " << err << std::endl; + return posix_errortrans(-r); + } + } + + if (opt_cmd == OPT::BUCKET_LINK) { + bucket_op.set_bucket_id(bucket_id); + bucket_op.set_new_bucket_name(new_bucket_name); + string err; + int r = RGWBucketAdminOp::link(driver, bucket_op, dpp(), &err); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + } + + if (opt_cmd == OPT::BUCKET_UNLINK) { + int r = RGWBucketAdminOp::unlink(driver, bucket_op, dpp()); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << std::endl; + return -r; + } + } + + if (opt_cmd == OPT::BUCKET_SHARD_OBJECTS) { + const auto prefix = opt_prefix ? *opt_prefix : "obj"s; + if (!num_shards_specified) { + cerr << "ERROR: num-shards must be specified." + << std::endl; + return EINVAL; + } + + if (specified_shard_id) { + if (shard_id >= num_shards) { + cerr << "ERROR: shard-id must be less than num-shards." + << std::endl; + return EINVAL; + } + std::string obj; + uint64_t ctr = 0; + int shard; + do { + obj = fmt::format("{}{:0>20}", prefix, ctr); + shard = RGWSI_BucketIndex_RADOS::bucket_shard_index(obj, num_shards); + ++ctr; + } while (shard != shard_id); + + formatter->open_object_section("shard_obj"); + encode_json("obj", obj, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } else { + std::vector objs(num_shards); + for (uint64_t ctr = 0, shardsleft = num_shards; shardsleft > 0; ++ctr) { + auto key = fmt::format("{}{:0>20}", prefix, ctr); + auto shard = RGWSI_BucketIndex_RADOS::bucket_shard_index(key, num_shards); + if (objs[shard].empty()) { + objs[shard] = std::move(key); + --shardsleft; + } + } + + formatter->open_object_section("shard_objs"); + encode_json("objs", objs, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } + } + + if (opt_cmd == OPT::BUCKET_OBJECT_SHARD) { + if (!num_shards_specified || object.empty()) { + cerr << "ERROR: num-shards and object must be specified." + << std::endl; + return EINVAL; + } + auto shard = RGWSI_BucketIndex_RADOS::bucket_shard_index(object, num_shards); + formatter->open_object_section("obj_shard"); + encode_json("shard", shard, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BUCKET_RESYNC_ENCRYPTED_MULTIPART) { + // repair logic for replication of encrypted multipart uploads: + // https://tracker.ceph.com/issues/46062 + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + + auto rados_driver = dynamic_cast(driver); + if (!rados_driver) { + cerr << "ERROR: this command can only work when the cluster " + "has a RADOS backing store." << std::endl; + return EPERM; + } + + // fail if recovery wouldn't generate replication log entries + if (!rados_driver->svc()->zone->need_to_log_data() && !yes_i_really_mean_it) { + cerr << "This command is only necessary for replicated buckets." << std::endl; + cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return EPERM; + } + + formatter->open_object_section("modified"); + encode_json("bucket", bucket->get_name(), formatter.get()); + encode_json("bucket_id", bucket->get_bucket_id(), formatter.get()); + + ret = rados_driver->getRados()->bucket_resync_encrypted_multipart( + dpp(), null_yield, rados_driver, bucket->get_info(), + marker, stream_flusher); + if (ret < 0) { + return -ret; + } + formatter->close_section(); + formatter->flush(cout); + return 0; + } + + if (opt_cmd == OPT::BUCKET_CHOWN) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + + bucket_op.set_bucket_name(bucket_name); + bucket_op.set_new_bucket_name(new_bucket_name); + string err; + + int r = RGWBucketAdminOp::chown(driver, bucket_op, marker, dpp(), &err); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + } + + if (opt_cmd == OPT::LOG_LIST) { + // filter by date? + if (date.size() && date.size() != 10) { + cerr << "bad date format for '" << date << "', expect YYYY-MM-DD" << std::endl; + return EINVAL; + } + + formatter->reset(); + formatter->open_array_section("logs"); + RGWAccessHandle h; + int r = static_cast(driver)->getRados()->log_list_init(dpp(), date, &h); + if (r == -ENOENT) { + // no logs. + } else { + if (r < 0) { + cerr << "log list: error " << r << std::endl; + return -r; + } + while (true) { + string name; + int r = static_cast(driver)->getRados()->log_list_next(h, &name); + if (r == -ENOENT) + break; + if (r < 0) { + cerr << "log list: error " << r << std::endl; + return -r; + } + formatter->dump_string("object", name); + } + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + } + + if (opt_cmd == OPT::LOG_SHOW || opt_cmd == OPT::LOG_RM) { + if (object.empty() && (date.empty() || bucket_name.empty() || bucket_id.empty())) { + cerr << "specify an object or a date, bucket and bucket-id" << std::endl; + exit(1); + } + + string oid; + if (!object.empty()) { + oid = object; + } else { + oid = date; + oid += "-"; + oid += bucket_id; + oid += "-"; + oid += bucket_name; + } + + if (opt_cmd == OPT::LOG_SHOW) { + RGWAccessHandle h; + + int r = static_cast(driver)->getRados()->log_show_init(dpp(), oid, &h); + if (r < 0) { + cerr << "error opening log " << oid << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + + formatter->reset(); + formatter->open_object_section("log"); + + struct rgw_log_entry entry; + + // peek at first entry to get bucket metadata + r = static_cast(driver)->getRados()->log_show_next(dpp(), h, &entry); + if (r < 0) { + cerr << "error reading log " << oid << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + formatter->dump_string("bucket_id", entry.bucket_id); + formatter->dump_string("bucket_owner", entry.bucket_owner.to_str()); + formatter->dump_string("bucket", entry.bucket); + + uint64_t agg_time = 0; + uint64_t agg_bytes_sent = 0; + uint64_t agg_bytes_received = 0; + uint64_t total_entries = 0; + + if (show_log_entries) + formatter->open_array_section("log_entries"); + + do { + using namespace std::chrono; + uint64_t total_time = duration_cast(entry.total_time).count(); + + agg_time += total_time; + agg_bytes_sent += entry.bytes_sent; + agg_bytes_received += entry.bytes_received; + total_entries++; + + if (skip_zero_entries && entry.bytes_sent == 0 && + entry.bytes_received == 0) + goto next; + + if (show_log_entries) { + + rgw_format_ops_log_entry(entry, formatter.get()); + formatter->flush(cout); + } +next: + r = static_cast(driver)->getRados()->log_show_next(dpp(), h, &entry); + } while (r > 0); + + if (r < 0) { + cerr << "error reading log " << oid << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + if (show_log_entries) + formatter->close_section(); + + if (show_log_sum) { + formatter->open_object_section("log_sum"); + formatter->dump_int("bytes_sent", agg_bytes_sent); + formatter->dump_int("bytes_received", agg_bytes_received); + formatter->dump_int("total_time", agg_time); + formatter->dump_int("total_entries", total_entries); + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + } + if (opt_cmd == OPT::LOG_RM) { + int r = static_cast(driver)->getRados()->log_remove(dpp(), oid); + if (r < 0) { + cerr << "error removing log " << oid << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + } + } + + if (opt_cmd == OPT::POOL_ADD) { + if (pool_name.empty()) { + cerr << "need to specify pool to add!" << std::endl; + exit(1); + } + + int ret = static_cast(driver)->svc()->zone->add_bucket_placement(dpp(), pool, null_yield); + if (ret < 0) + cerr << "failed to add bucket placement: " << cpp_strerror(-ret) << std::endl; + } + + if (opt_cmd == OPT::POOL_RM) { + if (pool_name.empty()) { + cerr << "need to specify pool to remove!" << std::endl; + exit(1); + } + + int ret = static_cast(driver)->svc()->zone->remove_bucket_placement(dpp(), pool, null_yield); + if (ret < 0) + cerr << "failed to remove bucket placement: " << cpp_strerror(-ret) << std::endl; + } + + if (opt_cmd == OPT::POOLS_LIST) { + set pools; + int ret = static_cast(driver)->svc()->zone->list_placement_set(dpp(), pools, null_yield); + if (ret < 0) { + cerr << "could not list placement set: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->reset(); + formatter->open_array_section("pools"); + for (auto siter = pools.begin(); siter != pools.end(); ++siter) { + formatter->open_object_section("pool"); + formatter->dump_string("name", siter->to_str()); + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + } + + if (opt_cmd == OPT::USAGE_SHOW) { + uint64_t start_epoch = 0; + uint64_t end_epoch = (uint64_t)-1; + + int ret; + + if (!start_date.empty()) { + ret = utime_t::parse_date(start_date, &start_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse start date" << std::endl; + return 1; + } + } + if (!end_date.empty()) { + ret = utime_t::parse_date(end_date, &end_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse end date" << std::endl; + return 1; + } + } + + + if (!bucket_name.empty()) { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + ret = RGWUsage::show(dpp(), driver, user.get(), bucket.get(), start_epoch, + end_epoch, show_log_entries, show_log_sum, &categories, + stream_flusher); + if (ret < 0) { + cerr << "ERROR: failed to show usage" << std::endl; + return 1; + } + } + + if (opt_cmd == OPT::USAGE_TRIM) { + if (rgw::sal::User::empty(user) && bucket_name.empty() && + start_date.empty() && end_date.empty() && !yes_i_really_mean_it) { + cerr << "usage trim without user/date/bucket specified will remove *all* users data" << std::endl; + cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return 1; + } + int ret; + uint64_t start_epoch = 0; + uint64_t end_epoch = (uint64_t)-1; + + + if (!start_date.empty()) { + ret = utime_t::parse_date(start_date, &start_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse start date" << std::endl; + return 1; + } + } + + if (!end_date.empty()) { + ret = utime_t::parse_date(end_date, &end_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse end date" << std::endl; + return 1; + } + } + + if (!bucket_name.empty()) { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + ret = RGWUsage::trim(dpp(), driver, user.get(), bucket.get(), start_epoch, end_epoch); + if (ret < 0) { + cerr << "ERROR: read_usage() returned ret=" << ret << std::endl; + return 1; + } + } + + if (opt_cmd == OPT::USAGE_CLEAR) { + if (!yes_i_really_mean_it) { + cerr << "usage clear would remove *all* users usage data for all time" << std::endl; + cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return 1; + } + + ret = RGWUsage::clear(dpp(), driver); + if (ret < 0) { + return ret; + } + } + + + if (opt_cmd == OPT::OLH_GET || opt_cmd == OPT::OLH_READLOG) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + if (object.empty()) { + cerr << "ERROR: object not specified" << std::endl; + return EINVAL; + } + } + + if (opt_cmd == OPT::OLH_GET) { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + RGWOLHInfo olh; + rgw_obj obj(bucket->get_key(), object); + ret = static_cast(driver)->getRados()->get_olh(dpp(), bucket->get_info(), obj, &olh); + if (ret < 0) { + cerr << "ERROR: failed reading olh: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("olh", olh, formatter.get()); + formatter->flush(cout); + } + + if (opt_cmd == OPT::OLH_READLOG) { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + map > log; + bool is_truncated; + + std::unique_ptr obj = bucket->get_object(object); + + RGWObjState *state; + + ret = obj->get_obj_state(dpp(), &state, null_yield); + if (ret < 0) { + return -ret; + } + + ret = static_cast(driver)->getRados()->bucket_index_read_olh_log(dpp(), bucket->get_info(), *state, obj->get_obj(), 0, &log, &is_truncated); + if (ret < 0) { + cerr << "ERROR: failed reading olh: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("result"); + encode_json("is_truncated", is_truncated, formatter.get()); + encode_json("log", log, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BI_GET) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + if (object.empty()) { + cerr << "ERROR: object not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + rgw_obj obj(bucket->get_key(), object); + if (!object_version.empty()) { + obj.key.set_instance(object_version); + } + + rgw_cls_bi_entry entry; + ret = static_cast(driver)->getRados()->bi_get(dpp(), bucket->get_info(), obj, bi_index_type, &entry); + if (ret < 0) { + cerr << "ERROR: bi_get(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("entry", entry, formatter.get()); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BI_PUT) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + rgw_cls_bi_entry entry; + cls_rgw_obj_key key; + ret = read_decode_json(infile, entry, &key); + if (ret < 0) { + return 1; + } + + rgw_obj obj(bucket->get_key(), key); + + ret = static_cast(driver)->getRados()->bi_put(dpp(), bucket->get_key(), obj, entry); + if (ret < 0) { + cerr << "ERROR: bi_put(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::BI_LIST) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + std::list entries; + bool is_truncated; + const auto& index = bucket->get_info().layout.current_index; + const int max_shards = rgw::num_shards(index); + if (max_entries < 0) { + max_entries = 1000; + } + + ldpp_dout(dpp(), 20) << "INFO: " << __func__ << ": max_entries=" << max_entries << + ", index=" << index << ", max_shards=" << max_shards << dendl; + + formatter->open_array_section("entries"); + + int i = (specified_shard_id ? shard_id : 0); + for (; i < max_shards; i++) { + ldpp_dout(dpp(), 20) << "INFO: " << __func__ << ": starting shard=" << i << dendl; + + RGWRados::BucketShard bs(static_cast(driver)->getRados()); + int ret = bs.init(dpp(), bucket->get_info(), index, i); + marker.clear(); + + if (ret < 0) { + cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << i << "): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + do { + entries.clear(); + // if object is specified, we use that as a filter to only retrieve some some entries + ret = static_cast(driver)->getRados()->bi_list(bs, object, marker, max_entries, &entries, &is_truncated); + if (ret < 0) { + cerr << "ERROR: bi_list(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ldpp_dout(dpp(), 20) << "INFO: " << __func__ << + ": bi_list() returned without error; entries.size()=" << + entries.size() << ", is_truncated=" << is_truncated << + ", marker=" << marker << dendl; + + for (const auto& entry : entries) { + encode_json("entry", entry, formatter.get()); + marker = entry.idx; + } + formatter->flush(cout); + } while (is_truncated); + + formatter->flush(cout); + + if (specified_shard_id) { + break; + } + } + ldpp_dout(dpp(), 20) << "INFO: " << __func__ << ": done" << dendl; + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BI_PURGE) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + std::unique_ptr cur_bucket; + ret = init_bucket(user.get(), tenant, bucket_name, string(), &cur_bucket); + if (ret == -ENOENT) { + // no bucket entrypoint + } else if (ret < 0) { + cerr << "ERROR: could not init current bucket info for bucket_name=" << bucket_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } else if (cur_bucket->get_bucket_id() == bucket->get_bucket_id() && + !yes_i_really_mean_it) { + cerr << "specified bucket instance points to a current bucket instance" << std::endl; + cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return EINVAL; + } + + const auto& index = bucket->get_info().layout.current_index; + if (index.layout.type == rgw::BucketIndexType::Indexless) { + cerr << "ERROR: indexless bucket has no index to purge" << std::endl; + return EINVAL; + } + + const int max_shards = rgw::num_shards(index); + for (int i = 0; i < max_shards; i++) { + RGWRados::BucketShard bs(static_cast(driver)->getRados()); + int ret = bs.init(dpp(), bucket->get_info(), index, i); + if (ret < 0) { + cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << i << "): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = static_cast(driver)->getRados()->bi_remove(dpp(), bs); + if (ret < 0) { + cerr << "ERROR: failed to remove bucket index object: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + if (opt_cmd == OPT::OBJECT_PUT) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + if (object.empty()) { + cerr << "ERROR: object not specified" << std::endl; + return EINVAL; + } + + RGWDataAccess data_access(driver); + rgw_obj_key key(object, object_version); + + RGWDataAccess::BucketRef b; + RGWDataAccess::ObjectRef obj; + + int ret = data_access.get_bucket(dpp(), tenant, bucket_name, bucket_id, &b, null_yield); + if (ret < 0) { + cerr << "ERROR: failed to init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = b->get_object(key, &obj); + if (ret < 0) { + cerr << "ERROR: failed to get object: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + bufferlist bl; + ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl; + } + + map attrs; + ret = obj->put(bl, attrs, dpp(), null_yield); + if (ret < 0) { + cerr << "ERROR: put object returned error: " << cpp_strerror(-ret) << std::endl; + } + } + + if (opt_cmd == OPT::OBJECT_RM) { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + rgw_obj_key key(object, object_version); + ret = rgw_remove_object(dpp(), driver, bucket.get(), key); + + if (ret < 0) { + cerr << "ERROR: object remove returned: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::OBJECT_REWRITE) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + if (object.empty()) { + cerr << "ERROR: object not specified" << std::endl; + return EINVAL; + } + + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + std::unique_ptr obj = bucket->get_object(object); + obj->set_instance(object_version); + bool need_rewrite = true; + if (min_rewrite_stripe_size > 0) { + ret = check_min_obj_stripe_size(driver, obj.get(), min_rewrite_stripe_size, &need_rewrite); + if (ret < 0) { + ldpp_dout(dpp(), 0) << "WARNING: check_min_obj_stripe_size failed, r=" << ret << dendl; + } + } + if (need_rewrite) { + RGWRados* store = static_cast(driver)->getRados(); + ret = store->rewrite_obj(bucket->get_info(), obj->get_obj(), dpp(), null_yield); + if (ret < 0) { + cerr << "ERROR: object rewrite returned: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + ldpp_dout(dpp(), 20) << "skipped object" << dendl; + } + } // OPT::OBJECT_REWRITE + + if (opt_cmd == OPT::OBJECT_REINDEX) { + if (bucket_name.empty()) { + cerr << "ERROR: --bucket not specified." << std::endl; + return EINVAL; + } + if (object.empty() && objects_file.empty()) { + cerr << "ERROR: neither --object nor --objects-file specified." << std::endl; + return EINVAL; + } else if (!object.empty() && !objects_file.empty()) { + cerr << "ERROR: both --object and --objects-file specified and only one is allowed." << std::endl; + return EINVAL; + } else if (!objects_file.empty() && !object_version.empty()) { + cerr << "ERROR: cannot specify --object_version when --objects-file specified." << std::endl; + return EINVAL; + } + + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << + "." << std::endl; + return -ret; + } + + rgw::sal::RadosStore* rados_store = dynamic_cast(driver); + if (!rados_store) { + cerr << + "ERROR: this command can only work when the cluster has a RADOS backing store." << + std::endl; + return EPERM; + } + RGWRados* store = rados_store->getRados(); + + auto process = [&](const std::string& p_object, const std::string& p_object_version) -> int { + std::unique_ptr obj = bucket->get_object(p_object); + obj->set_instance(p_object_version); + ret = store->reindex_obj(bucket->get_info(), obj->get_obj(), dpp(), null_yield); + if (ret < 0) { + return ret; + } + return 0; + }; + + if (!object.empty()) { + ret = process(object, object_version); + if (ret < 0) { + return -ret; + } + } else { + std::ifstream file; + file.open(objects_file); + if (!file.is_open()) { + std::cerr << "ERROR: unable to open objects-file \"" << + objects_file << "\"." << std::endl; + return ENOENT; + } + + std::string obj_name; + const std::string empty_version; + while (std::getline(file, obj_name)) { + ret = process(obj_name, empty_version); + if (ret < 0) { + std::cerr << "ERROR: while processing \"" << obj_name << + "\", received " << cpp_strerror(-ret) << "." << std::endl; + if (!yes_i_really_mean_it) { + std::cerr << + "NOTE: with *caution* you can use --yes-i-really-mean-it to push through errors and continue processing." << + std::endl; + return -ret; + } + } + } // while + } + } // OPT::OBJECT_REINDEX + + if (opt_cmd == OPT::OBJECTS_EXPIRE) { + if (!static_cast(driver)->getRados()->process_expire_objects(dpp())) { + cerr << "ERROR: process_expire_objects() processing returned error." << std::endl; + return 1; + } + } + + if (opt_cmd == OPT::OBJECTS_EXPIRE_STALE_LIST) { + ret = RGWBucketAdminOp::fix_obj_expiry(driver, bucket_op, stream_flusher, dpp(), true); + if (ret < 0) { + cerr << "ERROR: listing returned " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::OBJECTS_EXPIRE_STALE_RM) { + ret = RGWBucketAdminOp::fix_obj_expiry(driver, bucket_op, stream_flusher, dpp(), false); + if (ret < 0) { + cerr << "ERROR: removing returned " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::BUCKET_REWRITE) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + uint64_t start_epoch = 0; + uint64_t end_epoch = 0; + + if (!end_date.empty()) { + int ret = utime_t::parse_date(end_date, &end_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse end date" << std::endl; + return EINVAL; + } + } + if (!start_date.empty()) { + int ret = utime_t::parse_date(start_date, &start_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse start date" << std::endl; + return EINVAL; + } + } + + bool is_truncated = true; + bool cls_filtered = true; + + rgw_obj_index_key marker; + string empty_prefix; + string empty_delimiter; + + formatter->open_object_section("result"); + formatter->dump_string("bucket", bucket_name); + formatter->open_array_section("objects"); + + constexpr uint32_t NUM_ENTRIES = 1000; + uint16_t expansion_factor = 1; + while (is_truncated) { + RGWRados::ent_map_t result; + result.reserve(NUM_ENTRIES); + + const auto& current_index = bucket->get_info().layout.current_index; + int r = static_cast(driver)->getRados()->cls_bucket_list_ordered( + dpp(), bucket->get_info(), current_index, RGW_NO_SHARD, + marker, empty_prefix, empty_delimiter, + NUM_ENTRIES, true, expansion_factor, + result, &is_truncated, &cls_filtered, &marker, + null_yield, + rgw_bucket_object_check_filter); + if (r < 0 && r != -ENOENT) { + cerr << "ERROR: failed operation r=" << r << std::endl; + } else if (r == -ENOENT) { + break; + } + + if (result.size() < NUM_ENTRIES / 8) { + ++expansion_factor; + } else if (result.size() > NUM_ENTRIES * 7 / 8 && + expansion_factor > 1) { + --expansion_factor; + } + + for (auto iter = result.begin(); iter != result.end(); ++iter) { + rgw_obj_key key = iter->second.key; + rgw_bucket_dir_entry& entry = iter->second; + + formatter->open_object_section("object"); + formatter->dump_string("name", key.name); + formatter->dump_string("instance", key.instance); + formatter->dump_int("size", entry.meta.size); + utime_t ut(entry.meta.mtime); + ut.gmtime(formatter->dump_stream("mtime")); + + if ((entry.meta.size < min_rewrite_size) || + (entry.meta.size > max_rewrite_size) || + (start_epoch > 0 && start_epoch > (uint64_t)ut.sec()) || + (end_epoch > 0 && end_epoch < (uint64_t)ut.sec())) { + formatter->dump_string("status", "Skipped"); + } else { + std::unique_ptr obj = bucket->get_object(key); + + bool need_rewrite = true; + if (min_rewrite_stripe_size > 0) { + r = check_min_obj_stripe_size(driver, obj.get(), min_rewrite_stripe_size, &need_rewrite); + if (r < 0) { + ldpp_dout(dpp(), 0) << "WARNING: check_min_obj_stripe_size failed, r=" << r << dendl; + } + } + if (!need_rewrite) { + formatter->dump_string("status", "Skipped"); + } else { + RGWRados* store = static_cast(driver)->getRados(); + r = store->rewrite_obj(bucket->get_info(), obj->get_obj(), dpp(), null_yield); + if (r == 0) { + formatter->dump_string("status", "Success"); + } else { + formatter->dump_string("status", cpp_strerror(-r)); + } + } + } + formatter->dump_int("flags", entry.flags); + + formatter->close_section(); + formatter->flush(cout); + } + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BUCKET_RESHARD) { + int ret = check_reshard_bucket_params(driver, + bucket_name, + tenant, + bucket_id, + num_shards_specified, + num_shards, + yes_i_really_mean_it, + &bucket); + if (ret < 0) { + return ret; + } + + auto zone_svc = static_cast(driver)->svc()->zone; + if (!zone_svc->can_reshard()) { + const auto& zonegroup = zone_svc->get_zonegroup(); + std::cerr << "The zonegroup '" << zonegroup.get_name() << "' does not " + "have the resharding feature enabled." << std::endl; + return ENOTSUP; + } + if (!RGWBucketReshard::can_reshard(bucket->get_info(), zone_svc) && + !yes_i_really_mean_it) { + std::cerr << "Bucket '" << bucket->get_name() << "' already has too many " + "log generations (" << bucket->get_info().layout.logs.size() << ") " + "from previous reshards that peer zones haven't finished syncing. " + "Resharding is not recommended until the old generations sync, but " + "you can force a reshard with --yes-i-really-mean-it." << std::endl; + return EINVAL; + } + + RGWBucketReshard br(static_cast(driver), + bucket->get_info(), bucket->get_attrs(), + nullptr /* no callback */); + +#define DEFAULT_RESHARD_MAX_ENTRIES 1000 + if (max_entries < 1) { + max_entries = DEFAULT_RESHARD_MAX_ENTRIES; + } + + ReshardFaultInjector fault; + if (inject_error_at) { + const int code = -inject_error_code.value_or(EIO); + fault.inject(*inject_error_at, InjectError{code, dpp()}); + } else if (inject_abort_at) { + fault.inject(*inject_abort_at, InjectAbort{}); + } else if (inject_delay_at) { + fault.inject(*inject_delay_at, InjectDelay{inject_delay, dpp()}); + } + ret = br.execute(num_shards, fault, max_entries, dpp(), + verbose, &cout, formatter.get()); + return -ret; + } + + if (opt_cmd == OPT::RESHARD_ADD) { + int ret = check_reshard_bucket_params(driver, + bucket_name, + tenant, + bucket_id, + num_shards_specified, + num_shards, + yes_i_really_mean_it, + &bucket); + if (ret < 0) { + return ret; + } + + int num_source_shards = rgw::current_num_shards(bucket->get_info().layout); + + RGWReshard reshard(static_cast(driver), dpp()); + cls_rgw_reshard_entry entry; + entry.time = real_clock::now(); + entry.tenant = tenant; + entry.bucket_name = bucket_name; + entry.bucket_id = bucket->get_info().bucket.bucket_id; + entry.old_num_shards = num_source_shards; + entry.new_num_shards = num_shards; + + return reshard.add(dpp(), entry); + } + + if (opt_cmd == OPT::RESHARD_LIST) { + int ret; + int count = 0; + if (max_entries < 0) { + max_entries = 1000; + } + + int num_logshards = + driver->ctx()->_conf.get_val("rgw_reshard_num_logs"); + + RGWReshard reshard(static_cast(driver), dpp()); + + formatter->open_array_section("reshard"); + for (int i = 0; i < num_logshards; i++) { + bool is_truncated = true; + std::string marker; + do { + std::list entries; + ret = reshard.list(dpp(), i, marker, max_entries - count, entries, &is_truncated); + if (ret < 0) { + cerr << "Error listing resharding buckets: " << cpp_strerror(-ret) << std::endl; + return ret; + } + for (const auto& entry : entries) { + encode_json("entry", entry, formatter.get()); + } + if (is_truncated) { + entries.crbegin()->get_key(&marker); // last entry's key becomes marker + } + count += entries.size(); + formatter->flush(cout); + } while (is_truncated && count < max_entries); + + if (count >= max_entries) { + break; + } + } + + formatter->close_section(); + formatter->flush(cout); + + return 0; + } + + if (opt_cmd == OPT::RESHARD_STATUS) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWBucketReshard br(static_cast(driver), + bucket->get_info(), bucket->get_attrs(), + nullptr /* no callback */); + list status; + int r = br.get_status(dpp(), &status); + if (r < 0) { + cerr << "ERROR: could not get resharding status for bucket " << + bucket_name << std::endl; + return -r; + } + + show_reshard_status(status, formatter.get()); + } + + if (opt_cmd == OPT::RESHARD_PROCESS) { + RGWReshard reshard(static_cast(driver), true, &cout); + + int ret = reshard.process_all_logshards(dpp()); + if (ret < 0) { + cerr << "ERROR: failed to process reshard logs, error=" << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::RESHARD_CANCEL) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + bool bucket_initable = true; + ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + if (yes_i_really_mean_it) { + bucket_initable = false; + } else { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << + "; if you want to cancel the reshard request nonetheless, please " + "use the --yes-i-really-mean-it option" << std::endl; + return -ret; + } + } + + bool resharding_underway = true; + + if (bucket_initable) { + // we did not encounter an error, so let's work with the bucket + RGWBucketReshard br(static_cast(driver), + bucket->get_info(), bucket->get_attrs(), + nullptr /* no callback */); + int ret = br.cancel(dpp()); + if (ret < 0) { + if (ret == -EBUSY) { + cerr << "There is ongoing resharding, please retry after " << + driver->ctx()->_conf.get_val("rgw_reshard_bucket_lock_duration") << + " seconds." << std::endl; + return -ret; + } else if (ret == -EINVAL) { + resharding_underway = false; + // we can continue and try to unschedule + } else { + cerr << "Error cancelling bucket \"" << bucket_name << + "\" resharding: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + RGWReshard reshard(static_cast(driver), dpp()); + + cls_rgw_reshard_entry entry; + entry.tenant = tenant; + entry.bucket_name = bucket_name; + + ret = reshard.remove(dpp(), entry); + if (ret == -ENOENT) { + if (!resharding_underway) { + cerr << "Error, bucket \"" << bucket_name << + "\" is neither undergoing resharding nor scheduled to undergo " + "resharding." << std::endl; + return EINVAL; + } else { + // we cancelled underway resharding above, so we're good + return 0; + } + } else if (ret < 0) { + cerr << "Error in updating reshard log with bucket \"" << + bucket_name << "\": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } // OPT_RESHARD_CANCEL + + if (opt_cmd == OPT::OBJECT_UNLINK) { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + list oid_list; + rgw_obj_key key(object, object_version); + rgw_obj_index_key index_key; + key.get_index_key(&index_key); + oid_list.push_back(index_key); + + // note: under rados this removes directly from rados index objects + ret = bucket->remove_objs_from_index(dpp(), oid_list); + if (ret < 0) { + cerr << "ERROR: remove_obj_from_index() returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + if (opt_cmd == OPT::OBJECT_STAT) { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + std::unique_ptr obj = bucket->get_object(object); + obj->set_instance(object_version); + + ret = obj->get_obj_attrs(null_yield, dpp()); + if (ret < 0) { + cerr << "ERROR: failed to stat object, returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + formatter->open_object_section("object_metadata"); + formatter->dump_string("name", object); + formatter->dump_unsigned("size", obj->get_obj_size()); + + map::iterator iter; + map other_attrs; + for (iter = obj->get_attrs().begin(); iter != obj->get_attrs().end(); ++iter) { + bufferlist& bl = iter->second; + bool handled = false; + if (iter->first == RGW_ATTR_MANIFEST) { + handled = decode_dump("manifest", bl, formatter.get()); + } else if (iter->first == RGW_ATTR_ACL) { + handled = decode_dump("policy", bl, formatter.get()); + } else if (iter->first == RGW_ATTR_ID_TAG) { + handled = dump_string("tag", bl, formatter.get()); + } else if (iter->first == RGW_ATTR_ETAG) { + handled = dump_string("etag", bl, formatter.get()); + } else if (iter->first == RGW_ATTR_COMPRESSION) { + handled = decode_dump("compression", bl, formatter.get()); + } else if (iter->first == RGW_ATTR_DELETE_AT) { + handled = decode_dump("delete_at", bl, formatter.get()); + } + + if (!handled) + other_attrs[iter->first] = bl; + } + + formatter->open_object_section("attrs"); + for (iter = other_attrs.begin(); iter != other_attrs.end(); ++iter) { + dump_string(iter->first.c_str(), iter->second, formatter.get()); + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BUCKET_CHECK) { + if (check_head_obj_locator) { + if (bucket_name.empty()) { + cerr << "ERROR: need to specify bucket name" << std::endl; + return EINVAL; + } + do_check_object_locator(tenant, bucket_name, fix, remove_bad, formatter.get()); + } else { + RGWBucketAdminOp::check_index(driver, bucket_op, stream_flusher, null_yield, dpp()); + } + } + + if (opt_cmd == OPT::BUCKET_CHECK_OLH) { + rgw::sal::RadosStore* store = dynamic_cast(driver); + if (!store) { + cerr << + "WARNING: this command is only relevant when the cluster has a RADOS backing store." << + std::endl; + return 0; + } + RGWBucketAdminOp::check_index_olh(store, bucket_op, stream_flusher, dpp()); + } + + if (opt_cmd == OPT::BUCKET_CHECK_UNLINKED) { + rgw::sal::RadosStore* store = dynamic_cast(driver); + if (!store) { + cerr << + "WARNING: this command is only relevant when the cluster has a RADOS backing store." << + std::endl; + return 0; + } + RGWBucketAdminOp::check_index_unlinked(store, bucket_op, stream_flusher, dpp()); + } + + if (opt_cmd == OPT::BUCKET_RM) { + if (!inconsistent_index) { + RGWBucketAdminOp::remove_bucket(driver, bucket_op, null_yield, dpp(), bypass_gc, true); + } else { + if (!yes_i_really_mean_it) { + cerr << "using --inconsistent_index can corrupt the bucket index " << std::endl + << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return 1; + } + RGWBucketAdminOp::remove_bucket(driver, bucket_op, null_yield, dpp(), bypass_gc, false); + } + } + + if (opt_cmd == OPT::GC_LIST) { + int index = 0; + bool truncated; + bool processing_queue = false; + formatter->open_array_section("entries"); + + do { + list result; + int ret = static_cast(driver)->getRados()->list_gc_objs(&index, marker, 1000, !include_all, result, &truncated, processing_queue); + if (ret < 0) { + cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret) << std::endl; + return 1; + } + + + list::iterator iter; + for (iter = result.begin(); iter != result.end(); ++iter) { + cls_rgw_gc_obj_info& info = *iter; + formatter->open_object_section("chain_info"); + formatter->dump_string("tag", info.tag); + formatter->dump_stream("time") << info.time; + formatter->open_array_section("objs"); + list::iterator liter; + cls_rgw_obj_chain& chain = info.chain; + for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) { + cls_rgw_obj& obj = *liter; + encode_json("obj", obj, formatter.get()); + } + formatter->close_section(); // objs + formatter->close_section(); // obj_chain + formatter->flush(cout); + } + } while (truncated); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::GC_PROCESS) { + int ret = static_cast(driver)->getRados()->process_gc(!include_all); + if (ret < 0) { + cerr << "ERROR: gc processing returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + if (opt_cmd == OPT::LC_LIST) { + formatter->open_array_section("lifecycle_list"); + vector> bucket_lc_map; + string marker; + int index{0}; +#define MAX_LC_LIST_ENTRIES 100 + if (max_entries < 0) { + max_entries = MAX_LC_LIST_ENTRIES; + } + do { + int ret = static_cast(driver)->getRados()->list_lc_progress(marker, max_entries, + bucket_lc_map, index); + if (ret < 0) { + cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret) + << std::endl; + return 1; + } + for (const auto& entry : bucket_lc_map) { + formatter->open_object_section("bucket_lc_info"); + formatter->dump_string("bucket", entry->get_bucket()); + formatter->dump_string("shard", entry->get_oid()); + char exp_buf[100]; + time_t t{time_t(entry->get_start_time())}; + if (std::strftime( + exp_buf, sizeof(exp_buf), + "%a, %d %b %Y %T %Z", std::gmtime(&t))) { + formatter->dump_string("started", exp_buf); + } + string lc_status = LC_STATUS[entry->get_status()]; + formatter->dump_string("status", lc_status); + formatter->close_section(); // objs + formatter->flush(cout); + } + } while (!bucket_lc_map.empty()); + + formatter->close_section(); //lifecycle list + formatter->flush(cout); + } + + + if (opt_cmd == OPT::LC_GET) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + RGWLifecycleConfiguration config; + ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto aiter = bucket->get_attrs().find(RGW_ATTR_LC); + if (aiter == bucket->get_attrs().end()) { + return -ENOENT; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + cerr << "ERROR: decode life cycle config failed" << std::endl; + return -EIO; + } + + encode_json("result", config, formatter.get()); + formatter->flush(cout); + } + + if (opt_cmd == OPT::LC_PROCESS) { + if ((! bucket_name.empty()) || + (! bucket_id.empty())) { + int ret = init_bucket(nullptr, tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) + << std::endl; + return ret; + } + } + + int ret = + static_cast(driver)->getRados()->process_lc(bucket); + if (ret < 0) { + cerr << "ERROR: lc processing returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + if (opt_cmd == OPT::LC_RESHARD_FIX) { + ret = RGWBucketAdminOp::fix_lc_shards(driver, bucket_op, stream_flusher, dpp()); + if (ret < 0) { + cerr << "ERROR: fixing lc shards: " << cpp_strerror(-ret) << std::endl; + } + + } + + if (opt_cmd == OPT::ORPHANS_FIND) { + if (!yes_i_really_mean_it) { + cerr << "this command is now deprecated; please consider using the rgw-orphan-list tool; " + << "accidental removal of active objects cannot be reversed; " + << "do you really mean it? (requires --yes-i-really-mean-it)" + << std::endl; + return EINVAL; + } else { + cerr << "IMPORTANT: this command is now deprecated; please consider using the rgw-orphan-list tool" + << std::endl; + } + + RGWOrphanSearch search(static_cast(driver), max_concurrent_ios, orphan_stale_secs); + + if (job_id.empty()) { + cerr << "ERROR: --job-id not specified" << std::endl; + return EINVAL; + } + if (pool_name.empty()) { + cerr << "ERROR: --pool not specified" << std::endl; + return EINVAL; + } + + RGWOrphanSearchInfo info; + + info.pool = pool; + info.job_name = job_id; + info.num_shards = num_shards; + + int ret = search.init(dpp(), job_id, &info, detail); + if (ret < 0) { + cerr << "could not init search, ret=" << ret << std::endl; + return -ret; + } + ret = search.run(dpp()); + if (ret < 0) { + return -ret; + } + } + + if (opt_cmd == OPT::ORPHANS_FINISH) { + if (!yes_i_really_mean_it) { + cerr << "this command is now deprecated; please consider using the rgw-orphan-list tool; " + << "accidental removal of active objects cannot be reversed; " + << "do you really mean it? (requires --yes-i-really-mean-it)" + << std::endl; + return EINVAL; + } else { + cerr << "IMPORTANT: this command is now deprecated; please consider using the rgw-orphan-list tool" + << std::endl; + } + + RGWOrphanSearch search(static_cast(driver), max_concurrent_ios, orphan_stale_secs); + + if (job_id.empty()) { + cerr << "ERROR: --job-id not specified" << std::endl; + return EINVAL; + } + int ret = search.init(dpp(), job_id, NULL); + if (ret < 0) { + if (ret == -ENOENT) { + cerr << "job not found" << std::endl; + } + return -ret; + } + ret = search.finish(); + if (ret < 0) { + return -ret; + } + } + + if (opt_cmd == OPT::ORPHANS_LIST_JOBS){ + if (!yes_i_really_mean_it) { + cerr << "this command is now deprecated; please consider using the rgw-orphan-list tool; " + << "do you really mean it? (requires --yes-i-really-mean-it)" + << std::endl; + return EINVAL; + } else { + cerr << "IMPORTANT: this command is now deprecated; please consider using the rgw-orphan-list tool" + << std::endl; + } + + RGWOrphanStore orphan_store(static_cast(driver)); + int ret = orphan_store.init(dpp()); + if (ret < 0){ + cerr << "connection to cluster failed!" << std::endl; + return -ret; + } + + map m; + ret = orphan_store.list_jobs(m); + if (ret < 0) { + cerr << "job list failed" << std::endl; + return -ret; + } + formatter->open_array_section("entries"); + for (const auto &it: m){ + if (!extra_info){ + formatter->dump_string("job-id",it.first); + } else { + encode_json("orphan_search_state", it.second, formatter.get()); + } + } + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::USER_CHECK) { + check_bad_user_bucket_mapping(driver, *user.get(), fix, null_yield, dpp()); + } + + if (opt_cmd == OPT::USER_STATS) { + if (rgw::sal::User::empty(user)) { + cerr << "ERROR: uid not specified" << std::endl; + return EINVAL; + } + if (reset_stats) { + if (!bucket_name.empty()) { + cerr << "ERROR: --reset-stats does not work on buckets and " + "bucket specified" << std::endl; + return EINVAL; + } + if (sync_stats) { + cerr << "ERROR: sync-stats includes the reset-stats functionality, " + "so at most one of the two should be specified" << std::endl; + return EINVAL; + } + ret = static_cast(driver)->svc()->user->reset_bucket_stats(dpp(), user->get_id(), null_yield); + if (ret < 0) { + cerr << "ERROR: could not reset user stats: " << cpp_strerror(-ret) << + std::endl; + return -ret; + } + } + + if (sync_stats) { + if (!bucket_name.empty()) { + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = bucket->sync_user_stats(dpp(), null_yield); + if (ret < 0) { + cerr << "ERROR: could not sync bucket stats: " << + cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + int ret = rgw_user_sync_all_stats(dpp(), driver, user.get(), null_yield); + if (ret < 0) { + cerr << "ERROR: could not sync user stats: " << + cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + constexpr bool omit_utilized_stats = false; + RGWStorageStats stats(omit_utilized_stats); + ceph::real_time last_stats_sync; + ceph::real_time last_stats_update; + int ret = user->read_stats(dpp(), null_yield, &stats, &last_stats_sync, &last_stats_update); + if (ret < 0) { + if (ret == -ENOENT) { /* in case of ENOENT */ + cerr << "User has not been initialized or user does not exist" << std::endl; + } else { + cerr << "ERROR: can't read user: " << cpp_strerror(ret) << std::endl; + } + return -ret; + } + + + { + Formatter::ObjectSection os(*formatter, "result"); + encode_json("stats", stats, formatter.get()); + utime_t last_sync_ut(last_stats_sync); + encode_json("last_stats_sync", last_sync_ut, formatter.get()); + utime_t last_update_ut(last_stats_update); + encode_json("last_stats_update", last_update_ut, formatter.get()); + } + formatter->flush(cout); + } + + if (opt_cmd == OPT::METADATA_GET) { + int ret = static_cast(driver)->ctl()->meta.mgr->get(metadata_key, formatter.get(), null_yield, dpp()); + if (ret < 0) { + cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + formatter->flush(cout); + } + + if (opt_cmd == OPT::METADATA_PUT) { + bufferlist bl; + int ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = static_cast(driver)->ctl()->meta.mgr->put(metadata_key, bl, null_yield, dpp(), RGWMDLogSyncType::APPLY_ALWAYS, false); + if (ret < 0) { + cerr << "ERROR: can't put key: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::METADATA_RM) { + int ret = static_cast(driver)->ctl()->meta.mgr->remove(metadata_key, null_yield, dpp()); + if (ret < 0) { + cerr << "ERROR: can't remove key: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::METADATA_LIST || opt_cmd == OPT::USER_LIST) { + if (opt_cmd == OPT::USER_LIST) { + metadata_key = "user"; + } + void *handle; + int max = 1000; + int ret = driver->meta_list_keys_init(dpp(), metadata_key, marker, &handle); + if (ret < 0) { + cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + bool truncated; + uint64_t count = 0; + + if (max_entries_specified) { + formatter->open_object_section("result"); + } + formatter->open_array_section("keys"); + + uint64_t left; + do { + list keys; + left = (max_entries_specified ? max_entries - count : max); + ret = driver->meta_list_keys_next(dpp(), handle, left, keys, &truncated); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } if (ret != -ENOENT) { + for (list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + formatter->dump_string("key", *iter); + ++count; + } + formatter->flush(cout); + } + } while (truncated && left > 0); + + formatter->close_section(); + + if (max_entries_specified) { + encode_json("truncated", truncated, formatter.get()); + encode_json("count", count, formatter.get()); + if (truncated) { + encode_json("marker", driver->meta_get_marker(handle), formatter.get()); + } + formatter->close_section(); + } + formatter->flush(cout); + + driver->meta_list_keys_complete(handle); + } + + if (opt_cmd == OPT::MDLOG_LIST) { + if (!start_date.empty()) { + std::cerr << "start-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_date.empty()) { + std::cerr << "end-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_marker.empty()) { + std::cerr << "end-marker not allowed." << std::endl; + return -EINVAL; + } + if (!start_marker.empty()) { + if (marker.empty()) { + marker = start_marker; + } else { + std::cerr << "start-marker and marker not both allowed." << std::endl; + return -EINVAL; + } + } + + int i = (specified_shard_id ? shard_id : 0); + + if (period_id.empty()) { + // use realm's current period + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0 ) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + period_id = realm.current_period; + std::cerr << "No --period given, using current period=" + << period_id << std::endl; + } + RGWMetadataLog *meta_log = static_cast(driver)->svc()->mdlog->get_log(period_id); + + formatter->open_array_section("entries"); + for (; i < g_ceph_context->_conf->rgw_md_log_max_shards; i++) { + void *handle; + list entries; + + meta_log->init_list_entries(i, {}, {}, marker, &handle); + bool truncated; + do { + int ret = meta_log->list_entries(dpp(), handle, 1000, entries, NULL, &truncated); + if (ret < 0) { + cerr << "ERROR: meta_log->list_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + cls_log_entry& entry = *iter; + static_cast(driver)->ctl()->meta.mgr->dump_log_entry(entry, formatter.get()); + } + formatter->flush(cout); + } while (truncated); + + meta_log->complete_list_entries(handle); + + if (specified_shard_id) + break; + } + + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::MDLOG_STATUS) { + int i = (specified_shard_id ? shard_id : 0); + + if (period_id.empty()) { + // use realm's current period + RGWRealm realm; + int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(), + realm_id, realm_name, realm); + if (ret < 0 ) { + cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + period_id = realm.current_period; + std::cerr << "No --period given, using current period=" + << period_id << std::endl; + } + RGWMetadataLog *meta_log = static_cast(driver)->svc()->mdlog->get_log(period_id); + + formatter->open_array_section("entries"); + + for (; i < g_ceph_context->_conf->rgw_md_log_max_shards; i++) { + RGWMetadataLogInfo info; + meta_log->get_info(dpp(), i, &info); + + ::encode_json("info", info, formatter.get()); + + if (specified_shard_id) + break; + } + + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::MDLOG_AUTOTRIM) { + // need a full history for purging old mdlog periods + static_cast(driver)->svc()->mdlog->init_oldest_log_period(null_yield, dpp()); + + RGWCoroutinesManager crs(driver->ctx(), driver->get_cr_registry()); + RGWHTTPManager http(driver->ctx(), crs.get_completion_mgr()); + int ret = http.start(); + if (ret < 0) { + cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl; + return -ret; + } + + auto num_shards = g_conf()->rgw_md_log_max_shards; + auto mltcr = create_admin_meta_log_trim_cr( + dpp(), static_cast(driver), &http, num_shards); + if (!mltcr) { + cerr << "Cluster misconfigured! Unable to trim." << std::endl; + return -EIO; + } + ret = crs.run(dpp(), mltcr); + if (ret < 0) { + cerr << "automated mdlog trim failed with " << cpp_strerror(ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::MDLOG_TRIM) { + if (!start_date.empty()) { + std::cerr << "start-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_date.empty()) { + std::cerr << "end-date not allowed." << std::endl; + return -EINVAL; + } + if (!start_marker.empty()) { + std::cerr << "start-marker not allowed." << std::endl; + return -EINVAL; + } + if (!end_marker.empty()) { + if (marker.empty()) { + marker = end_marker; + } else { + std::cerr << "end-marker and marker not both allowed." << std::endl; + return -EINVAL; + } + } + + if (!specified_shard_id) { + cerr << "ERROR: shard-id must be specified for trim operation" << std::endl; + return EINVAL; + } + + if (marker.empty()) { + cerr << "ERROR: marker must be specified for trim operation" << std::endl; + return EINVAL; + } + + if (period_id.empty()) { + std::cerr << "missing --period argument" << std::endl; + return EINVAL; + } + RGWMetadataLog *meta_log = static_cast(driver)->svc()->mdlog->get_log(period_id); + + // trim until -ENODATA + do { + ret = meta_log->trim(dpp(), shard_id, {}, {}, {}, marker); + } while (ret == 0); + if (ret < 0 && ret != -ENODATA) { + cerr << "ERROR: meta_log->trim(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::SYNC_INFO) { + sync_info(opt_effective_zone_id, opt_bucket, zone_formatter.get()); + } + + if (opt_cmd == OPT::SYNC_STATUS) { + sync_status(formatter.get()); + } + + if (opt_cmd == OPT::METADATA_SYNC_STATUS) { + RGWMetaSyncStatusManager sync(static_cast(driver), static_cast(driver)->svc()->rados->get_async_processor()); + + int ret = sync.init(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + rgw_meta_sync_status sync_status; + ret = sync.read_sync_status(dpp(), &sync_status); + if (ret < 0) { + cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + + formatter->open_object_section("summary"); + encode_json("sync_status", sync_status, formatter.get()); + + uint64_t full_total = 0; + uint64_t full_complete = 0; + + for (auto marker_iter : sync_status.sync_markers) { + full_total += marker_iter.second.total_entries; + if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) { + full_complete += marker_iter.second.pos; + } else { + full_complete += marker_iter.second.total_entries; + } + } + + formatter->open_object_section("full_sync"); + encode_json("total", full_total, formatter.get()); + encode_json("complete", full_complete, formatter.get()); + formatter->close_section(); + formatter->dump_string("current_time", + to_iso_8601(ceph::real_clock::now(), + iso_8601_format::YMDhms)); + formatter->close_section(); + + formatter->flush(cout); + + } + + if (opt_cmd == OPT::METADATA_SYNC_INIT) { + RGWMetaSyncStatusManager sync(static_cast(driver), static_cast(driver)->svc()->rados->get_async_processor()); + + int ret = sync.init(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + ret = sync.init_sync_status(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + } + + + if (opt_cmd == OPT::METADATA_SYNC_RUN) { + RGWMetaSyncStatusManager sync(static_cast(driver), static_cast(driver)->svc()->rados->get_async_processor()); + + int ret = sync.init(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + ret = sync.run(dpp(), null_yield); + if (ret < 0) { + cerr << "ERROR: sync.run() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::DATA_SYNC_STATUS) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + RGWDataSyncStatusManager sync(static_cast(driver), static_cast(driver)->svc()->rados->get_async_processor(), source_zone, nullptr); + + int ret = sync.init(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + rgw_data_sync_status sync_status; + if (specified_shard_id) { + set pending_buckets; + set recovering_buckets; + rgw_data_sync_marker sync_marker; + ret = sync.read_shard_status(dpp(), shard_id, pending_buckets, recovering_buckets, &sync_marker, + max_entries_specified ? max_entries : 20); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: sync.read_shard_status() returned ret=" << ret << std::endl; + return -ret; + } + formatter->open_object_section("summary"); + encode_json("shard_id", shard_id, formatter.get()); + encode_json("marker", sync_marker, formatter.get()); + encode_json("pending_buckets", pending_buckets, formatter.get()); + encode_json("recovering_buckets", recovering_buckets, formatter.get()); + formatter->dump_string("current_time", + to_iso_8601(ceph::real_clock::now(), + iso_8601_format::YMDhms)); + formatter->close_section(); + formatter->flush(cout); + } else { + ret = sync.read_sync_status(dpp(), &sync_status); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + + formatter->open_object_section("summary"); + encode_json("sync_status", sync_status, formatter.get()); + + uint64_t full_total = 0; + uint64_t full_complete = 0; + + for (auto marker_iter : sync_status.sync_markers) { + full_total += marker_iter.second.total_entries; + if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) { + full_complete += marker_iter.second.pos; + } else { + full_complete += marker_iter.second.total_entries; + } + } + + formatter->open_object_section("full_sync"); + encode_json("total", full_total, formatter.get()); + encode_json("complete", full_complete, formatter.get()); + formatter->close_section(); + formatter->dump_string("current_time", + to_iso_8601(ceph::real_clock::now(), + iso_8601_format::YMDhms)); + formatter->close_section(); + + formatter->flush(cout); + } + } + + if (opt_cmd == OPT::DATA_SYNC_INIT) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + + RGWDataSyncStatusManager sync(static_cast(driver), static_cast(driver)->svc()->rados->get_async_processor(), source_zone, nullptr); + + int ret = sync.init(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + ret = sync.init_sync_status(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::DATA_SYNC_RUN) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + + RGWSyncModuleInstanceRef sync_module; + int ret = static_cast(driver)->svc()->sync_modules->get_manager()->create_instance(dpp(), g_ceph_context, static_cast(driver)->svc()->zone->get_zone().tier_type, + static_cast(driver)->svc()->zone->get_zone_params().tier_config, &sync_module); + if (ret < 0) { + ldpp_dout(dpp(), -1) << "ERROR: failed to init sync module instance, ret=" << ret << dendl; + return ret; + } + + RGWDataSyncStatusManager sync(static_cast(driver), static_cast(driver)->svc()->rados->get_async_processor(), source_zone, nullptr, sync_module); + + ret = sync.init(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + ret = sync.run(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.run() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::BUCKET_SYNC_INIT) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket_for_sync(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + auto opt_sb = opt_source_bucket; + if (opt_sb && opt_sb->bucket_id.empty()) { + string sbid; + std::unique_ptr sbuck; + int ret = init_bucket_for_sync(user.get(), opt_sb->tenant, opt_sb->name, sbid, &sbuck); + if (ret < 0) { + return -ret; + } + opt_sb = sbuck->get_key(); + } + + auto sync = RGWBucketPipeSyncStatusManager::construct( + dpp(), static_cast(driver), source_zone, opt_sb, + bucket->get_key(), extra_info ? &std::cout : nullptr); + + if (!sync) { + cerr << "ERROR: sync.init() returned error=" << sync.error() << std::endl; + return -sync.error(); + } + ret = (*sync)->init_sync_status(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::BUCKET_SYNC_CHECKPOINT) { + std::optional opt_source_zone; + if (!source_zone.empty()) { + opt_source_zone = source_zone; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + + if (!static_cast(driver)->ctl()->bucket->bucket_imports_data(bucket->get_key(), null_yield, dpp())) { + std::cout << "Sync is disabled for bucket " << bucket_name << std::endl; + return 0; + } + + RGWBucketSyncPolicyHandlerRef handler; + ret = driver->get_sync_policy_handler(dpp(), std::nullopt, bucket->get_key(), &handler, null_yield); + if (ret < 0) { + std::cerr << "ERROR: failed to get policy handler for bucket (" + << bucket << "): r=" << ret << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto timeout_at = ceph::coarse_mono_clock::now() + opt_timeout_sec; + ret = rgw_bucket_sync_checkpoint(dpp(), static_cast(driver), *handler, bucket->get_info(), + opt_source_zone, opt_source_bucket, + opt_retry_delay_ms, timeout_at); + if (ret < 0) { + ldpp_dout(dpp(), -1) << "bucket sync checkpoint failed: " << cpp_strerror(ret) << dendl; + return -ret; + } + } + + if ((opt_cmd == OPT::BUCKET_SYNC_DISABLE) || (opt_cmd == OPT::BUCKET_SYNC_ENABLE)) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + if (opt_cmd == OPT::BUCKET_SYNC_DISABLE) { + bucket_op.set_sync_bucket(false); + } else { + bucket_op.set_sync_bucket(true); + } + bucket_op.set_tenant(tenant); + string err_msg; + ret = RGWBucketAdminOp::sync_bucket(driver, bucket_op, dpp(), &err_msg); + if (ret < 0) { + cerr << err_msg << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::BUCKET_SYNC_INFO) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + bucket_sync_info(driver, bucket->get_info(), std::cout); + } + + if (opt_cmd == OPT::BUCKET_SYNC_STATUS) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + bucket_sync_status(driver, bucket->get_info(), source_zone, opt_source_bucket, std::cout); + } + + if (opt_cmd == OPT::BUCKET_SYNC_MARKERS) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket_for_sync(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + auto sync = RGWBucketPipeSyncStatusManager::construct( + dpp(), static_cast(driver), source_zone, + opt_source_bucket, bucket->get_key(), nullptr); + + if (!sync) { + cerr << "ERROR: sync.init() returned error=" << sync.error() << std::endl; + return -sync.error(); + } + + auto sync_status = (*sync)->read_sync_status(dpp()); + if (!sync_status) { + cerr << "ERROR: sync.read_sync_status() returned error=" + << sync_status.error() << std::endl; + return -sync_status.error(); + } + + encode_json("sync_status", *sync_status, formatter.get()); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BUCKET_SYNC_RUN) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket_for_sync(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + auto sync = RGWBucketPipeSyncStatusManager::construct( + dpp(), static_cast(driver), source_zone, + opt_source_bucket, bucket->get_key(), extra_info ? &std::cout : nullptr); + + if (!sync) { + cerr << "ERROR: sync.init() returned error=" << sync.error() << std::endl; + return -sync.error(); + } + + ret = (*sync)->run(dpp()); + if (ret < 0) { + cerr << "ERROR: sync.run() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::BILOG_LIST) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_array_section("entries"); + bool truncated; + int count = 0; + if (max_entries < 0) + max_entries = 1000; + + const auto& logs = bucket->get_info().layout.logs; + auto log_layout = std::reference_wrapper{logs.back()}; + if (gen) { + auto i = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen)); + if (i == logs.end()) { + cerr << "ERROR: no log layout with gen=" << *gen << std::endl; + return ENOENT; + } + log_layout = *i; + } + + do { + list entries; + ret = static_cast(driver)->svc()->bilog_rados->log_list(dpp(), bucket->get_info(), log_layout, shard_id, marker, max_entries - count, entries, &truncated); + if (ret < 0) { + cerr << "ERROR: list_bi_log_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += entries.size(); + + for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_bi_log_entry& entry = *iter; + encode_json("entry", entry, formatter.get()); + + marker = entry.id; + } + formatter->flush(cout); + } while (truncated && count < max_entries); + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::SYNC_ERROR_LIST) { + if (max_entries < 0) { + max_entries = 1000; + } + if (!start_date.empty()) { + std::cerr << "start-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_date.empty()) { + std::cerr << "end-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_marker.empty()) { + std::cerr << "end-marker not allowed." << std::endl; + return -EINVAL; + } + if (!start_marker.empty()) { + if (marker.empty()) { + marker = start_marker; + } else { + std::cerr << "start-marker and marker not both allowed." << std::endl; + return -EINVAL; + } + } + + bool truncated; + + if (shard_id < 0) { + shard_id = 0; + } + + formatter->open_array_section("entries"); + + for (; shard_id < ERROR_LOGGER_SHARDS; ++shard_id) { + formatter->open_object_section("shard"); + encode_json("shard_id", shard_id, formatter.get()); + formatter->open_array_section("entries"); + + int count = 0; + string oid = RGWSyncErrorLogger::get_shard_oid(RGW_SYNC_ERROR_LOG_SHARD_PREFIX, shard_id); + + do { + list entries; + ret = static_cast(driver)->svc()->cls->timelog.list(dpp(), oid, {}, {}, max_entries - count, entries, marker, &marker, &truncated, + null_yield); + if (ret == -ENOENT) { + break; + } + if (ret < 0) { + cerr << "ERROR: svc.cls->timelog.list(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += entries.size(); + + for (auto& cls_entry : entries) { + rgw_sync_error_info log_entry; + + auto iter = cls_entry.data.cbegin(); + try { + decode(log_entry, iter); + } catch (buffer::error& err) { + cerr << "ERROR: failed to decode log entry" << std::endl; + continue; + } + formatter->open_object_section("entry"); + encode_json("id", cls_entry.id, formatter.get()); + encode_json("section", cls_entry.section, formatter.get()); + encode_json("name", cls_entry.name, formatter.get()); + encode_json("timestamp", cls_entry.timestamp, formatter.get()); + encode_json("info", log_entry, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } + } while (truncated && count < max_entries); + + formatter->close_section(); + formatter->close_section(); + + if (specified_shard_id) { + break; + } + } + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::SYNC_ERROR_TRIM) { + if (!start_date.empty()) { + std::cerr << "start-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_date.empty()) { + std::cerr << "end-date not allowed." << std::endl; + return -EINVAL; + } + if (!start_marker.empty()) { + std::cerr << "end-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_marker.empty()) { + std::cerr << "end-date not allowed." << std::endl; + return -EINVAL; + } + + if (shard_id < 0) { + shard_id = 0; + } + + for (; shard_id < ERROR_LOGGER_SHARDS; ++shard_id) { + ret = trim_sync_error_log(shard_id, marker, trim_delay_ms); + if (ret < 0) { + cerr << "ERROR: sync error trim: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (specified_shard_id) { + break; + } + } + } + + if (opt_cmd == OPT::SYNC_GROUP_CREATE || + opt_cmd == OPT::SYNC_GROUP_MODIFY) { + CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL); + CHECK_TRUE(require_opt(opt_status), "ERROR: --status is not specified (options: forbidden, allowed, enabled)", EINVAL); + + SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket); + ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name); + if (ret < 0) { + return -ret; + } + auto& sync_policy = sync_policy_ctx.get_policy(); + + if (opt_cmd == OPT::SYNC_GROUP_MODIFY) { + auto iter = sync_policy.groups.find(*opt_group_id); + if (iter == sync_policy.groups.end()) { + cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl; + return ENOENT; + } + } + + auto& group = sync_policy.groups[*opt_group_id]; + group.id = *opt_group_id; + + if (opt_status) { + if (!group.set_status(*opt_status)) { + cerr << "ERROR: unrecognized status (options: forbidden, allowed, enabled)" << std::endl; + return EINVAL; + } + } + + ret = sync_policy_ctx.write_policy(); + if (ret < 0) { + return -ret; + } + + show_result(sync_policy, zone_formatter.get(), cout); + } + + if (opt_cmd == OPT::SYNC_GROUP_GET) { + SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket); + ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name); + if (ret < 0) { + return -ret; + } + auto& sync_policy = sync_policy_ctx.get_policy(); + + auto& groups = sync_policy.groups; + + if (!opt_group_id) { + show_result(groups, zone_formatter.get(), cout); + } else { + auto iter = sync_policy.groups.find(*opt_group_id); + if (iter == sync_policy.groups.end()) { + cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl; + return ENOENT; + } + + show_result(iter->second, zone_formatter.get(), cout); + } + } + + if (opt_cmd == OPT::SYNC_GROUP_REMOVE) { + CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL); + + SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket); + ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name); + if (ret < 0) { + return -ret; + } + auto& sync_policy = sync_policy_ctx.get_policy(); + + sync_policy.groups.erase(*opt_group_id); + + ret = sync_policy_ctx.write_policy(); + if (ret < 0) { + return -ret; + } + + { + Formatter::ObjectSection os(*zone_formatter.get(), "result"); + encode_json("sync_policy", sync_policy, zone_formatter.get()); + } + + zone_formatter->flush(cout); + } + + if (opt_cmd == OPT::SYNC_GROUP_FLOW_CREATE) { + CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL); + CHECK_TRUE(require_non_empty_opt(opt_flow_id), "ERROR: --flow-id not specified", EINVAL); + CHECK_TRUE(require_opt(opt_flow_type), + "ERROR: --flow-type not specified (options: symmetrical, directional)", EINVAL); + CHECK_TRUE((symmetrical_flow_opt(*opt_flow_type) || + directional_flow_opt(*opt_flow_type)), + "ERROR: --flow-type invalid (options: symmetrical, directional)", EINVAL); + + SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket); + ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name); + if (ret < 0) { + return -ret; + } + auto& sync_policy = sync_policy_ctx.get_policy(); + + auto iter = sync_policy.groups.find(*opt_group_id); + if (iter == sync_policy.groups.end()) { + cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl; + return ENOENT; + } + + auto& group = iter->second; + + if (symmetrical_flow_opt(*opt_flow_type)) { + CHECK_TRUE(require_non_empty_opt(opt_zone_ids), "ERROR: --zones not provided for symmetrical flow, or is empty", EINVAL); + + rgw_sync_symmetric_group *flow_group; + + group.data_flow.find_or_create_symmetrical(*opt_flow_id, &flow_group); + + for (auto& z : *opt_zone_ids) { + flow_group->zones.insert(z); + } + } else { /* directional */ + CHECK_TRUE(require_non_empty_opt(opt_source_zone_id), "ERROR: --source-zone not provided for directional flow rule, or is empty", EINVAL); + CHECK_TRUE(require_non_empty_opt(opt_dest_zone_id), "ERROR: --dest-zone not provided for directional flow rule, or is empty", EINVAL); + + rgw_sync_directional_rule *flow_rule; + + group.data_flow.find_or_create_directional(*opt_source_zone_id, *opt_dest_zone_id, &flow_rule); + } + + ret = sync_policy_ctx.write_policy(); + if (ret < 0) { + return -ret; + } + + show_result(sync_policy, zone_formatter.get(), cout); + } + + if (opt_cmd == OPT::SYNC_GROUP_FLOW_REMOVE) { + CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL); + CHECK_TRUE(require_non_empty_opt(opt_flow_id), "ERROR: --flow-id not specified", EINVAL); + CHECK_TRUE(require_opt(opt_flow_type), + "ERROR: --flow-type not specified (options: symmetrical, directional)", EINVAL); + CHECK_TRUE((symmetrical_flow_opt(*opt_flow_type) || + directional_flow_opt(*opt_flow_type)), + "ERROR: --flow-type invalid (options: symmetrical, directional)", EINVAL); + + SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket); + ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name); + if (ret < 0) { + return -ret; + } + auto& sync_policy = sync_policy_ctx.get_policy(); + + auto iter = sync_policy.groups.find(*opt_group_id); + if (iter == sync_policy.groups.end()) { + cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl; + return ENOENT; + } + + auto& group = iter->second; + + if (symmetrical_flow_opt(*opt_flow_type)) { + group.data_flow.remove_symmetrical(*opt_flow_id, opt_zone_ids); + } else { /* directional */ + CHECK_TRUE(require_non_empty_opt(opt_source_zone_id), "ERROR: --source-zone not provided for directional flow rule, or is empty", EINVAL); + CHECK_TRUE(require_non_empty_opt(opt_dest_zone_id), "ERROR: --dest-zone not provided for directional flow rule, or is empty", EINVAL); + + group.data_flow.remove_directional(*opt_source_zone_id, *opt_dest_zone_id); + } + + ret = sync_policy_ctx.write_policy(); + if (ret < 0) { + return -ret; + } + + show_result(sync_policy, zone_formatter.get(), cout); + } + + if (opt_cmd == OPT::SYNC_GROUP_PIPE_CREATE || + opt_cmd == OPT::SYNC_GROUP_PIPE_MODIFY) { + CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL); + CHECK_TRUE(require_non_empty_opt(opt_pipe_id), "ERROR: --pipe-id not specified", EINVAL); + if (opt_cmd == OPT::SYNC_GROUP_PIPE_CREATE) { + CHECK_TRUE(require_non_empty_opt(opt_source_zone_ids), "ERROR: --source-zones not provided or is empty; should be list of zones or '*'", EINVAL); + CHECK_TRUE(require_non_empty_opt(opt_dest_zone_ids), "ERROR: --dest-zones not provided or is empty; should be list of zones or '*'", EINVAL); + } + + SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket); + ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name); + if (ret < 0) { + return -ret; + } + auto& sync_policy = sync_policy_ctx.get_policy(); + + auto iter = sync_policy.groups.find(*opt_group_id); + if (iter == sync_policy.groups.end()) { + cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl; + return ENOENT; + } + + auto& group = iter->second; + + rgw_sync_bucket_pipes *pipe; + + if (opt_cmd == OPT::SYNC_GROUP_PIPE_CREATE) { + group.find_pipe(*opt_pipe_id, true, &pipe); + } else { + if (!group.find_pipe(*opt_pipe_id, false, &pipe)) { + cerr << "ERROR: could not find pipe '" << *opt_pipe_id << "'" << std::endl; + return ENOENT; + } + } + + if (opt_source_zone_ids) { + pipe->source.add_zones(*opt_source_zone_ids); + } + pipe->source.set_bucket(opt_source_tenant, + opt_source_bucket_name, + opt_source_bucket_id); + if (opt_dest_zone_ids) { + pipe->dest.add_zones(*opt_dest_zone_ids); + } + pipe->dest.set_bucket(opt_dest_tenant, + opt_dest_bucket_name, + opt_dest_bucket_id); + + pipe->params.source.filter.set_prefix(opt_prefix, !!opt_prefix_rm); + pipe->params.source.filter.set_tags(tags_add, tags_rm); + if (opt_dest_owner) { + pipe->params.dest.set_owner(*opt_dest_owner); + } + if (opt_storage_class) { + pipe->params.dest.set_storage_class(*opt_storage_class); + } + if (opt_priority) { + pipe->params.priority = *opt_priority; + } + if (opt_mode) { + if (*opt_mode == "system") { + pipe->params.mode = rgw_sync_pipe_params::MODE_SYSTEM; + } else if (*opt_mode == "user") { + pipe->params.mode = rgw_sync_pipe_params::MODE_USER; + } else { + cerr << "ERROR: bad mode value: should be one of the following: system, user" << std::endl; + return EINVAL; + } + } + + if (!rgw::sal::User::empty(user)) { + pipe->params.user = user->get_id(); + } else if (pipe->params.user.empty()) { + auto owner = sync_policy_ctx.get_owner(); + if (owner) { + pipe->params.user = *owner; + } + } + + ret = sync_policy_ctx.write_policy(); + if (ret < 0) { + return -ret; + } + + show_result(sync_policy, zone_formatter.get(), cout); + } + + if (opt_cmd == OPT::SYNC_GROUP_PIPE_REMOVE) { + CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL); + CHECK_TRUE(require_non_empty_opt(opt_pipe_id), "ERROR: --pipe-id not specified", EINVAL); + + SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket); + ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name); + if (ret < 0) { + return -ret; + } + auto& sync_policy = sync_policy_ctx.get_policy(); + + auto iter = sync_policy.groups.find(*opt_group_id); + if (iter == sync_policy.groups.end()) { + cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl; + return ENOENT; + } + + auto& group = iter->second; + + rgw_sync_bucket_pipes *pipe; + + if (!group.find_pipe(*opt_pipe_id, false, &pipe)) { + cerr << "ERROR: could not find pipe '" << *opt_pipe_id << "'" << std::endl; + return ENOENT; + } + + if (opt_source_zone_ids) { + pipe->source.remove_zones(*opt_source_zone_ids); + } + + pipe->source.remove_bucket(opt_source_tenant, + opt_source_bucket_name, + opt_source_bucket_id); + if (opt_dest_zone_ids) { + pipe->dest.remove_zones(*opt_dest_zone_ids); + } + pipe->dest.remove_bucket(opt_dest_tenant, + opt_dest_bucket_name, + opt_dest_bucket_id); + + if (!(opt_source_zone_ids || + opt_source_tenant || + opt_source_bucket || + opt_source_bucket_id || + opt_dest_zone_ids || + opt_dest_tenant || + opt_dest_bucket || + opt_dest_bucket_id)) { + group.remove_pipe(*opt_pipe_id); + } + + ret = sync_policy_ctx.write_policy(); + if (ret < 0) { + return -ret; + } + + show_result(sync_policy, zone_formatter.get(), cout); + } + + if (opt_cmd == OPT::SYNC_POLICY_GET) { + SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket); + ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name); + if (ret < 0) { + return -ret; + } + auto& sync_policy = sync_policy_ctx.get_policy(); + + show_result(sync_policy, zone_formatter.get(), cout); + } + + if (opt_cmd == OPT::BILOG_TRIM) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (!gen) { + gen = 0; + } + ret = bilog_trim(dpp(), static_cast(driver), + bucket->get_info(), *gen, + shard_id, start_marker, end_marker); + if (ret < 0) { + cerr << "ERROR: trim_bi_log_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::BILOG_STATUS) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + map markers; + const auto& logs = bucket->get_info().layout.logs; + auto log_layout = std::reference_wrapper{logs.back()}; + if (gen) { + auto i = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen)); + if (i == logs.end()) { + cerr << "ERROR: no log layout with gen=" << *gen << std::endl; + return ENOENT; + } + log_layout = *i; + } + + ret = static_cast(driver)->svc()->bilog_rados->get_log_status(dpp(), bucket->get_info(), log_layout, shard_id, + &markers, null_yield); + if (ret < 0) { + cerr << "ERROR: get_bi_log_status(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("entries"); + encode_json("markers", markers, formatter.get()); + formatter->dump_string("current_time", + to_iso_8601(ceph::real_clock::now(), + iso_8601_format::YMDhms)); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::BILOG_AUTOTRIM) { + RGWCoroutinesManager crs(driver->ctx(), driver->get_cr_registry()); + RGWHTTPManager http(driver->ctx(), crs.get_completion_mgr()); + int ret = http.start(); + if (ret < 0) { + cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl; + return -ret; + } + + rgw::BucketTrimConfig config; + configure_bucket_trim(driver->ctx(), config); + + rgw::BucketTrimManager trim(static_cast(driver), config); + ret = trim.init(); + if (ret < 0) { + cerr << "trim manager init failed with " << cpp_strerror(ret) << std::endl; + return -ret; + } + ret = crs.run(dpp(), trim.create_admin_bucket_trim_cr(&http)); + if (ret < 0) { + cerr << "automated bilog trim failed with " << cpp_strerror(ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::DATALOG_LIST) { + formatter->open_array_section("entries"); + bool truncated; + int count = 0; + if (max_entries < 0) + max_entries = 1000; + if (!start_date.empty()) { + std::cerr << "start-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_date.empty()) { + std::cerr << "end-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_marker.empty()) { + std::cerr << "end-marker not allowed." << std::endl; + return -EINVAL; + } + if (!start_marker.empty()) { + if (marker.empty()) { + marker = start_marker; + } else { + std::cerr << "start-marker and marker not both allowed." << std::endl; + return -EINVAL; + } + } + + auto datalog_svc = static_cast(driver)->svc()->datalog_rados; + RGWDataChangesLog::LogMarker log_marker; + + do { + std::vector entries; + if (specified_shard_id) { + ret = datalog_svc->list_entries(dpp(), shard_id, max_entries - count, + entries, marker, + &marker, &truncated, + null_yield); + } else { + ret = datalog_svc->list_entries(dpp(), max_entries - count, entries, + log_marker, &truncated, null_yield); + } + if (ret < 0) { + cerr << "ERROR: datalog_svc->list_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += entries.size(); + + for (const auto& entry : entries) { + if (!extra_info) { + encode_json("entry", entry.entry, formatter.get()); + } else { + encode_json("entry", entry, formatter.get()); + } + } + formatter.get()->flush(cout); + } while (truncated && count < max_entries); + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::DATALOG_STATUS) { + int i = (specified_shard_id ? shard_id : 0); + + formatter->open_array_section("entries"); + for (; i < g_ceph_context->_conf->rgw_data_log_num_shards; i++) { + list entries; + + RGWDataChangesLogInfo info; + static_cast(driver)->svc()-> + datalog_rados->get_info(dpp(), i, &info, null_yield); + + ::encode_json("info", info, formatter.get()); + + if (specified_shard_id) + break; + } + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::DATALOG_AUTOTRIM) { + RGWCoroutinesManager crs(driver->ctx(), driver->get_cr_registry()); + RGWHTTPManager http(driver->ctx(), crs.get_completion_mgr()); + int ret = http.start(); + if (ret < 0) { + cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl; + return -ret; + } + + auto num_shards = g_conf()->rgw_data_log_num_shards; + std::vector markers(num_shards); + ret = crs.run(dpp(), create_admin_data_log_trim_cr(dpp(), static_cast(driver), &http, num_shards, markers)); + if (ret < 0) { + cerr << "automated datalog trim failed with " << cpp_strerror(ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::DATALOG_TRIM) { + if (!start_date.empty()) { + std::cerr << "start-date not allowed." << std::endl; + return -EINVAL; + } + if (!end_date.empty()) { + std::cerr << "end-date not allowed." << std::endl; + return -EINVAL; + } + if (!start_marker.empty()) { + std::cerr << "start-marker not allowed." << std::endl; + return -EINVAL; + } + if (!end_marker.empty()) { + if (marker.empty()) { + marker = end_marker; + } else { + std::cerr << "end-marker and marker not both allowed." << std::endl; + return -EINVAL; + } + } + + if (!specified_shard_id) { + cerr << "ERROR: requires a --shard-id" << std::endl; + return EINVAL; + } + + if (marker.empty()) { + cerr << "ERROR: requires a --marker" << std::endl; + return EINVAL; + } + + auto datalog = static_cast(driver)->svc()->datalog_rados; + ret = datalog->trim_entries(dpp(), shard_id, marker, null_yield); + + if (ret < 0 && ret != -ENODATA) { + cerr << "ERROR: trim_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::DATALOG_TYPE) { + if (!opt_log_type) { + std::cerr << "log-type not specified." << std::endl; + return -EINVAL; + } + auto datalog = static_cast(driver)->svc()->datalog_rados; + ret = datalog->change_format(dpp(), *opt_log_type, null_yield); + if (ret < 0) { + cerr << "ERROR: change_format(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::DATALOG_PRUNE) { + auto datalog = static_cast(driver)->svc()->datalog_rados; + std::optional through; + ret = datalog->trim_generations(dpp(), through, null_yield); + + if (ret < 0) { + cerr << "ERROR: trim_generations(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (through) { + std::cout << "Pruned " << *through << " empty generations." << std::endl; + } else { + std::cout << "No empty generations." << std::endl; + } + } + + bool quota_op = (opt_cmd == OPT::QUOTA_SET || opt_cmd == OPT::QUOTA_ENABLE || opt_cmd == OPT::QUOTA_DISABLE); + + if (quota_op) { + if (bucket_name.empty() && rgw::sal::User::empty(user)) { + cerr << "ERROR: bucket name or uid is required for quota operation" << std::endl; + return EINVAL; + } + + if (!bucket_name.empty()) { + if (!quota_scope.empty() && quota_scope != "bucket") { + cerr << "ERROR: invalid quota scope specification." << std::endl; + return EINVAL; + } + set_bucket_quota(driver, opt_cmd, tenant, bucket_name, + max_size, max_objects, have_max_size, have_max_objects); + } else if (!rgw::sal::User::empty(user)) { + if (quota_scope == "bucket") { + return set_user_bucket_quota(opt_cmd, ruser, user_op, max_size, max_objects, have_max_size, have_max_objects); + } else if (quota_scope == "user") { + return set_user_quota(opt_cmd, ruser, user_op, max_size, max_objects, have_max_size, have_max_objects); + } else { + cerr << "ERROR: invalid quota scope specification. Please specify either --quota-scope=bucket, or --quota-scope=user" << std::endl; + return EINVAL; + } + } + } + + bool ratelimit_op_set = (opt_cmd == OPT::RATELIMIT_SET || opt_cmd == OPT::RATELIMIT_ENABLE || opt_cmd == OPT::RATELIMIT_DISABLE); + bool ratelimit_op_get = opt_cmd == OPT::RATELIMIT_GET; + if (ratelimit_op_set) { + if (bucket_name.empty() && rgw::sal::User::empty(user)) { + cerr << "ERROR: bucket name or uid is required for ratelimit operation" << std::endl; + return EINVAL; + } + + if (!bucket_name.empty()) { + if (!ratelimit_scope.empty() && ratelimit_scope != "bucket") { + cerr << "ERROR: invalid ratelimit scope specification. (bucket scope is not bucket but bucket has been specified)" << std::endl; + return EINVAL; + } + return set_bucket_ratelimit(driver, opt_cmd, tenant, bucket_name, + max_read_ops, max_write_ops, + max_read_bytes, max_write_bytes, + have_max_read_ops, have_max_write_ops, + have_max_read_bytes, have_max_write_bytes); + } else if (!rgw::sal::User::empty(user)) { + } if (ratelimit_scope == "user") { + return set_user_ratelimit(opt_cmd, user, max_read_ops, max_write_ops, + max_read_bytes, max_write_bytes, + have_max_read_ops, have_max_write_ops, + have_max_read_bytes, have_max_write_bytes); + } else { + cerr << "ERROR: invalid ratelimit scope specification. Please specify either --ratelimit-scope=bucket, or --ratelimit-scope=user" << std::endl; + return EINVAL; + } + } + + if (ratelimit_op_get) { + if (bucket_name.empty() && rgw::sal::User::empty(user)) { + cerr << "ERROR: bucket name or uid is required for ratelimit operation" << std::endl; + return EINVAL; + } + + if (!bucket_name.empty()) { + if (!ratelimit_scope.empty() && ratelimit_scope != "bucket") { + cerr << "ERROR: invalid ratelimit scope specification. (bucket scope is not bucket but bucket has been specified)" << std::endl; + return EINVAL; + } + return show_bucket_ratelimit(driver, tenant, bucket_name, formatter.get()); + } else if (!rgw::sal::User::empty(user)) { + } if (ratelimit_scope == "user") { + return show_user_ratelimit(user, formatter.get()); + } else { + cerr << "ERROR: invalid ratelimit scope specification. Please specify either --ratelimit-scope=bucket, or --ratelimit-scope=user" << std::endl; + return EINVAL; + } + } + + if (opt_cmd == OPT::MFA_CREATE) { + rados::cls::otp::otp_info_t config; + + if (rgw::sal::User::empty(user)) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + if (totp_seed.empty()) { + cerr << "ERROR: TOTP device seed was not provided (via --totp-seed)" << std::endl; + return EINVAL; + } + + + rados::cls::otp::SeedType seed_type; + if (totp_seed_type == "hex") { + seed_type = rados::cls::otp::OTP_SEED_HEX; + } else if (totp_seed_type == "base32") { + seed_type = rados::cls::otp::OTP_SEED_BASE32; + } else { + cerr << "ERROR: invalid seed type: " << totp_seed_type << std::endl; + return EINVAL; + } + + config.id = totp_serial; + config.seed = totp_seed; + config.seed_type = seed_type; + + if (totp_seconds > 0) { + config.step_size = totp_seconds; + } + + if (totp_window > 0) { + config.window = totp_window; + } + + real_time mtime = real_clock::now(); + string oid = static_cast(driver)->svc()->cls->mfa.get_mfa_oid(user->get_id()); + + int ret = static_cast(driver)->ctl()->meta.mgr->mutate(RGWSI_MetaBackend_OTP::get_meta_key(user->get_id()), + mtime, &objv_tracker, + null_yield, dpp(), + MDLOG_STATUS_WRITE, + [&] { + return static_cast(driver)->svc()->cls->mfa.create_mfa(dpp(), user->get_id(), config, &objv_tracker, mtime, null_yield); + }); + if (ret < 0) { + cerr << "MFA creation failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWUserInfo& user_info = user_op.get_user_info(); + user_info.mfa_ids.insert(totp_serial); + user_op.set_mfa_ids(user_info.mfa_ids); + string err; + ret = ruser.modify(dpp(), user_op, null_yield, &err); + if (ret < 0) { + cerr << "ERROR: failed storing user info, error: " << err << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::MFA_REMOVE) { + if (rgw::sal::User::empty(user)) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + real_time mtime = real_clock::now(); + + int ret = static_cast(driver)->ctl()->meta.mgr->mutate(RGWSI_MetaBackend_OTP::get_meta_key(user->get_id()), + mtime, &objv_tracker, + null_yield, dpp(), + MDLOG_STATUS_WRITE, + [&] { + return static_cast(driver)->svc()->cls->mfa.remove_mfa(dpp(), user->get_id(), totp_serial, &objv_tracker, mtime, null_yield); + }); + if (ret < 0) { + cerr << "MFA removal failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWUserInfo& user_info = user_op.get_user_info(); + user_info.mfa_ids.erase(totp_serial); + user_op.set_mfa_ids(user_info.mfa_ids); + string err; + ret = ruser.modify(dpp(), user_op, null_yield, &err); + if (ret < 0) { + cerr << "ERROR: failed storing user info, error: " << err << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::MFA_GET) { + if (rgw::sal::User::empty(user)) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + rados::cls::otp::otp_info_t result; + int ret = static_cast(driver)->svc()->cls->mfa.get_mfa(dpp(), user->get_id(), totp_serial, &result, null_yield); + if (ret < 0) { + if (ret == -ENOENT || ret == -ENODATA) { + cerr << "MFA serial id not found" << std::endl; + } else { + cerr << "MFA retrieval failed, error: " << cpp_strerror(-ret) << std::endl; + } + return -ret; + } + formatter->open_object_section("result"); + encode_json("entry", result, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::MFA_LIST) { + if (rgw::sal::User::empty(user)) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + list result; + int ret = static_cast(driver)->svc()->cls->mfa.list_mfa(dpp(), user->get_id(), &result, null_yield); + if (ret < 0 && ret != -ENOENT) { + cerr << "MFA listing failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("result"); + encode_json("entries", result, formatter.get()); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT::MFA_CHECK) { + if (rgw::sal::User::empty(user)) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + if (totp_pin.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-pin)" << std::endl; + return EINVAL; + } + + list result; + int ret = static_cast(driver)->svc()->cls->mfa.check_mfa(dpp(), user->get_id(), totp_serial, totp_pin.front(), null_yield); + if (ret < 0) { + cerr << "MFA check failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + cout << "ok" << std::endl; + } + + if (opt_cmd == OPT::MFA_RESYNC) { + if (rgw::sal::User::empty(user)) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + if (totp_pin.size() != 2) { + cerr << "ERROR: missing two --totp-pin params (--totp-pin= --totp-pin=)" << std::endl; + return EINVAL; + } + + rados::cls::otp::otp_info_t config; + int ret = static_cast(driver)->svc()->cls->mfa.get_mfa(dpp(), user->get_id(), totp_serial, &config, null_yield); + if (ret < 0) { + if (ret == -ENOENT || ret == -ENODATA) { + cerr << "MFA serial id not found" << std::endl; + } else { + cerr << "MFA retrieval failed, error: " << cpp_strerror(-ret) << std::endl; + } + return -ret; + } + + ceph::real_time now; + + ret = static_cast(driver)->svc()->cls->mfa.otp_get_current_time(dpp(), user->get_id(), &now, null_yield); + if (ret < 0) { + cerr << "ERROR: failed to fetch current time from osd: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + time_t time_ofs; + + ret = scan_totp(driver->ctx(), now, config, totp_pin, &time_ofs); + if (ret < 0) { + if (ret == -ENOENT) { + cerr << "failed to resync, TOTP values not found in range" << std::endl; + } else { + cerr << "ERROR: failed to scan for TOTP values: " << cpp_strerror(-ret) << std::endl; + } + return -ret; + } + + config.time_ofs = time_ofs; + + /* now update the backend */ + real_time mtime = real_clock::now(); + + ret = static_cast(driver)->ctl()->meta.mgr->mutate(RGWSI_MetaBackend_OTP::get_meta_key(user->get_id()), + mtime, &objv_tracker, + null_yield, dpp(), + MDLOG_STATUS_WRITE, + [&] { + return static_cast(driver)->svc()->cls->mfa.create_mfa(dpp(), user->get_id(), config, &objv_tracker, mtime, null_yield); + }); + if (ret < 0) { + cerr << "MFA update failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + } + + if (opt_cmd == OPT::RESHARD_STALE_INSTANCES_LIST) { + if (!static_cast(driver)->svc()->zone->can_reshard() && !yes_i_really_mean_it) { + cerr << "Resharding disabled in a multisite env, stale instances unlikely from resharding" << std::endl; + cerr << "These instances may not be safe to delete." << std::endl; + cerr << "Use --yes-i-really-mean-it to force displaying these instances." << std::endl; + return EINVAL; + } + + ret = RGWBucketAdminOp::list_stale_instances(driver, bucket_op, stream_flusher, dpp()); + if (ret < 0) { + cerr << "ERROR: listing stale instances" << cpp_strerror(-ret) << std::endl; + } + } + + if (opt_cmd == OPT::RESHARD_STALE_INSTANCES_DELETE) { + if (!static_cast(driver)->svc()->zone->can_reshard()) { + cerr << "Resharding disabled in a multisite env. Stale instances are not safe to be deleted." << std::endl; + return EINVAL; + } + + ret = RGWBucketAdminOp::clear_stale_instances(driver, bucket_op, stream_flusher, dpp()); + if (ret < 0) { + cerr << "ERROR: deleting stale instances" << cpp_strerror(-ret) << std::endl; + } + } + + if (opt_cmd == OPT::PUBSUB_NOTIFICATION_LIST) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name was not provided (via --bucket)" << std::endl; + return EINVAL; + } + + RGWPubSub ps(driver, tenant); + + rgw_pubsub_bucket_topics result; + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + const RGWPubSub::Bucket b(ps, bucket.get()); + ret = b.get_topics(dpp(), result, null_yield); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("result", result, formatter.get()); + formatter->flush(cout); + } + + if (opt_cmd == OPT::PUBSUB_TOPIC_LIST) { + RGWPubSub ps(driver, tenant); + + rgw_pubsub_topics result; + int ret = ps.get_topics(dpp(), result, null_yield); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("result", result, formatter.get()); + formatter->flush(cout); + } + + if (opt_cmd == OPT::PUBSUB_TOPIC_GET) { + if (topic_name.empty()) { + cerr << "ERROR: topic name was not provided (via --topic)" << std::endl; + return EINVAL; + } + + RGWPubSub ps(driver, tenant); + + rgw_pubsub_topic topic; + ret = ps.get_topic(dpp(), topic_name, topic, null_yield); + if (ret < 0) { + cerr << "ERROR: could not get topic: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("topic", topic, formatter.get()); + formatter->flush(cout); + } + + if (opt_cmd == OPT::PUBSUB_NOTIFICATION_GET) { + if (notification_id.empty()) { + cerr << "ERROR: notification-id was not provided (via --notification-id)" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket name was not provided (via --bucket)" << std::endl; + return EINVAL; + } + + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWPubSub ps(driver, tenant); + + rgw_pubsub_bucket_topics bucket_topics; + const RGWPubSub::Bucket b(ps, bucket.get()); + ret = b.get_topics(dpp(), bucket_topics, null_yield); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: could not get bucket notifications: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + rgw_pubsub_topic_filter bucket_topic; + ret = b.get_notification_by_id(dpp(), notification_id, bucket_topic, null_yield); + if (ret < 0) { + cerr << "ERROR: could not get notification: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("notification", bucket_topic, formatter.get()); + formatter->flush(cout); + } + + if (opt_cmd == OPT::PUBSUB_TOPIC_RM) { + if (topic_name.empty()) { + cerr << "ERROR: topic name was not provided (via --topic)" << std::endl; + return EINVAL; + } + + ret = rgw::notify::remove_persistent_topic(dpp(), static_cast(driver)->getRados()->get_notif_pool_ctx(), topic_name, null_yield); + if (ret < 0) { + cerr << "ERROR: could not remove persistent topic: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWPubSub ps(driver, tenant); + + ret = ps.remove_topic(dpp(), topic_name, null_yield); + if (ret < 0) { + cerr << "ERROR: could not remove topic: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT::PUBSUB_NOTIFICATION_RM) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name was not provided (via --bucket)" << std::endl; + return EINVAL; + } + + int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWPubSub ps(driver, tenant); + + rgw_pubsub_bucket_topics bucket_topics; + const RGWPubSub::Bucket b(ps, bucket.get()); + ret = b.get_topics(dpp(), bucket_topics, null_yield); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: could not get bucket notifications: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + rgw_pubsub_topic_filter bucket_topic; + if(notification_id.empty()) { + ret = b.remove_notifications(dpp(), null_yield); + } else { + ret = b.remove_notification_by_id(dpp(), notification_id, null_yield); + } + } + + if (opt_cmd == OPT::SCRIPT_PUT) { + if (!str_script_ctx) { + cerr << "ERROR: context was not provided (via --context)" << std::endl; + return EINVAL; + } + if (infile.empty()) { + cerr << "ERROR: infile was not provided (via --infile)" << std::endl; + return EINVAL; + } + bufferlist bl; + auto rc = read_input(infile, bl); + if (rc < 0) { + cerr << "ERROR: failed to read script: '" << infile << "'. error: " << rc << std::endl; + return -rc; + } + const std::string script = bl.to_str(); + std::string err_msg; + if (!rgw::lua::verify(script, err_msg)) { + cerr << "ERROR: script: '" << infile << "' has error: " << std::endl << err_msg << std::endl; + return EINVAL; + } + const rgw::lua::context script_ctx = rgw::lua::to_context(*str_script_ctx); + if (script_ctx == rgw::lua::context::none) { + cerr << "ERROR: invalid script context: " << *str_script_ctx << ". must be one of: " << LUA_CONTEXT_LIST << std::endl; + return EINVAL; + } + if (script_ctx == rgw::lua::context::background && !tenant.empty()) { + cerr << "ERROR: cannot specify tenant in background context" << std::endl; + return EINVAL; + } + auto lua_manager = driver->get_lua_manager(); + rc = rgw::lua::write_script(dpp(), lua_manager.get(), tenant, null_yield, script_ctx, script); + if (rc < 0) { + cerr << "ERROR: failed to put script. error: " << rc << std::endl; + return -rc; + } + } + + if (opt_cmd == OPT::SCRIPT_GET) { + if (!str_script_ctx) { + cerr << "ERROR: context was not provided (via --context)" << std::endl; + return EINVAL; + } + const rgw::lua::context script_ctx = rgw::lua::to_context(*str_script_ctx); + if (script_ctx == rgw::lua::context::none) { + cerr << "ERROR: invalid script context: " << *str_script_ctx << ". must be one of: " << LUA_CONTEXT_LIST << std::endl; + return EINVAL; + } + auto lua_manager = driver->get_lua_manager(); + std::string script; + const auto rc = rgw::lua::read_script(dpp(), lua_manager.get(), tenant, null_yield, script_ctx, script); + if (rc == -ENOENT) { + std::cout << "no script exists for context: " << *str_script_ctx << + (tenant.empty() ? "" : (" in tenant: " + tenant)) << std::endl; + } else if (rc < 0) { + cerr << "ERROR: failed to read script. error: " << rc << std::endl; + return -rc; + } else { + std::cout << script << std::endl; + } + } + + if (opt_cmd == OPT::SCRIPT_RM) { + if (!str_script_ctx) { + cerr << "ERROR: context was not provided (via --context)" << std::endl; + return EINVAL; + } + const rgw::lua::context script_ctx = rgw::lua::to_context(*str_script_ctx); + if (script_ctx == rgw::lua::context::none) { + cerr << "ERROR: invalid script context: " << *str_script_ctx << ". must be one of: " << LUA_CONTEXT_LIST << std::endl; + return EINVAL; + } + auto lua_manager = driver->get_lua_manager(); + const auto rc = rgw::lua::delete_script(dpp(), lua_manager.get(), tenant, null_yield, script_ctx); + if (rc < 0) { + cerr << "ERROR: failed to remove script. error: " << rc << std::endl; + return -rc; + } + } + + if (opt_cmd == OPT::SCRIPT_PACKAGE_ADD) { +#ifdef WITH_RADOSGW_LUA_PACKAGES + if (!script_package) { + cerr << "ERROR: lua package name was not provided (via --package)" << std::endl; + return EINVAL; + } + const auto rc = rgw::lua::add_package(dpp(), driver, null_yield, *script_package, bool(allow_compilation)); + if (rc < 0) { + cerr << "ERROR: failed to add lua package: " << script_package << " .error: " << rc << std::endl; + return -rc; + } +#else + cerr << "ERROR: adding lua packages is not permitted" << std::endl; + return EPERM; +#endif + } + + if (opt_cmd == OPT::SCRIPT_PACKAGE_RM) { +#ifdef WITH_RADOSGW_LUA_PACKAGES + if (!script_package) { + cerr << "ERROR: lua package name was not provided (via --package)" << std::endl; + return EINVAL; + } + const auto rc = rgw::lua::remove_package(dpp(), driver, null_yield, *script_package); + if (rc == -ENOENT) { + cerr << "WARNING: package " << script_package << " did not exists or already removed" << std::endl; + return 0; + } + if (rc < 0) { + cerr << "ERROR: failed to remove lua package: " << script_package << " .error: " << rc << std::endl; + return -rc; + } +#else + cerr << "ERROR: removing lua packages in not permitted" << std::endl; + return EPERM; +#endif + } + + if (opt_cmd == OPT::SCRIPT_PACKAGE_LIST) { +#ifdef WITH_RADOSGW_LUA_PACKAGES + rgw::lua::packages_t packages; + const auto rc = rgw::lua::list_packages(dpp(), driver, null_yield, packages); + if (rc == -ENOENT) { + std::cout << "no lua packages in allowlist" << std::endl; + } else if (rc < 0) { + cerr << "ERROR: failed to read lua packages allowlist. error: " << rc << std::endl; + return rc; + } else { + for (const auto& package : packages) { + std::cout << package << std::endl; + } + } +#else + cerr << "ERROR: listing lua packages in not permitted" << std::endl; + return EPERM; +#endif + } + + return 0; +} + diff --git a/src/rgw/rgw_aio.cc b/src/rgw/rgw_aio.cc new file mode 100644 index 000000000..4fba513b8 --- /dev/null +++ b/src/rgw/rgw_aio.cc @@ -0,0 +1,138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include "include/rados/librados.hpp" +#include "librados/librados_asio.h" + +#include "rgw_aio.h" +#include "rgw_d3n_cacherequest.h" + +namespace rgw { + +namespace { + +void cb(librados::completion_t, void* arg); + +struct state { + Aio* aio; + librados::AioCompletion* c; + + state(Aio* aio, AioResult& r) + : aio(aio), + c(librados::Rados::aio_create_completion(&r, &cb)) {} +}; + +void cb(librados::completion_t, void* arg) { + static_assert(sizeof(AioResult::user_data) >= sizeof(state)); + static_assert(std::is_trivially_destructible_v); + auto& r = *(static_cast(arg)); + auto s = reinterpret_cast(&r.user_data); + r.result = s->c->get_return_value(); + s->c->release(); + s->aio->put(r); +} + +template +Aio::OpFunc aio_abstract(Op&& op) { + return [op = std::move(op)] (Aio* aio, AioResult& r) mutable { + constexpr bool read = std::is_same_v, librados::ObjectReadOperation>; + auto s = new (&r.user_data) state(aio, r); + if constexpr (read) { + r.result = r.obj.aio_operate(s->c, &op, &r.data); + } else { + r.result = r.obj.aio_operate(s->c, &op); + } + if (r.result < 0) { + s->c->release(); + aio->put(r); + } + }; +} + +struct Handler { + Aio* throttle = nullptr; + AioResult& r; + // write callback + void operator()(boost::system::error_code ec) const { + r.result = -ec.value(); + throttle->put(r); + } + // read callback + void operator()(boost::system::error_code ec, bufferlist bl) const { + r.result = -ec.value(); + r.data = std::move(bl); + throttle->put(r); + } +}; + +template +Aio::OpFunc aio_abstract(Op&& op, boost::asio::io_context& context, + yield_context yield) { + return [op = std::move(op), &context, yield] (Aio* aio, AioResult& r) mutable { + // arrange for the completion Handler to run on the yield_context's strand + // executor so it can safely call back into Aio without locking + using namespace boost::asio; + async_completion init(yield); + auto ex = get_associated_executor(init.completion_handler); + + auto& ref = r.obj.get_ref(); + librados::async_operate(context, ref.pool.ioctx(), ref.obj.oid, &op, 0, + bind_executor(ex, Handler{aio, r})); + }; +} + + +Aio::OpFunc d3n_cache_aio_abstract(const DoutPrefixProvider *dpp, optional_yield y, off_t read_ofs, off_t read_len, std::string& cache_location) { + return [dpp, y, read_ofs, read_len, cache_location] (Aio* aio, AioResult& r) mutable { + // d3n data cache requires yield context (rgw_beast_enable_async=true) + ceph_assert(y); + auto& ref = r.obj.get_ref(); + auto c = std::make_unique(); + lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: d3n_cache_aio_abstract(): libaio Read From Cache, oid=" << ref.obj.oid << dendl; + c->file_aio_read_abstract(dpp, y.get_io_context(), y.get_yield_context(), cache_location, read_ofs, read_len, aio, r); + }; +} + + +template +Aio::OpFunc aio_abstract(Op&& op, optional_yield y) { + static_assert(std::is_base_of_v>); + static_assert(!std::is_lvalue_reference_v); + static_assert(!std::is_const_v); + if (y) { + return aio_abstract(std::forward(op), y.get_io_context(), + y.get_yield_context()); + } + return aio_abstract(std::forward(op)); +} + +} // anonymous namespace + +Aio::OpFunc Aio::librados_op(librados::ObjectReadOperation&& op, + optional_yield y) { + return aio_abstract(std::move(op), y); +} +Aio::OpFunc Aio::librados_op(librados::ObjectWriteOperation&& op, + optional_yield y) { + return aio_abstract(std::move(op), y); +} + +Aio::OpFunc Aio::d3n_cache_op(const DoutPrefixProvider *dpp, optional_yield y, + off_t read_ofs, off_t read_len, std::string& cache_location) { + return d3n_cache_aio_abstract(dpp, y, read_ofs, read_len, cache_location); +} + +} // namespace rgw diff --git a/src/rgw/rgw_aio.h b/src/rgw/rgw_aio.h new file mode 100644 index 000000000..a2c539c17 --- /dev/null +++ b/src/rgw/rgw_aio.h @@ -0,0 +1,104 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include + +#include +#include "include/rados/librados_fwd.hpp" +#include "common/async/yield_context.h" + +#include "services/svc_rados.h" // cant forward declare RGWSI_RADOS::Obj + +#include "rgw_common.h" + +#include "include/function2.hpp" + +struct D3nGetObjData; + +namespace rgw { + +struct AioResult { + RGWSI_RADOS::Obj obj; + uint64_t id = 0; // id allows caller to associate a result with its request + bufferlist data; // result buffer for reads + int result = 0; + std::aligned_storage_t<3 * sizeof(void*)> user_data; + + AioResult() = default; + AioResult(const AioResult&) = delete; + AioResult& operator =(const AioResult&) = delete; + AioResult(AioResult&&) = delete; + AioResult& operator =(AioResult&&) = delete; +}; +struct AioResultEntry : AioResult, boost::intrusive::list_base_hook<> { + virtual ~AioResultEntry() {} +}; +// a list of polymorphic entries that frees them on destruction +template +struct OwningList : boost::intrusive::list { + OwningList() = default; + ~OwningList() { this->clear_and_dispose(std::default_delete{}); } + OwningList(OwningList&&) = default; + OwningList& operator=(OwningList&&) = default; + OwningList(const OwningList&) = delete; + OwningList& operator=(const OwningList&) = delete; +}; +using AioResultList = OwningList; + +// returns the first error code or 0 if all succeeded +inline int check_for_errors(const AioResultList& results) { + for (auto& e : results) { + if (e.result < 0) { + return e.result; + } + } + return 0; +} + +// interface to submit async librados operations and wait on their completions. +// each call returns a list of results from prior completions +class Aio { + public: + using OpFunc = fu2::unique_function; + + virtual ~Aio() {} + + virtual AioResultList get(const RGWSI_RADOS::Obj& obj, + OpFunc&& f, + uint64_t cost, uint64_t id) = 0; + virtual void put(AioResult& r) = 0; + + // poll for any ready completions without waiting + virtual AioResultList poll() = 0; + + // return any ready completions. if there are none, wait for the next + virtual AioResultList wait() = 0; + + // wait for all outstanding completions and return their results + virtual AioResultList drain() = 0; + + static OpFunc librados_op(librados::ObjectReadOperation&& op, + optional_yield y); + static OpFunc librados_op(librados::ObjectWriteOperation&& op, + optional_yield y); + static OpFunc d3n_cache_op(const DoutPrefixProvider *dpp, optional_yield y, + off_t read_ofs, off_t read_len, std::string& location); +}; + +} // namespace rgw diff --git a/src/rgw/rgw_aio_throttle.cc b/src/rgw/rgw_aio_throttle.cc new file mode 100644 index 000000000..8ada6db34 --- /dev/null +++ b/src/rgw/rgw_aio_throttle.cc @@ -0,0 +1,202 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/librados.hpp" + +#include "rgw_aio_throttle.h" + +namespace rgw { + +bool Throttle::waiter_ready() const +{ + switch (waiter) { + case Wait::Available: return is_available(); + case Wait::Completion: return has_completion(); + case Wait::Drained: return is_drained(); + default: return false; + } +} + +AioResultList BlockingAioThrottle::get(const RGWSI_RADOS::Obj& obj, + OpFunc&& f, + uint64_t cost, uint64_t id) +{ + auto p = std::make_unique(); + p->obj = obj; + p->id = id; + p->cost = cost; + + std::unique_lock lock{mutex}; + if (cost > window) { + p->result = -EDEADLK; // would never succeed + completed.push_back(*p); + } else { + // wait for the write size to become available + pending_size += p->cost; + if (!is_available()) { + ceph_assert(waiter == Wait::None); + waiter = Wait::Available; + cond.wait(lock, [this] { return is_available(); }); + waiter = Wait::None; + } + + // register the pending write and attach a completion + p->parent = this; + pending.push_back(*p); + lock.unlock(); + std::move(f)(this, *static_cast(p.get())); + lock.lock(); + } + p.release(); + return std::move(completed); +} + +void BlockingAioThrottle::put(AioResult& r) +{ + auto& p = static_cast(r); + std::scoped_lock lock{mutex}; + + // move from pending to completed + pending.erase(pending.iterator_to(p)); + completed.push_back(p); + + pending_size -= p.cost; + + if (waiter_ready()) { + cond.notify_one(); + } +} + +AioResultList BlockingAioThrottle::poll() +{ + std::unique_lock lock{mutex}; + return std::move(completed); +} + +AioResultList BlockingAioThrottle::wait() +{ + std::unique_lock lock{mutex}; + if (completed.empty() && !pending.empty()) { + ceph_assert(waiter == Wait::None); + waiter = Wait::Completion; + cond.wait(lock, [this] { return has_completion(); }); + waiter = Wait::None; + } + return std::move(completed); +} + +AioResultList BlockingAioThrottle::drain() +{ + std::unique_lock lock{mutex}; + if (!pending.empty()) { + ceph_assert(waiter == Wait::None); + waiter = Wait::Drained; + cond.wait(lock, [this] { return is_drained(); }); + waiter = Wait::None; + } + return std::move(completed); +} + +template +auto YieldingAioThrottle::async_wait(CompletionToken&& token) +{ + using boost::asio::async_completion; + using Signature = void(boost::system::error_code); + async_completion init(token); + completion = Completion::create(context.get_executor(), + std::move(init.completion_handler)); + return init.result.get(); +} + +AioResultList YieldingAioThrottle::get(const RGWSI_RADOS::Obj& obj, + OpFunc&& f, + uint64_t cost, uint64_t id) +{ + auto p = std::make_unique(); + p->obj = obj; + p->id = id; + p->cost = cost; + + if (cost > window) { + p->result = -EDEADLK; // would never succeed + completed.push_back(*p); + } else { + // wait for the write size to become available + pending_size += p->cost; + if (!is_available()) { + ceph_assert(waiter == Wait::None); + ceph_assert(!completion); + + boost::system::error_code ec; + waiter = Wait::Available; + async_wait(yield[ec]); + } + + // register the pending write and initiate the operation + pending.push_back(*p); + std::move(f)(this, *static_cast(p.get())); + } + p.release(); + return std::move(completed); +} + +void YieldingAioThrottle::put(AioResult& r) +{ + auto& p = static_cast(r); + + // move from pending to completed + pending.erase(pending.iterator_to(p)); + completed.push_back(p); + + pending_size -= p.cost; + + if (waiter_ready()) { + ceph_assert(completion); + ceph::async::post(std::move(completion), boost::system::error_code{}); + waiter = Wait::None; + } +} + +AioResultList YieldingAioThrottle::poll() +{ + return std::move(completed); +} + +AioResultList YieldingAioThrottle::wait() +{ + if (!has_completion() && !pending.empty()) { + ceph_assert(waiter == Wait::None); + ceph_assert(!completion); + + boost::system::error_code ec; + waiter = Wait::Completion; + async_wait(yield[ec]); + } + return std::move(completed); +} + +AioResultList YieldingAioThrottle::drain() +{ + if (!is_drained()) { + ceph_assert(waiter == Wait::None); + ceph_assert(!completion); + + boost::system::error_code ec; + waiter = Wait::Drained; + async_wait(yield[ec]); + } + return std::move(completed); +} +} // namespace rgw diff --git a/src/rgw/rgw_aio_throttle.h b/src/rgw/rgw_aio_throttle.h new file mode 100644 index 000000000..30ae93cd6 --- /dev/null +++ b/src/rgw/rgw_aio_throttle.h @@ -0,0 +1,133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "include/rados/librados_fwd.hpp" +#include +#include "common/ceph_mutex.h" +#include "common/async/completion.h" +#include "common/async/yield_context.h" +#include "services/svc_rados.h" +#include "rgw_aio.h" + +namespace rgw { + +class Throttle { + protected: + const uint64_t window; + uint64_t pending_size = 0; + + AioResultList pending; + AioResultList completed; + + bool is_available() const { return pending_size <= window; } + bool has_completion() const { return !completed.empty(); } + bool is_drained() const { return pending.empty(); } + + enum class Wait { None, Available, Completion, Drained }; + Wait waiter = Wait::None; + + bool waiter_ready() const; + + public: + Throttle(uint64_t window) : window(window) {} + + virtual ~Throttle() { + // must drain before destructing + ceph_assert(pending.empty()); + ceph_assert(completed.empty()); + } +}; + +// a throttle for aio operations. all public functions must be called from +// the same thread +class BlockingAioThrottle final : public Aio, private Throttle { + ceph::mutex mutex = ceph::make_mutex("AioThrottle"); + ceph::condition_variable cond; + + struct Pending : AioResultEntry { + BlockingAioThrottle *parent = nullptr; + uint64_t cost = 0; + librados::AioCompletion *completion = nullptr; + }; + public: + BlockingAioThrottle(uint64_t window) : Throttle(window) {} + + virtual ~BlockingAioThrottle() override {}; + + AioResultList get(const RGWSI_RADOS::Obj& obj, OpFunc&& f, + uint64_t cost, uint64_t id) override final; + + void put(AioResult& r) override final; + + AioResultList poll() override final; + + AioResultList wait() override final; + + AioResultList drain() override final; +}; + +// a throttle that yields the coroutine instead of blocking. all public +// functions must be called within the coroutine strand +class YieldingAioThrottle final : public Aio, private Throttle { + boost::asio::io_context& context; + yield_context yield; + struct Handler; + + // completion callback associated with the waiter + using Completion = ceph::async::Completion; + std::unique_ptr completion; + + template + auto async_wait(CompletionToken&& token); + + struct Pending : AioResultEntry { uint64_t cost = 0; }; + + public: + YieldingAioThrottle(uint64_t window, boost::asio::io_context& context, + yield_context yield) + : Throttle(window), context(context), yield(yield) + {} + + virtual ~YieldingAioThrottle() override {}; + + AioResultList get(const RGWSI_RADOS::Obj& obj, OpFunc&& f, + uint64_t cost, uint64_t id) override final; + + void put(AioResult& r) override final; + + AioResultList poll() override final; + + AioResultList wait() override final; + + AioResultList drain() override final; +}; + +// return a smart pointer to Aio +inline auto make_throttle(uint64_t window_size, optional_yield y) +{ + std::unique_ptr aio; + if (y) { + aio = std::make_unique(window_size, + y.get_io_context(), + y.get_yield_context()); + } else { + aio = std::make_unique(window_size); + } + return aio; +} + +} // namespace rgw diff --git a/src/rgw/rgw_amqp.cc b/src/rgw/rgw_amqp.cc new file mode 100644 index 000000000..3014edd1d --- /dev/null +++ b/src/rgw/rgw_amqp.cc @@ -0,0 +1,1051 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_amqp.h" +#include +#include +#include +#include +#include "include/ceph_assert.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/dout.h" +#include + +#define dout_subsys ceph_subsys_rgw + +// TODO investigation, not necessarily issues: +// (1) in case of single threaded writer context use spsc_queue +// (2) support multiple channels +// (3) check performance of emptying queue to local list, and go over the list and publish +// (4) use std::shared_mutex (c++17) or equivalent for the connections lock + +namespace rgw::amqp { + +// RGW AMQP status codes for publishing +static const int RGW_AMQP_STATUS_BROKER_NACK = -0x1001; +static const int RGW_AMQP_STATUS_CONNECTION_CLOSED = -0x1002; +static const int RGW_AMQP_STATUS_QUEUE_FULL = -0x1003; +static const int RGW_AMQP_STATUS_MAX_INFLIGHT = -0x1004; +static const int RGW_AMQP_STATUS_MANAGER_STOPPED = -0x1005; +// RGW AMQP status code for connection opening +static const int RGW_AMQP_STATUS_CONN_ALLOC_FAILED = -0x2001; +static const int RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED = -0x2002; +static const int RGW_AMQP_STATUS_SOCKET_OPEN_FAILED = -0x2003; +static const int RGW_AMQP_STATUS_LOGIN_FAILED = -0x2004; +static const int RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED = -0x2005; +static const int RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED = -0x2006; +static const int RGW_AMQP_STATUS_Q_DECLARE_FAILED = -0x2007; +static const int RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED = -0x2008; +static const int RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED = -0x2009; +static const int RGW_AMQP_STATUS_SOCKET_CACERT_FAILED = -0x2010; + +static const int RGW_AMQP_RESPONSE_SOCKET_ERROR = -0x3008; +static const int RGW_AMQP_NO_REPLY_CODE = 0x0; + +// the amqp_connection_info struct does not hold any memory and just points to the URL string +// so, strings are copied into connection_id_t +connection_id_t::connection_id_t(const amqp_connection_info& info, const std::string& _exchange) + : host(info.host), port(info.port), vhost(info.vhost), exchange(_exchange), ssl(info.ssl) {} + +// equality operator and hasher functor are needed +// so that connection_id_t could be used as key in unordered_map +bool operator==(const connection_id_t& lhs, const connection_id_t& rhs) { + return lhs.host == rhs.host && lhs.port == rhs.port && + lhs.vhost == rhs.vhost && lhs.exchange == rhs.exchange; +} + +struct connection_id_hasher { + std::size_t operator()(const connection_id_t& k) const { + std::size_t h = 0; + boost::hash_combine(h, k.host); + boost::hash_combine(h, k.port); + boost::hash_combine(h, k.vhost); + boost::hash_combine(h, k.exchange); + return h; + } +}; + +std::string to_string(const connection_id_t& id) { + return fmt::format("{}://{}:{}{}?exchange={}", + id.ssl ? "amqps" : "amqp", + id.host, id.port, id.vhost, id.exchange); +} + +// automatically cleans amqp state when gets out of scope +class ConnectionCleaner { + private: + amqp_connection_state_t state; + public: + ConnectionCleaner(amqp_connection_state_t _state) : state(_state) {} + ~ConnectionCleaner() { + if (state) { + amqp_destroy_connection(state); + } + } + // call reset() if cleanup is not needed anymore + void reset() { + state = nullptr; + } +}; + +// struct for holding the callback and its tag in the callback list +struct reply_callback_with_tag_t { + uint64_t tag; + reply_callback_t cb; + + reply_callback_with_tag_t(uint64_t _tag, reply_callback_t _cb) : tag(_tag), cb(_cb) {} + + bool operator==(uint64_t rhs) { + return tag == rhs; + } +}; + +typedef std::vector CallbackList; + +// struct for holding the connection state object as well as the exchange +struct connection_t { + CephContext* cct = nullptr; + amqp_connection_state_t state = nullptr; + amqp_bytes_t reply_to_queue = amqp_empty_bytes; + uint64_t delivery_tag = 1; + int status = AMQP_STATUS_OK; + int reply_type = AMQP_RESPONSE_NORMAL; + int reply_code = RGW_AMQP_NO_REPLY_CODE; + CallbackList callbacks; + ceph::coarse_real_clock::time_point next_reconnect = ceph::coarse_real_clock::now(); + bool mandatory = false; + const bool use_ssl = false; + std::string user; + std::string password; + bool verify_ssl = true; + boost::optional ca_location; + utime_t timestamp = ceph_clock_now(); + + connection_t(CephContext* _cct, const amqp_connection_info& info, bool _verify_ssl, boost::optional _ca_location) : + cct(_cct), use_ssl(info.ssl), user(info.user), password(info.password), verify_ssl(_verify_ssl), ca_location(_ca_location) {} + + // cleanup of all internal connection resource + // the object can still remain, and internal connection + // resources created again on successful reconnection + void destroy(int s) { + status = s; + ConnectionCleaner clean_state(state); + state = nullptr; + amqp_bytes_free(reply_to_queue); + reply_to_queue = amqp_empty_bytes; + // fire all remaining callbacks + std::for_each(callbacks.begin(), callbacks.end(), [this](auto& cb_tag) { + cb_tag.cb(status); + ldout(cct, 20) << "AMQP destroy: invoking callback with tag=" << cb_tag.tag << dendl; + }); + callbacks.clear(); + delivery_tag = 1; + } + + bool is_ok() const { + return (state != nullptr); + } + + // dtor also destroys the internals + ~connection_t() { + destroy(RGW_AMQP_STATUS_CONNECTION_CLOSED); + } +}; + +// convert connection info to string +std::string to_string(const amqp_connection_info& info) { + std::stringstream ss; + ss << "connection info:" << + "\nHost: " << info.host << + "\nPort: " << info.port << + "\nUser: " << info.user << + "\nPassword: " << info.password << + "\nvhost: " << info.vhost << + "\nSSL support: " << info.ssl << std::endl; + return ss.str(); +} + +// convert reply to error code +int reply_to_code(const amqp_rpc_reply_t& reply) { + switch (reply.reply_type) { + case AMQP_RESPONSE_NONE: + case AMQP_RESPONSE_NORMAL: + return RGW_AMQP_NO_REPLY_CODE; + case AMQP_RESPONSE_LIBRARY_EXCEPTION: + return reply.library_error; + case AMQP_RESPONSE_SERVER_EXCEPTION: + if (reply.reply.decoded) { + const amqp_connection_close_t* m = (amqp_connection_close_t*)reply.reply.decoded; + return m->reply_code; + } + return reply.reply.id; + } + return RGW_AMQP_NO_REPLY_CODE; +} + +// convert reply to string +std::string to_string(const amqp_rpc_reply_t& reply) { + std::stringstream ss; + switch (reply.reply_type) { + case AMQP_RESPONSE_NORMAL: + return ""; + case AMQP_RESPONSE_NONE: + return "missing RPC reply type"; + case AMQP_RESPONSE_LIBRARY_EXCEPTION: + return amqp_error_string2(reply.library_error); + case AMQP_RESPONSE_SERVER_EXCEPTION: + { + switch (reply.reply.id) { + case AMQP_CONNECTION_CLOSE_METHOD: + ss << "server connection error: "; + break; + case AMQP_CHANNEL_CLOSE_METHOD: + ss << "server channel error: "; + break; + default: + ss << "server unknown error: "; + break; + } + if (reply.reply.decoded) { + amqp_connection_close_t* m = (amqp_connection_close_t*)reply.reply.decoded; + ss << m->reply_code << " text: " << std::string((char*)m->reply_text.bytes, m->reply_text.len); + } + return ss.str(); + } + default: + ss << "unknown error, method id: " << reply.reply.id; + return ss.str(); + } +} + +// convert status enum to string +std::string to_string(amqp_status_enum s) { + switch (s) { + case AMQP_STATUS_OK: + return "AMQP_STATUS_OK"; + case AMQP_STATUS_NO_MEMORY: + return "AMQP_STATUS_NO_MEMORY"; + case AMQP_STATUS_BAD_AMQP_DATA: + return "AMQP_STATUS_BAD_AMQP_DATA"; + case AMQP_STATUS_UNKNOWN_CLASS: + return "AMQP_STATUS_UNKNOWN_CLASS"; + case AMQP_STATUS_UNKNOWN_METHOD: + return "AMQP_STATUS_UNKNOWN_METHOD"; + case AMQP_STATUS_HOSTNAME_RESOLUTION_FAILED: + return "AMQP_STATUS_HOSTNAME_RESOLUTION_FAILED"; + case AMQP_STATUS_INCOMPATIBLE_AMQP_VERSION: + return "AMQP_STATUS_INCOMPATIBLE_AMQP_VERSION"; + case AMQP_STATUS_CONNECTION_CLOSED: + return "AMQP_STATUS_CONNECTION_CLOSED"; + case AMQP_STATUS_BAD_URL: + return "AMQP_STATUS_BAD_URL"; + case AMQP_STATUS_SOCKET_ERROR: + return "AMQP_STATUS_SOCKET_ERROR"; + case AMQP_STATUS_INVALID_PARAMETER: + return "AMQP_STATUS_INVALID_PARAMETER"; + case AMQP_STATUS_TABLE_TOO_BIG: + return "AMQP_STATUS_TABLE_TOO_BIG"; + case AMQP_STATUS_WRONG_METHOD: + return "AMQP_STATUS_WRONG_METHOD"; + case AMQP_STATUS_TIMEOUT: + return "AMQP_STATUS_TIMEOUT"; + case AMQP_STATUS_TIMER_FAILURE: + return "AMQP_STATUS_TIMER_FAILURE"; + case AMQP_STATUS_HEARTBEAT_TIMEOUT: + return "AMQP_STATUS_HEARTBEAT_TIMEOUT"; + case AMQP_STATUS_UNEXPECTED_STATE: + return "AMQP_STATUS_UNEXPECTED_STATE"; + case AMQP_STATUS_SOCKET_CLOSED: + return "AMQP_STATUS_SOCKET_CLOSED"; + case AMQP_STATUS_SOCKET_INUSE: + return "AMQP_STATUS_SOCKET_INUSE"; + case AMQP_STATUS_BROKER_UNSUPPORTED_SASL_METHOD: + return "AMQP_STATUS_BROKER_UNSUPPORTED_SASL_METHOD"; +#if AMQP_VERSION >= AMQP_VERSION_CODE(0, 8, 0, 0) + case AMQP_STATUS_UNSUPPORTED: + return "AMQP_STATUS_UNSUPPORTED"; +#endif + case _AMQP_STATUS_NEXT_VALUE: + return "AMQP_STATUS_INTERNAL"; + case AMQP_STATUS_TCP_ERROR: + return "AMQP_STATUS_TCP_ERROR"; + case AMQP_STATUS_TCP_SOCKETLIB_INIT_ERROR: + return "AMQP_STATUS_TCP_SOCKETLIB_INIT_ERROR"; + case _AMQP_STATUS_TCP_NEXT_VALUE: + return "AMQP_STATUS_INTERNAL"; + case AMQP_STATUS_SSL_ERROR: + return "AMQP_STATUS_SSL_ERROR"; + case AMQP_STATUS_SSL_HOSTNAME_VERIFY_FAILED: + return "AMQP_STATUS_SSL_HOSTNAME_VERIFY_FAILED"; + case AMQP_STATUS_SSL_PEER_VERIFY_FAILED: + return "AMQP_STATUS_SSL_PEER_VERIFY_FAILED"; + case AMQP_STATUS_SSL_CONNECTION_FAILED: + return "AMQP_STATUS_SSL_CONNECTION_FAILED"; + case _AMQP_STATUS_SSL_NEXT_VALUE: + return "AMQP_STATUS_INTERNAL"; +#if AMQP_VERSION >= AMQP_VERSION_CODE(0, 11, 0, 0) + case AMQP_STATUS_SSL_SET_ENGINE_FAILED: + return "AMQP_STATUS_SSL_SET_ENGINE_FAILED"; +#endif + default: + return "AMQP_STATUS_UNKNOWN"; + } +} + +// TODO: add status_to_string on the connection object to prinf full status + +// convert int status to string - including RGW specific values +std::string status_to_string(int s) { + switch (s) { + case RGW_AMQP_STATUS_BROKER_NACK: + return "RGW_AMQP_STATUS_BROKER_NACK"; + case RGW_AMQP_STATUS_CONNECTION_CLOSED: + return "RGW_AMQP_STATUS_CONNECTION_CLOSED"; + case RGW_AMQP_STATUS_QUEUE_FULL: + return "RGW_AMQP_STATUS_QUEUE_FULL"; + case RGW_AMQP_STATUS_MAX_INFLIGHT: + return "RGW_AMQP_STATUS_MAX_INFLIGHT"; + case RGW_AMQP_STATUS_MANAGER_STOPPED: + return "RGW_AMQP_STATUS_MANAGER_STOPPED"; + case RGW_AMQP_STATUS_CONN_ALLOC_FAILED: + return "RGW_AMQP_STATUS_CONN_ALLOC_FAILED"; + case RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED: + return "RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED"; + case RGW_AMQP_STATUS_SOCKET_OPEN_FAILED: + return "RGW_AMQP_STATUS_SOCKET_OPEN_FAILED"; + case RGW_AMQP_STATUS_LOGIN_FAILED: + return "RGW_AMQP_STATUS_LOGIN_FAILED"; + case RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED: + return "RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED"; + case RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED: + return "RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED"; + case RGW_AMQP_STATUS_Q_DECLARE_FAILED: + return "RGW_AMQP_STATUS_Q_DECLARE_FAILED"; + case RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED: + return "RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED"; + case RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED: + return "RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED"; + case RGW_AMQP_STATUS_SOCKET_CACERT_FAILED: + return "RGW_AMQP_STATUS_SOCKET_CACERT_FAILED"; + } + return to_string((amqp_status_enum)s); +} + +// check the result from calls and return if error (=null) +#define RETURN_ON_ERROR(C, S, OK) \ + if (!OK) { \ + C->status = S; \ + return false; \ + } + +// in case of RPC calls, getting the RPC reply and return if an error is detected +#define RETURN_ON_REPLY_ERROR(C, ST, S) { \ + const auto reply = amqp_get_rpc_reply(ST); \ + if (reply.reply_type != AMQP_RESPONSE_NORMAL) { \ + C->status = S; \ + C->reply_type = reply.reply_type; \ + C->reply_code = reply_to_code(reply); \ + return false; \ + } \ + } + +static const amqp_channel_t CHANNEL_ID = 1; +static const amqp_channel_t CONFIRMING_CHANNEL_ID = 2; + +// utility function to create a connection, when the connection object already exists +bool new_state(connection_t* conn, const connection_id_t& conn_id) { + // state must be null at this point + ceph_assert(!conn->state); + // reset all status codes + conn->status = AMQP_STATUS_OK; + conn->reply_type = AMQP_RESPONSE_NORMAL; + conn->reply_code = RGW_AMQP_NO_REPLY_CODE; + + auto state = amqp_new_connection(); + if (!state) { + conn->status = RGW_AMQP_STATUS_CONN_ALLOC_FAILED; + return false; + } + // make sure that the connection state is cleaned up in case of error + ConnectionCleaner state_guard(state); + + // create and open socket + amqp_socket_t *socket = nullptr; + if (conn->use_ssl) { + socket = amqp_ssl_socket_new(state); +#if AMQP_VERSION >= AMQP_VERSION_CODE(0, 10, 0, 1) + SSL_CTX* ssl_ctx = reinterpret_cast(amqp_ssl_socket_get_context(socket)); +#else + // taken from https://github.com/alanxz/rabbitmq-c/pull/560 + struct hack { + const struct amqp_socket_class_t *klass; + SSL_CTX *ctx; + }; + + struct hack *h = reinterpret_cast(socket); + SSL_CTX* ssl_ctx = h->ctx; +#endif + // ensure system CA certificates get loaded + SSL_CTX_set_default_verify_paths(ssl_ctx); + } + else { + socket = amqp_tcp_socket_new(state); + } + + if (!socket) { + conn->status = RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED; + return false; + } + if (conn->use_ssl) { + if (!conn->verify_ssl) { + amqp_ssl_socket_set_verify_peer(socket, 0); + amqp_ssl_socket_set_verify_hostname(socket, 0); + } + if (conn->ca_location.has_value()) { + const auto s = amqp_ssl_socket_set_cacert(socket, conn->ca_location.get().c_str()); + if (s != AMQP_STATUS_OK) { + conn->status = RGW_AMQP_STATUS_SOCKET_CACERT_FAILED; + conn->reply_code = s; + return false; + } + } + } + const auto s = amqp_socket_open(socket, conn_id.host.c_str(), conn_id.port); + if (s < 0) { + conn->status = RGW_AMQP_STATUS_SOCKET_OPEN_FAILED; + conn->reply_type = RGW_AMQP_RESPONSE_SOCKET_ERROR; + conn->reply_code = s; + return false; + } + + // login to broker + const auto reply = amqp_login(state, + conn_id.vhost.c_str(), + AMQP_DEFAULT_MAX_CHANNELS, + AMQP_DEFAULT_FRAME_SIZE, + 0, // no heartbeat TODO: add conf + AMQP_SASL_METHOD_PLAIN, // TODO: add other types of security + conn->user.c_str(), + conn->password.c_str()); + if (reply.reply_type != AMQP_RESPONSE_NORMAL) { + conn->status = RGW_AMQP_STATUS_LOGIN_FAILED; + conn->reply_type = reply.reply_type; + conn->reply_code = reply_to_code(reply); + return false; + } + + // open channels + { + const auto ok = amqp_channel_open(state, CHANNEL_ID); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED, ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED); + } + { + const auto ok = amqp_channel_open(state, CONFIRMING_CHANNEL_ID); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED, ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED); + } + { + const auto ok = amqp_confirm_select(state, CONFIRMING_CHANNEL_ID); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED, ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED); + } + + // verify that the topic exchange is there + // TODO: make this step optional + { + const auto ok = amqp_exchange_declare(state, + CHANNEL_ID, + amqp_cstring_bytes(conn_id.exchange.c_str()), + amqp_cstring_bytes("topic"), + 1, // passive - exchange must already exist on broker + 1, // durable + 0, // dont auto-delete + 0, // not internal + amqp_empty_table); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED, ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED); + } + { + // create queue for confirmations + const auto queue_ok = amqp_queue_declare(state, + CHANNEL_ID, // use the regular channel for this call + amqp_empty_bytes, // let broker allocate queue name + 0, // not passive - create the queue + 0, // not durable + 1, // exclusive + 1, // auto-delete + amqp_empty_table // not args TODO add args from conf: TTL, max length etc. + ); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_Q_DECLARE_FAILED, queue_ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_Q_DECLARE_FAILED); + + // define consumption for connection + const auto consume_ok = amqp_basic_consume(state, + CONFIRMING_CHANNEL_ID, + queue_ok->queue, + amqp_empty_bytes, // broker will generate consumer tag + 1, // messages sent from client are never routed back + 1, // client does not ack thr acks + 1, // exclusive access to queue + amqp_empty_table // no parameters + ); + + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED, consume_ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED); + // broker generated consumer_tag could be used to cancel sending of n/acks from broker - not needed + + state_guard.reset(); + conn->state = state; + conn->reply_to_queue = amqp_bytes_malloc_dup(queue_ok->queue); + } + return true; +} + +/// struct used for holding messages in the message queue +struct message_wrapper_t { + connection_id_t conn_id; + std::string topic; + std::string message; + reply_callback_t cb; + + message_wrapper_t(const connection_id_t& _conn_id, + const std::string& _topic, + const std::string& _message, + reply_callback_t _cb) : conn_id(_conn_id), topic(_topic), message(_message), cb(_cb) {} +}; + +using connection_t_ptr = std::unique_ptr; + +typedef std::unordered_map ConnectionList; +typedef boost::lockfree::queue> MessageQueue; + +// macros used inside a loop where an iterator is either incremented or erased +#define INCREMENT_AND_CONTINUE(IT) \ + ++IT; \ + continue; + +#define ERASE_AND_CONTINUE(IT,CONTAINER) \ + IT=CONTAINER.erase(IT); \ + --connection_count; \ + continue; + +class Manager { +public: + const size_t max_connections; + const size_t max_inflight; + const size_t max_queue; + const size_t max_idle_time; +private: + std::atomic connection_count; + std::atomic stopped; + struct timeval read_timeout; + ConnectionList connections; + MessageQueue messages; + std::atomic queued; + std::atomic dequeued; + CephContext* const cct; + mutable std::mutex connections_lock; + const ceph::coarse_real_clock::duration idle_time; + const ceph::coarse_real_clock::duration reconnect_time; + std::thread runner; + + void publish_internal(message_wrapper_t* message) { + const std::unique_ptr msg_owner(message); + const auto& conn_id = message->conn_id; + auto conn_it = connections.find(conn_id); + if (conn_it == connections.end()) { + ldout(cct, 1) << "AMQP publish: connection '" << to_string(conn_id) << "' not found" << dendl; + if (message->cb) { + message->cb(RGW_AMQP_STATUS_CONNECTION_CLOSED); + } + return; + } + + auto& conn = conn_it->second; + + conn->timestamp = ceph_clock_now(); + + if (!conn->is_ok()) { + // connection had an issue while message was in the queue + ldout(cct, 1) << "AMQP publish: connection '" << to_string(conn_id) << "' is closed" << dendl; + if (message->cb) { + message->cb(RGW_AMQP_STATUS_CONNECTION_CLOSED); + } + return; + } + + if (message->cb == nullptr) { + const auto rc = amqp_basic_publish(conn->state, + CHANNEL_ID, + amqp_cstring_bytes(conn_id.exchange.c_str()), + amqp_cstring_bytes(message->topic.c_str()), + 0, // does not have to be routable + 0, // not immediate + nullptr, // no properties needed + amqp_cstring_bytes(message->message.c_str())); + if (rc == AMQP_STATUS_OK) { + ldout(cct, 20) << "AMQP publish (no callback): OK" << dendl; + return; + } + ldout(cct, 1) << "AMQP publish (no callback): failed with error " << status_to_string(rc) << dendl; + // an error occurred, close connection + // it will be retied by the main loop + conn->destroy(rc); + return; + } + + amqp_basic_properties_t props; + props._flags = + AMQP_BASIC_DELIVERY_MODE_FLAG | + AMQP_BASIC_REPLY_TO_FLAG; + props.delivery_mode = 2; // persistent delivery TODO take from conf + props.reply_to = conn->reply_to_queue; + + const auto rc = amqp_basic_publish(conn->state, + CONFIRMING_CHANNEL_ID, + amqp_cstring_bytes(conn_id.exchange.c_str()), + amqp_cstring_bytes(message->topic.c_str()), + conn->mandatory, + 0, // not immediate + &props, + amqp_cstring_bytes(message->message.c_str())); + + if (rc == AMQP_STATUS_OK) { + auto const q_len = conn->callbacks.size(); + if (q_len < max_inflight) { + ldout(cct, 20) << "AMQP publish (with callback, tag=" << conn->delivery_tag << "): OK. Queue has: " << q_len << " callbacks" << dendl; + conn->callbacks.emplace_back(conn->delivery_tag++, message->cb); + } else { + // immediately invoke callback with error + ldout(cct, 1) << "AMQP publish (with callback): failed with error: callback queue full" << dendl; + message->cb(RGW_AMQP_STATUS_MAX_INFLIGHT); + } + } else { + // an error occurred, close connection + // it will be retied by the main loop + ldout(cct, 1) << "AMQP publish (with callback): failed with error: " << status_to_string(rc) << dendl; + conn->destroy(rc); + // immediately invoke callback with error + message->cb(rc); + } + } + + // the managers thread: + // (1) empty the queue of messages to be published + // (2) loop over all connections and read acks + // (3) manages deleted connections + // (4) TODO reconnect on connection errors + // (5) TODO cleanup timedout callbacks + void run() noexcept { + amqp_frame_t frame; + while (!stopped) { + + // publish all messages in the queue + const auto count = messages.consume_all(std::bind(&Manager::publish_internal, this, std::placeholders::_1)); + dequeued += count; + ConnectionList::iterator conn_it; + ConnectionList::const_iterator end_it; + { + // thread safe access to the connection list + // once the iterators are fetched they are guaranteed to remain valid + std::lock_guard lock(connections_lock); + conn_it = connections.begin(); + end_it = connections.end(); + } + auto incoming_message = false; + // loop over all connections to read acks + for (;conn_it != end_it;) { + + const auto& conn_id = conn_it->first; + auto& conn = conn_it->second; + + if(conn->timestamp.sec() + max_idle_time < ceph_clock_now()) { + ldout(cct, 20) << "AMQP run: Time for deleting a connection due to idle behaviour: " << ceph_clock_now() << dendl; + ERASE_AND_CONTINUE(conn_it, connections); + } + + // try to reconnect the connection if it has an error + if (!conn->is_ok()) { + const auto now = ceph::coarse_real_clock::now(); + if (now >= conn->next_reconnect) { + // pointers are used temporarily inside the amqp_connection_info object + // as read-only values, hence the assignment, and const_cast are safe here + ldout(cct, 20) << "AMQP run: retry connection" << dendl; + if (!new_state(conn.get(), conn_id)) { + ldout(cct, 10) << "AMQP run: connection '" << to_string(conn_id) << "' retry failed. error: " << + status_to_string(conn->status) << " (" << conn->reply_code << ")" << dendl; + // TODO: add error counter for failed retries + // TODO: add exponential backoff for retries + conn->next_reconnect = now + reconnect_time; + } else { + ldout(cct, 10) << "AMQP run: connection '" << to_string(conn_id) << "' retry successfull" << dendl; + } + } + INCREMENT_AND_CONTINUE(conn_it); + } + + const auto rc = amqp_simple_wait_frame_noblock(conn->state, &frame, &read_timeout); + + if (rc == AMQP_STATUS_TIMEOUT) { + // TODO mark connection as idle + INCREMENT_AND_CONTINUE(conn_it); + } + + // this is just to prevent spinning idle, does not indicate that a message + // was successfully processed or not + incoming_message = true; + + // check if error occurred that require reopening the connection + if (rc != AMQP_STATUS_OK) { + // an error occurred, close connection + // it will be retied by the main loop + ldout(cct, 1) << "AMQP run: connection read error: " << status_to_string(rc) << dendl; + conn->destroy(rc); + INCREMENT_AND_CONTINUE(conn_it); + } + + if (frame.frame_type != AMQP_FRAME_METHOD) { + ldout(cct, 10) << "AMQP run: ignoring non n/ack messages. frame type: " + << unsigned(frame.frame_type) << dendl; + // handler is for publish confirmation only - handle only method frames + INCREMENT_AND_CONTINUE(conn_it); + } + + uint64_t tag; + bool multiple; + int result; + + switch (frame.payload.method.id) { + case AMQP_BASIC_ACK_METHOD: + { + result = AMQP_STATUS_OK; + const auto ack = (amqp_basic_ack_t*)frame.payload.method.decoded; + ceph_assert(ack); + tag = ack->delivery_tag; + multiple = ack->multiple; + break; + } + case AMQP_BASIC_NACK_METHOD: + { + result = RGW_AMQP_STATUS_BROKER_NACK; + const auto nack = (amqp_basic_nack_t*)frame.payload.method.decoded; + ceph_assert(nack); + tag = nack->delivery_tag; + multiple = nack->multiple; + break; + } + case AMQP_BASIC_REJECT_METHOD: + { + result = RGW_AMQP_STATUS_BROKER_NACK; + const auto reject = (amqp_basic_reject_t*)frame.payload.method.decoded; + tag = reject->delivery_tag; + multiple = false; + break; + } + case AMQP_CONNECTION_CLOSE_METHOD: + // TODO on channel close, no need to reopen the connection + case AMQP_CHANNEL_CLOSE_METHOD: + { + // other side closed the connection, no need to continue + ldout(cct, 10) << "AMQP run: connection was closed by broker" << dendl; + conn->destroy(rc); + INCREMENT_AND_CONTINUE(conn_it); + } + case AMQP_BASIC_RETURN_METHOD: + // message was not delivered, returned to sender + ldout(cct, 10) << "AMQP run: message was not routable" << dendl; + INCREMENT_AND_CONTINUE(conn_it); + break; + default: + // unexpected method + ldout(cct, 10) << "AMQP run: unexpected message" << dendl; + INCREMENT_AND_CONTINUE(conn_it); + } + + const auto tag_it = std::find(conn->callbacks.begin(), conn->callbacks.end(), tag); + if (tag_it != conn->callbacks.end()) { + if (multiple) { + // n/ack all up to (and including) the tag + ldout(cct, 20) << "AMQP run: multiple n/acks received with tag=" << tag << " and result=" << result << dendl; + auto it = conn->callbacks.begin(); + while (it->tag <= tag && it != conn->callbacks.end()) { + ldout(cct, 20) << "AMQP run: invoking callback with tag=" << it->tag << dendl; + it->cb(result); + it = conn->callbacks.erase(it); + } + } else { + // n/ack a specific tag + ldout(cct, 20) << "AMQP run: n/ack received, invoking callback with tag=" << tag << " and result=" << result << dendl; + tag_it->cb(result); + conn->callbacks.erase(tag_it); + } + } else { + ldout(cct, 10) << "AMQP run: unsolicited n/ack received with tag=" << tag << dendl; + } + // just increment the iterator + ++conn_it; + } + // if no messages were received or published, sleep for 100ms + if (count == 0 && !incoming_message) { + std::this_thread::sleep_for(idle_time); + } + } + } + + // used in the dtor for message cleanup + static void delete_message(const message_wrapper_t* message) { + delete message; + } + +public: + Manager(size_t _max_connections, + size_t _max_inflight, + size_t _max_queue, + long _usec_timeout, + unsigned reconnect_time_ms, + unsigned idle_time_ms, + CephContext* _cct) : + max_connections(_max_connections), + max_inflight(_max_inflight), + max_queue(_max_queue), + max_idle_time(30), + connection_count(0), + stopped(false), + read_timeout{0, _usec_timeout}, + connections(_max_connections), + messages(max_queue), + queued(0), + dequeued(0), + cct(_cct), + idle_time(std::chrono::milliseconds(idle_time_ms)), + reconnect_time(std::chrono::milliseconds(reconnect_time_ms)), + runner(&Manager::run, this) { + // The hashmap has "max connections" as the initial number of buckets, + // and allows for 10 collisions per bucket before rehash. + // This is to prevent rehashing so that iterators are not invalidated + // when a new connection is added. + connections.max_load_factor(10.0); + // give the runner thread a name for easier debugging + const auto rc = ceph_pthread_setname(runner.native_handle(), "amqp_manager"); + ceph_assert(rc==0); + } + + // non copyable + Manager(const Manager&) = delete; + const Manager& operator=(const Manager&) = delete; + + // stop the main thread + void stop() { + stopped = true; + } + + // connect to a broker, or reuse an existing connection if already connected + bool connect(connection_id_t& id, const std::string& url, const std::string& exchange, bool mandatory_delivery, bool verify_ssl, + boost::optional ca_location) { + if (stopped) { + ldout(cct, 1) << "AMQP connect: manager is stopped" << dendl; + return false; + } + + amqp_connection_info info; + // cache the URL so that parsing could happen in-place + std::vector url_cache(url.c_str(), url.c_str()+url.size()+1); + const auto retcode = amqp_parse_url(url_cache.data(), &info); + if (AMQP_STATUS_OK != retcode) { + ldout(cct, 1) << "AMQP connect: URL parsing failed. error: " << retcode << dendl; + return false; + } + connection_id_t tmp_id(info, exchange); + + std::lock_guard lock(connections_lock); + const auto it = connections.find(tmp_id); + if (it != connections.end()) { + // connection found - return even if non-ok + ldout(cct, 20) << "AMQP connect: connection found" << dendl; + id = it->first; + return true; + } + + // connection not found, creating a new one + if (connection_count >= max_connections) { + ldout(cct, 1) << "AMQP connect: max connections exceeded" << dendl; + return false; + } + // if error occurred during creation the creation will be retried in the main thread + ++connection_count; + auto conn = connections.emplace(tmp_id, std::make_unique(cct, info, verify_ssl, ca_location)).first->second.get(); + ldout(cct, 10) << "AMQP connect: new connection is created. Total connections: " << connection_count << dendl; + if (!new_state(conn, tmp_id)) { + ldout(cct, 1) << "AMQP connect: new connection '" << to_string(tmp_id) << "' is created. but state creation failed (will retry). error: " << + status_to_string(conn->status) << " (" << conn->reply_code << ")" << dendl; + } + id = std::move(tmp_id); + return true; + } + + // TODO publish with confirm is needed in "none" case as well, cb should be invoked publish is ok (no ack) + int publish(const connection_id_t& conn_id, + const std::string& topic, + const std::string& message) { + if (stopped) { + ldout(cct, 1) << "AMQP publish: manager is not running" << dendl; + return RGW_AMQP_STATUS_MANAGER_STOPPED; + } + auto wrapper = std::make_unique(conn_id, topic, message, nullptr); + if (messages.push(wrapper.get())) { + std::ignore = wrapper.release(); + ++queued; + return AMQP_STATUS_OK; + } + ldout(cct, 1) << "AMQP publish: queue is full" << dendl; + return RGW_AMQP_STATUS_QUEUE_FULL; + } + + int publish_with_confirm(const connection_id_t& conn_id, + const std::string& topic, + const std::string& message, + reply_callback_t cb) { + if (stopped) { + ldout(cct, 1) << "AMQP publish_with_confirm: manager is not running" << dendl; + return RGW_AMQP_STATUS_MANAGER_STOPPED; + } + auto wrapper = std::make_unique(conn_id, topic, message, cb); + if (messages.push(wrapper.get())) { + std::ignore = wrapper.release(); + ++queued; + return AMQP_STATUS_OK; + } + ldout(cct, 1) << "AMQP publish_with_confirm: queue is full" << dendl; + return RGW_AMQP_STATUS_QUEUE_FULL; + } + + // dtor wait for thread to stop + // then connection are cleaned-up + ~Manager() { + stopped = true; + runner.join(); + messages.consume_all(delete_message); + } + + // get the number of connections + size_t get_connection_count() const { + return connection_count; + } + + // get the number of in-flight messages + size_t get_inflight() const { + size_t sum = 0; + std::lock_guard lock(connections_lock); + std::for_each(connections.begin(), connections.end(), [&sum](auto& conn_pair) { + // concurrent access to the callback vector is safe without locking + sum += conn_pair.second->callbacks.size(); + }); + return sum; + } + + // running counter of the queued messages + size_t get_queued() const { + return queued; + } + + // running counter of the dequeued messages + size_t get_dequeued() const { + return dequeued; + } +}; + +// singleton manager +// note that the manager itself is not a singleton, and multiple instances may co-exist +// TODO make the pointer atomic in allocation and deallocation to avoid race conditions +static Manager* s_manager = nullptr; + +static const size_t MAX_CONNECTIONS_DEFAULT = 256; +static const size_t MAX_INFLIGHT_DEFAULT = 8192; +static const size_t MAX_QUEUE_DEFAULT = 8192; +static const long READ_TIMEOUT_USEC = 100; +static const unsigned IDLE_TIME_MS = 100; +static const unsigned RECONNECT_TIME_MS = 100; + +bool init(CephContext* cct) { + if (s_manager) { + return false; + } + // TODO: take conf from CephContext + s_manager = new Manager(MAX_CONNECTIONS_DEFAULT, MAX_INFLIGHT_DEFAULT, MAX_QUEUE_DEFAULT, + READ_TIMEOUT_USEC, IDLE_TIME_MS, RECONNECT_TIME_MS, cct); + return true; +} + +void shutdown() { + delete s_manager; + s_manager = nullptr; +} + +bool connect(connection_id_t& conn_id, const std::string& url, const std::string& exchange, bool mandatory_delivery, bool verify_ssl, + boost::optional ca_location) { + if (!s_manager) return false; + return s_manager->connect(conn_id, url, exchange, mandatory_delivery, verify_ssl, ca_location); +} + +int publish(const connection_id_t& conn_id, + const std::string& topic, + const std::string& message) { + if (!s_manager) return RGW_AMQP_STATUS_MANAGER_STOPPED; + return s_manager->publish(conn_id, topic, message); +} + +int publish_with_confirm(const connection_id_t& conn_id, + const std::string& topic, + const std::string& message, + reply_callback_t cb) { + if (!s_manager) return RGW_AMQP_STATUS_MANAGER_STOPPED; + return s_manager->publish_with_confirm(conn_id, topic, message, cb); +} + +size_t get_connection_count() { + if (!s_manager) return 0; + return s_manager->get_connection_count(); +} + +size_t get_inflight() { + if (!s_manager) return 0; + return s_manager->get_inflight(); +} + +size_t get_queued() { + if (!s_manager) return 0; + return s_manager->get_queued(); +} + +size_t get_dequeued() { + if (!s_manager) return 0; + return s_manager->get_dequeued(); +} + +size_t get_max_connections() { + if (!s_manager) return MAX_CONNECTIONS_DEFAULT; + return s_manager->max_connections; +} + +size_t get_max_inflight() { + if (!s_manager) return MAX_INFLIGHT_DEFAULT; + return s_manager->max_inflight; +} + +size_t get_max_queue() { + if (!s_manager) return MAX_QUEUE_DEFAULT; + return s_manager->max_queue; +} + +} // namespace amqp + diff --git a/src/rgw/rgw_amqp.h b/src/rgw/rgw_amqp.h new file mode 100644 index 000000000..c363f4d74 --- /dev/null +++ b/src/rgw/rgw_amqp.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include "include/common_fwd.h" + +struct amqp_connection_info; + +namespace rgw::amqp { + +// the reply callback is expected to get an integer parameter +// indicating the result, and not to return anything +typedef std::function reply_callback_t; + +// initialize the amqp manager +bool init(CephContext* cct); + +// shutdown the amqp manager +void shutdown(); + +// key class for the connection list +struct connection_id_t { + std::string host; + int port; + std::string vhost; + std::string exchange; + bool ssl; + connection_id_t() = default; + connection_id_t(const amqp_connection_info& info, const std::string& _exchange); +}; + +std::string to_string(const connection_id_t& id); + +// connect to an amqp endpoint +bool connect(connection_id_t& conn_id, const std::string& url, const std::string& exchange, bool mandatory_delivery, bool verify_ssl, + boost::optional ca_location); + +// publish a message over a connection that was already created +int publish(const connection_id_t& conn_id, + const std::string& topic, + const std::string& message); + +// publish a message over a connection that was already created +// and pass a callback that will be invoked (async) when broker confirms +// receiving the message +int publish_with_confirm(const connection_id_t& conn_id, + const std::string& topic, + const std::string& message, + reply_callback_t cb); + +// convert the integer status returned from the "publish" function to a string +std::string status_to_string(int s); + +// number of connections +size_t get_connection_count(); + +// return the number of messages that were sent +// to broker, but were not yet acked/nacked/timedout +size_t get_inflight(); + +// running counter of successfully queued messages +size_t get_queued(); + +// running counter of dequeued messages +size_t get_dequeued(); + +// number of maximum allowed connections +size_t get_max_connections(); + +// number of maximum allowed inflight messages +size_t get_max_inflight(); + +// maximum number of messages in the queue +size_t get_max_queue(); + +} + diff --git a/src/rgw/rgw_appmain.cc b/src/rgw/rgw_appmain.cc new file mode 100644 index 000000000..361f622b9 --- /dev/null +++ b/src/rgw/rgw_appmain.cc @@ -0,0 +1,605 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include "global/global_init.h" +#include "global/signal_handler.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/TracepointProvider.h" +#include "common/openssl_opts_handler.h" +#include "common/numa.h" +#include "include/compat.h" +#include "include/str_list.h" +#include "include/stringify.h" +#include "rgw_main.h" +#include "rgw_common.h" +#include "rgw_sal_rados.h" +#include "rgw_period_pusher.h" +#include "rgw_realm_reloader.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_swift.h" +#include "rgw_rest_admin.h" +#include "rgw_rest_info.h" +#include "rgw_rest_usage.h" +#include "rgw_rest_bucket.h" +#include "rgw_rest_metadata.h" +#include "rgw_rest_log.h" +#include "rgw_rest_config.h" +#include "rgw_rest_realm.h" +#include "rgw_rest_ratelimit.h" +#include "rgw_swift_auth.h" +#include "rgw_log.h" +#include "rgw_lib.h" +#include "rgw_frontend.h" +#include "rgw_lib_frontend.h" +#include "rgw_tools.h" +#include "rgw_resolve.h" +#include "rgw_process.h" +#include "rgw_frontend.h" +#include "rgw_http_client_curl.h" +#include "rgw_kmip_client.h" +#include "rgw_kmip_client_impl.h" +#include "rgw_perf_counters.h" +#include "rgw_signal.h" +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +#include "rgw_amqp.h" +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +#include "rgw_kafka.h" +#endif +#ifdef WITH_ARROW_FLIGHT +#include "rgw_flight_frontend.h" +#endif +#include "rgw_asio_frontend.h" +#include "rgw_dmclock_scheduler_ctx.h" +#include "rgw_lua.h" +#ifdef WITH_RADOSGW_DBSTORE +#include "rgw_sal_dbstore.h" +#endif +#include "rgw_lua_background.h" +#include "services/svc_zone.h" + +#ifdef HAVE_SYS_PRCTL_H +#include +#endif + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace { + TracepointProvider::Traits rgw_op_tracepoint_traits( + "librgw_op_tp.so", "rgw_op_tracing"); + TracepointProvider::Traits rgw_rados_tracepoint_traits( + "librgw_rados_tp.so", "rgw_rados_tracing"); +} + +OpsLogFile* rgw::AppMain::ops_log_file; + +void rgw::AppMain::init_frontends1(bool nfs) +{ + this->nfs = nfs; + std::string fe_key = (nfs) ? "rgw_nfs_frontends" : "rgw_frontends"; + std::vector frontends; + std::string rgw_frontends_str = g_conf().get_val(fe_key); + g_conf().early_expand_meta(rgw_frontends_str, &cerr); + get_str_vec(rgw_frontends_str, ",", frontends); + + /* default frontends */ + if (nfs) { + const auto is_rgw_nfs = [](const auto& s){return s == "rgw-nfs";}; + if (std::find_if(frontends.begin(), frontends.end(), is_rgw_nfs) == frontends.end()) { + frontends.push_back("rgw-nfs"); + } + } else { + if (frontends.empty()) { + frontends.push_back("beast"); + } + } + + for (auto &f : frontends) { + if (f.find("beast") != string::npos) { + have_http_frontend = true; + if (f.find("port") != string::npos) { + // check for the most common ws problems + if ((f.find("port=") == string::npos) || + (f.find("port= ") != string::npos)) { + derr << + R"(WARNING: radosgw frontend config found unexpected spacing around 'port' + (ensure frontend port parameter has the form 'port=80' with no spaces + before or after '='))" + << dendl; + } + } + } else { + if (f.find("civetweb") != string::npos) { + have_http_frontend = true; + } + } /* fe !beast */ + + RGWFrontendConfig *config = new RGWFrontendConfig(f); + int r = config->init(); + if (r < 0) { + delete config; + cerr << "ERROR: failed to init config: " << f << std::endl; + continue; + } + + fe_configs.push_back(config); + fe_map.insert( + pair(config->get_framework(), config)); + } /* for each frontend */ + + // maintain existing region root pool for new multisite objects + if (!g_conf()->rgw_region_root_pool.empty()) { + const char *root_pool = g_conf()->rgw_region_root_pool.c_str(); + if (g_conf()->rgw_zonegroup_root_pool.empty()) { + g_conf().set_val_or_die("rgw_zonegroup_root_pool", root_pool); + } + if (g_conf()->rgw_period_root_pool.empty()) { + g_conf().set_val_or_die("rgw_period_root_pool", root_pool); + } + if (g_conf()->rgw_realm_root_pool.empty()) { + g_conf().set_val_or_die("rgw_realm_root_pool", root_pool); + } + } + + // for region -> zonegroup conversion (must happen before + // common_init_finish()) + if (!g_conf()->rgw_region.empty() && g_conf()->rgw_zonegroup.empty()) { + g_conf().set_val_or_die("rgw_zonegroup", g_conf()->rgw_region.c_str()); + } + + ceph::crypto::init_openssl_engine_once(); +} /* init_frontends1 */ + +void rgw::AppMain::init_numa() +{ + if (nfs) { + return; + } + + int numa_node = g_conf().get_val("rgw_numa_node"); + size_t numa_cpu_set_size = 0; + cpu_set_t numa_cpu_set; + + if (numa_node >= 0) { + int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set); + if (r < 0) { + dout(1) << __func__ << " unable to determine rgw numa node " << numa_node + << " CPUs" << dendl; + numa_node = -1; + } else { + r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set); + if (r < 0) { + derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r) + << dendl; + } + } + } else { + dout(1) << __func__ << " not setting numa affinity" << dendl; + } +} /* init_numa */ + +void rgw::AppMain::init_storage() +{ + auto run_gc = + (g_conf()->rgw_enable_gc_threads && + ((!nfs) || (nfs && g_conf()->rgw_nfs_run_gc_threads))); + + auto run_lc = + (g_conf()->rgw_enable_lc_threads && + ((!nfs) || (nfs && g_conf()->rgw_nfs_run_lc_threads))); + + auto run_quota = + (g_conf()->rgw_enable_quota_threads && + ((!nfs) || (nfs && g_conf()->rgw_nfs_run_quota_threads))); + + auto run_sync = + (g_conf()->rgw_run_sync_thread && + ((!nfs) || (nfs && g_conf()->rgw_nfs_run_sync_thread))); + + DriverManager::Config cfg = DriverManager::get_config(false, g_ceph_context); + env.driver = DriverManager::get_storage(dpp, dpp->get_cct(), + cfg, + run_gc, + run_lc, + run_quota, + run_sync, + g_conf().get_val("rgw_dynamic_resharding"), + g_conf()->rgw_cache_enabled); + +} /* init_storage */ + +void rgw::AppMain::init_perfcounters() +{ + (void) rgw_perf_start(dpp->get_cct()); +} /* init_perfcounters */ + +void rgw::AppMain::init_http_clients() +{ + rgw_init_resolver(); + rgw::curl::setup_curl(fe_map); + rgw_http_client_init(dpp->get_cct()); + rgw_kmip_client_init(*new RGWKMIPManagerImpl(dpp->get_cct())); +} /* init_http_clients */ + +void rgw::AppMain::cond_init_apis() +{ + rgw_rest_init(g_ceph_context, env.driver->get_zone()->get_zonegroup()); + + if (have_http_frontend) { + std::vector apis; + get_str_vec(g_conf()->rgw_enable_apis, apis); + + std::map apis_map; + for (auto &api : apis) { + apis_map[api] = true; + } + + /* warn about insecure keystone secret config options */ + if (!(g_ceph_context->_conf->rgw_keystone_admin_token.empty() || + g_ceph_context->_conf->rgw_keystone_admin_password.empty())) { + dout(0) + << "WARNING: rgw_keystone_admin_token and " + "rgw_keystone_admin_password should be avoided as they can " + "expose secrets. Prefer the new rgw_keystone_admin_token_path " + "and rgw_keystone_admin_password_path options, which read their " + "secrets from files." + << dendl; + } + + // S3 website mode is a specialization of S3 + const bool s3website_enabled = apis_map.count("s3website") > 0; + const bool sts_enabled = apis_map.count("sts") > 0; + const bool iam_enabled = apis_map.count("iam") > 0; + const bool pubsub_enabled = + apis_map.count("pubsub") > 0 || apis_map.count("notifications") > 0; + // Swift API entrypoint could placed in the root instead of S3 + const bool swift_at_root = g_conf()->rgw_swift_url_prefix == "/"; + if (apis_map.count("s3") > 0 || s3website_enabled) { + if (!swift_at_root) { + rest.register_default_mgr(set_logging( + rest_filter(env.driver, RGW_REST_S3, + new RGWRESTMgr_S3(s3website_enabled, sts_enabled, + iam_enabled, pubsub_enabled)))); + } else { + derr << "Cannot have the S3 or S3 Website enabled together with " + << "Swift API placed in the root of hierarchy" << dendl; + } + } + + if (apis_map.count("swift") > 0) { + RGWRESTMgr_SWIFT* const swift_resource = new RGWRESTMgr_SWIFT; + + if (! g_conf()->rgw_cross_domain_policy.empty()) { + swift_resource->register_resource("crossdomain.xml", + set_logging(new RGWRESTMgr_SWIFT_CrossDomain)); + } + + swift_resource->register_resource("healthcheck", + set_logging(new RGWRESTMgr_SWIFT_HealthCheck)); + + swift_resource->register_resource("info", + set_logging(new RGWRESTMgr_SWIFT_Info)); + + if (! swift_at_root) { + rest.register_resource(g_conf()->rgw_swift_url_prefix, + set_logging(rest_filter(env.driver, RGW_REST_SWIFT, + swift_resource))); + } else { + if (env.driver->get_zone()->get_zonegroup().get_zone_count() > 1) { + derr << "Placing Swift API in the root of URL hierarchy while running" + << " multi-site configuration requires another instance of RadosGW" + << " with S3 API enabled!" << dendl; + } + + rest.register_default_mgr(set_logging(swift_resource)); + } + } + + if (apis_map.count("swift_auth") > 0) { + rest.register_resource(g_conf()->rgw_swift_auth_entry, + set_logging(new RGWRESTMgr_SWIFT_Auth)); + } + + if (apis_map.count("admin") > 0) { + RGWRESTMgr_Admin *admin_resource = new RGWRESTMgr_Admin; + admin_resource->register_resource("info", new RGWRESTMgr_Info); + admin_resource->register_resource("usage", new RGWRESTMgr_Usage); + /* Register driver-specific admin APIs */ + env.driver->register_admin_apis(admin_resource); + rest.register_resource(g_conf()->rgw_admin_entry, admin_resource); + } + } /* have_http_frontend */ +} /* init_apis */ + +void rgw::AppMain::init_ldap() +{ + CephContext* cct = env.driver->ctx(); + const string &ldap_uri = cct->_conf->rgw_ldap_uri; + const string &ldap_binddn = cct->_conf->rgw_ldap_binddn; + const string &ldap_searchdn = cct->_conf->rgw_ldap_searchdn; + const string &ldap_searchfilter = cct->_conf->rgw_ldap_searchfilter; + const string &ldap_dnattr = cct->_conf->rgw_ldap_dnattr; + std::string ldap_bindpw = parse_rgw_ldap_bindpw(cct); + + ldh.reset(new rgw::LDAPHelper(ldap_uri, ldap_binddn, + ldap_bindpw.c_str(), ldap_searchdn, ldap_searchfilter, ldap_dnattr)); + ldh->init(); + ldh->bind(); +} /* init_ldap */ + +void rgw::AppMain::init_opslog() +{ + rgw_log_usage_init(dpp->get_cct(), env.driver); + + OpsLogManifold *olog_manifold = new OpsLogManifold(); + if (!g_conf()->rgw_ops_log_socket_path.empty()) { + OpsLogSocket *olog_socket = + new OpsLogSocket(g_ceph_context, g_conf()->rgw_ops_log_data_backlog); + olog_socket->init(g_conf()->rgw_ops_log_socket_path); + olog_manifold->add_sink(olog_socket); + } + if (!g_conf()->rgw_ops_log_file_path.empty()) { + ops_log_file = + new OpsLogFile(g_ceph_context, g_conf()->rgw_ops_log_file_path, + g_conf()->rgw_ops_log_data_backlog); + ops_log_file->start(); + olog_manifold->add_sink(ops_log_file); + } + olog_manifold->add_sink(new OpsLogRados(env.driver)); + olog = olog_manifold; +} /* init_opslog */ + +int rgw::AppMain::init_frontends2(RGWLib* rgwlib) +{ + int r{0}; + vector frontends_def; + std::string frontend_defs_str = + g_conf().get_val("rgw_frontend_defaults"); + get_str_vec(frontend_defs_str, ",", frontends_def); + + service_map_meta["pid"] = stringify(getpid()); + + std::map > fe_def_map; + for (auto& f : frontends_def) { + RGWFrontendConfig *config = new RGWFrontendConfig(f); + int r = config->init(); + if (r < 0) { + delete config; + cerr << "ERROR: failed to init default config: " << f << std::endl; + continue; + } + fe_def_map[config->get_framework()].reset(config); + } + + /* Initialize the registry of auth strategies which will coordinate + * the dynamic reconfiguration. */ + implicit_tenant_context.reset(new rgw::auth::ImplicitTenants{g_conf()}); + g_conf().add_observer(implicit_tenant_context.get()); + + /* allocate a mime table (you'd never guess that from the name) */ + rgw_tools_init(dpp, dpp->get_cct()); + + /* Header custom behavior */ + rest.register_x_headers(g_conf()->rgw_log_http_headers); + + sched_ctx.reset(new rgw::dmclock::SchedulerCtx{dpp->get_cct()}); + ratelimiter.reset(new ActiveRateLimiter{dpp->get_cct()}); + ratelimiter->start(); + + // initialize RGWProcessEnv + env.rest = &rest; + env.olog = olog; + env.auth_registry = rgw::auth::StrategyRegistry::create( + dpp->get_cct(), *implicit_tenant_context, env.driver); + env.ratelimiting = ratelimiter.get(); + + int fe_count = 0; + for (multimap::iterator fiter = fe_map.begin(); + fiter != fe_map.end(); ++fiter, ++fe_count) { + RGWFrontendConfig *config = fiter->second; + string framework = config->get_framework(); + + auto def_iter = fe_def_map.find(framework); + if (def_iter != fe_def_map.end()) { + config->set_default_config(*def_iter->second); + } + + RGWFrontend* fe = nullptr; + + if (framework == "loadgen") { + fe = new RGWLoadGenFrontend(env, config); + } + else if (framework == "beast") { + fe = new RGWAsioFrontend(env, config, *sched_ctx); + } + else if (framework == "rgw-nfs") { + fe = new RGWLibFrontend(env, config); + if (rgwlib) { + rgwlib->set_fe(static_cast(fe)); + } + } + else if (framework == "arrow_flight") { +#ifdef WITH_ARROW_FLIGHT + int port; + config->get_val("port", 8077, &port); + fe = new rgw::flight::FlightFrontend(env, config, port); +#else + derr << "WARNING: arrow_flight frontend requested, but not included in build; skipping" << dendl; + continue; +#endif + } + + service_map_meta["frontend_type#" + stringify(fe_count)] = framework; + service_map_meta["frontend_config#" + stringify(fe_count)] = config->get_config(); + + if (! fe) { + dout(0) << "WARNING: skipping unknown framework: " << framework << dendl; + continue; + } + + dout(0) << "starting handler: " << fiter->first << dendl; + int r = fe->init(); + if (r < 0) { + derr << "ERROR: failed initializing frontend" << dendl; + return -r; + } + r = fe->run(); + if (r < 0) { + derr << "ERROR: failed run" << dendl; + return -r; + } + + fes.push_back(fe); + } + + std::string daemon_type = (nfs) ? "rgw-nfs" : "rgw"; + r = env.driver->register_to_service_map(dpp, daemon_type, service_map_meta); + if (r < 0) { + derr << "ERROR: failed to register to service map: " << cpp_strerror(-r) << dendl; + /* ignore error */ + } + + if (env.driver->get_name() == "rados") { + // add a watcher to respond to realm configuration changes + pusher = std::make_unique(dpp, env.driver, null_yield); + fe_pauser = std::make_unique(fes, pusher.get()); + rgw_pauser = std::make_unique(); + rgw_pauser->add_pauser(fe_pauser.get()); + if (env.lua.background) { + rgw_pauser->add_pauser(env.lua.background); + } + reloader = std::make_unique( + env, *implicit_tenant_context, service_map_meta, rgw_pauser.get()); + realm_watcher = std::make_unique(dpp, g_ceph_context, + static_cast(env.driver)->svc()->zone->get_realm()); + realm_watcher->add_watcher(RGWRealmNotify::Reload, *reloader); + realm_watcher->add_watcher(RGWRealmNotify::ZonesNeedPeriod, *pusher.get()); + } + + return r; +} /* init_frontends2 */ + +void rgw::AppMain::init_tracepoints() +{ + TracepointProvider::initialize(dpp->get_cct()); + TracepointProvider::initialize(dpp->get_cct()); + tracing::rgw::tracer.init("rgw"); +} /* init_tracepoints() */ + +void rgw::AppMain::init_notification_endpoints() +{ +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + if (!rgw::amqp::init(dpp->get_cct())) { + derr << "ERROR: failed to initialize AMQP manager" << dendl; + } +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + if (!rgw::kafka::init(dpp->get_cct())) { + derr << "ERROR: failed to initialize Kafka manager" << dendl; + } +#endif +} /* init_notification_endpoints */ + +void rgw::AppMain::init_lua() +{ + rgw::sal::Driver* driver = env.driver; + int r{0}; + std::string path = g_conf().get_val("rgw_luarocks_location"); + if (!path.empty()) { + path += "/" + g_conf()->name.to_str(); + } + env.lua.luarocks_path = path; + +#ifdef WITH_RADOSGW_LUA_PACKAGES + rgw::lua::packages_t failed_packages; + std::string output; + r = rgw::lua::install_packages(dpp, driver, null_yield, path, + failed_packages, output); + if (r < 0) { + dout(1) << "WARNING: failed to install lua packages from allowlist" + << dendl; + } + if (!output.empty()) { + dout(10) << "INFO: lua packages installation output: \n" << output << dendl; + } + for (const auto &p : failed_packages) { + dout(5) << "WARNING: failed to install lua package: " << p + << " from allowlist" << dendl; + } +#endif + + env.lua.manager = env.driver->get_lua_manager(); + + if (driver->get_name() == "rados") { /* Supported for only RadosStore */ + lua_background = std::make_unique< + rgw::lua::Background>(driver, dpp->get_cct(), path); + lua_background->start(); + env.lua.background = lua_background.get(); + } +} /* init_lua */ + +void rgw::AppMain::shutdown(std::function finalize_async_signals) +{ + if (env.driver->get_name() == "rados") { + reloader.reset(); // stop the realm reloader + } + + for (auto& fe : fes) { + fe->stop(); + } + + for (auto& fe : fes) { + fe->join(); + delete fe; + } + + for (auto& fec : fe_configs) { + delete fec; + } + + ldh.reset(nullptr); // deletes + finalize_async_signals(); // callback + rgw_log_usage_finalize(); + + delete olog; + + if (lua_background) { + lua_background->shutdown(); + } + + DriverManager::close_storage(env.driver); + + rgw_tools_cleanup(); + rgw_shutdown_resolver(); + rgw_http_client_cleanup(); + rgw_kmip_client_cleanup(); + rgw::curl::cleanup_curl(); + g_conf().remove_observer(implicit_tenant_context.get()); + implicit_tenant_context.reset(); // deletes +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + rgw::amqp::shutdown(); +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + rgw::kafka::shutdown(); +#endif + rgw_perf_stop(g_ceph_context); + ratelimiter.reset(); // deletes--ensure this happens before we destruct +} /* AppMain::shutdown */ diff --git a/src/rgw/rgw_arn.cc b/src/rgw/rgw_arn.cc new file mode 100644 index 000000000..fddc3d769 --- /dev/null +++ b/src/rgw/rgw_arn.cc @@ -0,0 +1,387 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_arn.h" +#include "rgw_common.h" +#include + +using namespace std; + +namespace rgw { + +namespace { +boost::optional to_partition(const smatch::value_type& p, + bool wildcards) { + if (p == "aws") { + return Partition::aws; + } else if (p == "aws-cn") { + return Partition::aws_cn; + } else if (p == "aws-us-gov") { + return Partition::aws_us_gov; + } else if (p == "*" && wildcards) { + return Partition::wildcard; + } else { + return boost::none; + } + + ceph_abort(); +} + +boost::optional to_service(const smatch::value_type& s, + bool wildcards) { + static const unordered_map services = { + { "acm", Service::acm }, + { "apigateway", Service::apigateway }, + { "appstream", Service::appstream }, + { "artifact", Service::artifact }, + { "autoscaling", Service::autoscaling }, + { "aws-marketplace", Service::aws_marketplace }, + { "aws-marketplace-management", + Service::aws_marketplace_management }, + { "aws-portal", Service::aws_portal }, + { "cloudformation", Service::cloudformation }, + { "cloudfront", Service::cloudfront }, + { "cloudhsm", Service::cloudhsm }, + { "cloudsearch", Service::cloudsearch }, + { "cloudtrail", Service::cloudtrail }, + { "cloudwatch", Service::cloudwatch }, + { "codebuild", Service::codebuild }, + { "codecommit", Service::codecommit }, + { "codedeploy", Service::codedeploy }, + { "codepipeline", Service::codepipeline }, + { "cognito-identity", Service::cognito_identity }, + { "cognito-idp", Service::cognito_idp }, + { "cognito-sync", Service::cognito_sync }, + { "config", Service::config }, + { "datapipeline", Service::datapipeline }, + { "devicefarm", Service::devicefarm }, + { "directconnect", Service::directconnect }, + { "dms", Service::dms }, + { "ds", Service::ds }, + { "dynamodb", Service::dynamodb }, + { "ec2", Service::ec2 }, + { "ecr", Service::ecr }, + { "ecs", Service::ecs }, + { "elasticache", Service::elasticache }, + { "elasticbeanstalk", Service::elasticbeanstalk }, + { "elasticfilesystem", Service::elasticfilesystem }, + { "elasticloadbalancing", Service::elasticloadbalancing }, + { "elasticmapreduce", Service::elasticmapreduce }, + { "elastictranscoder", Service::elastictranscoder }, + { "es", Service::es }, + { "events", Service::events }, + { "firehose", Service::firehose }, + { "gamelift", Service::gamelift }, + { "glacier", Service::glacier }, + { "health", Service::health }, + { "iam", Service::iam }, + { "importexport", Service::importexport }, + { "inspector", Service::inspector }, + { "iot", Service::iot }, + { "kinesis", Service::kinesis }, + { "kinesisanalytics", Service::kinesisanalytics }, + { "kms", Service::kms }, + { "lambda", Service::lambda }, + { "lightsail", Service::lightsail }, + { "logs", Service::logs }, + { "machinelearning", Service::machinelearning }, + { "mobileanalytics", Service::mobileanalytics }, + { "mobilehub", Service::mobilehub }, + { "opsworks", Service::opsworks }, + { "opsworks-cm", Service::opsworks_cm }, + { "polly", Service::polly }, + { "rds", Service::rds }, + { "redshift", Service::redshift }, + { "route53", Service::route53 }, + { "route53domains", Service::route53domains }, + { "s3", Service::s3 }, + { "sdb", Service::sdb }, + { "servicecatalog", Service::servicecatalog }, + { "ses", Service::ses }, + { "sns", Service::sns }, + { "sqs", Service::sqs }, + { "ssm", Service::ssm }, + { "states", Service::states }, + { "storagegateway", Service::storagegateway }, + { "sts", Service::sts }, + { "support", Service::support }, + { "swf", Service::swf }, + { "trustedadvisor", Service::trustedadvisor }, + { "waf", Service::waf }, + { "workmail", Service::workmail }, + { "workspaces", Service::workspaces }}; + + if (wildcards && s == "*") { + return Service::wildcard; + } + + auto i = services.find(s); + if (i == services.end()) { + return boost::none; + } else { + return i->second; + } +} +} +ARN::ARN(const rgw_obj& o) + : partition(Partition::aws), + service(Service::s3), + region(), + account(o.bucket.tenant), + resource(o.bucket.name) +{ + resource.push_back('/'); + resource.append(o.key.name); +} + +ARN::ARN(const rgw_bucket& b) + : partition(Partition::aws), + service(Service::s3), + region(), + account(b.tenant), + resource(b.name) { } + +ARN::ARN(const rgw_bucket& b, const std::string& o) + : partition(Partition::aws), + service(Service::s3), + region(), + account(b.tenant), + resource(b.name) { + resource.push_back('/'); + resource.append(o); +} + +ARN::ARN(const std::string& resource_name, const std::string& type, const std::string& tenant, bool has_path) + : partition(Partition::aws), + service(Service::iam), + region(), + account(tenant), + resource(type) { + if (! has_path) + resource.push_back('/'); + resource.append(resource_name); +} + +boost::optional ARN::parse(const std::string& s, bool wildcards) { + static const std::regex rx_wild("arn:([^:]*):([^:]*):([^:]*):([^:]*):([^:]*)", + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + static const std::regex rx_no_wild( + "arn:([^:*]*):([^:*]*):([^:*]*):([^:*]*):(.*)", + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + + smatch match; + + if ((s == "*") && wildcards) { + return ARN(Partition::wildcard, Service::wildcard, "*", "*", "*"); + } else if (regex_match(s, match, wildcards ? rx_wild : rx_no_wild) && + match.size() == 6) { + if (auto p = to_partition(match[1], wildcards)) { + if (auto s = to_service(match[2], wildcards)) { + return ARN(*p, *s, match[3], match[4], match[5]); + } + } + } + return boost::none; +} + +std::string ARN::to_string() const { + std::string s{"arn:"}; + + if (partition == Partition::aws) { + s.append("aws:"); + } else if (partition == Partition::aws_cn) { + s.append("aws-cn:"); + } else if (partition == Partition::aws_us_gov) { + s.append("aws-us-gov:"); + } else { + s.append("*:"); + } + + static const std::unordered_map services = { + { Service::acm, "acm" }, + { Service::apigateway, "apigateway" }, + { Service::appstream, "appstream" }, + { Service::artifact, "artifact" }, + { Service::autoscaling, "autoscaling" }, + { Service::aws_marketplace, "aws-marketplace" }, + { Service::aws_marketplace_management, "aws-marketplace-management" }, + { Service::aws_portal, "aws-portal" }, + { Service::cloudformation, "cloudformation" }, + { Service::cloudfront, "cloudfront" }, + { Service::cloudhsm, "cloudhsm" }, + { Service::cloudsearch, "cloudsearch" }, + { Service::cloudtrail, "cloudtrail" }, + { Service::cloudwatch, "cloudwatch" }, + { Service::codebuild, "codebuild" }, + { Service::codecommit, "codecommit" }, + { Service::codedeploy, "codedeploy" }, + { Service::codepipeline, "codepipeline" }, + { Service::cognito_identity, "cognito-identity" }, + { Service::cognito_idp, "cognito-idp" }, + { Service::cognito_sync, "cognito-sync" }, + { Service::config, "config" }, + { Service::datapipeline, "datapipeline" }, + { Service::devicefarm, "devicefarm" }, + { Service::directconnect, "directconnect" }, + { Service::dms, "dms" }, + { Service::ds, "ds" }, + { Service::dynamodb, "dynamodb" }, + { Service::ec2, "ec2" }, + { Service::ecr, "ecr" }, + { Service::ecs, "ecs" }, + { Service::elasticache, "elasticache" }, + { Service::elasticbeanstalk, "elasticbeanstalk" }, + { Service::elasticfilesystem, "elasticfilesystem" }, + { Service::elasticloadbalancing, "elasticloadbalancing" }, + { Service::elasticmapreduce, "elasticmapreduce" }, + { Service::elastictranscoder, "elastictranscoder" }, + { Service::es, "es" }, + { Service::events, "events" }, + { Service::firehose, "firehose" }, + { Service::gamelift, "gamelift" }, + { Service::glacier, "glacier" }, + { Service::health, "health" }, + { Service::iam, "iam" }, + { Service::importexport, "importexport" }, + { Service::inspector, "inspector" }, + { Service::iot, "iot" }, + { Service::kinesis, "kinesis" }, + { Service::kinesisanalytics, "kinesisanalytics" }, + { Service::kms, "kms" }, + { Service::lambda, "lambda" }, + { Service::lightsail, "lightsail" }, + { Service::logs, "logs" }, + { Service::machinelearning, "machinelearning" }, + { Service::mobileanalytics, "mobileanalytics" }, + { Service::mobilehub, "mobilehub" }, + { Service::opsworks, "opsworks" }, + { Service::opsworks_cm, "opsworks-cm" }, + { Service::polly, "polly" }, + { Service::rds, "rds" }, + { Service::redshift, "redshift" }, + { Service::route53, "route53" }, + { Service::route53domains, "route53domains" }, + { Service::s3, "s3" }, + { Service::sdb, "sdb" }, + { Service::servicecatalog, "servicecatalog" }, + { Service::ses, "ses" }, + { Service::sns, "sns" }, + { Service::sqs, "sqs" }, + { Service::ssm, "ssm" }, + { Service::states, "states" }, + { Service::storagegateway, "storagegateway" }, + { Service::sts, "sts" }, + { Service::support, "support" }, + { Service::swf, "swf" }, + { Service::trustedadvisor, "trustedadvisor" }, + { Service::waf, "waf" }, + { Service::workmail, "workmail" }, + { Service::workspaces, "workspaces" }}; + + auto i = services.find(service); + if (i != services.end()) { + s.append(i->second); + } else { + s.push_back('*'); + } + s.push_back(':'); + + s.append(region); + s.push_back(':'); + + s.append(account); + s.push_back(':'); + + s.append(resource); + + return s; +} + +bool operator ==(const ARN& l, const ARN& r) { + return ((l.partition == r.partition) && + (l.service == r.service) && + (l.region == r.region) && + (l.account == r.account) && + (l.resource == r.resource)); +} +bool operator <(const ARN& l, const ARN& r) { + return ((l.partition < r.partition) || + (l.service < r.service) || + (l.region < r.region) || + (l.account < r.account) || + (l.resource < r.resource)); +} + +// The candidate is not allowed to have wildcards. The only way to +// do that sanely would be to use unification rather than matching. +bool ARN::match(const ARN& candidate) const { + if ((candidate.partition == Partition::wildcard) || + (partition != candidate.partition && partition + != Partition::wildcard)) { + return false; + } + + if ((candidate.service == Service::wildcard) || + (service != candidate.service && service != Service::wildcard)) { + return false; + } + + if (!match_policy(region, candidate.region, MATCH_POLICY_ARN)) { + return false; + } + + if (!match_policy(account, candidate.account, MATCH_POLICY_ARN)) { + return false; + } + + if (!match_policy(resource, candidate.resource, MATCH_POLICY_RESOURCE)) { + return false; + } + + return true; +} + +boost::optional ARNResource::parse(const std::string& s) { + static const std::regex rx("^([^:/]*)[:/]?([^:/]*)?[:/]?(.*)$", + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + std::smatch match; + if (!regex_match(s, match, rx)) { + return boost::none; + } + if (match[2].str().empty() && match[3].str().empty()) { + // only resource exist + return rgw::ARNResource("", match[1], ""); + } + + // resource type also exist, and cannot be wildcard + if (match[1] != std::string(wildcard)) { + // resource type cannot be wildcard + return rgw::ARNResource(match[1], match[2], match[3]); + } + + return boost::none; +} + +std::string ARNResource::to_string() const { + std::string s; + + if (!resource_type.empty()) { + s.append(resource_type); + s.push_back(':'); + + s.append(resource); + s.push_back(':'); + + s.append(qualifier); + } else { + s.append(resource); + } + + return s; +} + +} + diff --git a/src/rgw/rgw_arn.h b/src/rgw/rgw_arn.h new file mode 100644 index 000000000..406a9f429 --- /dev/null +++ b/src/rgw/rgw_arn.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once +#include +#include + +class rgw_obj; +class rgw_bucket; + +namespace rgw { + +enum struct Partition { + aws, aws_cn, aws_us_gov, wildcard + // If we wanted our own ARNs for principal type unique to us + // (maybe to integrate better with Swift) or for anything else we + // provide that doesn't map onto S3, we could add an 'rgw' + // partition type. +}; + +enum struct Service { + apigateway, appstream, artifact, autoscaling, aws_portal, acm, + cloudformation, cloudfront, cloudhsm, cloudsearch, cloudtrail, + cloudwatch, events, logs, codebuild, codecommit, codedeploy, + codepipeline, cognito_idp, cognito_identity, cognito_sync, + config, datapipeline, dms, devicefarm, directconnect, + ds, dynamodb, ec2, ecr, ecs, ssm, elasticbeanstalk, elasticfilesystem, + elasticloadbalancing, elasticmapreduce, elastictranscoder, elasticache, + es, gamelift, glacier, health, iam, importexport, inspector, iot, + kms, kinesisanalytics, firehose, kinesis, lambda, lightsail, + machinelearning, aws_marketplace, aws_marketplace_management, + mobileanalytics, mobilehub, opsworks, opsworks_cm, polly, + redshift, rds, route53, route53domains, sts, servicecatalog, + ses, sns, sqs, s3, swf, sdb, states, storagegateway, support, + trustedadvisor, waf, workmail, workspaces, wildcard +}; + +/* valid format: + * 'arn:partition:service:region:account-id:resource' + * The 'resource' part can be further broken down via ARNResource +*/ +struct ARN { + Partition partition; + Service service; + std::string region; + // Once we refit tenant, we should probably use that instead of a + // string. + std::string account; + std::string resource; + + ARN() + : partition(Partition::wildcard), service(Service::wildcard) {} + ARN(Partition partition, Service service, std::string region, + std::string account, std::string resource) + : partition(partition), service(service), region(std::move(region)), + account(std::move(account)), resource(std::move(resource)) {} + ARN(const rgw_obj& o); + ARN(const rgw_bucket& b); + ARN(const rgw_bucket& b, const std::string& o); + ARN(const std::string& resource_name, const std::string& type, const std::string& tenant, bool has_path=false); + + static boost::optional parse(const std::string& s, + bool wildcard = false); + std::string to_string() const; + + // `this` is the pattern + bool match(const ARN& candidate) const; +}; + +inline std::string to_string(const ARN& a) { + return a.to_string(); +} + +inline std::ostream& operator <<(std::ostream& m, const ARN& a) { + return m << to_string(a); +} + +bool operator ==(const ARN& l, const ARN& r); +bool operator <(const ARN& l, const ARN& r); + +/* valid formats (only resource part): + * 'resource' + * 'resourcetype/resource' + * 'resourcetype/resource/qualifier' + * 'resourcetype/resource:qualifier' + * 'resourcetype:resource' + * 'resourcetype:resource:qualifier' + * Note that 'resourceType' cannot be wildcard +*/ +struct ARNResource { + constexpr static const char* const wildcard = "*"; + std::string resource_type; + std::string resource; + std::string qualifier; + + ARNResource() : resource_type(""), resource(wildcard), qualifier("") {} + + ARNResource(const std::string& _resource_type, const std::string& _resource, const std::string& _qualifier) : + resource_type(std::move(_resource_type)), resource(std::move(_resource)), qualifier(std::move(_qualifier)) {} + + static boost::optional parse(const std::string& s); + + std::string to_string() const; +}; + +inline std::string to_string(const ARNResource& r) { + return r.to_string(); +} + +} // namespace rgw + +namespace std { +template<> +struct hash<::rgw::Service> { + size_t operator()(const ::rgw::Service& s) const noexcept { + // Invoke a default-constructed hash object for int. + return hash()(static_cast(s)); + } +}; +} // namespace std + diff --git a/src/rgw/rgw_asio_client.cc b/src/rgw/rgw_asio_client.cc new file mode 100644 index 000000000..a0ec0bf5c --- /dev/null +++ b/src/rgw/rgw_asio_client.cc @@ -0,0 +1,192 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "rgw_asio_client.h" +#include "rgw_perf_counters.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace rgw::asio; + +ClientIO::ClientIO(parser_type& parser, bool is_ssl, + const endpoint_type& local_endpoint, + const endpoint_type& remote_endpoint) + : parser(parser), is_ssl(is_ssl), + local_endpoint(local_endpoint), + remote_endpoint(remote_endpoint), + txbuf(*this) +{ +} + +ClientIO::~ClientIO() = default; + +int ClientIO::init_env(CephContext *cct) +{ + env.init(cct); + + perfcounter->inc(l_rgw_qlen); + perfcounter->inc(l_rgw_qactive); + + const auto& request = parser.get(); + const auto& headers = request; + for (auto header = headers.begin(); header != headers.end(); ++header) { + const auto& field = header->name(); // enum type for known headers + const auto& name = header->name_string(); + const auto& value = header->value(); + + if (field == beast::http::field::content_length) { + env.set("CONTENT_LENGTH", value.to_string()); + continue; + } + if (field == beast::http::field::content_type) { + env.set("CONTENT_TYPE", value.to_string()); + continue; + } + + static const std::string_view HTTP_{"HTTP_"}; + + char buf[name.size() + HTTP_.size() + 1]; + auto dest = std::copy(std::begin(HTTP_), std::end(HTTP_), buf); + for (auto src = name.begin(); src != name.end(); ++src, ++dest) { + if (*src == '-') { + *dest = '_'; + } else if (*src == '_') { + *dest = '-'; + } else { + *dest = std::toupper(*src); + } + } + *dest = '\0'; + + env.set(buf, value.to_string()); + } + + int major = request.version() / 10; + int minor = request.version() % 10; + env.set("HTTP_VERSION", std::to_string(major) + '.' + std::to_string(minor)); + + env.set("REQUEST_METHOD", request.method_string().to_string()); + + // split uri from query + auto uri = request.target(); + auto pos = uri.find('?'); + if (pos != uri.npos) { + auto query = uri.substr(pos + 1); + env.set("QUERY_STRING", query.to_string()); + uri = uri.substr(0, pos); + } + env.set("SCRIPT_URI", uri.to_string()); + + env.set("REQUEST_URI", request.target().to_string()); + + char port_buf[16]; + snprintf(port_buf, sizeof(port_buf), "%d", local_endpoint.port()); + env.set("SERVER_PORT", port_buf); + if (is_ssl) { + env.set("SERVER_PORT_SECURE", port_buf); + } + env.set("REMOTE_ADDR", remote_endpoint.address().to_string()); + // TODO: set REMOTE_USER if authenticated + return 0; +} + +size_t ClientIO::complete_request() +{ + perfcounter->inc(l_rgw_qlen, -1); + perfcounter->inc(l_rgw_qactive, -1); + return 0; +} + +void ClientIO::flush() +{ + txbuf.pubsync(); +} + +size_t ClientIO::send_status(int status, const char* status_name) +{ + static constexpr size_t STATUS_BUF_SIZE = 128; + + char statusbuf[STATUS_BUF_SIZE]; + const auto statuslen = snprintf(statusbuf, sizeof(statusbuf), + "HTTP/1.1 %d %s\r\n", status, status_name); + + return txbuf.sputn(statusbuf, statuslen); +} + +size_t ClientIO::send_100_continue() +{ + const char HTTTP_100_CONTINUE[] = "HTTP/1.1 100 CONTINUE\r\n\r\n"; + const size_t sent = txbuf.sputn(HTTTP_100_CONTINUE, + sizeof(HTTTP_100_CONTINUE) - 1); + flush(); + sent100continue = true; + return sent; +} + +static constexpr size_t TIME_BUF_SIZE = 128; +static size_t dump_date_header(char (×tr)[TIME_BUF_SIZE]) +{ + const time_t gtime = time(nullptr); + struct tm result; + struct tm const * const tmp = gmtime_r(>ime, &result); + if (tmp == nullptr) { + return 0; + } + return strftime(timestr, sizeof(timestr), + "Date: %a, %d %b %Y %H:%M:%S %Z\r\n", tmp); +} + +size_t ClientIO::complete_header() +{ + size_t sent = 0; + + char timestr[TIME_BUF_SIZE]; + if (dump_date_header(timestr)) { + sent += txbuf.sputn(timestr, strlen(timestr)); + } + + if (parser.keep_alive()) { + constexpr char CONN_KEEP_ALIVE[] = "Connection: Keep-Alive\r\n"; + sent += txbuf.sputn(CONN_KEEP_ALIVE, sizeof(CONN_KEEP_ALIVE) - 1); + } else { + constexpr char CONN_KEEP_CLOSE[] = "Connection: close\r\n"; + sent += txbuf.sputn(CONN_KEEP_CLOSE, sizeof(CONN_KEEP_CLOSE) - 1); + } + + constexpr char HEADER_END[] = "\r\n"; + sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1); + + flush(); + return sent; +} + +size_t ClientIO::send_header(const std::string_view& name, + const std::string_view& value) +{ + static constexpr char HEADER_SEP[] = ": "; + static constexpr char HEADER_END[] = "\r\n"; + + size_t sent = 0; + + sent += txbuf.sputn(name.data(), name.length()); + sent += txbuf.sputn(HEADER_SEP, sizeof(HEADER_SEP) - 1); + sent += txbuf.sputn(value.data(), value.length()); + sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1); + + return sent; +} + +size_t ClientIO::send_content_length(uint64_t len) +{ + static constexpr size_t CONLEN_BUF_SIZE = 128; + + char sizebuf[CONLEN_BUF_SIZE]; + const auto sizelen = snprintf(sizebuf, sizeof(sizebuf), + "Content-Length: %" PRIu64 "\r\n", len); + + return txbuf.sputn(sizebuf, sizelen); +} diff --git a/src/rgw/rgw_asio_client.h b/src/rgw/rgw_asio_client.h new file mode 100644 index 000000000..e2ab943dd --- /dev/null +++ b/src/rgw/rgw_asio_client.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include "include/ceph_assert.h" + +#include "rgw_client_io.h" + +namespace rgw { +namespace asio { + +namespace beast = boost::beast; +using parser_type = beast::http::request_parser; + +class ClientIO : public io::RestfulClient, + public io::BuffererSink { + protected: + parser_type& parser; + private: + const bool is_ssl; + using endpoint_type = boost::asio::ip::tcp::endpoint; + endpoint_type local_endpoint; + endpoint_type remote_endpoint; + + RGWEnv env; + + rgw::io::StaticOutputBufferer<> txbuf; + bool sent100continue = false; + + public: + ClientIO(parser_type& parser, bool is_ssl, + const endpoint_type& local_endpoint, + const endpoint_type& remote_endpoint); + ~ClientIO() override; + + int init_env(CephContext *cct) override; + size_t complete_request() override; + void flush() override; + size_t send_status(int status, const char *status_name) override; + size_t send_100_continue() override; + size_t send_header(const std::string_view& name, + const std::string_view& value) override; + size_t send_content_length(uint64_t len) override; + size_t complete_header() override; + + size_t send_body(const char* buf, size_t len) override { + return write_data(buf, len); + } + + RGWEnv& get_env() noexcept override { + return env; + } + + bool sent_100_continue() const { return sent100continue; } +}; + +} // namespace asio +} // namespace rgw diff --git a/src/rgw/rgw_asio_frontend.cc b/src/rgw/rgw_asio_frontend.cc new file mode 100644 index 000000000..633a29633 --- /dev/null +++ b/src/rgw/rgw_asio_frontend.cc @@ -0,0 +1,1199 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "common/async/shared_mutex.h" +#include "common/errno.h" +#include "common/strtol.h" + +#include "rgw_asio_client.h" +#include "rgw_asio_frontend.h" + +#ifdef WITH_RADOSGW_BEAST_OPENSSL +#include +#endif + +#include "common/split.h" + +#include "services/svc_config_key.h" +#include "services/svc_zone.h" + +#include "rgw_zone.h" + +#include "rgw_asio_frontend_timer.h" +#include "rgw_dmclock_async_scheduler.h" + +#define dout_subsys ceph_subsys_rgw + +namespace { + +using tcp = boost::asio::ip::tcp; +namespace http = boost::beast::http; +#ifdef WITH_RADOSGW_BEAST_OPENSSL +namespace ssl = boost::asio::ssl; +#endif + +struct Connection; + +// use explicit executor types instead of the type-erased boost::asio::executor +using executor_type = boost::asio::io_context::executor_type; + +using tcp_socket = boost::asio::basic_stream_socket; +using tcp_stream = boost::beast::basic_stream; + +using timeout_timer = rgw::basic_timeout_timer; + +static constexpr size_t parse_buffer_size = 65536; +using parse_buffer = boost::beast::flat_static_buffer; + +// use mmap/mprotect to allocate 512k coroutine stacks +auto make_stack_allocator() { + return boost::context::protected_fixedsize_stack{512*1024}; +} + +using namespace std; + +template +class StreamIO : public rgw::asio::ClientIO { + CephContext* const cct; + Stream& stream; + timeout_timer& timeout; + yield_context yield; + parse_buffer& buffer; + boost::system::error_code fatal_ec; + public: + StreamIO(CephContext *cct, Stream& stream, timeout_timer& timeout, + rgw::asio::parser_type& parser, yield_context yield, + parse_buffer& buffer, bool is_ssl, + const tcp::endpoint& local_endpoint, + const tcp::endpoint& remote_endpoint) + : ClientIO(parser, is_ssl, local_endpoint, remote_endpoint), + cct(cct), stream(stream), timeout(timeout), yield(yield), + buffer(buffer) + {} + + boost::system::error_code get_fatal_error_code() const { return fatal_ec; } + + size_t write_data(const char* buf, size_t len) override { + boost::system::error_code ec; + timeout.start(); + auto bytes = boost::asio::async_write(stream, boost::asio::buffer(buf, len), + yield[ec]); + timeout.cancel(); + if (ec) { + ldout(cct, 4) << "write_data failed: " << ec.message() << dendl; + if (ec == boost::asio::error::broken_pipe) { + boost::system::error_code ec_ignored; + stream.lowest_layer().shutdown(tcp_socket::shutdown_both, ec_ignored); + } + if (!fatal_ec) { + fatal_ec = ec; + } + throw rgw::io::Exception(ec.value(), std::system_category()); + } + return bytes; + } + + size_t recv_body(char* buf, size_t max) override { + auto& message = parser.get(); + auto& body_remaining = message.body(); + body_remaining.data = buf; + body_remaining.size = max; + + while (body_remaining.size && !parser.is_done()) { + boost::system::error_code ec; + timeout.start(); + http::async_read_some(stream, buffer, parser, yield[ec]); + timeout.cancel(); + if (ec == http::error::need_buffer) { + break; + } + if (ec) { + ldout(cct, 4) << "failed to read body: " << ec.message() << dendl; + if (!fatal_ec) { + fatal_ec = ec; + } + throw rgw::io::Exception(ec.value(), std::system_category()); + } + } + return max - body_remaining.size; + } +}; + +// output the http version as a string, ie 'HTTP/1.1' +struct http_version { + unsigned major_ver; + unsigned minor_ver; + explicit http_version(unsigned version) + : major_ver(version / 10), minor_ver(version % 10) {} +}; +std::ostream& operator<<(std::ostream& out, const http_version& v) { + return out << "HTTP/" << v.major_ver << '.' << v.minor_ver; +} + +// log an http header value or '-' if it's missing +struct log_header { + const http::fields& fields; + http::field field; + std::string_view quote; + log_header(const http::fields& fields, http::field field, + std::string_view quote = "") + : fields(fields), field(field), quote(quote) {} +}; +std::ostream& operator<<(std::ostream& out, const log_header& h) { + auto p = h.fields.find(h.field); + if (p == h.fields.end()) { + return out << '-'; + } + return out << h.quote << p->value() << h.quote; +} + +// log fractional seconds in milliseconds +struct log_ms_remainder { + ceph::coarse_real_time t; + log_ms_remainder(ceph::coarse_real_time t) : t(t) {} +}; +std::ostream& operator<<(std::ostream& out, const log_ms_remainder& m) { + using namespace std::chrono; + return out << std::setfill('0') << std::setw(3) + << duration_cast(m.t.time_since_epoch()).count() % 1000; +} + +// log time in apache format: day/month/year:hour:minute:second zone +struct log_apache_time { + ceph::coarse_real_time t; + log_apache_time(ceph::coarse_real_time t) : t(t) {} +}; +std::ostream& operator<<(std::ostream& out, const log_apache_time& a) { + const auto t = ceph::coarse_real_clock::to_time_t(a.t); + const auto local = std::localtime(&t); + return out << std::put_time(local, "%d/%b/%Y:%T.") << log_ms_remainder{a.t} + << std::put_time(local, " %z"); +}; + +using SharedMutex = ceph::async::SharedMutex; + +template +void handle_connection(boost::asio::io_context& context, + RGWProcessEnv& env, Stream& stream, + timeout_timer& timeout, size_t header_limit, + parse_buffer& buffer, bool is_ssl, + SharedMutex& pause_mutex, + rgw::dmclock::Scheduler *scheduler, + const std::string& uri_prefix, + boost::system::error_code& ec, + yield_context yield) +{ + // don't impose a limit on the body, since we read it in pieces + static constexpr size_t body_limit = std::numeric_limits::max(); + + auto cct = env.driver->ctx(); + + // read messages from the stream until eof + for (;;) { + // configure the parser + rgw::asio::parser_type parser; + parser.header_limit(header_limit); + parser.body_limit(body_limit); + timeout.start(); + // parse the header + http::async_read_header(stream, buffer, parser, yield[ec]); + timeout.cancel(); + if (ec == boost::asio::error::connection_reset || + ec == boost::asio::error::bad_descriptor || + ec == boost::asio::error::operation_aborted || +#ifdef WITH_RADOSGW_BEAST_OPENSSL + ec == ssl::error::stream_truncated || +#endif + ec == http::error::end_of_stream) { + ldout(cct, 20) << "failed to read header: " << ec.message() << dendl; + return; + } + auto& message = parser.get(); + if (ec) { + ldout(cct, 1) << "failed to read header: " << ec.message() << dendl; + http::response response; + response.result(http::status::bad_request); + response.version(message.version() == 10 ? 10 : 11); + response.prepare_payload(); + timeout.start(); + http::async_write(stream, response, yield[ec]); + timeout.cancel(); + if (ec) { + ldout(cct, 5) << "failed to write response: " << ec.message() << dendl; + } + ldout(cct, 1) << "====== req done http_status=400 ======" << dendl; + return; + } + + bool expect_continue = (message[http::field::expect] == "100-continue"); + + { + auto lock = pause_mutex.async_lock_shared(yield[ec]); + if (ec == boost::asio::error::operation_aborted) { + return; + } else if (ec) { + ldout(cct, 1) << "failed to lock: " << ec.message() << dendl; + return; + } + + // process the request + RGWRequest req{env.driver->get_new_req_id()}; + + auto& socket = stream.lowest_layer(); + const auto& remote_endpoint = socket.remote_endpoint(ec); + if (ec) { + ldout(cct, 1) << "failed to connect client: " << ec.message() << dendl; + return; + } + const auto& local_endpoint = socket.local_endpoint(ec); + if (ec) { + ldout(cct, 1) << "failed to connect client: " << ec.message() << dendl; + return; + } + + StreamIO real_client{cct, stream, timeout, parser, yield, buffer, + is_ssl, local_endpoint, remote_endpoint}; + + auto real_client_io = rgw::io::add_reordering( + rgw::io::add_buffering(cct, + rgw::io::add_chunking( + rgw::io::add_conlen_controlling( + &real_client)))); + RGWRestfulIO client(cct, &real_client_io); + optional_yield y = null_yield; + if (cct->_conf->rgw_beast_enable_async) { + y = optional_yield{context, yield}; + } + int http_ret = 0; + string user = "-"; + const auto started = ceph::coarse_real_clock::now(); + ceph::coarse_real_clock::duration latency{}; + process_request(env, &req, uri_prefix, &client, y, + scheduler, &user, &latency, &http_ret); + + if (cct->_conf->subsys.should_gather(ceph_subsys_rgw_access, 1)) { + // access log line elements begin per Apache Combined Log Format with additions following + lsubdout(cct, rgw_access, 1) << "beast: " << std::hex << &req << std::dec << ": " + << remote_endpoint.address() << " - " << user << " [" << log_apache_time{started} << "] \"" + << message.method_string() << ' ' << message.target() << ' ' + << http_version{message.version()} << "\" " << http_ret << ' ' + << client.get_bytes_sent() + client.get_bytes_received() << ' ' + << log_header{message, http::field::referer, "\""} << ' ' + << log_header{message, http::field::user_agent, "\""} << ' ' + << log_header{message, http::field::range} << " latency=" + << latency << dendl; + } + + // process_request() can't distinguish between connection errors and + // http/s3 errors, so check StreamIO for fatal connection errors + ec = real_client.get_fatal_error_code(); + if (ec) { + return; + } + + if (real_client.sent_100_continue()) { + expect_continue = false; + } + } + + if (!parser.keep_alive()) { + return; + } + + // if we failed before reading the entire message, discard any remaining + // bytes before reading the next + while (!expect_continue && !parser.is_done()) { + static std::array discard_buffer; + + auto& body = parser.get().body(); + body.size = discard_buffer.size(); + body.data = discard_buffer.data(); + + timeout.start(); + http::async_read_some(stream, buffer, parser, yield[ec]); + timeout.cancel(); + if (ec == http::error::need_buffer) { + continue; + } + if (ec == boost::asio::error::connection_reset) { + return; + } + if (ec) { + ldout(cct, 5) << "failed to discard unread message: " + << ec.message() << dendl; + return; + } + } + } +} + +// timeout support requires that connections are reference-counted, because the +// timeout_handler can outlive the coroutine +struct Connection : boost::intrusive::list_base_hook<>, + boost::intrusive_ref_counter +{ + tcp_socket socket; + parse_buffer buffer; + + explicit Connection(tcp_socket&& socket) noexcept + : socket(std::move(socket)) {} + + void close(boost::system::error_code& ec) { + socket.close(ec); + } + + tcp_socket& get_socket() { return socket; } +}; + +class ConnectionList { + using List = boost::intrusive::list; + List connections; + std::mutex mutex; + + void remove(Connection& c) { + std::lock_guard lock{mutex}; + if (c.is_linked()) { + connections.erase(List::s_iterator_to(c)); + } + } + public: + class Guard { + ConnectionList *list; + Connection *conn; + public: + Guard(ConnectionList *list, Connection *conn) : list(list), conn(conn) {} + ~Guard() { list->remove(*conn); } + }; + [[nodiscard]] Guard add(Connection& conn) { + std::lock_guard lock{mutex}; + connections.push_back(conn); + return Guard{this, &conn}; + } + void close(boost::system::error_code& ec) { + std::lock_guard lock{mutex}; + for (auto& conn : connections) { + conn.socket.close(ec); + } + connections.clear(); + } +}; + +namespace dmc = rgw::dmclock; +class AsioFrontend { + RGWProcessEnv& env; + RGWFrontendConfig* conf; + boost::asio::io_context context; + std::string uri_prefix; + ceph::timespan request_timeout = std::chrono::milliseconds(REQUEST_TIMEOUT); + size_t header_limit = 16384; +#ifdef WITH_RADOSGW_BEAST_OPENSSL + boost::optional ssl_context; + int get_config_key_val(string name, + const string& type, + bufferlist *pbl); + int ssl_set_private_key(const string& name, bool is_ssl_cert); + int ssl_set_certificate_chain(const string& name); + int init_ssl(); +#endif + SharedMutex pause_mutex; + std::unique_ptr scheduler; + + struct Listener { + tcp::endpoint endpoint; + tcp::acceptor acceptor; + tcp_socket socket; + bool use_ssl = false; + bool use_nodelay = false; + + explicit Listener(boost::asio::io_context& context) + : acceptor(context), socket(context) {} + }; + std::vector listeners; + + ConnectionList connections; + + // work guard to keep run() threads busy while listeners are paused + using Executor = boost::asio::io_context::executor_type; + std::optional> work; + + std::vector threads; + std::atomic going_down{false}; + + CephContext* ctx() const { return env.driver->ctx(); } + std::optional client_counters; + std::unique_ptr client_config; + void accept(Listener& listener, boost::system::error_code ec); + + public: + AsioFrontend(RGWProcessEnv& env, RGWFrontendConfig* conf, + dmc::SchedulerCtx& sched_ctx) + : env(env), conf(conf), pause_mutex(context.get_executor()) + { + auto sched_t = dmc::get_scheduler_t(ctx()); + switch(sched_t){ + case dmc::scheduler_t::dmclock: + scheduler.reset(new dmc::AsyncScheduler(ctx(), + context, + std::ref(sched_ctx.get_dmc_client_counters()), + sched_ctx.get_dmc_client_config(), + *sched_ctx.get_dmc_client_config(), + dmc::AtLimit::Reject)); + break; + case dmc::scheduler_t::none: + lderr(ctx()) << "Got invalid scheduler type for beast, defaulting to throttler" << dendl; + [[fallthrough]]; + case dmc::scheduler_t::throttler: + scheduler.reset(new dmc::SimpleThrottler(ctx())); + + } + } + + int init(); + int run(); + void stop(); + void join(); + void pause(); + void unpause(); +}; + +unsigned short parse_port(const char *input, boost::system::error_code& ec) +{ + char *end = nullptr; + auto port = std::strtoul(input, &end, 10); + if (port > std::numeric_limits::max()) { + ec.assign(ERANGE, boost::system::system_category()); + } else if (port == 0 && end == input) { + ec.assign(EINVAL, boost::system::system_category()); + } + return port; +} + +tcp::endpoint parse_endpoint(boost::asio::string_view input, + unsigned short default_port, + boost::system::error_code& ec) +{ + tcp::endpoint endpoint; + + if (input.empty()) { + ec = boost::asio::error::invalid_argument; + return endpoint; + } + + if (input[0] == '[') { // ipv6 + const size_t addr_begin = 1; + const size_t addr_end = input.find(']'); + if (addr_end == input.npos) { // no matching ] + ec = boost::asio::error::invalid_argument; + return endpoint; + } + if (addr_end + 1 < input.size()) { + // :port must must follow [ipv6] + if (input[addr_end + 1] != ':') { + ec = boost::asio::error::invalid_argument; + return endpoint; + } else { + auto port_str = input.substr(addr_end + 2); + endpoint.port(parse_port(port_str.data(), ec)); + } + } else { + endpoint.port(default_port); + } + auto addr = input.substr(addr_begin, addr_end - addr_begin); + endpoint.address(boost::asio::ip::make_address_v6(addr, ec)); + } else { // ipv4 + auto colon = input.find(':'); + if (colon != input.npos) { + auto port_str = input.substr(colon + 1); + endpoint.port(parse_port(port_str.data(), ec)); + if (ec) { + return endpoint; + } + } else { + endpoint.port(default_port); + } + auto addr = input.substr(0, colon); + endpoint.address(boost::asio::ip::make_address_v4(addr, ec)); + } + return endpoint; +} + +static int drop_privileges(CephContext *ctx) +{ + uid_t uid = ctx->get_set_uid(); + gid_t gid = ctx->get_set_gid(); + std::string uid_string = ctx->get_set_uid_string(); + std::string gid_string = ctx->get_set_gid_string(); + if (gid && setgid(gid) != 0) { + int err = errno; + ldout(ctx, -1) << "unable to setgid " << gid << ": " << cpp_strerror(err) << dendl; + return -err; + } + if (uid && setuid(uid) != 0) { + int err = errno; + ldout(ctx, -1) << "unable to setuid " << uid << ": " << cpp_strerror(err) << dendl; + return -err; + } + if (uid && gid) { + ldout(ctx, 0) << "set uid:gid to " << uid << ":" << gid + << " (" << uid_string << ":" << gid_string << ")" << dendl; + } + return 0; +} + +int AsioFrontend::init() +{ + boost::system::error_code ec; + auto& config = conf->get_config_map(); + + if (auto i = config.find("prefix"); i != config.end()) { + uri_prefix = i->second; + } + +// Setting global timeout + auto timeout = config.find("request_timeout_ms"); + if (timeout != config.end()) { + auto timeout_number = ceph::parse(timeout->second); + if (timeout_number) { + request_timeout = std::chrono::milliseconds(*timeout_number); + } else { + lderr(ctx()) << "WARNING: invalid value for request_timeout_ms: " + << timeout->second << " setting it to the default value: " + << REQUEST_TIMEOUT << dendl; + } + } + + auto max_header_size = config.find("max_header_size"); + if (max_header_size != config.end()) { + auto limit = ceph::parse(max_header_size->second); + if (!limit) { + lderr(ctx()) << "WARNING: invalid value for max_header_size: " + << max_header_size->second << ", using the default value: " + << header_limit << dendl; + } else if (*limit > parse_buffer_size) { // can't exceed parse buffer size + header_limit = parse_buffer_size; + lderr(ctx()) << "WARNING: max_header_size " << max_header_size->second + << " capped at maximum value " << header_limit << dendl; + } else { + header_limit = *limit; + } + } + +#ifdef WITH_RADOSGW_BEAST_OPENSSL + int r = init_ssl(); + if (r < 0) { + return r; + } +#endif + + // parse endpoints + auto ports = config.equal_range("port"); + for (auto i = ports.first; i != ports.second; ++i) { + auto port = parse_port(i->second.c_str(), ec); + if (ec) { + lderr(ctx()) << "failed to parse port=" << i->second << dendl; + return -ec.value(); + } + listeners.emplace_back(context); + listeners.back().endpoint.port(port); + + listeners.emplace_back(context); + listeners.back().endpoint = tcp::endpoint(tcp::v6(), port); + } + + auto endpoints = config.equal_range("endpoint"); + for (auto i = endpoints.first; i != endpoints.second; ++i) { + auto endpoint = parse_endpoint(i->second, 80, ec); + if (ec) { + lderr(ctx()) << "failed to parse endpoint=" << i->second << dendl; + return -ec.value(); + } + listeners.emplace_back(context); + listeners.back().endpoint = endpoint; + } + // parse tcp nodelay + auto nodelay = config.find("tcp_nodelay"); + if (nodelay != config.end()) { + for (auto& l : listeners) { + l.use_nodelay = (nodelay->second == "1"); + } + } + + + bool socket_bound = false; + // start listeners + for (auto& l : listeners) { + l.acceptor.open(l.endpoint.protocol(), ec); + if (ec) { + if (ec == boost::asio::error::address_family_not_supported) { + ldout(ctx(), 0) << "WARNING: cannot open socket for endpoint=" << l.endpoint + << ", " << ec.message() << dendl; + continue; + } + + lderr(ctx()) << "failed to open socket: " << ec.message() << dendl; + return -ec.value(); + } + + if (l.endpoint.protocol() == tcp::v6()) { + l.acceptor.set_option(boost::asio::ip::v6_only(true), ec); + if (ec) { + lderr(ctx()) << "failed to set v6_only socket option: " + << ec.message() << dendl; + return -ec.value(); + } + } + + l.acceptor.set_option(tcp::acceptor::reuse_address(true)); + l.acceptor.bind(l.endpoint, ec); + if (ec) { + lderr(ctx()) << "failed to bind address " << l.endpoint + << ": " << ec.message() << dendl; + return -ec.value(); + } + + auto it = config.find("max_connection_backlog"); + auto max_connection_backlog = boost::asio::socket_base::max_listen_connections; + if (it != config.end()) { + string err; + max_connection_backlog = strict_strtol(it->second.c_str(), 10, &err); + if (!err.empty()) { + ldout(ctx(), 0) << "WARNING: invalid value for max_connection_backlog=" << it->second << dendl; + max_connection_backlog = boost::asio::socket_base::max_listen_connections; + } + } + l.acceptor.listen(max_connection_backlog); + l.acceptor.async_accept(l.socket, + [this, &l] (boost::system::error_code ec) { + accept(l, ec); + }); + + ldout(ctx(), 4) << "frontend listening on " << l.endpoint << dendl; + socket_bound = true; + } + if (!socket_bound) { + lderr(ctx()) << "Unable to listen at any endpoints" << dendl; + return -EINVAL; + } + + return drop_privileges(ctx()); +} + +#ifdef WITH_RADOSGW_BEAST_OPENSSL + +static string config_val_prefix = "config://"; + +namespace { + +class ExpandMetaVar { + map meta_map; + +public: + ExpandMetaVar(rgw::sal::Zone* zone_svc) { + meta_map["realm"] = zone_svc->get_realm_name(); + meta_map["realm_id"] = zone_svc->get_realm_id(); + meta_map["zonegroup"] = zone_svc->get_zonegroup().get_name(); + meta_map["zonegroup_id"] = zone_svc->get_zonegroup().get_id(); + meta_map["zone"] = zone_svc->get_name(); + meta_map["zone_id"] = zone_svc->get_id(); + } + + string process_str(const string& in); +}; + +string ExpandMetaVar::process_str(const string& in) +{ + if (meta_map.empty()) { + return in; + } + + auto pos = in.find('$'); + if (pos == std::string::npos) { + return in; + } + + string out; + decltype(pos) last_pos = 0; + + while (pos != std::string::npos) { + if (pos > last_pos) { + out += in.substr(last_pos, pos - last_pos); + } + + string var; + const char *valid_chars = "abcdefghijklmnopqrstuvwxyz_"; + + size_t endpos = 0; + if (in[pos+1] == '{') { + // ...${foo_bar}... + endpos = in.find_first_not_of(valid_chars, pos + 2); + if (endpos != std::string::npos && + in[endpos] == '}') { + var = in.substr(pos + 2, endpos - pos - 2); + endpos++; + } + } else { + // ...$foo... + endpos = in.find_first_not_of(valid_chars, pos + 1); + if (endpos != std::string::npos) + var = in.substr(pos + 1, endpos - pos - 1); + else + var = in.substr(pos + 1); + } + string var_source = in.substr(pos, endpos - pos); + last_pos = endpos; + + auto iter = meta_map.find(var); + if (iter != meta_map.end()) { + out += iter->second; + } else { + out += var_source; + } + pos = in.find('$', last_pos); + } + if (last_pos != std::string::npos) { + out += in.substr(last_pos); + } + + return out; +} + +} /* anonymous namespace */ + +int AsioFrontend::get_config_key_val(string name, + const string& type, + bufferlist *pbl) +{ + if (name.empty()) { + lderr(ctx()) << "bad " << type << " config value" << dendl; + return -EINVAL; + } + + int r = env.driver->get_config_key_val(name, pbl); + if (r < 0) { + lderr(ctx()) << type << " was not found: " << name << dendl; + return r; + } + return 0; +} + +int AsioFrontend::ssl_set_private_key(const string& name, bool is_ssl_certificate) +{ + boost::system::error_code ec; + + if (!boost::algorithm::starts_with(name, config_val_prefix)) { + ssl_context->use_private_key_file(name, ssl::context::pem, ec); + } else { + bufferlist bl; + int r = get_config_key_val(name.substr(config_val_prefix.size()), + "ssl_private_key", + &bl); + if (r < 0) { + return r; + } + ssl_context->use_private_key(boost::asio::buffer(bl.c_str(), bl.length()), + ssl::context::pem, ec); + } + + if (ec) { + if (!is_ssl_certificate) { + lderr(ctx()) << "failed to add ssl_private_key=" << name + << ": " << ec.message() << dendl; + } else { + lderr(ctx()) << "failed to use ssl_certificate=" << name + << " as a private key: " << ec.message() << dendl; + } + return -ec.value(); + } + + return 0; +} + +int AsioFrontend::ssl_set_certificate_chain(const string& name) +{ + boost::system::error_code ec; + + if (!boost::algorithm::starts_with(name, config_val_prefix)) { + ssl_context->use_certificate_chain_file(name, ec); + } else { + bufferlist bl; + int r = get_config_key_val(name.substr(config_val_prefix.size()), + "ssl_certificate", + &bl); + if (r < 0) { + return r; + } + ssl_context->use_certificate_chain(boost::asio::buffer(bl.c_str(), bl.length()), + ec); + } + + if (ec) { + lderr(ctx()) << "failed to use ssl_certificate=" << name + << ": " << ec.message() << dendl; + return -ec.value(); + } + + return 0; +} + +int AsioFrontend::init_ssl() +{ + boost::system::error_code ec; + auto& config = conf->get_config_map(); + + // ssl configuration + std::optional cert = conf->get_val("ssl_certificate"); + if (cert) { + // only initialize the ssl context if it's going to be used + ssl_context = boost::in_place(ssl::context::tls); + } + + std::optional key = conf->get_val("ssl_private_key"); + bool have_cert = false; + + if (key && !cert) { + lderr(ctx()) << "no ssl_certificate configured for ssl_private_key" << dendl; + return -EINVAL; + } + + std::optional options = conf->get_val("ssl_options"); + if (options) { + if (!cert) { + lderr(ctx()) << "no ssl_certificate configured for ssl_options" << dendl; + return -EINVAL; + } + } else if (cert) { + options = "no_sslv2:no_sslv3:no_tlsv1:no_tlsv1_1"; + } + + if (options) { + for (auto &option : ceph::split(*options, ":")) { + if (option == "default_workarounds") { + ssl_context->set_options(ssl::context::default_workarounds); + } else if (option == "no_compression") { + ssl_context->set_options(ssl::context::no_compression); + } else if (option == "no_sslv2") { + ssl_context->set_options(ssl::context::no_sslv2); + } else if (option == "no_sslv3") { + ssl_context->set_options(ssl::context::no_sslv3); + } else if (option == "no_tlsv1") { + ssl_context->set_options(ssl::context::no_tlsv1); + } else if (option == "no_tlsv1_1") { + ssl_context->set_options(ssl::context::no_tlsv1_1); + } else if (option == "no_tlsv1_2") { + ssl_context->set_options(ssl::context::no_tlsv1_2); + } else if (option == "single_dh_use") { + ssl_context->set_options(ssl::context::single_dh_use); + } else { + lderr(ctx()) << "ignoring unknown ssl option '" << option << "'" << dendl; + } + } + } + + std::optional ciphers = conf->get_val("ssl_ciphers"); + if (ciphers) { + if (!cert) { + lderr(ctx()) << "no ssl_certificate configured for ssl_ciphers" << dendl; + return -EINVAL; + } + + int r = SSL_CTX_set_cipher_list(ssl_context->native_handle(), + ciphers->c_str()); + if (r == 0) { + lderr(ctx()) << "no cipher could be selected from ssl_ciphers: " + << *ciphers << dendl; + return -EINVAL; + } + } + + auto ports = config.equal_range("ssl_port"); + auto endpoints = config.equal_range("ssl_endpoint"); + + /* + * don't try to config certificate if frontend isn't configured for ssl + */ + if (ports.first == ports.second && + endpoints.first == endpoints.second) { + return 0; + } + + bool key_is_cert = false; + + if (cert) { + if (!key) { + key = cert; + key_is_cert = true; + } + + ExpandMetaVar emv(env.driver->get_zone()); + + cert = emv.process_str(*cert); + key = emv.process_str(*key); + + int r = ssl_set_private_key(*key, key_is_cert); + bool have_private_key = (r >= 0); + if (r < 0) { + if (!key_is_cert) { + r = ssl_set_private_key(*cert, true); + have_private_key = (r >= 0); + } + } + + if (have_private_key) { + int r = ssl_set_certificate_chain(*cert); + have_cert = (r >= 0); + } + } + + // parse ssl endpoints + for (auto i = ports.first; i != ports.second; ++i) { + if (!have_cert) { + lderr(ctx()) << "no ssl_certificate configured for ssl_port" << dendl; + return -EINVAL; + } + auto port = parse_port(i->second.c_str(), ec); + if (ec) { + lderr(ctx()) << "failed to parse ssl_port=" << i->second << dendl; + return -ec.value(); + } + listeners.emplace_back(context); + listeners.back().endpoint.port(port); + listeners.back().use_ssl = true; + + listeners.emplace_back(context); + listeners.back().endpoint = tcp::endpoint(tcp::v6(), port); + listeners.back().use_ssl = true; + } + + for (auto i = endpoints.first; i != endpoints.second; ++i) { + if (!have_cert) { + lderr(ctx()) << "no ssl_certificate configured for ssl_endpoint" << dendl; + return -EINVAL; + } + auto endpoint = parse_endpoint(i->second, 443, ec); + if (ec) { + lderr(ctx()) << "failed to parse ssl_endpoint=" << i->second << dendl; + return -ec.value(); + } + listeners.emplace_back(context); + listeners.back().endpoint = endpoint; + listeners.back().use_ssl = true; + } + return 0; +} +#endif // WITH_RADOSGW_BEAST_OPENSSL + +void AsioFrontend::accept(Listener& l, boost::system::error_code ec) +{ + if (!l.acceptor.is_open()) { + return; + } else if (ec == boost::asio::error::operation_aborted) { + return; + } else if (ec) { + ldout(ctx(), 1) << "accept failed: " << ec.message() << dendl; + return; + } + auto stream = std::move(l.socket); + stream.set_option(tcp::no_delay(l.use_nodelay), ec); + l.acceptor.async_accept(l.socket, + [this, &l] (boost::system::error_code ec) { + accept(l, ec); + }); + + // spawn a coroutine to handle the connection +#ifdef WITH_RADOSGW_BEAST_OPENSSL + if (l.use_ssl) { + spawn::spawn(context, + [this, s=std::move(stream)] (yield_context yield) mutable { + auto conn = boost::intrusive_ptr{new Connection(std::move(s))}; + auto c = connections.add(*conn); + // wrap the tcp stream in an ssl stream + boost::asio::ssl::stream stream{conn->socket, *ssl_context}; + auto timeout = timeout_timer{context.get_executor(), request_timeout, conn}; + // do ssl handshake + boost::system::error_code ec; + timeout.start(); + auto bytes = stream.async_handshake(ssl::stream_base::server, + conn->buffer.data(), yield[ec]); + timeout.cancel(); + if (ec) { + ldout(ctx(), 1) << "ssl handshake failed: " << ec.message() << dendl; + return; + } + conn->buffer.consume(bytes); + handle_connection(context, env, stream, timeout, header_limit, + conn->buffer, true, pause_mutex, scheduler.get(), + uri_prefix, ec, yield); + if (!ec) { + // ssl shutdown (ignoring errors) + stream.async_shutdown(yield[ec]); + } + conn->socket.shutdown(tcp::socket::shutdown_both, ec); + }, make_stack_allocator()); + } else { +#else + { +#endif // WITH_RADOSGW_BEAST_OPENSSL + spawn::spawn(context, + [this, s=std::move(stream)] (yield_context yield) mutable { + auto conn = boost::intrusive_ptr{new Connection(std::move(s))}; + auto c = connections.add(*conn); + auto timeout = timeout_timer{context.get_executor(), request_timeout, conn}; + boost::system::error_code ec; + handle_connection(context, env, conn->socket, timeout, header_limit, + conn->buffer, false, pause_mutex, scheduler.get(), + uri_prefix, ec, yield); + conn->socket.shutdown(tcp_socket::shutdown_both, ec); + }, make_stack_allocator()); + } +} + +int AsioFrontend::run() +{ + auto cct = ctx(); + const int thread_count = cct->_conf->rgw_thread_pool_size; + threads.reserve(thread_count); + + ldout(cct, 4) << "frontend spawning " << thread_count << " threads" << dendl; + + // the worker threads call io_context::run(), which will return when there's + // no work left. hold a work guard to keep these threads going until join() + work.emplace(boost::asio::make_work_guard(context)); + + for (int i = 0; i < thread_count; i++) { + threads.emplace_back([this]() noexcept { + // request warnings on synchronous librados calls in this thread + is_asio_thread = true; + // Have uncaught exceptions kill the process and give a + // stacktrace, not be swallowed. + context.run(); + }); + } + return 0; +} + +void AsioFrontend::stop() +{ + ldout(ctx(), 4) << "frontend initiating shutdown..." << dendl; + + going_down = true; + + boost::system::error_code ec; + // close all listeners + for (auto& listener : listeners) { + listener.acceptor.close(ec); + } + // close all connections + connections.close(ec); + pause_mutex.cancel(); +} + +void AsioFrontend::join() +{ + if (!going_down) { + stop(); + } + work.reset(); + + ldout(ctx(), 4) << "frontend joining threads..." << dendl; + for (auto& thread : threads) { + thread.join(); + } + ldout(ctx(), 4) << "frontend done" << dendl; +} + +void AsioFrontend::pause() +{ + ldout(ctx(), 4) << "frontend pausing connections..." << dendl; + + // cancel pending calls to accept(), but don't close the sockets + boost::system::error_code ec; + for (auto& l : listeners) { + l.acceptor.cancel(ec); + } + + // pause and wait for outstanding requests to complete + pause_mutex.lock(ec); + + if (ec) { + ldout(ctx(), 1) << "frontend failed to pause: " << ec.message() << dendl; + } else { + ldout(ctx(), 4) << "frontend paused" << dendl; + } +} + +void AsioFrontend::unpause() +{ + // unpause to unblock connections + pause_mutex.unlock(); + + // start accepting connections again + for (auto& l : listeners) { + l.acceptor.async_accept(l.socket, + [this, &l] (boost::system::error_code ec) { + accept(l, ec); + }); + } + + ldout(ctx(), 4) << "frontend unpaused" << dendl; +} + +} // anonymous namespace + +class RGWAsioFrontend::Impl : public AsioFrontend { + public: + Impl(RGWProcessEnv& env, RGWFrontendConfig* conf, + rgw::dmclock::SchedulerCtx& sched_ctx) + : AsioFrontend(env, conf, sched_ctx) {} +}; + +RGWAsioFrontend::RGWAsioFrontend(RGWProcessEnv& env, + RGWFrontendConfig* conf, + rgw::dmclock::SchedulerCtx& sched_ctx) + : impl(new Impl(env, conf, sched_ctx)) +{ +} + +RGWAsioFrontend::~RGWAsioFrontend() = default; + +int RGWAsioFrontend::init() +{ + return impl->init(); +} + +int RGWAsioFrontend::run() +{ + return impl->run(); +} + +void RGWAsioFrontend::stop() +{ + impl->stop(); +} + +void RGWAsioFrontend::join() +{ + impl->join(); +} + +void RGWAsioFrontend::pause_for_new_config() +{ + impl->pause(); +} + +void RGWAsioFrontend::unpause_with_new_config() +{ + impl->unpause(); +} diff --git a/src/rgw/rgw_asio_frontend.h b/src/rgw/rgw_asio_frontend.h new file mode 100644 index 000000000..2de6f337a --- /dev/null +++ b/src/rgw/rgw_asio_frontend.h @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "rgw_frontend.h" +#define REQUEST_TIMEOUT 65000 + +class RGWAsioFrontend : public RGWFrontend { + class Impl; + std::unique_ptr impl; +public: + RGWAsioFrontend(RGWProcessEnv& env, RGWFrontendConfig* conf, + rgw::dmclock::SchedulerCtx& sched_ctx); + ~RGWAsioFrontend() override; + + int init() override; + int run() override; + void stop() override; + void join() override; + + void pause_for_new_config() override; + void unpause_with_new_config() override; +}; diff --git a/src/rgw/rgw_asio_frontend_timer.h b/src/rgw/rgw_asio_frontend_timer.h new file mode 100644 index 000000000..bc58790d6 --- /dev/null +++ b/src/rgw/rgw_asio_frontend_timer.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include + +#include "common/ceph_time.h" + +namespace rgw { + +// a WaitHandler that closes a stream if the timeout expires +template +struct timeout_handler { + // this handler may outlive the timer/stream, so we need to hold a reference + // to keep the stream alive + boost::intrusive_ptr stream; + + explicit timeout_handler(boost::intrusive_ptr stream) noexcept + : stream(std::move(stream)) {} + + void operator()(boost::system::error_code ec) { + if (!ec) { // wait was not canceled + boost::system::error_code ec_ignored; + stream->get_socket().cancel(); + stream->get_socket().shutdown(boost::asio::ip::tcp::socket::shutdown_both, ec_ignored); + } + } +}; + +// a timeout timer for stream operations +template +class basic_timeout_timer { + public: + using clock_type = Clock; + using duration = typename clock_type::duration; + using executor_type = Executor; + + explicit basic_timeout_timer(const executor_type& ex, duration dur, + boost::intrusive_ptr stream) + : timer(ex), dur(dur), stream(std::move(stream)) + {} + + basic_timeout_timer(const basic_timeout_timer&) = delete; + basic_timeout_timer& operator=(const basic_timeout_timer&) = delete; + + void start() { + if (dur.count() > 0) { + timer.expires_after(dur); + timer.async_wait(timeout_handler{stream}); + } + } + + void cancel() { + if (dur.count() > 0) { + timer.cancel(); + } + } + + private: + using Timer = boost::asio::basic_waitable_timer, executor_type>; + Timer timer; + duration dur; + boost::intrusive_ptr stream; +}; + +} // namespace rgw diff --git a/src/rgw/rgw_auth.cc b/src/rgw/rgw_auth.cc new file mode 100644 index 000000000..2c61b8361 --- /dev/null +++ b/src/rgw/rgw_auth.cc @@ -0,0 +1,934 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "rgw_common.h" +#include "rgw_auth.h" +#include "rgw_quota.h" +#include "rgw_user.h" +#include "rgw_http_client.h" +#include "rgw_keystone.h" +#include "rgw_sal.h" +#include "rgw_log.h" + +#include "include/str_list.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw { +namespace auth { + +std::unique_ptr +transform_old_authinfo(CephContext* const cct, + const rgw_user& auth_id, + const int perm_mask, + const bool is_admin, + const uint32_t type) +{ + /* This class is not intended for public use. Should be removed altogether + * with this function after moving all our APIs to the new authentication + * infrastructure. */ + class DummyIdentityApplier : public rgw::auth::Identity { + CephContext* const cct; + + /* For this particular case it's OK to use rgw_user structure to convey + * the identity info as this was the policy for doing that before the + * new auth. */ + const rgw_user id; + const int perm_mask; + const bool is_admin; + const uint32_t type; + public: + DummyIdentityApplier(CephContext* const cct, + const rgw_user& auth_id, + const int perm_mask, + const bool is_admin, + const uint32_t type) + : cct(cct), + id(auth_id), + perm_mask(perm_mask), + is_admin(is_admin), + type(type) { + } + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override { + return rgw_perms_from_aclspec_default_strategy(id, aclspec, dpp); + } + + bool is_admin_of(const rgw_user& acct_id) const override { + return is_admin; + } + + bool is_owner_of(const rgw_user& acct_id) const override { + return id == acct_id; + } + + bool is_identity(const idset_t& ids) const override { + for (auto& p : ids) { + if (p.is_wildcard()) { + return true; + } else if (p.is_tenant() && p.get_tenant() == id.tenant) { + return true; + } else if (p.is_user() && + (p.get_tenant() == id.tenant) && + (p.get_id() == id.id)) { + return true; + } + } + return false; + } + + uint32_t get_perm_mask() const override { + return perm_mask; + } + + uint32_t get_identity_type() const override { + return type; + } + + string get_acct_name() const override { + return {}; + } + + string get_subuser() const override { + return {}; + } + + void to_str(std::ostream& out) const override { + out << "RGWDummyIdentityApplier(auth_id=" << id + << ", perm_mask=" << perm_mask + << ", is_admin=" << is_admin << ")"; + } + }; + + return std::unique_ptr( + new DummyIdentityApplier(cct, + auth_id, + perm_mask, + is_admin, + type)); +} + +std::unique_ptr +transform_old_authinfo(const req_state* const s) +{ + return transform_old_authinfo(s->cct, + s->user->get_id(), + s->perm_mask, + /* System user has admin permissions by default - it's supposed to pass + * through any security check. */ + s->system_request, + s->user->get_type()); +} + +} /* namespace auth */ +} /* namespace rgw */ + + +uint32_t rgw_perms_from_aclspec_default_strategy( + const rgw_user& uid, + const rgw::auth::Identity::aclspec_t& aclspec, + const DoutPrefixProvider *dpp) +{ + ldpp_dout(dpp, 5) << "Searching permissions for uid=" << uid << dendl; + + const auto iter = aclspec.find(uid.to_str()); + if (std::end(aclspec) != iter) { + ldpp_dout(dpp, 5) << "Found permission: " << iter->second << dendl; + return iter->second; + } + + ldpp_dout(dpp, 5) << "Permissions for user not found" << dendl; + return 0; +} + + +static inline const std::string make_spec_item(const std::string& tenant, + const std::string& id) +{ + return tenant + ":" + id; +} + + +static inline std::pair +strategy_handle_rejected(rgw::auth::Engine::result_t&& engine_result, + const rgw::auth::Strategy::Control policy, + rgw::auth::Engine::result_t&& strategy_result) +{ + using Control = rgw::auth::Strategy::Control; + switch (policy) { + case Control::REQUISITE: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + case Control::SUFFICIENT: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + case Control::FALLBACK: + /* Don't try next. */ + return std::make_pair(false, std::move(strategy_result)); + + default: + /* Huh, memory corruption? */ + ceph_abort(); + } +} + +static inline std::pair +strategy_handle_denied(rgw::auth::Engine::result_t&& engine_result, + const rgw::auth::Strategy::Control policy, + rgw::auth::Engine::result_t&& strategy_result) +{ + using Control = rgw::auth::Strategy::Control; + switch (policy) { + case Control::REQUISITE: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + case Control::SUFFICIENT: + /* Just try next. */ + return std::make_pair(true, std::move(engine_result)); + + case Control::FALLBACK: + return std::make_pair(true, std::move(strategy_result)); + + default: + /* Huh, memory corruption? */ + ceph_abort(); + } +} + +static inline std::pair +strategy_handle_granted(rgw::auth::Engine::result_t&& engine_result, + const rgw::auth::Strategy::Control policy, + rgw::auth::Engine::result_t&& strategy_result) +{ + using Control = rgw::auth::Strategy::Control; + switch (policy) { + case Control::REQUISITE: + /* Try next. */ + return std::make_pair(true, std::move(engine_result)); + + case Control::SUFFICIENT: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + case Control::FALLBACK: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + default: + /* Huh, memory corruption? */ + ceph_abort(); + } +} + +rgw::auth::Engine::result_t +rgw::auth::Strategy::authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const +{ + result_t strategy_result = result_t::deny(); + + for (const stack_item_t& kv : auth_stack) { + const rgw::auth::Engine& engine = kv.first; + const auto& policy = kv.second; + + ldpp_dout(dpp, 20) << get_name() << ": trying " << engine.get_name() << dendl; + + result_t engine_result = result_t::deny(); + try { + engine_result = engine.authenticate(dpp, s, y); + } catch (const int err) { + engine_result = result_t::deny(err); + } + + bool try_next = true; + switch (engine_result.get_status()) { + case result_t::Status::REJECTED: { + ldpp_dout(dpp, 20) << engine.get_name() << " rejected with reason=" + << engine_result.get_reason() << dendl; + + std::tie(try_next, strategy_result) = \ + strategy_handle_rejected(std::move(engine_result), policy, + std::move(strategy_result)); + break; + } + case result_t::Status::DENIED: { + ldpp_dout(dpp, 20) << engine.get_name() << " denied with reason=" + << engine_result.get_reason() << dendl; + + std::tie(try_next, strategy_result) = \ + strategy_handle_denied(std::move(engine_result), policy, + std::move(strategy_result)); + break; + } + case result_t::Status::GRANTED: { + ldpp_dout(dpp, 20) << engine.get_name() << " granted access" << dendl; + + std::tie(try_next, strategy_result) = \ + strategy_handle_granted(std::move(engine_result), policy, + std::move(strategy_result)); + break; + } + default: { + ceph_abort(); + } + } + + if (! try_next) { + break; + } + } + + return strategy_result; +} + +int +rgw::auth::Strategy::apply(const DoutPrefixProvider *dpp, const rgw::auth::Strategy& auth_strategy, + req_state* const s, optional_yield y) noexcept +{ + try { + auto result = auth_strategy.authenticate(dpp, s, y); + if (result.get_status() != decltype(result)::Status::GRANTED) { + /* Access denied is acknowledged by returning a std::unique_ptr with + * nullptr inside. */ + ldpp_dout(dpp, 5) << "Failed the auth strategy, reason=" + << result.get_reason() << dendl; + return result.get_reason(); + } + + try { + rgw::auth::IdentityApplier::aplptr_t applier = result.get_applier(); + rgw::auth::Completer::cmplptr_t completer = result.get_completer(); + + /* Account used by a given RGWOp is decoupled from identity employed + * in the authorization phase (RGWOp::verify_permissions). */ + applier->load_acct_info(dpp, s->user->get_info()); + s->perm_mask = applier->get_perm_mask(); + + /* This is the single place where we pass req_state as a pointer + * to non-const and thus its modification is allowed. In the time + * of writing only RGWTempURLEngine needed that feature. */ + applier->modify_request_state(dpp, s); + if (completer) { + completer->modify_request_state(dpp, s); + } + + s->auth.identity = std::move(applier); + s->auth.completer = std::move(completer); + + return 0; + } catch (const int err) { + ldpp_dout(dpp, 5) << "applier throwed err=" << err << dendl; + return err; + } catch (const std::exception& e) { + ldpp_dout(dpp, 5) << "applier throwed unexpected err: " << e.what() + << dendl; + return -EPERM; + } + } catch (const int err) { + ldpp_dout(dpp, 5) << "auth engine throwed err=" << err << dendl; + return err; + } catch (const std::exception& e) { + ldpp_dout(dpp, 5) << "auth engine throwed unexpected err: " << e.what() + << dendl; + } + + /* We never should be here. */ + return -EPERM; +} + +void +rgw::auth::Strategy::add_engine(const Control ctrl_flag, + const Engine& engine) noexcept +{ + auth_stack.push_back(std::make_pair(std::cref(engine), ctrl_flag)); +} + +void rgw::auth::WebIdentityApplier::to_str(std::ostream& out) const +{ + out << "rgw::auth::WebIdentityApplier(sub =" << sub + << ", user_name=" << user_name + << ", provider_id =" << iss << ")"; +} + +string rgw::auth::WebIdentityApplier::get_idp_url() const +{ + string idp_url = this->iss; + idp_url = url_remove_prefix(idp_url); + return idp_url; +} + +void rgw::auth::WebIdentityApplier::create_account(const DoutPrefixProvider* dpp, + const rgw_user& acct_user, + const string& display_name, + RGWUserInfo& user_info) const /* out */ +{ + std::unique_ptr user = driver->get_user(acct_user); + user->get_info().display_name = display_name; + user->get_info().type = TYPE_WEB; + user->get_info().max_buckets = + cct->_conf.get_val("rgw_user_max_buckets"); + rgw_apply_default_bucket_quota(user->get_info().quota.bucket_quota, cct->_conf); + rgw_apply_default_user_quota(user->get_info().quota.user_quota, cct->_conf); + + int ret = user->store_user(dpp, null_yield, true); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to store new user info: user=" + << user << " ret=" << ret << dendl; + throw ret; + } + user_info = user->get_info(); +} + +void rgw::auth::WebIdentityApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const { + rgw_user federated_user; + federated_user.id = this->sub; + federated_user.tenant = role_tenant; + federated_user.ns = "oidc"; + + std::unique_ptr user = driver->get_user(federated_user); + + //Check in oidc namespace + if (user->load_user(dpp, null_yield) >= 0) { + /* Succeeded. */ + user_info = user->get_info(); + return; + } + + user->clear_ns(); + //Check for old users which wouldn't have been created in oidc namespace + if (user->load_user(dpp, null_yield) >= 0) { + /* Succeeded. */ + user_info = user->get_info(); + return; + } + + //Check if user_id.buckets already exists, may have been from the time, when shadow users didnt exist + RGWStorageStats stats; + int ret = user->read_stats(dpp, null_yield, &stats); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: reading stats for the user returned error " << ret << dendl; + return; + } + if (ret == -ENOENT) { /* in case of ENOENT, which means user doesnt have buckets */ + //In this case user will be created in oidc namespace + ldpp_dout(dpp, 5) << "NOTICE: incoming user has no buckets " << federated_user << dendl; + federated_user.ns = "oidc"; + } else { + //User already has buckets associated, hence wont be created in oidc namespace. + ldpp_dout(dpp, 5) << "NOTICE: incoming user already has buckets associated " << federated_user << ", won't be created in oidc namespace"<< dendl; + federated_user.ns = ""; + } + + ldpp_dout(dpp, 0) << "NOTICE: couldn't map oidc federated user " << federated_user << dendl; + create_account(dpp, federated_user, this->user_name, user_info); +} + +void rgw::auth::WebIdentityApplier::modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const +{ + s->info.args.append("sub", this->sub); + s->info.args.append("aud", this->aud); + s->info.args.append("provider_id", this->iss); + s->info.args.append("client_id", this->client_id); + + string condition; + string idp_url = get_idp_url(); + for (auto& claim : token_claims) { + if (claim.first == "aud") { + condition.clear(); + condition = idp_url + ":app_id"; + s->env.emplace(condition, claim.second); + } + condition.clear(); + condition = idp_url + ":" + claim.first; + s->env.emplace(condition, claim.second); + } + + if (principal_tags) { + constexpr size_t KEY_SIZE = 128, VAL_SIZE = 256; + std::set> p_tags = principal_tags.get(); + for (auto& it : p_tags) { + string key = it.first; + string val = it.second; + if (key.find("aws:") == 0 || val.find("aws:") == 0) { + ldpp_dout(dpp, 0) << "ERROR: Tag/Value can't start with aws:, hence skipping it" << dendl; + continue; + } + if (key.size() > KEY_SIZE || val.size() > VAL_SIZE) { + ldpp_dout(dpp, 0) << "ERROR: Invalid tag/value size, hence skipping it" << dendl; + continue; + } + std::string p_key = "aws:PrincipalTag/"; + p_key.append(key); + s->principal_tags.emplace_back(std::make_pair(p_key, val)); + ldpp_dout(dpp, 10) << "Principal Tag Key: " << p_key << " Value: " << val << dendl; + + std::string e_key = "aws:RequestTag/"; + e_key.append(key); + s->env.emplace(e_key, val); + ldpp_dout(dpp, 10) << "RGW Env Tag Key: " << e_key << " Value: " << val << dendl; + + s->env.emplace("aws:TagKeys", key); + ldpp_dout(dpp, 10) << "aws:TagKeys: " << key << dendl; + + if (s->principal_tags.size() == 50) { + ldpp_dout(dpp, 0) << "ERROR: Number of tag/value pairs exceeding 50, hence skipping the rest" << dendl; + break; + } + } + } + + if (role_tags) { + for (auto& it : role_tags.get()) { + std::string p_key = "aws:PrincipalTag/"; + p_key.append(it.first); + s->principal_tags.emplace_back(std::make_pair(p_key, it.second)); + ldpp_dout(dpp, 10) << "Principal Tag Key: " << p_key << " Value: " << it.second << dendl; + + std::string e_key = "iam:ResourceTag/"; + e_key.append(it.first); + s->env.emplace(e_key, it.second); + ldpp_dout(dpp, 10) << "RGW Env Tag Key: " << e_key << " Value: " << it.second << dendl; + } + } +} + +bool rgw::auth::WebIdentityApplier::is_identity(const idset_t& ids) const +{ + if (ids.size() > 1) { + return false; + } + + for (auto id : ids) { + string idp_url = get_idp_url(); + if (id.is_oidc_provider() && id.get_idp_url() == idp_url) { + return true; + } + } + return false; +} + +const std::string rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER; +const std::string rgw::auth::RemoteApplier::AuthInfo::NO_ACCESS_KEY; + +/* rgw::auth::RemoteAuthApplier */ +uint32_t rgw::auth::RemoteApplier::get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const +{ + uint32_t perm = 0; + + /* For backward compatibility with ACLOwner. */ + perm |= rgw_perms_from_aclspec_default_strategy(info.acct_user, + aclspec, dpp); + + /* We also need to cover cases where rgw_keystone_implicit_tenants + * was enabled. */ + if (info.acct_user.tenant.empty()) { + const rgw_user tenanted_acct_user(info.acct_user.id, info.acct_user.id); + + perm |= rgw_perms_from_aclspec_default_strategy(tenanted_acct_user, + aclspec, dpp); + } + + /* Now it's a time for invoking additional strategy that was supplied by + * a specific auth engine. */ + if (extra_acl_strategy) { + perm |= extra_acl_strategy(aclspec); + } + + ldpp_dout(dpp, 20) << "from ACL got perm=" << perm << dendl; + return perm; +} + +bool rgw::auth::RemoteApplier::is_admin_of(const rgw_user& uid) const +{ + return info.is_admin; +} + +bool rgw::auth::RemoteApplier::is_owner_of(const rgw_user& uid) const +{ + if (info.acct_user.tenant.empty()) { + const rgw_user tenanted_acct_user(info.acct_user.id, info.acct_user.id); + + if (tenanted_acct_user == uid) { + return true; + } + } + + return info.acct_user == uid; +} + +bool rgw::auth::RemoteApplier::is_identity(const idset_t& ids) const { + for (auto& id : ids) { + if (id.is_wildcard()) { + return true; + + // We also need to cover cases where rgw_keystone_implicit_tenants + // was enabled. */ + } else if (id.is_tenant() && + (info.acct_user.tenant.empty() ? + info.acct_user.id : + info.acct_user.tenant) == id.get_tenant()) { + return true; + } else if (id.is_user() && + info.acct_user.id == id.get_id() && + (info.acct_user.tenant.empty() ? + info.acct_user.id : + info.acct_user.tenant) == id.get_tenant()) { + return true; + } + } + return false; +} + +void rgw::auth::RemoteApplier::to_str(std::ostream& out) const +{ + out << "rgw::auth::RemoteApplier(acct_user=" << info.acct_user + << ", acct_name=" << info.acct_name + << ", perm_mask=" << info.perm_mask + << ", is_admin=" << info.is_admin << ")"; +} + +void rgw::auth::ImplicitTenants::recompute_value(const ConfigProxy& c) +{ + std::string s = c.get_val("rgw_keystone_implicit_tenants"); + int v = 0; + if (boost::iequals(s, "both") + || boost::iequals(s, "true") + || boost::iequals(s, "1")) { + v = IMPLICIT_TENANTS_S3|IMPLICIT_TENANTS_SWIFT; + } else if (boost::iequals(s, "0") + || boost::iequals(s, "none") + || boost::iequals(s, "false")) { + v = 0; + } else if (boost::iequals(s, "s3")) { + v = IMPLICIT_TENANTS_S3; + } else if (boost::iequals(s, "swift")) { + v = IMPLICIT_TENANTS_SWIFT; + } else { /* "" (and anything else) */ + v = IMPLICIT_TENANTS_BAD; + // assert(0); + } + saved = v; +} + +const char **rgw::auth::ImplicitTenants::get_tracked_conf_keys() const +{ + static const char *keys[] = { + "rgw_keystone_implicit_tenants", + nullptr }; + return keys; +} + +void rgw::auth::ImplicitTenants::handle_conf_change(const ConfigProxy& c, + const std::set &changed) +{ + if (changed.count("rgw_keystone_implicit_tenants")) { + recompute_value(c); + } +} + +void rgw::auth::RemoteApplier::create_account(const DoutPrefixProvider* dpp, + const rgw_user& acct_user, + bool implicit_tenant, + RGWUserInfo& user_info) const /* out */ +{ + rgw_user new_acct_user = acct_user; + + /* An upper layer may enforce creating new accounts within their own + * tenants. */ + if (new_acct_user.tenant.empty() && implicit_tenant) { + new_acct_user.tenant = new_acct_user.id; + } + + std::unique_ptr user = driver->get_user(new_acct_user); + user->get_info().display_name = info.acct_name; + if (info.acct_type) { + //ldap/keystone for s3 users + user->get_info().type = info.acct_type; + } + user->get_info().max_buckets = + cct->_conf.get_val("rgw_user_max_buckets"); + rgw_apply_default_bucket_quota(user->get_info().quota.bucket_quota, cct->_conf); + rgw_apply_default_user_quota(user->get_info().quota.user_quota, cct->_conf); + user_info = user->get_info(); + + int ret = user->store_user(dpp, null_yield, true); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to store new user info: user=" + << user << " ret=" << ret << dendl; + throw ret; + } +} + +void rgw::auth::RemoteApplier::write_ops_log_entry(rgw_log_entry& entry) const +{ + entry.access_key_id = info.access_key_id; + entry.subuser = info.subuser; +} + +/* TODO(rzarzynski): we need to handle display_name changes. */ +void rgw::auth::RemoteApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +{ + /* It's supposed that RGWRemoteAuthApplier tries to load account info + * that belongs to the authenticated identity. Another policy may be + * applied by using a RGWThirdPartyAccountAuthApplier decorator. */ + const rgw_user& acct_user = info.acct_user; + auto implicit_value = implicit_tenant_context.get_value(); + bool implicit_tenant = implicit_value.implicit_tenants_for_(implicit_tenant_bit); + bool split_mode = implicit_value.is_split_mode(); + std::unique_ptr user; + + /* Normally, empty "tenant" field of acct_user means the authenticated + * identity has the legacy, global tenant. However, due to inclusion + * of multi-tenancy, we got some special compatibility kludge for remote + * backends like Keystone. + * If the global tenant is the requested one, we try the same tenant as + * the user name first. If that RGWUserInfo exists, we use it. This way, + * migrated OpenStack users can get their namespaced containers and nobody's + * the wiser. + * If that fails, we look up in the requested (possibly empty) tenant. + * If that fails too, we create the account within the global or separated + * namespace depending on rgw_keystone_implicit_tenants. + * For compatibility with previous versions of ceph, it is possible + * to enable implicit_tenants for only s3 or only swift. + * in this mode ("split_mode"), we must constrain the id lookups to + * only use the identifier space that would be used if the id were + * to be created. */ + + if (split_mode && !implicit_tenant) + ; /* suppress lookup for id used by "other" protocol */ + else if (acct_user.tenant.empty()) { + const rgw_user tenanted_uid(acct_user.id, acct_user.id); + user = driver->get_user(tenanted_uid); + + if (user->load_user(dpp, null_yield) >= 0) { + /* Succeeded. */ + user_info = user->get_info(); + return; + } + } + + user = driver->get_user(acct_user); + + if (split_mode && implicit_tenant) + ; /* suppress lookup for id used by "other" protocol */ + else if (user->load_user(dpp, null_yield) >= 0) { + /* Succeeded. */ + user_info = user->get_info(); + return; + } + + ldpp_dout(dpp, 0) << "NOTICE: couldn't map swift user " << acct_user << dendl; + create_account(dpp, acct_user, implicit_tenant, user_info); + + /* Succeeded if we are here (create_account() hasn't throwed). */ +} + +/* rgw::auth::LocalApplier */ +/* static declaration */ +const std::string rgw::auth::LocalApplier::NO_SUBUSER; +const std::string rgw::auth::LocalApplier::NO_ACCESS_KEY; + +uint32_t rgw::auth::LocalApplier::get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const +{ + return rgw_perms_from_aclspec_default_strategy(user_info.user_id, aclspec, dpp); +} + +bool rgw::auth::LocalApplier::is_admin_of(const rgw_user& uid) const +{ + return user_info.admin || user_info.system; +} + +bool rgw::auth::LocalApplier::is_owner_of(const rgw_user& uid) const +{ + return uid == user_info.user_id; +} + +bool rgw::auth::LocalApplier::is_identity(const idset_t& ids) const { + for (auto& id : ids) { + if (id.is_wildcard()) { + return true; + } else if (id.is_tenant() && + id.get_tenant() == user_info.user_id.tenant) { + return true; + } else if (id.is_user() && + (id.get_tenant() == user_info.user_id.tenant)) { + if (id.get_id() == user_info.user_id.id) { + return true; + } + std::string wildcard_subuser = user_info.user_id.id; + wildcard_subuser.append(":*"); + if (wildcard_subuser == id.get_id()) { + return true; + } else if (subuser != NO_SUBUSER) { + std::string user = user_info.user_id.id; + user.append(":"); + user.append(subuser); + if (user == id.get_id()) { + return true; + } + } + } + } + return false; +} + +void rgw::auth::LocalApplier::to_str(std::ostream& out) const { + out << "rgw::auth::LocalApplier(acct_user=" << user_info.user_id + << ", acct_name=" << user_info.display_name + << ", subuser=" << subuser + << ", perm_mask=" << get_perm_mask() + << ", is_admin=" << static_cast(user_info.admin) << ")"; +} + +uint32_t rgw::auth::LocalApplier::get_perm_mask(const std::string& subuser_name, + const RGWUserInfo &uinfo) const +{ + if (! subuser_name.empty() && subuser_name != NO_SUBUSER) { + const auto iter = uinfo.subusers.find(subuser_name); + + if (iter != std::end(uinfo.subusers)) { + return iter->second.perm_mask; + } else { + /* Subuser specified but not found. */ + return RGW_PERM_NONE; + } + } else { + /* Due to backward compatibility. */ + return RGW_PERM_FULL_CONTROL; + } +} + +void rgw::auth::LocalApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +{ + /* Load the account that belongs to the authenticated identity. An extra call + * to RADOS may be safely skipped in this case. */ + user_info = this->user_info; +} + +void rgw::auth::LocalApplier::write_ops_log_entry(rgw_log_entry& entry) const +{ + entry.access_key_id = access_key_id; + entry.subuser = subuser; +} + +void rgw::auth::RoleApplier::to_str(std::ostream& out) const { + out << "rgw::auth::RoleApplier(role name =" << role.name; + for (auto& policy: role.role_policies) { + out << ", role policy =" << policy; + } + out << ", token policy =" << token_attrs.token_policy; + out << ")"; +} + +bool rgw::auth::RoleApplier::is_identity(const idset_t& ids) const { + for (auto& p : ids) { + if (p.is_wildcard()) { + return true; + } else if (p.is_role()) { + string name = p.get_id(); + string tenant = p.get_tenant(); + if (name == role.name && tenant == role.tenant) { + return true; + } + } else if (p.is_assumed_role()) { + string tenant = p.get_tenant(); + string role_session = role.name + "/" + token_attrs.role_session_name; //role/role-session + if (role.tenant == tenant && role_session == p.get_role_session()) { + return true; + } + } else { + string id = p.get_id(); + string tenant = p.get_tenant(); + string oidc_id; + if (token_attrs.user_id.ns.empty()) { + oidc_id = token_attrs.user_id.id; + } else { + oidc_id = token_attrs.user_id.ns + "$" + token_attrs.user_id.id; + } + if (oidc_id == id && token_attrs.user_id.tenant == tenant) { + return true; + } + } + } + return false; +} + +void rgw::auth::RoleApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +{ + /* Load the user id */ + user_info.user_id = this->token_attrs.user_id; +} + +void rgw::auth::RoleApplier::modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const +{ + for (auto it: role.role_policies) { + try { + bufferlist bl = bufferlist::static_from_string(it); + const rgw::IAM::Policy p(s->cct, role.tenant, bl, false); + s->iam_user_policies.push_back(std::move(p)); + } catch (rgw::IAM::PolicyParseException& e) { + //Control shouldn't reach here as the policy has already been + //verified earlier + ldpp_dout(dpp, 20) << "failed to parse role policy: " << e.what() << dendl; + } + } + + if (!this->token_attrs.token_policy.empty()) { + try { + string policy = this->token_attrs.token_policy; + bufferlist bl = bufferlist::static_from_string(policy); + const rgw::IAM::Policy p(s->cct, role.tenant, bl, false); + s->session_policies.push_back(std::move(p)); + } catch (rgw::IAM::PolicyParseException& e) { + //Control shouldn't reach here as the policy has already been + //verified earlier + ldpp_dout(dpp, 20) << "failed to parse token policy: " << e.what() << dendl; + } + } + + string condition = "aws:userid"; + string value = role.id + ":" + token_attrs.role_session_name; + s->env.emplace(condition, value); + + s->env.emplace("aws:TokenIssueTime", token_attrs.token_issued_at); + + for (auto& m : token_attrs.principal_tags) { + s->env.emplace(m.first, m.second); + ldpp_dout(dpp, 10) << "Principal Tag Key: " << m.first << " Value: " << m.second << dendl; + std::size_t pos = m.first.find('/'); + string key = m.first.substr(pos + 1); + s->env.emplace("aws:TagKeys", key); + ldpp_dout(dpp, 10) << "aws:TagKeys: " << key << dendl; + } + + s->token_claims.emplace_back("sts"); + s->token_claims.emplace_back("role_name:" + role.tenant + "$" + role.name); + s->token_claims.emplace_back("role_session:" + token_attrs.role_session_name); + for (auto& it : token_attrs.token_claims) { + s->token_claims.emplace_back(it); + } +} + +rgw::auth::Engine::result_t +rgw::auth::AnonymousEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const +{ + if (! is_applicable(s)) { + return result_t::deny(-EPERM); + } else { + RGWUserInfo user_info; + rgw_get_anon_user(user_info); + + auto apl = \ + apl_factory->create_apl_local(cct, s, user_info, + rgw::auth::LocalApplier::NO_SUBUSER, + std::nullopt, rgw::auth::LocalApplier::NO_ACCESS_KEY); + return result_t::grant(std::move(apl)); + } +} diff --git a/src/rgw/rgw_auth.h b/src/rgw/rgw_auth.h new file mode 100644 index 000000000..82e0d0c97 --- /dev/null +++ b/src/rgw/rgw_auth.h @@ -0,0 +1,791 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rgw_common.h" +#include "rgw_web_idp.h" + +#define RGW_USER_ANON_ID "anonymous" + +class RGWCtl; +struct rgw_log_entry; +struct req_state; + +namespace rgw { +namespace auth { + +using Exception = std::system_error; + + +/* Load information about identity that will be used by RGWOp to authorize + * any operation that comes from an authenticated user. */ +class Identity { +public: + typedef std::map aclspec_t; + using idset_t = boost::container::flat_set; + + virtual ~Identity() = default; + + /* Translate the ACL provided in @aclspec into concrete permission set that + * can be used during the authorization phase (RGWOp::verify_permission). + * On error throws rgw::auth::Exception storing the reason. + * + * NOTE: an implementation is responsible for giving the real semantic to + * the items in @aclspec. That is, their meaning may depend on particular + * applier that is being used. */ + virtual uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const = 0; + + /* Verify whether a given identity *can be treated as* an admin of rgw_user + * (account in Swift's terminology) specified in @uid. On error throws + * rgw::auth::Exception storing the reason. */ + virtual bool is_admin_of(const rgw_user& uid) const = 0; + + /* Verify whether a given identity *is* the owner of the rgw_user (account + * in the Swift's terminology) specified in @uid. On internal error throws + * rgw::auth::Exception storing the reason. */ + virtual bool is_owner_of(const rgw_user& uid) const = 0; + + /* Return the permission mask that is used to narrow down the set of + * operations allowed for a given identity. This method reflects the idea + * of subuser tied to RGWUserInfo. On error throws rgw::auth::Exception + * with the reason. */ + virtual uint32_t get_perm_mask() const = 0; + + virtual bool is_anonymous() const { + /* If the identity owns the anonymous account (rgw_user), it's considered + * the anonymous identity. On error throws rgw::auth::Exception storing + * the reason. */ + return is_owner_of(rgw_user(RGW_USER_ANON_ID)); + } + + virtual void to_str(std::ostream& out) const = 0; + + /* Verify whether a given identity corresponds to an identity in the + provided set */ + virtual bool is_identity(const idset_t& ids) const = 0; + + /* Identity Type: RGW/ LDAP/ Keystone */ + virtual uint32_t get_identity_type() const = 0; + + /* Name of Account */ + virtual std::string get_acct_name() const = 0; + + /* Subuser of Account */ + virtual std::string get_subuser() const = 0; + + virtual std::string get_role_tenant() const { return ""; } + + /* write any auth-specific fields that are safe to expose in the ops log */ + virtual void write_ops_log_entry(rgw_log_entry& entry) const {}; +}; + +inline std::ostream& operator<<(std::ostream& out, + const rgw::auth::Identity& id) { + id.to_str(out); + return out; +} + + +std::unique_ptr +transform_old_authinfo(CephContext* const cct, + const rgw_user& auth_id, + const int perm_mask, + const bool is_admin, + const uint32_t type); +std::unique_ptr transform_old_authinfo(const req_state* const s); + + +/* Interface for classes applying changes to request state/RADOS store + * imposed by a particular rgw::auth::Engine. + * + * In contrast to rgw::auth::Engine, implementations of this interface + * are allowed to handle req_state or RGWUserCtl in the read-write manner. + * + * It's expected that most (if not all) of implementations will also + * conform to rgw::auth::Identity interface to provide authorization + * policy (ACLs, account's ownership and entitlement). */ +class IdentityApplier : public Identity { +public: + typedef std::unique_ptr aplptr_t; + + virtual ~IdentityApplier() {}; + + /* Fill provided RGWUserInfo with information about the account that + * RGWOp will operate on. Errors are handled solely through exceptions. + * + * XXX: be aware that the "account" term refers to rgw_user. The naming + * is legacy. */ + virtual void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const = 0; /* out */ + + /* Apply any changes to request state. This method will be most useful for + * TempURL of Swift API. */ + virtual void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const {} /* in/out */ +}; + + +/* Interface class for completing the two-step authentication process. + * Completer provides the second step - the complete() method that should + * be called after Engine::authenticate() but before *committing* results + * of an RGWOp (or sending a response in the case of non-mutating ops). + * + * The motivation driving the interface is to address those authentication + * schemas that require message integrity verification *without* in-memory + * data buffering. Typical examples are AWS Auth v4 and the auth mechanism + * of browser uploads facilities both in S3 and Swift APIs (see RGWPostObj). + * The workflow of request from the authentication point-of-view does look + * like following one: + * A. authenticate (Engine::authenticate), + * B. authorize (see RGWOp::verify_permissions), + * C. execute-prepare (init potential data modifications), + * D. authenticate-complete - (Completer::complete), + * E. execute-commit - commit the modifications from point C. */ +class Completer { +public: + /* It's expected that Completers would tend to implement many interfaces + * and be used not only in req_state::auth::completer. Ref counting their + * instances would be helpful. */ + typedef std::shared_ptr cmplptr_t; + + virtual ~Completer() = default; + + /* Complete the authentication process. Return boolean indicating whether + * the completion succeeded. On error throws rgw::auth::Exception storing + * the reason. */ + virtual bool complete() = 0; + + /* Apply any changes to request state. The initial use case was injecting + * the AWSv4 filter over rgw::io::RestfulClient in req_state. */ + virtual void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) = 0; /* in/out */ +}; + + +/* Interface class for authentication backends (auth engines) in RadosGW. + * + * An engine is supposed only to authenticate (not authorize!) requests + * basing on their req_state and - if access has been granted - provide + * an upper layer with: + * - rgw::auth::IdentityApplier to commit all changes to the request state as + * well as to the RADOS store (creating an account, synchronizing + * user-related information with external databases and so on). + * - rgw::auth::Completer (optionally) to finish the authentication + * of the request. Typical use case is verifying message integrity + * in AWS Auth v4 and browser uploads (RGWPostObj). + * + * Both of them are supposed to be wrapped in Engine::AuthResult. + * + * The authentication process consists of two steps: + * - Engine::authenticate() which should be called before *initiating* + * any modifications to RADOS store that are related to an operation + * a client wants to perform (RGWOp::execute). + * - Completer::complete() supposed to be called, if completer has been + * returned, after the authenticate() step but before *committing* + * those modifications or sending a response (RGWOp::complete). + * + * An engine outlives both Applier and Completer. It's intended to live + * since RadosGW's initialization and handle multiple requests till + * a reconfiguration. + * + * Auth engine MUST NOT make any changes to req_state nor RADOS store. + * This is solely an Applier's responsibility! + * + * Separation between authentication and global state modification has + * been introduced because many auth engines are orthogonal to appliers + * and thus they can be decoupled. Additional motivation is to clearly + * distinguish all portions of code modifying data structures. */ +class Engine { +public: + virtual ~Engine() = default; + + class AuthResult { + struct rejection_mark_t {}; + bool is_rejected = false; + int reason = 0; + + std::pair result_pair; + + explicit AuthResult(const int reason) + : reason(reason) { + } + + AuthResult(rejection_mark_t&&, const int reason) + : is_rejected(true), + reason(reason) { + } + + /* Allow only the reasonable combintations - returning just Completer + * without accompanying IdentityApplier is strictly prohibited! */ + explicit AuthResult(IdentityApplier::aplptr_t&& applier) + : result_pair(std::move(applier), nullptr) { + } + + AuthResult(IdentityApplier::aplptr_t&& applier, + Completer::cmplptr_t&& completer) + : result_pair(std::move(applier), std::move(completer)) { + } + + public: + enum class Status { + /* Engine doesn't grant the access but also doesn't reject it. */ + DENIED, + + /* Engine successfully authenicated requester. */ + GRANTED, + + /* Engine strictly indicates that a request should be rejected + * without trying any further engine. */ + REJECTED + }; + + Status get_status() const { + if (is_rejected) { + return Status::REJECTED; + } else if (! result_pair.first) { + return Status::DENIED; + } else { + return Status::GRANTED; + } + } + + int get_reason() const { + return reason; + } + + IdentityApplier::aplptr_t get_applier() { + return std::move(result_pair.first); + } + + Completer::cmplptr_t&& get_completer() { + return std::move(result_pair.second); + } + + static AuthResult reject(const int reason = -EACCES) { + return AuthResult(rejection_mark_t(), reason); + } + + static AuthResult deny(const int reason = -EACCES) { + return AuthResult(reason); + } + + static AuthResult grant(IdentityApplier::aplptr_t&& applier) { + return AuthResult(std::move(applier)); + } + + static AuthResult grant(IdentityApplier::aplptr_t&& applier, + Completer::cmplptr_t&& completer) { + return AuthResult(std::move(applier), std::move(completer)); + } + }; + + using result_t = AuthResult; + + /* Get name of the auth engine. */ + virtual const char* get_name() const noexcept = 0; + + /* Throwing method for identity verification. When the check is positive + * an implementation should return Engine::result_t containing: + * - a non-null pointer to an object conforming the Applier interface. + * Otherwise, the authentication is treated as failed. + * - a (potentially null) pointer to an object conforming the Completer + * interface. + * + * On error throws rgw::auth::Exception containing the reason. */ + virtual result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s, optional_yield y) const = 0; +}; + + +/* Interface for extracting a token basing from data carried by req_state. */ +class TokenExtractor { +public: + virtual ~TokenExtractor() = default; + virtual std::string get_token(const req_state* s) const = 0; +}; + + +/* Abstract class for stacking sub-engines to expose them as a single + * Engine. It is responsible for ordering its sub-engines and managing + * fall-backs between them. Derivatee is supposed to encapsulate engine + * instances and add them using the add_engine() method in the order it + * wants to be tried during the call to authenticate(). + * + * Each new Strategy should be exposed to StrategyRegistry for handling + * the dynamic reconfiguration. */ +class Strategy : public Engine { +public: + /* Specifiers controlling what happens when an associated engine fails. + * The names and semantic has been borrowed mostly from libpam. */ + enum class Control { + /* Failure of an engine injected with the REQUISITE specifier aborts + * the strategy's authentication process immediately. No other engine + * will be tried. */ + REQUISITE, + + /* Success of an engine injected with the SUFFICIENT specifier ends + * strategy's authentication process successfully. However, denying + * doesn't abort it -- there will be fall-back to following engine + * if the one that failed wasn't the last one. */ + SUFFICIENT, + + /* Like SUFFICIENT with the exception that on failure the reason code + * is not overridden. Instead, it's taken directly from the last tried + * non-FALLBACK engine. If there was no previous non-FALLBACK engine + * in a Strategy, then the result_t::deny(reason = -EACCES) is used. */ + FALLBACK, + }; + + Engine::result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s, optional_yield y) const override final; + + bool is_empty() const { + return auth_stack.empty(); + } + + static int apply(const DoutPrefixProvider* dpp, const Strategy& auth_strategy, req_state* s, optional_yield y) noexcept; + +private: + /* Using the reference wrapper here to explicitly point out we are not + * interested in storing nulls while preserving the dynamic polymorphism. */ + using stack_item_t = std::pair, + Control>; + std::vector auth_stack; + +protected: + void add_engine(Control ctrl_flag, const Engine& engine) noexcept; +}; + + +/* A class aggregating the knowledge about all Strategies in RadosGW. It is + * responsible for handling the dynamic reconfiguration on e.g. realm update. + * The definition is in rgw/rgw_auth_registry.h, + * + * Each new Strategy should be exposed to it. */ +class StrategyRegistry; + +class WebIdentityApplier : public IdentityApplier { + std::string sub; + std::string iss; + std::string aud; + std::string client_id; + std::string user_name; +protected: + CephContext* const cct; + rgw::sal::Driver* driver; + std::string role_session; + std::string role_tenant; + std::unordered_multimap token_claims; + boost::optional> role_tags; + boost::optional>> principal_tags; + + std::string get_idp_url() const; + + void create_account(const DoutPrefixProvider* dpp, + const rgw_user& acct_user, + const std::string& display_name, + RGWUserInfo& user_info) const; /* out */ +public: + WebIdentityApplier( CephContext* const cct, + rgw::sal::Driver* driver, + const std::string& role_session, + const std::string& role_tenant, + const std::unordered_multimap& token_claims, + boost::optional> role_tags, + boost::optional>> principal_tags) + : cct(cct), + driver(driver), + role_session(role_session), + role_tenant(role_tenant), + token_claims(token_claims), + role_tags(role_tags), + principal_tags(principal_tags) { + const auto& sub = token_claims.find("sub"); + if(sub != token_claims.end()) { + this->sub = sub->second; + } + + const auto& iss = token_claims.find("iss"); + if(iss != token_claims.end()) { + this->iss = iss->second; + } + + const auto& aud = token_claims.find("aud"); + if(aud != token_claims.end()) { + this->aud = aud->second; + } + + const auto& client_id = token_claims.find("client_id"); + if(client_id != token_claims.end()) { + this->client_id = client_id->second; + } else { + const auto& azp = token_claims.find("azp"); + if (azp != token_claims.end()) { + this->client_id = azp->second; + } + } + + const auto& user_name = token_claims.find("username"); + if(user_name != token_claims.end()) { + this->user_name = user_name->second; + } else { + const auto& given_username = token_claims.find("given_username"); + if (given_username != token_claims.end()) { + this->user_name = given_username->second; + } + } + } + + void modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const override; + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override { + return RGW_PERM_NONE; + } + + bool is_admin_of(const rgw_user& uid) const override { + return false; + } + + bool is_owner_of(const rgw_user& uid) const override { + if (uid.id == this->sub && uid.tenant == role_tenant && uid.ns == "oidc") { + return true; + } + return false; + } + + uint32_t get_perm_mask() const override { + return RGW_PERM_NONE; + } + + void to_str(std::ostream& out) const override; + + bool is_identity(const idset_t& ids) const override; + + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; + + uint32_t get_identity_type() const override { + return TYPE_WEB; + } + + std::string get_acct_name() const override { + return this->user_name; + } + + std::string get_subuser() const override { + return {}; + } + + struct Factory { + virtual ~Factory() {} + + virtual aplptr_t create_apl_web_identity( CephContext* cct, + const req_state* s, + const std::string& role_session, + const std::string& role_tenant, + const std::unordered_multimap& token, + boost::optional>, + boost::optional>> principal_tags) const = 0; + }; +}; + +class ImplicitTenants: public md_config_obs_t { +public: + enum implicit_tenant_flag_bits {IMPLICIT_TENANTS_SWIFT=1, + IMPLICIT_TENANTS_S3=2, IMPLICIT_TENANTS_BAD = -1, }; +private: + int saved; + void recompute_value(const ConfigProxy& ); + class ImplicitTenantValue { + friend class ImplicitTenants; + int v; + ImplicitTenantValue(int v) : v(v) {}; + public: + bool inline is_split_mode() + { + assert(v != IMPLICIT_TENANTS_BAD); + return v == IMPLICIT_TENANTS_SWIFT || v == IMPLICIT_TENANTS_S3; + } + bool inline implicit_tenants_for_(const implicit_tenant_flag_bits bit) + { + assert(v != IMPLICIT_TENANTS_BAD); + return static_cast(v&bit); + } + }; +public: + ImplicitTenants(const ConfigProxy& c) { recompute_value(c);} + ImplicitTenantValue get_value() const { + return ImplicitTenantValue(saved); + } +private: + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set &changed) override; +}; + +std::tuple implicit_tenants_enabled_for_swift(CephContext * const cct); +std::tuple implicit_tenants_enabled_for_s3(CephContext * const cct); + +/* rgw::auth::RemoteApplier targets those authentication engines which don't + * need to ask the RADOS store while performing the auth process. Instead, + * they obtain credentials from an external source like Keystone or LDAP. + * + * As the authenticated user may not have an account yet, RGWRemoteAuthApplier + * must be able to create it basing on data passed by an auth engine. Those + * data will be used to fill RGWUserInfo structure. */ +class RemoteApplier : public IdentityApplier { +public: + class AuthInfo { + friend class RemoteApplier; + protected: + const rgw_user acct_user; + const std::string acct_name; + const uint32_t perm_mask; + const bool is_admin; + const uint32_t acct_type; + const std::string access_key_id; + const std::string subuser; + + public: + enum class acct_privilege_t { + IS_ADMIN_ACCT, + IS_PLAIN_ACCT + }; + + static const std::string NO_SUBUSER; + static const std::string NO_ACCESS_KEY; + + AuthInfo(const rgw_user& acct_user, + const std::string& acct_name, + const uint32_t perm_mask, + const acct_privilege_t level, + const std::string access_key_id, + const std::string subuser, + const uint32_t acct_type=TYPE_NONE) + : acct_user(acct_user), + acct_name(acct_name), + perm_mask(perm_mask), + is_admin(acct_privilege_t::IS_ADMIN_ACCT == level), + acct_type(acct_type), + access_key_id(access_key_id), + subuser(subuser) { + } + }; + + using aclspec_t = rgw::auth::Identity::aclspec_t; + typedef std::function acl_strategy_t; + +protected: + CephContext* const cct; + + /* Read-write is intensional here due to RGWUserInfo creation process. */ + rgw::sal::Driver* driver; + + /* Supplemental strategy for extracting permissions from ACLs. Its results + * will be combined (ORed) with a default strategy that is responsible for + * handling backward compatibility. */ + const acl_strategy_t extra_acl_strategy; + + const AuthInfo info; + const rgw::auth::ImplicitTenants& implicit_tenant_context; + const rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit; + + virtual void create_account(const DoutPrefixProvider* dpp, + const rgw_user& acct_user, + bool implicit_tenant, + RGWUserInfo& user_info) const; /* out */ + +public: + RemoteApplier(CephContext* const cct, + rgw::sal::Driver* driver, + acl_strategy_t&& extra_acl_strategy, + const AuthInfo& info, + const rgw::auth::ImplicitTenants& implicit_tenant_context, + rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit) + : cct(cct), + driver(driver), + extra_acl_strategy(std::move(extra_acl_strategy)), + info(info), + implicit_tenant_context(implicit_tenant_context), + implicit_tenant_bit(implicit_tenant_bit) { + } + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override; + bool is_admin_of(const rgw_user& uid) const override; + bool is_owner_of(const rgw_user& uid) const override; + bool is_identity(const idset_t& ids) const override; + + uint32_t get_perm_mask() const override { return info.perm_mask; } + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + void write_ops_log_entry(rgw_log_entry& entry) const override; + uint32_t get_identity_type() const override { return info.acct_type; } + std::string get_acct_name() const override { return info.acct_name; } + std::string get_subuser() const override { return {}; } + + struct Factory { + virtual ~Factory() {} + /* Providing r-value reference here is required intensionally. Callee is + * thus disallowed to handle std::function in a way that could inhibit + * the move behaviour (like forgetting about std::moving a l-value). */ + virtual aplptr_t create_apl_remote(CephContext* cct, + const req_state* s, + acl_strategy_t&& extra_acl_strategy, + const AuthInfo &info) const = 0; + }; +}; + + +/* rgw::auth::LocalApplier targets those auth engines that base on the data + * enclosed in the RGWUserInfo control structure. As a side effect of doing + * the authentication process, they must have it loaded. Leveraging this is + * a way to avoid unnecessary calls to underlying RADOS store. */ +class LocalApplier : public IdentityApplier { + using aclspec_t = rgw::auth::Identity::aclspec_t; + +protected: + const RGWUserInfo user_info; + const std::string subuser; + uint32_t perm_mask; + const std::string access_key_id; + + uint32_t get_perm_mask(const std::string& subuser_name, + const RGWUserInfo &uinfo) const; + +public: + static const std::string NO_SUBUSER; + static const std::string NO_ACCESS_KEY; + + LocalApplier(CephContext* const cct, + const RGWUserInfo& user_info, + std::string subuser, + const std::optional& perm_mask, + const std::string access_key_id) + : user_info(user_info), + subuser(std::move(subuser)), + perm_mask(perm_mask.value_or(RGW_PERM_INVALID)), + access_key_id(access_key_id) { + } + + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override; + bool is_admin_of(const rgw_user& uid) const override; + bool is_owner_of(const rgw_user& uid) const override; + bool is_identity(const idset_t& ids) const override; + uint32_t get_perm_mask() const override { + if (this->perm_mask == RGW_PERM_INVALID) { + return get_perm_mask(subuser, user_info); + } else { + return this->perm_mask; + } + } + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + uint32_t get_identity_type() const override { return TYPE_RGW; } + std::string get_acct_name() const override { return {}; } + std::string get_subuser() const override { return subuser; } + void write_ops_log_entry(rgw_log_entry& entry) const override; + + struct Factory { + virtual ~Factory() {} + virtual aplptr_t create_apl_local(CephContext* cct, + const req_state* s, + const RGWUserInfo& user_info, + const std::string& subuser, + const std::optional& perm_mask, + const std::string& access_key_id) const = 0; + }; +}; + +class RoleApplier : public IdentityApplier { +public: + struct Role { + std::string id; + std::string name; + std::string tenant; + std::vector role_policies; + }; + struct TokenAttrs { + rgw_user user_id; + std::string token_policy; + std::string role_session_name; + std::vector token_claims; + std::string token_issued_at; + std::vector> principal_tags; + }; +protected: + Role role; + TokenAttrs token_attrs; + +public: + + RoleApplier(CephContext* const cct, + const Role& role, + const TokenAttrs& token_attrs) + : role(role), + token_attrs(token_attrs) {} + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override { + return 0; + } + bool is_admin_of(const rgw_user& uid) const override { + return false; + } + bool is_owner_of(const rgw_user& uid) const override { + return (this->token_attrs.user_id.id == uid.id && this->token_attrs.user_id.tenant == uid.tenant && this->token_attrs.user_id.ns == uid.ns); + } + bool is_identity(const idset_t& ids) const override; + uint32_t get_perm_mask() const override { + return RGW_PERM_NONE; + } + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + uint32_t get_identity_type() const override { return TYPE_ROLE; } + std::string get_acct_name() const override { return {}; } + std::string get_subuser() const override { return {}; } + void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override; + std::string get_role_tenant() const override { return role.tenant; } + + struct Factory { + virtual ~Factory() {} + virtual aplptr_t create_apl_role( CephContext* cct, + const req_state* s, + const rgw::auth::RoleApplier::Role& role, + const rgw::auth::RoleApplier::TokenAttrs& token_attrs) const = 0; + }; +}; + +/* The anonymous abstract engine. */ +class AnonymousEngine : public Engine { + CephContext* const cct; + const rgw::auth::LocalApplier::Factory* const apl_factory; + +public: + AnonymousEngine(CephContext* const cct, + const rgw::auth::LocalApplier::Factory* const apl_factory) + : cct(cct), + apl_factory(apl_factory) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::AnonymousEngine"; + } + + Engine::result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s, optional_yield y) const override final; + +protected: + virtual bool is_applicable(const req_state*) const noexcept { + return true; + } +}; + +} /* namespace auth */ +} /* namespace rgw */ + + +uint32_t rgw_perms_from_aclspec_default_strategy( + const rgw_user& uid, + const rgw::auth::Identity::aclspec_t& aclspec, + const DoutPrefixProvider *dpp); diff --git a/src/rgw/rgw_auth_filters.h b/src/rgw/rgw_auth_filters.h new file mode 100644 index 000000000..9e3818bef --- /dev/null +++ b/src/rgw/rgw_auth_filters.h @@ -0,0 +1,302 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include +#include + +#include "rgw_service.h" +#include "rgw_common.h" +#include "rgw_auth.h" +#include "rgw_user.h" + +namespace rgw { +namespace auth { + +/* Abstract decorator over any implementation of rgw::auth::IdentityApplier + * which could be provided both as a pointer-to-object or the object itself. */ +template +class DecoratedApplier : public rgw::auth::IdentityApplier { + typedef typename std::remove_pointer::type DerefedDecorateeT; + + static_assert(std::is_base_of::value, + "DecorateeT must be a subclass of rgw::auth::IdentityApplier"); + + DecorateeT decoratee; + + /* There is an indirection layer over accessing decoratee to share the same + * code base between dynamic and static decorators. The difference is about + * what we store internally: pointer to a decorated object versus the whole + * object itself. Googling for "SFINAE" can help to understand the code. */ + template ::value, T>::type* = nullptr> + DerefedDecorateeT& get_decoratee() { + return *decoratee; + } + + template ::value, T>::type* = nullptr> + DerefedDecorateeT& get_decoratee() { + return decoratee; + } + + template ::value, T>::type* = nullptr> + const DerefedDecorateeT& get_decoratee() const { + return *decoratee; + } + + template ::value, T>::type* = nullptr> + const DerefedDecorateeT& get_decoratee() const { + return decoratee; + } + +public: + explicit DecoratedApplier(DecorateeT&& decoratee) + : decoratee(std::forward(decoratee)) { + } + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override { + return get_decoratee().get_perms_from_aclspec(dpp, aclspec); + } + + bool is_admin_of(const rgw_user& uid) const override { + return get_decoratee().is_admin_of(uid); + } + + bool is_owner_of(const rgw_user& uid) const override { + return get_decoratee().is_owner_of(uid); + } + + bool is_anonymous() const override { + return get_decoratee().is_anonymous(); + } + + uint32_t get_perm_mask() const override { + return get_decoratee().get_perm_mask(); + } + + uint32_t get_identity_type() const override { + return get_decoratee().get_identity_type(); + } + + std::string get_acct_name() const override { + return get_decoratee().get_acct_name(); + } + + std::string get_subuser() const override { + return get_decoratee().get_subuser(); + } + + bool is_identity( + const boost::container::flat_set& ids) const override { + return get_decoratee().is_identity(ids); + } + + void to_str(std::ostream& out) const override { + get_decoratee().to_str(out); + } + + std::string get_role_tenant() const override { /* in/out */ + return get_decoratee().get_role_tenant(); + } + + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override { /* out */ + return get_decoratee().load_acct_info(dpp, user_info); + } + + void modify_request_state(const DoutPrefixProvider* dpp, req_state * s) const override { /* in/out */ + return get_decoratee().modify_request_state(dpp, s); + } + + void write_ops_log_entry(rgw_log_entry& entry) const override { + return get_decoratee().write_ops_log_entry(entry); + } +}; + + +template +class ThirdPartyAccountApplier : public DecoratedApplier { + rgw::sal::Driver* driver; + const rgw_user acct_user_override; + +public: + /* A value representing situations where there is no requested account + * override. In other words, acct_user_override will be equal to this + * constant where the request isn't a cross-tenant one. */ + static const rgw_user UNKNOWN_ACCT; + + template + ThirdPartyAccountApplier(rgw::sal::Driver* driver, + const rgw_user &acct_user_override, + U&& decoratee) + : DecoratedApplier(std::move(decoratee)), + driver(driver), + acct_user_override(acct_user_override) { + } + + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ +}; + +/* static declaration: UNKNOWN_ACCT will be an empty rgw_user that is a result + * of the default construction. */ +template +const rgw_user ThirdPartyAccountApplier::UNKNOWN_ACCT; + +template +void ThirdPartyAccountApplier::to_str(std::ostream& out) const +{ + out << "rgw::auth::ThirdPartyAccountApplier(" + acct_user_override.to_str() + ")" + << " -> "; + DecoratedApplier::to_str(out); +} + +template +void ThirdPartyAccountApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const +{ + if (UNKNOWN_ACCT == acct_user_override) { + /* There is no override specified by the upper layer. This means that we'll + * load the account owned by the authenticated identity (aka auth_user). */ + DecoratedApplier::load_acct_info(dpp, user_info); + } else if (DecoratedApplier::is_owner_of(acct_user_override)) { + /* The override has been specified but the account belongs to the authenticated + * identity. We may safely forward the call to a next stage. */ + DecoratedApplier::load_acct_info(dpp, user_info); + } else if (this->is_anonymous()) { + /* If the user was authed by the anonymous engine then scope the ANON user + * to the correct tenant */ + if (acct_user_override.tenant.empty()) + user_info.user_id = rgw_user(acct_user_override.id, RGW_USER_ANON_ID); + else + user_info.user_id = rgw_user(acct_user_override.tenant, RGW_USER_ANON_ID); + } else { + /* Compatibility mechanism for multi-tenancy. For more details refer to + * load_acct_info method of rgw::auth::RemoteApplier. */ + std::unique_ptr user; + + if (acct_user_override.tenant.empty()) { + const rgw_user tenanted_uid(acct_user_override.id, acct_user_override.id); + user = driver->get_user(tenanted_uid); + + if (user->load_user(dpp, null_yield) >= 0) { + user_info = user->get_info(); + /* Succeeded. */ + return; + } + } + + user = driver->get_user(acct_user_override); + const int ret = user->load_user(dpp, null_yield); + if (ret < 0) { + /* We aren't trying to recover from ENOENT here. It's supposed that creating + * someone else's account isn't a thing we want to support in this filter. */ + if (ret == -ENOENT) { + throw -EACCES; + } else { + throw ret; + } + } + user_info = user->get_info(); + } +} + +template static inline +ThirdPartyAccountApplier add_3rdparty(rgw::sal::Driver* driver, + const rgw_user &acct_user_override, + T&& t) { + return ThirdPartyAccountApplier(driver, acct_user_override, + std::forward(t)); +} + + +template +class SysReqApplier : public DecoratedApplier { + CephContext* const cct; + rgw::sal::Driver* driver; + const RGWHTTPArgs& args; + mutable boost::tribool is_system; + +public: + template + SysReqApplier(CephContext* const cct, + rgw::sal::Driver* driver, + const req_state* const s, + U&& decoratee) + : DecoratedApplier(std::forward(decoratee)), + cct(cct), + driver(driver), + args(s->info.args), + is_system(boost::logic::indeterminate) { + } + + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override; /* in/out */ +}; + +template +void SysReqApplier::to_str(std::ostream& out) const +{ + out << "rgw::auth::SysReqApplier" << " -> "; + DecoratedApplier::to_str(out); +} + +template +void SysReqApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const +{ + DecoratedApplier::load_acct_info(dpp, user_info); + is_system = user_info.system; + + if (is_system) { + //ldpp_dout(dpp, 20) << "system request" << dendl; + + rgw_user effective_uid(args.sys_get(RGW_SYS_PARAM_PREFIX "uid")); + if (! effective_uid.empty()) { + /* We aren't writing directly to user_info for consistency and security + * reasons. rgw_get_user_info_by_uid doesn't trigger the operator=() but + * calls ::decode instead. */ + std::unique_ptr user = driver->get_user(effective_uid); + if (user->load_user(dpp, null_yield) < 0) { + //ldpp_dout(dpp, 0) << "User lookup failed!" << dendl; + throw -EACCES; + } + user_info = user->get_info(); + } + } +} + +template +void SysReqApplier::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s) const +{ + if (boost::logic::indeterminate(is_system)) { + RGWUserInfo unused_info; + load_acct_info(dpp, unused_info); + } + + if (is_system) { + s->info.args.set_system(); + s->system_request = true; + } + DecoratedApplier::modify_request_state(dpp, s); +} + +template static inline +SysReqApplier add_sysreq(CephContext* const cct, + rgw::sal::Driver* driver, + const req_state* const s, + T&& t) { + return SysReqApplier(cct, driver, s, std::forward(t)); +} + +} /* namespace auth */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_auth_keystone.cc b/src/rgw/rgw_auth_keystone.cc new file mode 100644 index 000000000..81588d50c --- /dev/null +++ b/src/rgw/rgw_auth_keystone.cc @@ -0,0 +1,767 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include +#include + +#include "rgw_b64.h" + +#include "common/errno.h" +#include "common/ceph_json.h" +#include "include/types.h" +#include "include/str_list.h" + +#include "rgw_common.h" +#include "rgw_keystone.h" +#include "rgw_auth_keystone.h" +#include "rgw_rest_s3.h" +#include "rgw_auth_s3.h" + +#include "common/ceph_crypto.h" +#include "common/Cond.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw { +namespace auth { +namespace keystone { + +bool +TokenEngine::is_applicable(const std::string& token) const noexcept +{ + return ! token.empty() && ! cct->_conf->rgw_keystone_url.empty(); +} + +boost::optional +TokenEngine::get_from_keystone(const DoutPrefixProvider* dpp, const std::string& token, bool allow_expired) const +{ + /* Unfortunately, we can't use the short form of "using" here. It's because + * we're aliasing a class' member, not namespace. */ + using RGWValidateKeystoneToken = \ + rgw::keystone::Service::RGWValidateKeystoneToken; + + /* The container for plain response obtained from Keystone. It will be + * parsed token_envelope_t::parse method. */ + ceph::bufferlist token_body_bl; + RGWValidateKeystoneToken validate(cct, "GET", "", &token_body_bl); + + std::string url = config.get_endpoint_url(); + if (url.empty()) { + throw -EINVAL; + } + + const auto keystone_version = config.get_api_version(); + if (keystone_version == rgw::keystone::ApiVersion::VER_2) { + url.append("v2.0/tokens/" + token); + } else if (keystone_version == rgw::keystone::ApiVersion::VER_3) { + url.append("v3/auth/tokens"); + + if (allow_expired) { + url.append("?allow_expired=1"); + } + + validate.append_header("X-Subject-Token", token); + } + + std::string admin_token; + if (rgw::keystone::Service::get_admin_token(dpp, cct, token_cache, config, + admin_token) < 0) { + throw -EINVAL; + } + + validate.append_header("X-Auth-Token", admin_token); + validate.set_send_length(0); + + validate.set_url(url); + + int ret = validate.process(null_yield); + + /* NULL terminate for debug output. */ + token_body_bl.append(static_cast(0)); + + /* Detect Keystone rejection earlier than during the token parsing. + * Although failure at the parsing phase doesn't impose a threat, + * this allows to return proper error code (EACCESS instead of EINVAL + * or similar) and thus improves logging. */ + if (validate.get_http_status() == + /* Most likely: wrong admin credentials or admin token. */ + RGWValidateKeystoneToken::HTTP_STATUS_UNAUTHORIZED || + validate.get_http_status() == + /* Most likely: non-existent token supplied by the client. */ + RGWValidateKeystoneToken::HTTP_STATUS_NOTFOUND) { + ldpp_dout(dpp, 5) << "Failed keystone auth from " << url << " with " + << validate.get_http_status() << dendl; + return boost::none; + } + // throw any other http or connection errors + if (ret < 0) { + throw ret; + } + + ldpp_dout(dpp, 20) << "received response status=" << validate.get_http_status() + << ", body=" << token_body_bl.c_str() << dendl; + + TokenEngine::token_envelope_t token_body; + ret = token_body.parse(dpp, cct, token, token_body_bl, config.get_api_version()); + if (ret < 0) { + throw ret; + } + + return token_body; +} + +TokenEngine::auth_info_t +TokenEngine::get_creds_info(const TokenEngine::token_envelope_t& token, + const std::vector& admin_roles + ) const noexcept +{ + using acct_privilege_t = rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t; + + /* Check whether the user has an admin status. */ + acct_privilege_t level = acct_privilege_t::IS_PLAIN_ACCT; + for (const auto& admin_role : admin_roles) { + if (token.has_role(admin_role)) { + level = acct_privilege_t::IS_ADMIN_ACCT; + break; + } + } + + return auth_info_t { + /* Suggested account name for the authenticated user. */ + rgw_user(token.get_project_id()), + /* User's display name (aka real name). */ + token.get_project_name(), + /* Keystone doesn't support RGW's subuser concept, so we cannot cut down + * the access rights through the perm_mask. At least at this layer. */ + RGW_PERM_FULL_CONTROL, + level, + rgw::auth::RemoteApplier::AuthInfo::NO_ACCESS_KEY, + rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER, + TYPE_KEYSTONE +}; +} + +static inline const std::string +make_spec_item(const std::string& tenant, const std::string& id) +{ + return tenant + ":" + id; +} + +TokenEngine::acl_strategy_t +TokenEngine::get_acl_strategy(const TokenEngine::token_envelope_t& token) const +{ + /* The primary identity is constructed upon UUIDs. */ + const auto& tenant_uuid = token.get_project_id(); + const auto& user_uuid = token.get_user_id(); + + /* For Keystone v2 an alias may be also used. */ + const auto& tenant_name = token.get_project_name(); + const auto& user_name = token.get_user_name(); + + /* Construct all possible combinations including Swift's wildcards. */ + const std::array allowed_items = { + make_spec_item(tenant_uuid, user_uuid), + make_spec_item(tenant_name, user_name), + + /* Wildcards. */ + make_spec_item(tenant_uuid, "*"), + make_spec_item(tenant_name, "*"), + make_spec_item("*", user_uuid), + make_spec_item("*", user_name), + }; + + /* Lambda will obtain a copy of (not a reference to!) allowed_items. */ + return [allowed_items](const rgw::auth::Identity::aclspec_t& aclspec) { + uint32_t perm = 0; + + for (const auto& allowed_item : allowed_items) { + const auto iter = aclspec.find(allowed_item); + + if (std::end(aclspec) != iter) { + perm |= iter->second; + } + } + + return perm; + }; +} + +TokenEngine::result_t +TokenEngine::authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const std::string& service_token, + const req_state* const s) const +{ + bool allow_expired = false; + boost::optional t; + + /* This will be initialized on the first call to this method. In C++11 it's + * also thread-safe. */ + static const struct RolesCacher { + explicit RolesCacher(CephContext* const cct) { + get_str_vec(cct->_conf->rgw_keystone_accepted_roles, plain); + get_str_vec(cct->_conf->rgw_keystone_accepted_admin_roles, admin); + + /* Let's suppose that having an admin role implies also a regular one. */ + plain.insert(std::end(plain), std::begin(admin), std::end(admin)); + } + + std::vector plain; + std::vector admin; + } roles(cct); + + static const struct ServiceTokenRolesCacher { + explicit ServiceTokenRolesCacher(CephContext* const cct) { + get_str_vec(cct->_conf->rgw_keystone_service_token_accepted_roles, plain); + } + + std::vector plain; + } service_token_roles(cct); + + if (! is_applicable(token)) { + return result_t::deny(); + } + + /* Token ID is a legacy of supporting the service-side validation + * of PKI/PKIz token type which are already-removed-in-OpenStack. + * The idea was to bury in cache only a short hash instead of few + * kilobytes. RadosGW doesn't do the local validation anymore. */ + const auto& token_id = rgw_get_token_id(token); + ldpp_dout(dpp, 20) << "token_id=" << token_id << dendl; + + /* Check cache first. */ + t = token_cache.find(token_id); + if (t) { + ldpp_dout(dpp, 20) << "cached token.project.id=" << t->get_project_id() + << dendl; + auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t), + get_creds_info(*t, roles.admin)); + return result_t::grant(std::move(apl)); + } + + /* We have a service token and a token so we verify the service + * token and if it's invalid the request is invalid. If it's valid + * we allow an expired token to be used when doing lookup in Keystone. + * We never get to this if the token is in the cache. */ + if (g_conf()->rgw_keystone_service_token_enabled && ! service_token.empty()) { + boost::optional st; + + const auto& service_token_id = rgw_get_token_id(service_token); + ldpp_dout(dpp, 20) << "service_token_id=" << service_token_id << dendl; + + /* Check cache for service token first. */ + st = token_cache.find_service(service_token_id); + if (st) { + ldpp_dout(dpp, 20) << "cached service_token.project.id=" << st->get_project_id() + << dendl; + + /* We found the service token in the cache so we allow using an expired + * token for this request. */ + allow_expired = true; + ldpp_dout(dpp, 20) << "allowing expired tokens because service_token_id=" + << service_token_id + << " was found in cache" << dendl; + } else { + /* Service token was not found in cache. Go to Keystone for validating + * the token. The allow_expired here must always be false. */ + ceph_assert(allow_expired == false); + st = get_from_keystone(dpp, service_token, allow_expired); + + if (! st) { + return result_t::deny(-EACCES); + } + + /* Verify expiration of service token. */ + if (st->expired()) { + ldpp_dout(dpp, 0) << "got expired service token: " << st->get_project_name() + << ":" << st->get_user_name() + << " expired " << st->get_expires() << dendl; + return result_t::deny(-EPERM); + } + + /* Check for necessary roles for service token. */ + for (const auto& role : service_token_roles.plain) { + if (st->has_role(role) == true) { + /* Service token is valid so we allow using an expired token for + * this request. */ + ldpp_dout(dpp, 20) << "allowing expired tokens because service_token_id=" + << service_token_id + << " is valid, role: " + << role << dendl; + allow_expired = true; + token_cache.add_service(service_token_id, *st); + break; + } + } + + if (!allow_expired) { + ldpp_dout(dpp, 0) << "service token user does not hold a matching role; required roles: " + << g_conf()->rgw_keystone_service_token_accepted_roles << dendl; + return result_t::deny(-EPERM); + } + } + } + + /* Token not in cache. Go to the Keystone for validation. This happens even + * for the legacy PKI/PKIz token types. That's it, after the PKI/PKIz + * RadosGW-side validation has been removed, we always ask Keystone. */ + t = get_from_keystone(dpp, token, allow_expired); + + if (! t) { + return result_t::deny(-EACCES); + } + + /* Verify expiration. */ + if (t->expired()) { + if (allow_expired) { + ldpp_dout(dpp, 20) << "allowing expired token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expired: " << t->get_expires() + << " because of valid service token" << dendl; + } else { + ldpp_dout(dpp, 0) << "got expired token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expired: " << t->get_expires() << dendl; + return result_t::deny(-EPERM); + } + } + + /* Check for necessary roles. */ + for (const auto& role : roles.plain) { + if (t->has_role(role) == true) { + /* If this token was an allowed expired token because we got a + * service token we need to update the expiration before we cache it. */ + if (allow_expired) { + time_t now = ceph_clock_now().sec(); + time_t new_expires = now + g_conf()->rgw_keystone_expired_token_cache_expiration; + ldpp_dout(dpp, 20) << "updating expiration of allowed expired token" + << " from old " << t->get_expires() << " to now " << now << " + " + << g_conf()->rgw_keystone_expired_token_cache_expiration + << " secs = " + << new_expires << dendl; + t->set_expires(new_expires); + } + ldpp_dout(dpp, 0) << "validated token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expires: " << t->get_expires() << dendl; + token_cache.add(token_id, *t); + auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t), + get_creds_info(*t, roles.admin)); + return result_t::grant(std::move(apl)); + } + } + + ldpp_dout(dpp, 0) << "user does not hold a matching role; required roles: " + << g_conf()->rgw_keystone_accepted_roles << dendl; + + return result_t::deny(-EPERM); +} + + +/* + * Try to validate S3 auth against keystone s3token interface + */ +std::pair, int> +EC2Engine::get_from_keystone(const DoutPrefixProvider* dpp, const std::string_view& access_key_id, + const std::string& string_to_sign, + const std::string_view& signature) const +{ + /* prepare keystone url */ + std::string keystone_url = config.get_endpoint_url(); + if (keystone_url.empty()) { + throw -EINVAL; + } + + const auto api_version = config.get_api_version(); + if (api_version == rgw::keystone::ApiVersion::VER_3) { + keystone_url.append("v3/s3tokens"); + } else { + keystone_url.append("v2.0/s3tokens"); + } + + /* get authentication token for Keystone. */ + std::string admin_token; + int ret = rgw::keystone::Service::get_admin_token(dpp, cct, token_cache, config, + admin_token); + if (ret < 0) { + ldpp_dout(dpp, 2) << "s3 keystone: cannot get token for keystone access" + << dendl; + throw ret; + } + + using RGWValidateKeystoneToken + = rgw::keystone::Service::RGWValidateKeystoneToken; + + /* The container for plain response obtained from Keystone. It will be + * parsed token_envelope_t::parse method. */ + ceph::bufferlist token_body_bl; + RGWValidateKeystoneToken validate(cct, "POST", keystone_url, &token_body_bl); + + /* set required headers for keystone request */ + validate.append_header("X-Auth-Token", admin_token); + validate.append_header("Content-Type", "application/json"); + + /* check if we want to verify keystone's ssl certs */ + validate.set_verify_ssl(cct->_conf->rgw_keystone_verify_ssl); + + /* create json credentials request body */ + JSONFormatter credentials(false); + credentials.open_object_section(""); + credentials.open_object_section("credentials"); + credentials.dump_string("access", sview2cstr(access_key_id).data()); + credentials.dump_string("token", rgw::to_base64(string_to_sign)); + credentials.dump_string("signature", sview2cstr(signature).data()); + credentials.close_section(); + credentials.close_section(); + + std::stringstream os; + credentials.flush(os); + validate.set_post_data(os.str()); + validate.set_send_length(os.str().length()); + + /* send request */ + ret = validate.process(null_yield); + + /* if the supplied signature is wrong, we will get 401 from Keystone */ + if (validate.get_http_status() == + decltype(validate)::HTTP_STATUS_UNAUTHORIZED) { + return std::make_pair(boost::none, -ERR_SIGNATURE_NO_MATCH); + } else if (validate.get_http_status() == + decltype(validate)::HTTP_STATUS_NOTFOUND) { + return std::make_pair(boost::none, -ERR_INVALID_ACCESS_KEY); + } + // throw any other http or connection errors + if (ret < 0) { + ldpp_dout(dpp, 2) << "s3 keystone: token validation ERROR: " + << token_body_bl.c_str() << dendl; + throw ret; + } + + /* now parse response */ + rgw::keystone::TokenEnvelope token_envelope; + ret = token_envelope.parse(dpp, cct, std::string(), token_body_bl, api_version); + if (ret < 0) { + ldpp_dout(dpp, 2) << "s3 keystone: token parsing failed, ret=0" << ret + << dendl; + throw ret; + } + + return std::make_pair(std::move(token_envelope), 0); +} + +std::pair, int> EC2Engine::get_secret_from_keystone(const DoutPrefixProvider* dpp, + const std::string& user_id, + const std::string_view& access_key_id) const +{ + /* Fetch from /users/{USER_ID}/credentials/OS-EC2/{ACCESS_KEY_ID} */ + /* Should return json with response key "credential" which contains entry "secret"*/ + + /* prepare keystone url */ + std::string keystone_url = config.get_endpoint_url(); + if (keystone_url.empty()) { + return make_pair(boost::none, -EINVAL); + } + + const auto api_version = config.get_api_version(); + if (api_version == rgw::keystone::ApiVersion::VER_3) { + keystone_url.append("v3/"); + } else { + keystone_url.append("v2.0/"); + } + keystone_url.append("users/"); + keystone_url.append(user_id); + keystone_url.append("/credentials/OS-EC2/"); + keystone_url.append(std::string(access_key_id)); + + /* get authentication token for Keystone. */ + std::string admin_token; + int ret = rgw::keystone::Service::get_admin_token(dpp, cct, token_cache, config, + admin_token); + if (ret < 0) { + ldpp_dout(dpp, 2) << "s3 keystone: cannot get token for keystone access" + << dendl; + return make_pair(boost::none, ret); + } + + using RGWGetAccessSecret + = rgw::keystone::Service::RGWKeystoneHTTPTransceiver; + + /* The container for plain response obtained from Keystone.*/ + ceph::bufferlist token_body_bl; + RGWGetAccessSecret secret(cct, "GET", keystone_url, &token_body_bl); + + /* set required headers for keystone request */ + secret.append_header("X-Auth-Token", admin_token); + + /* check if we want to verify keystone's ssl certs */ + secret.set_verify_ssl(cct->_conf->rgw_keystone_verify_ssl); + + /* send request */ + ret = secret.process(null_yield); + + /* if the supplied access key isn't found, we will get 404 from Keystone */ + if (secret.get_http_status() == + decltype(secret)::HTTP_STATUS_NOTFOUND) { + return make_pair(boost::none, -ERR_INVALID_ACCESS_KEY); + } + // return any other http or connection errors + if (ret < 0) { + ldpp_dout(dpp, 2) << "s3 keystone: secret fetching error: " + << token_body_bl.c_str() << dendl; + return make_pair(boost::none, ret); + } + + /* now parse response */ + + JSONParser parser; + if (! parser.parse(token_body_bl.c_str(), token_body_bl.length())) { + ldpp_dout(dpp, 0) << "Keystone credential parse error: malformed json" << dendl; + return make_pair(boost::none, -EINVAL); + } + + JSONObjIter credential_iter = parser.find_first("credential"); + std::string secret_string; + + try { + if (!credential_iter.end()) { + JSONDecoder::decode_json("secret", secret_string, *credential_iter, true); + } else { + ldpp_dout(dpp, 0) << "Keystone credential not present in return from server" << dendl; + return make_pair(boost::none, -EINVAL); + } + } catch (const JSONDecoder::err& err) { + ldpp_dout(dpp, 0) << "Keystone credential parse error: " << err.what() << dendl; + return make_pair(boost::none, -EINVAL); + } + + return make_pair(secret_string, 0); +} + +/* + * Try to get a token for S3 authentication, using a secret cache if available + */ +auto EC2Engine::get_access_token(const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string& string_to_sign, + const std::string_view& signature, + const signature_factory_t& signature_factory) const + -> access_token_result +{ + using server_signature_t = VersionAbstractor::server_signature_t; + boost::optional token; + boost::optional secret; + int failure_reason; + + /* Get a token from the cache if one has already been stored */ + boost::optional> + t = secret_cache.find(std::string(access_key_id)); + + /* Check that credentials can correctly be used to sign data */ + if (t) { + std::string sig(signature); + server_signature_t server_signature = signature_factory(cct, t->get<1>(), string_to_sign); + if (sig.compare(server_signature) == 0) { + return {t->get<0>(), t->get<1>(), 0}; + } else { + ldpp_dout(dpp, 0) << "Secret string does not correctly sign payload, cache miss" << dendl; + } + } else { + ldpp_dout(dpp, 0) << "No stored secret string, cache miss" << dendl; + } + + /* No cached token, token expired, or secret invalid: fall back to keystone */ + std::tie(token, failure_reason) = get_from_keystone(dpp, access_key_id, string_to_sign, signature); + + if (token) { + /* Fetch secret from keystone for the access_key_id */ + std::tie(secret, failure_reason) = + get_secret_from_keystone(dpp, token->get_user_id(), access_key_id); + + if (secret) { + /* Add token, secret pair to cache, and set timeout */ + secret_cache.add(std::string(access_key_id), *token, *secret); + } + } + + return {token, secret, failure_reason}; +} + +EC2Engine::acl_strategy_t +EC2Engine::get_acl_strategy(const EC2Engine::token_envelope_t&) const +{ + /* This is based on the assumption that the default acl strategy in + * get_perms_from_aclspec, will take care. Extra acl spec is not required. */ + return nullptr; +} + +EC2Engine::auth_info_t +EC2Engine::get_creds_info(const EC2Engine::token_envelope_t& token, + const std::vector& admin_roles, + const std::string& access_key_id + ) const noexcept +{ + using acct_privilege_t = \ + rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t; + + /* Check whether the user has an admin status. */ + acct_privilege_t level = acct_privilege_t::IS_PLAIN_ACCT; + for (const auto& admin_role : admin_roles) { + if (token.has_role(admin_role)) { + level = acct_privilege_t::IS_ADMIN_ACCT; + break; + } + } + + return auth_info_t { + /* Suggested account name for the authenticated user. */ + rgw_user(token.get_project_id()), + /* User's display name (aka real name). */ + token.get_project_name(), + /* Keystone doesn't support RGW's subuser concept, so we cannot cut down + * the access rights through the perm_mask. At least at this layer. */ + RGW_PERM_FULL_CONTROL, + level, + access_key_id, + rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER, + TYPE_KEYSTONE + }; +} + +rgw::auth::Engine::result_t EC2Engine::authenticate( + const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + /* Passthorugh only! */ + const req_state* s, + optional_yield y) const +{ + /* This will be initialized on the first call to this method. In C++11 it's + * also thread-safe. */ + static const struct RolesCacher { + explicit RolesCacher(CephContext* const cct) { + get_str_vec(cct->_conf->rgw_keystone_accepted_roles, plain); + get_str_vec(cct->_conf->rgw_keystone_accepted_admin_roles, admin); + + /* Let's suppose that having an admin role implies also a regular one. */ + plain.insert(std::end(plain), std::begin(admin), std::end(admin)); + } + + std::vector plain; + std::vector admin; + } accepted_roles(cct); + + auto [t, secret_key, failure_reason] = + get_access_token(dpp, access_key_id, string_to_sign, signature, signature_factory); + if (! t) { + if (failure_reason == -ERR_SIGNATURE_NO_MATCH) { + // we looked up a secret but it didn't generate the same signature as + // the client. since we found this access key in keystone, we should + // reject the request instead of trying other engines + return result_t::reject(failure_reason); + } + return result_t::deny(failure_reason); + } + + /* Verify expiration. */ + if (t->expired()) { + ldpp_dout(dpp, 0) << "got expired token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expired: " << t->get_expires() << dendl; + return result_t::deny(); + } + + /* check if we have a valid role */ + bool found = false; + for (const auto& role : accepted_roles.plain) { + if (t->has_role(role) == true) { + found = true; + break; + } + } + + if (! found) { + ldpp_dout(dpp, 5) << "s3 keystone: user does not hold a matching role;" + " required roles: " + << cct->_conf->rgw_keystone_accepted_roles << dendl; + return result_t::deny(); + } else { + /* everything seems fine, continue with this user */ + ldpp_dout(dpp, 5) << "s3 keystone: validated token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expires: " << t->get_expires() << dendl; + + auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t), + get_creds_info(*t, accepted_roles.admin, std::string(access_key_id))); + return result_t::grant(std::move(apl), completer_factory(secret_key)); + } +} + +bool SecretCache::find(const std::string& token_id, + SecretCache::token_envelope_t& token, + std::string &secret) +{ + std::lock_guard l(lock); + + map::iterator iter = secrets.find(token_id); + if (iter == secrets.end()) { + return false; + } + + secret_entry& entry = iter->second; + secrets_lru.erase(entry.lru_iter); + + const utime_t now = ceph_clock_now(); + if (entry.token.expired() || now > entry.expires) { + secrets.erase(iter); + return false; + } + token = entry.token; + secret = entry.secret; + + secrets_lru.push_front(token_id); + entry.lru_iter = secrets_lru.begin(); + + return true; +} + +void SecretCache::add(const std::string& token_id, + const SecretCache::token_envelope_t& token, + const std::string& secret) +{ + std::lock_guard l(lock); + + map::iterator iter = secrets.find(token_id); + if (iter != secrets.end()) { + secret_entry& e = iter->second; + secrets_lru.erase(e.lru_iter); + } + + const utime_t now = ceph_clock_now(); + secrets_lru.push_front(token_id); + secret_entry& entry = secrets[token_id]; + entry.token = token; + entry.secret = secret; + entry.expires = now + s3_token_expiry_length; + entry.lru_iter = secrets_lru.begin(); + + while (secrets_lru.size() > max) { + list::reverse_iterator riter = secrets_lru.rbegin(); + iter = secrets.find(*riter); + assert(iter != secrets.end()); + secrets.erase(iter); + secrets_lru.pop_back(); + } +} + +}; /* namespace keystone */ +}; /* namespace auth */ +}; /* namespace rgw */ diff --git a/src/rgw/rgw_auth_keystone.h b/src/rgw/rgw_auth_keystone.h new file mode 100644 index 000000000..9d79bc878 --- /dev/null +++ b/src/rgw/rgw_auth_keystone.h @@ -0,0 +1,202 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include "rgw_auth.h" +#include "rgw_rest_s3.h" +#include "rgw_common.h" +#include "rgw_keystone.h" + +namespace rgw { +namespace auth { +namespace keystone { + +/* Dedicated namespace for Keystone-related auth engines. We need it because + * Keystone offers three different authentication mechanisms (token, EC2 and + * regular user/pass). RadosGW actually does support the first two. */ + +class TokenEngine : public rgw::auth::Engine { + CephContext* const cct; + + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + using auth_info_t = rgw::auth::RemoteApplier::AuthInfo; + using result_t = rgw::auth::Engine::result_t; + using token_envelope_t = rgw::keystone::TokenEnvelope; + + const rgw::auth::TokenExtractor* const auth_token_extractor; + const rgw::auth::TokenExtractor* const service_token_extractor; + const rgw::auth::RemoteApplier::Factory* const apl_factory; + rgw::keystone::Config& config; + rgw::keystone::TokenCache& token_cache; + + /* Helper methods. */ + bool is_applicable(const std::string& token) const noexcept; + + boost::optional + get_from_keystone(const DoutPrefixProvider* dpp, const std::string& token, bool allow_expired) const; + + acl_strategy_t get_acl_strategy(const token_envelope_t& token) const; + auth_info_t get_creds_info(const token_envelope_t& token, + const std::vector& admin_roles + ) const noexcept; + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const std::string& service_token, + const req_state* s) const; + +public: + TokenEngine(CephContext* const cct, + const rgw::auth::TokenExtractor* const auth_token_extractor, + const rgw::auth::TokenExtractor* const service_token_extractor, + const rgw::auth::RemoteApplier::Factory* const apl_factory, + rgw::keystone::Config& config, + rgw::keystone::TokenCache& token_cache) + : cct(cct), + auth_token_extractor(auth_token_extractor), + service_token_extractor(service_token_extractor), + apl_factory(apl_factory), + config(config), + token_cache(token_cache) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::keystone::TokenEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s, + optional_yield y) const override { + return authenticate(dpp, auth_token_extractor->get_token(s), service_token_extractor->get_token(s), s); + } +}; /* class TokenEngine */ + +class SecretCache { + using token_envelope_t = rgw::keystone::TokenEnvelope; + + struct secret_entry { + token_envelope_t token; + std::string secret; + utime_t expires; + std::list::iterator lru_iter; + }; + + const boost::intrusive_ptr cct; + + std::map secrets; + std::list secrets_lru; + + std::mutex lock; + + const size_t max; + + const utime_t s3_token_expiry_length; + + SecretCache() + : cct(g_ceph_context), + lock(), + max(cct->_conf->rgw_keystone_token_cache_size), + s3_token_expiry_length(300, 0) { + } + + ~SecretCache() {} + +public: + SecretCache(const SecretCache&) = delete; + void operator=(const SecretCache&) = delete; + + static SecretCache& get_instance() { + /* In C++11 this is thread safe. */ + static SecretCache instance; + return instance; + } + + bool find(const std::string& token_id, token_envelope_t& token, std::string& secret); + boost::optional> find(const std::string& token_id) { + token_envelope_t token_envlp; + std::string secret; + if (find(token_id, token_envlp, secret)) { + return boost::make_tuple(token_envlp, secret); + } + return boost::none; + } + void add(const std::string& token_id, const token_envelope_t& token, const std::string& secret); +}; /* class SecretCache */ + +class EC2Engine : public rgw::auth::s3::AWSEngine { + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + using auth_info_t = rgw::auth::RemoteApplier::AuthInfo; + using result_t = rgw::auth::Engine::result_t; + using token_envelope_t = rgw::keystone::TokenEnvelope; + + const rgw::auth::RemoteApplier::Factory* const apl_factory; + rgw::keystone::Config& config; + rgw::keystone::TokenCache& token_cache; + rgw::auth::keystone::SecretCache& secret_cache; + + /* Helper methods. */ + acl_strategy_t get_acl_strategy(const token_envelope_t& token) const; + auth_info_t get_creds_info(const token_envelope_t& token, + const std::vector& admin_roles, + const std::string& access_key_id + ) const noexcept; + std::pair, int> + get_from_keystone(const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string& string_to_sign, + const std::string_view& signature) const; + + struct access_token_result { + boost::optional token; + boost::optional secret_key; + int failure_reason = 0; + }; + access_token_result + get_access_token(const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string& string_to_sign, + const std::string_view& signature, + const signature_factory_t& signature_factory) const; + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* s, + optional_yield y) const override; + std::pair, int> get_secret_from_keystone(const DoutPrefixProvider* dpp, + const std::string& user_id, + const std::string_view& access_key_id) const; +public: + EC2Engine(CephContext* const cct, + const rgw::auth::s3::AWSEngine::VersionAbstractor* const ver_abstractor, + const rgw::auth::RemoteApplier::Factory* const apl_factory, + rgw::keystone::Config& config, + /* The token cache is used ONLY for the retrieving admin token. + * Due to the architecture of AWS Auth S3 credentials cannot be + * cached at all. */ + rgw::keystone::TokenCache& token_cache, + rgw::auth::keystone::SecretCache& secret_cache) + : AWSEngine(cct, *ver_abstractor), + apl_factory(apl_factory), + config(config), + token_cache(token_cache), + secret_cache(secret_cache) { + } + + using AWSEngine::authenticate; + + const char* get_name() const noexcept override { + return "rgw::auth::keystone::EC2Engine"; + } + +}; /* class EC2Engine */ + +}; /* namespace keystone */ +}; /* namespace auth */ +}; /* namespace rgw */ diff --git a/src/rgw/rgw_auth_registry.h b/src/rgw/rgw_auth_registry.h new file mode 100644 index 000000000..b9d239aec --- /dev/null +++ b/src/rgw/rgw_auth_registry.h @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include + +#include "rgw_auth.h" +#include "rgw_auth_s3.h" +#include "rgw_swift_auth.h" +#include "rgw_rest_sts.h" + +namespace rgw { +namespace auth { + +/* A class aggregating the knowledge about all Strategies in RadosGW. It is + * responsible for handling the dynamic reconfiguration on e.g. realm update. */ +class StrategyRegistry { + template + using s3_strategy_t = \ + rgw::auth::s3::AWSAuthStrategy; + + struct s3_main_strategy_t : public Strategy { + using s3_main_strategy_plain_t = \ + s3_strategy_t; + using s3_main_strategy_boto2_t = \ + s3_strategy_t; + + s3_main_strategy_plain_t s3_main_strategy_plain; + s3_main_strategy_boto2_t s3_main_strategy_boto2; + + s3_main_strategy_t(CephContext* const cct, + const ImplicitTenants& implicit_tenant_context, + rgw::sal::Driver* driver) + : s3_main_strategy_plain(cct, implicit_tenant_context, driver), + s3_main_strategy_boto2(cct, implicit_tenant_context, driver) { + add_engine(Strategy::Control::SUFFICIENT, s3_main_strategy_plain); + add_engine(Strategy::Control::FALLBACK, s3_main_strategy_boto2); + } + + const char* get_name() const noexcept override { + return "rgw::auth::StrategyRegistry::s3_main_strategy_t"; + } + } s3_main_strategy; + + using s3_post_strategy_t = \ + s3_strategy_t; + s3_post_strategy_t s3_post_strategy; + + rgw::auth::swift::DefaultStrategy swift_strategy; + + rgw::auth::sts::DefaultStrategy sts_strategy; + +public: + StrategyRegistry(CephContext* const cct, + const ImplicitTenants& implicit_tenant_context, + rgw::sal::Driver* driver) + : s3_main_strategy(cct, implicit_tenant_context, driver), + s3_post_strategy(cct, implicit_tenant_context, driver), + swift_strategy(cct, implicit_tenant_context, driver), + sts_strategy(cct, implicit_tenant_context, driver) { + } + + const s3_main_strategy_t& get_s3_main() const { + return s3_main_strategy; + } + + const s3_post_strategy_t& get_s3_post() const { + return s3_post_strategy; + } + + const rgw::auth::swift::DefaultStrategy& get_swift() const { + return swift_strategy; + } + + const rgw::auth::sts::DefaultStrategy& get_sts() const { + return sts_strategy; + } + + static std::unique_ptr + create(CephContext* const cct, + const ImplicitTenants& implicit_tenant_context, + rgw::sal::Driver* driver) { + return std::make_unique(cct, implicit_tenant_context, driver); + } +}; + +} /* namespace auth */ +} /* namespace rgw */ + +using rgw_auth_registry_t = rgw::auth::StrategyRegistry; +using rgw_auth_registry_ptr_t = std::unique_ptr; diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc new file mode 100644 index 000000000..0797f8184 --- /dev/null +++ b/src/rgw/rgw_auth_s3.cc @@ -0,0 +1,1355 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include +#include +#include + +#include "common/armor.h" +#include "common/utf8.h" +#include "rgw_rest_s3.h" +#include "rgw_auth_s3.h" +#include "rgw_common.h" +#include "rgw_client_io.h" +#include "rgw_rest.h" +#include "rgw_crypt_sanitize.h" + +#include +#include +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static const auto signed_subresources = { + "acl", + "cors", + "delete", + "encryption", + "lifecycle", + "location", + "logging", + "notification", + "partNumber", + "policy", + "policyStatus", + "publicAccessBlock", + "requestPayment", + "response-cache-control", + "response-content-disposition", + "response-content-encoding", + "response-content-language", + "response-content-type", + "response-expires", + "tagging", + "torrent", + "uploadId", + "uploads", + "versionId", + "versioning", + "versions", + "website", + "object-lock" +}; + +/* + * ?get the canonical amazon-style header for something? + */ + +static std::string +get_canon_amz_hdr(const meta_map_t& meta_map) +{ + std::string dest; + + for (const auto& kv : meta_map) { + dest.append(kv.first); + dest.append(":"); + dest.append(kv.second); + dest.append("\n"); + } + + return dest; +} + +/* + * ?get the canonical representation of the object's location + */ +static std::string +get_canon_resource(const DoutPrefixProvider *dpp, const char* const request_uri, + const std::map& sub_resources) +{ + std::string dest; + + if (request_uri) { + dest.append(request_uri); + } + + bool initial = true; + for (const auto& subresource : signed_subresources) { + const auto iter = sub_resources.find(subresource); + if (iter == std::end(sub_resources)) { + continue; + } + + if (initial) { + dest.append("?"); + initial = false; + } else { + dest.append("&"); + } + + dest.append(iter->first); + if (! iter->second.empty()) { + dest.append("="); + dest.append(iter->second); + } + } + + ldpp_dout(dpp, 10) << "get_canon_resource(): dest=" << dest << dendl; + return dest; +} + +/* + * get the header authentication information required to + * compute a request's signature + */ +void rgw_create_s3_canonical_header( + const DoutPrefixProvider *dpp, + const char* const method, + const char* const content_md5, + const char* const content_type, + const char* const date, + const meta_map_t& meta_map, + const meta_map_t& qs_map, + const char* const request_uri, + const std::map& sub_resources, + std::string& dest_str) +{ + std::string dest; + + if (method) { + dest = method; + } + dest.append("\n"); + + if (content_md5) { + dest.append(content_md5); + } + dest.append("\n"); + + if (content_type) { + dest.append(content_type); + } + dest.append("\n"); + + if (date) { + dest.append(date); + } + dest.append("\n"); + + dest.append(get_canon_amz_hdr(meta_map)); + dest.append(get_canon_amz_hdr(qs_map)); + dest.append(get_canon_resource(dpp, request_uri, sub_resources)); + + dest_str = dest; +} + +static inline bool is_base64_for_content_md5(unsigned char c) { + return (isalnum(c) || isspace(c) || (c == '+') || (c == '/') || (c == '=')); +} + +static inline void get_v2_qs_map(const req_info& info, + meta_map_t& qs_map) { + const auto& params = const_cast(info.args).get_params(); + for (const auto& elt : params) { + std::string k = boost::algorithm::to_lower_copy(elt.first); + if (k.find("x-amz-meta-") == /* offset */ 0) { + rgw_add_amz_meta_header(qs_map, k, elt.second); + } + if (k == "x-amz-security-token") { + qs_map[k] = elt.second; + } + } +} + +/* + * get the header authentication information required to + * compute a request's signature + */ +bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp, + const req_info& info, + utime_t* const header_time, + std::string& dest, + const bool qsr) +{ + const char* const content_md5 = info.env->get("HTTP_CONTENT_MD5"); + if (content_md5) { + for (const char *p = content_md5; *p; p++) { + if (!is_base64_for_content_md5(*p)) { + ldpp_dout(dpp, 0) << "NOTICE: bad content-md5 provided (not base64)," + << " aborting request p=" << *p << " " << (int)*p << dendl; + return false; + } + } + } + + const char *content_type = info.env->get("CONTENT_TYPE"); + + std::string date; + meta_map_t qs_map; + + if (qsr) { + get_v2_qs_map(info, qs_map); // handle qs metadata + date = info.args.get("Expires"); + } else { + const char *str = info.env->get("HTTP_X_AMZ_DATE"); + const char *req_date = str; + if (str == NULL) { + req_date = info.env->get("HTTP_DATE"); + if (!req_date) { + ldpp_dout(dpp, 0) << "NOTICE: missing date for auth header" << dendl; + return false; + } + date = req_date; + } + + if (header_time) { + struct tm t; + uint32_t ns = 0; + if (!parse_rfc2616(req_date, &t) && !parse_iso8601(req_date, &t, &ns, false)) { + ldpp_dout(dpp, 0) << "NOTICE: failed to parse date <" << req_date << "> for auth header" << dendl; + return false; + } + if (t.tm_year < 70) { + ldpp_dout(dpp, 0) << "NOTICE: bad date (predates epoch): " << req_date << dendl; + return false; + } + *header_time = utime_t(internal_timegm(&t), 0); + *header_time -= t.tm_gmtoff; + } + } + + const auto& meta_map = info.x_meta_map; + const auto& sub_resources = info.args.get_sub_resources(); + + std::string request_uri; + if (info.effective_uri.empty()) { + request_uri = info.request_uri; + } else { + request_uri = info.effective_uri; + } + + rgw_create_s3_canonical_header(dpp, info.method, content_md5, content_type, + date.c_str(), meta_map, qs_map, + request_uri.c_str(), sub_resources, dest); + return true; +} + + +namespace rgw::auth::s3 { + +bool is_time_skew_ok(time_t t) +{ + auto req_tp = ceph::coarse_real_clock::from_time_t(t); + auto cur_tp = ceph::coarse_real_clock::now(); + + if (std::chrono::abs(cur_tp - req_tp) > RGW_AUTH_GRACE) { + dout(10) << "NOTICE: request time skew too big." << dendl; + using ceph::operator<<; + dout(10) << "req_tp=" << req_tp << ", cur_tp=" << cur_tp << dendl; + return false; + } + + return true; +} + +static inline int parse_v4_query_string(const req_info& info, /* in */ + std::string_view& credential, /* out */ + std::string_view& signedheaders, /* out */ + std::string_view& signature, /* out */ + std::string_view& date, /* out */ + std::string_view& sessiontoken) /* out */ +{ + /* auth ships with req params ... */ + + /* look for required params */ + credential = info.args.get("x-amz-credential"); + if (credential.size() == 0) { + return -EPERM; + } + + date = info.args.get("x-amz-date"); + struct tm date_t; + if (!parse_iso8601(sview2cstr(date).data(), &date_t, nullptr, false)) { + return -EPERM; + } + + std::string_view expires = info.args.get("x-amz-expires"); + if (expires.empty()) { + return -EPERM; + } + /* X-Amz-Expires provides the time period, in seconds, for which + the generated presigned URL is valid. The minimum value + you can set is 1, and the maximum is 604800 (seven days) */ + time_t exp = atoll(expires.data()); + if ((exp < 1) || (exp > 7*24*60*60)) { + dout(10) << "NOTICE: exp out of range, exp = " << exp << dendl; + return -EPERM; + } + /* handle expiration in epoch time */ + uint64_t req_sec = (uint64_t)internal_timegm(&date_t); + uint64_t now = ceph_clock_now(); + if (now >= req_sec + exp) { + dout(10) << "NOTICE: now = " << now << ", req_sec = " << req_sec << ", exp = " << exp << dendl; + return -EPERM; + } + + signedheaders = info.args.get("x-amz-signedheaders"); + if (signedheaders.size() == 0) { + return -EPERM; + } + + signature = info.args.get("x-amz-signature"); + if (signature.size() == 0) { + return -EPERM; + } + + if (info.args.exists("x-amz-security-token")) { + sessiontoken = info.args.get("x-amz-security-token"); + if (sessiontoken.size() == 0) { + return -EPERM; + } + } + + return 0; +} + +static bool get_next_token(const std::string_view& s, + size_t& pos, + const char* const delims, + std::string_view& token) +{ + const size_t start = s.find_first_not_of(delims, pos); + if (start == std::string_view::npos) { + pos = s.size(); + return false; + } + + size_t end = s.find_first_of(delims, start); + if (end != std::string_view::npos) + pos = end + 1; + else { + pos = end = s.size(); + } + + token = s.substr(start, end - start); + return true; +} + +template +boost::container::small_vector +get_str_vec(const std::string_view& str, const char* const delims) +{ + boost::container::small_vector str_vec; + + size_t pos = 0; + std::string_view token; + while (pos < str.size()) { + if (get_next_token(str, pos, delims, token)) { + if (token.size() > 0) { + str_vec.push_back(token); + } + } + } + + return str_vec; +} + +template +boost::container::small_vector +get_str_vec(const std::string_view& str) +{ + const char delims[] = ";,= \t"; + return get_str_vec(str, delims); +} + +static inline int parse_v4_auth_header(const req_info& info, /* in */ + std::string_view& credential, /* out */ + std::string_view& signedheaders, /* out */ + std::string_view& signature, /* out */ + std::string_view& date, /* out */ + std::string_view& sessiontoken, /* out */ + const DoutPrefixProvider *dpp) +{ + std::string_view input(info.env->get("HTTP_AUTHORIZATION", "")); + try { + input = input.substr(::strlen(AWS4_HMAC_SHA256_STR) + 1); + } catch (std::out_of_range&) { + /* We should never ever run into this situation as the presence of + * AWS4_HMAC_SHA256_STR had been verified earlier. */ + ldpp_dout(dpp, 10) << "credentials string is too short" << dendl; + return -EINVAL; + } + + std::map kv; + for (const auto& s : get_str_vec<4>(input, ",")) { + const auto parsed_pair = parse_key_value(s); + if (parsed_pair) { + kv[parsed_pair->first] = parsed_pair->second; + } else { + ldpp_dout(dpp, 10) << "NOTICE: failed to parse auth header (s=" << s << ")" + << dendl; + return -EINVAL; + } + } + + static const std::array required_keys = { + "Credential", + "SignedHeaders", + "Signature" + }; + + /* Ensure that the presigned required keys are really there. */ + for (const auto& k : required_keys) { + if (kv.find(k) == std::end(kv)) { + ldpp_dout(dpp, 10) << "NOTICE: auth header missing key: " << k << dendl; + return -EINVAL; + } + } + + credential = kv["Credential"]; + signedheaders = kv["SignedHeaders"]; + signature = kv["Signature"]; + + /* sig hex str */ + ldpp_dout(dpp, 10) << "v4 signature format = " << signature << dendl; + + /* ------------------------- handle x-amz-date header */ + + /* grab date */ + + const char *d = info.env->get("HTTP_X_AMZ_DATE"); + + struct tm t; + if (unlikely(d == NULL)) { + d = info.env->get("HTTP_DATE"); + } + if (!d || !parse_iso8601(d, &t, NULL, false)) { + ldpp_dout(dpp, 10) << "error reading date via http_x_amz_date and http_date" << dendl; + return -EACCES; + } + date = d; + + if (!is_time_skew_ok(internal_timegm(&t))) { + return -ERR_REQUEST_TIME_SKEWED; + } + + auto token = info.env->get_optional("HTTP_X_AMZ_SECURITY_TOKEN"); + if (token) { + sessiontoken = *token; + } + + return 0; +} + +bool is_non_s3_op(RGWOpType op_type) +{ + if (op_type == RGW_STS_GET_SESSION_TOKEN || + op_type == RGW_STS_ASSUME_ROLE || + op_type == RGW_STS_ASSUME_ROLE_WEB_IDENTITY || + op_type == RGW_OP_CREATE_ROLE || + op_type == RGW_OP_DELETE_ROLE || + op_type == RGW_OP_GET_ROLE || + op_type == RGW_OP_MODIFY_ROLE_TRUST_POLICY || + op_type == RGW_OP_LIST_ROLES || + op_type == RGW_OP_PUT_ROLE_POLICY || + op_type == RGW_OP_GET_ROLE_POLICY || + op_type == RGW_OP_LIST_ROLE_POLICIES || + op_type == RGW_OP_DELETE_ROLE_POLICY || + op_type == RGW_OP_PUT_USER_POLICY || + op_type == RGW_OP_GET_USER_POLICY || + op_type == RGW_OP_LIST_USER_POLICIES || + op_type == RGW_OP_DELETE_USER_POLICY || + op_type == RGW_OP_CREATE_OIDC_PROVIDER || + op_type == RGW_OP_DELETE_OIDC_PROVIDER || + op_type == RGW_OP_GET_OIDC_PROVIDER || + op_type == RGW_OP_LIST_OIDC_PROVIDERS || + op_type == RGW_OP_PUBSUB_TOPIC_CREATE || + op_type == RGW_OP_PUBSUB_TOPICS_LIST || + op_type == RGW_OP_PUBSUB_TOPIC_GET || + op_type == RGW_OP_PUBSUB_TOPIC_DELETE || + op_type == RGW_OP_TAG_ROLE || + op_type == RGW_OP_LIST_ROLE_TAGS || + op_type == RGW_OP_UNTAG_ROLE || + op_type == RGW_OP_UPDATE_ROLE) { + return true; + } + return false; +} + +int parse_v4_credentials(const req_info& info, /* in */ + std::string_view& access_key_id, /* out */ + std::string_view& credential_scope, /* out */ + std::string_view& signedheaders, /* out */ + std::string_view& signature, /* out */ + std::string_view& date, /* out */ + std::string_view& session_token, /* out */ + const bool using_qs, /* in */ + const DoutPrefixProvider *dpp) +{ + std::string_view credential; + int ret; + if (using_qs) { + ret = parse_v4_query_string(info, credential, signedheaders, + signature, date, session_token); + } else { + ret = parse_v4_auth_header(info, credential, signedheaders, + signature, date, session_token, dpp); + } + + if (ret < 0) { + return ret; + } + + /* access_key/YYYYMMDD/region/service/aws4_request */ + ldpp_dout(dpp, 10) << "v4 credential format = " << credential << dendl; + + if (std::count(credential.begin(), credential.end(), '/') != 4) { + return -EINVAL; + } + + /* credential must end with 'aws4_request' */ + if (credential.find("aws4_request") == std::string::npos) { + return -EINVAL; + } + + /* grab access key id */ + const size_t pos = credential.find("/"); + access_key_id = credential.substr(0, pos); + ldpp_dout(dpp, 10) << "access key id = " << access_key_id << dendl; + + /* grab credential scope */ + credential_scope = credential.substr(pos + 1); + ldpp_dout(dpp, 10) << "credential scope = " << credential_scope << dendl; + + return 0; +} + +string gen_v4_scope(const ceph::real_time& timestamp, + const string& region, + const string& service) +{ + + auto sec = real_clock::to_time_t(timestamp); + + struct tm bt; + gmtime_r(&sec, &bt); + + auto year = 1900 + bt.tm_year; + auto mon = bt.tm_mon + 1; + auto day = bt.tm_mday; + + return fmt::format(FMT_STRING("{:d}{:02d}{:02d}/{:s}/{:s}/aws4_request"), + year, mon, day, region, service); +} + +std::string get_v4_canonical_qs(const req_info& info, const bool using_qs) +{ + const std::string *params = &info.request_params; + std::string copy_params; + if (params->empty()) { + /* Optimize the typical flow. */ + return std::string(); + } + if (params->find_first_of('+') != std::string::npos) { + copy_params = *params; + boost::replace_all(copy_params, "+", "%20"); + params = ©_params; + } + + /* Handle case when query string exists. Step 3 described in: http://docs. + * aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html */ + std::map canonical_qs_map; + for (const auto& s : get_str_vec<5>(*params, "&")) { + std::string_view key, val; + const auto parsed_pair = parse_key_value(s); + if (parsed_pair) { + std::tie(key, val) = *parsed_pair; + } else { + /* Handling a parameter without any value (even the empty one). That's + * it, we've encountered something like "this_param&other_param=val" + * which is used by S3 for subresources. */ + key = s; + } + + if (using_qs && boost::iequals(key, "X-Amz-Signature")) { + /* Preserving the original behaviour of get_v4_canonical_qs() here. */ + continue; + } + + // while awsv4 specs ask for all slashes to be encoded, s3 itself is relaxed + // in its implementation allowing non-url-encoded slashes to be present in + // presigned urls for instance + canonical_qs_map[aws4_uri_recode(key, true)] = aws4_uri_recode(val, true); + } + + /* Thanks to the early exist we have the guarantee that canonical_qs_map has + * at least one element. */ + auto iter = std::begin(canonical_qs_map); + std::string canonical_qs; + canonical_qs.append(iter->first) + .append("=", ::strlen("=")) + .append(iter->second); + + for (iter++; iter != std::end(canonical_qs_map); iter++) { + canonical_qs.append("&", ::strlen("&")) + .append(iter->first) + .append("=", ::strlen("=")) + .append(iter->second); + } + + return canonical_qs; +} + +static void add_v4_canonical_params_from_map(const map& m, + std::map *result, + bool is_non_s3_op) +{ + for (auto& entry : m) { + const auto& key = entry.first; + if (key.empty() || (is_non_s3_op && key == "PayloadHash")) { + continue; + } + + (*result)[aws4_uri_recode(key, true)] = aws4_uri_recode(entry.second, true); + } +} + +std::string gen_v4_canonical_qs(const req_info& info, bool is_non_s3_op) +{ + std::map canonical_qs_map; + + add_v4_canonical_params_from_map(info.args.get_params(), &canonical_qs_map, is_non_s3_op); + add_v4_canonical_params_from_map(info.args.get_sys_params(), &canonical_qs_map, false); + + if (canonical_qs_map.empty()) { + return string(); + } + + /* Thanks to the early exit we have the guarantee that canonical_qs_map has + * at least one element. */ + auto iter = std::begin(canonical_qs_map); + std::string canonical_qs; + canonical_qs.append(iter->first) + .append("=", ::strlen("=")) + .append(iter->second); + + for (iter++; iter != std::end(canonical_qs_map); iter++) { + canonical_qs.append("&", ::strlen("&")) + .append(iter->first) + .append("=", ::strlen("=")) + .append(iter->second); + } + + return canonical_qs; +} + +std::string get_v4_canonical_method(const req_state* s) +{ + /* If this is a OPTIONS request we need to compute the v4 signature for the + * intended HTTP method and not the OPTIONS request itself. */ + if (s->op_type == RGW_OP_OPTIONS_CORS) { + const char *cors_method = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + + if (cors_method) { + /* Validate request method passed in access-control-request-method is valid. */ + auto cors_flags = get_cors_method_flags(cors_method); + if (!cors_flags) { + ldpp_dout(s, 1) << "invalid access-control-request-method header = " + << cors_method << dendl; + throw -EINVAL; + } + + ldpp_dout(s, 10) << "canonical req method = " << cors_method + << ", due to access-control-request-method header" << dendl; + return cors_method; + } else { + ldpp_dout(s, 1) << "invalid http options req missing " + << "access-control-request-method header" << dendl; + throw -EINVAL; + } + } + + return s->info.method; +} + +boost::optional +get_v4_canonical_headers(const req_info& info, + const std::string_view& signedheaders, + const bool using_qs, + const bool force_boto2_compat) +{ + std::map canonical_hdrs_map; + for (const auto& token : get_str_vec<5>(signedheaders, ";")) { + /* TODO(rzarzynski): we'd like to switch to sstring here but it should + * get push_back() and reserve() first. */ + std::string token_env = "HTTP_"; + token_env.reserve(token.length() + std::strlen("HTTP_") + 1); + + std::transform(std::begin(token), std::end(token), + std::back_inserter(token_env), [](const int c) { + return c == '-' ? '_' : c == '_' ? '-' : std::toupper(c); + }); + + if (token_env == "HTTP_CONTENT_LENGTH") { + token_env = "CONTENT_LENGTH"; + } else if (token_env == "HTTP_CONTENT_TYPE") { + token_env = "CONTENT_TYPE"; + } + const char* const t = info.env->get(token_env.c_str()); + if (!t) { + dout(10) << "warning env var not available " << token_env.c_str() << dendl; + continue; + } + + std::string token_value(t); + if (token_env == "HTTP_CONTENT_MD5" && + !std::all_of(std::begin(token_value), std::end(token_value), + is_base64_for_content_md5)) { + dout(0) << "NOTICE: bad content-md5 provided (not base64)" + << ", aborting request" << dendl; + return boost::none; + } + + if (force_boto2_compat && using_qs && token == "host") { + std::string_view port = info.env->get("SERVER_PORT", ""); + std::string_view secure_port = info.env->get("SERVER_PORT_SECURE", ""); + + if (!secure_port.empty()) { + if (secure_port != "443") + token_value.append(":", std::strlen(":")) + .append(secure_port.data(), secure_port.length()); + } else if (!port.empty()) { + if (port != "80") + token_value.append(":", std::strlen(":")) + .append(port.data(), port.length()); + } + } + + canonical_hdrs_map[token] = rgw_trim_whitespace(token_value); + } + + std::string canonical_hdrs; + for (const auto& header : canonical_hdrs_map) { + const std::string_view& name = header.first; + std::string value = header.second; + boost::trim_all(value); + + canonical_hdrs.append(name.data(), name.length()) + .append(":", std::strlen(":")) + .append(value) + .append("\n", std::strlen("\n")); + } + return canonical_hdrs; +} + +static void handle_header(const string& header, const string& val, + std::map *canonical_hdrs_map) +{ + /* TODO(rzarzynski): we'd like to switch to sstring here but it should + * get push_back() and reserve() first. */ + + std::string token; + token.reserve(header.length()); + + if (header == "HTTP_CONTENT_LENGTH") { + token = "content-length"; + } else if (header == "HTTP_CONTENT_TYPE") { + token = "content-type"; + } else { + auto start = std::begin(header); + if (boost::algorithm::starts_with(header, "HTTP_")) { + start += 5; /* len("HTTP_") */ + } + + std::transform(start, std::end(header), + std::back_inserter(token), [](const int c) { + return c == '_' ? '-' : std::tolower(c); + }); + } + + (*canonical_hdrs_map)[token] = rgw_trim_whitespace(val); +} + +std::string gen_v4_canonical_headers(const req_info& info, + const map& extra_headers, + string *signed_hdrs) +{ + std::map canonical_hdrs_map; + for (auto& entry : info.env->get_map()) { + handle_header(entry.first, entry.second, &canonical_hdrs_map); + } + for (auto& entry : extra_headers) { + handle_header(entry.first, entry.second, &canonical_hdrs_map); + } + + std::string canonical_hdrs; + signed_hdrs->clear(); + for (const auto& header : canonical_hdrs_map) { + const auto& name = header.first; + std::string value = header.second; + boost::trim_all(value); + + if (!signed_hdrs->empty()) { + signed_hdrs->append(";"); + } + signed_hdrs->append(name); + + canonical_hdrs.append(name.data(), name.length()) + .append(":", std::strlen(":")) + .append(value) + .append("\n", std::strlen("\n")); + } + + return canonical_hdrs; +} + +/* + * create canonical request for signature version 4 + * + * http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + */ +sha256_digest_t +get_v4_canon_req_hash(CephContext* cct, + const std::string_view& http_verb, + const std::string& canonical_uri, + const std::string& canonical_qs, + const std::string& canonical_hdrs, + const std::string_view& signed_hdrs, + const std::string_view& request_payload_hash, + const DoutPrefixProvider *dpp) +{ + ldpp_dout(dpp, 10) << "payload request hash = " << request_payload_hash << dendl; + + const auto canonical_req = string_join_reserve("\n", + http_verb, + canonical_uri, + canonical_qs, + canonical_hdrs, + signed_hdrs, + request_payload_hash); + + const auto canonical_req_hash = calc_hash_sha256(canonical_req); + + using sanitize = rgw::crypt_sanitize::log_content; + ldpp_dout(dpp, 10) << "canonical request = " << sanitize{canonical_req} << dendl; + ldpp_dout(dpp, 10) << "canonical request hash = " + << canonical_req_hash << dendl; + + return canonical_req_hash; +} + +/* + * create string to sign for signature version 4 + * + * http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html + */ +AWSEngine::VersionAbstractor::string_to_sign_t +get_v4_string_to_sign(CephContext* const cct, + const std::string_view& algorithm, + const std::string_view& request_date, + const std::string_view& credential_scope, + const sha256_digest_t& canonreq_hash, + const DoutPrefixProvider *dpp) +{ + const auto hexed_cr_hash = canonreq_hash.to_str(); + const std::string_view hexed_cr_hash_str(hexed_cr_hash); + + const auto string_to_sign = string_join_reserve("\n", + algorithm, + request_date, + credential_scope, + hexed_cr_hash_str); + + ldpp_dout(dpp, 10) << "string to sign = " + << rgw::crypt_sanitize::log_content{string_to_sign} + << dendl; + + return string_to_sign; +} + + +static inline std::tuple /* service */ +parse_cred_scope(std::string_view credential_scope) +{ + /* date cred */ + size_t pos = credential_scope.find("/"); + const auto date_cs = credential_scope.substr(0, pos); + credential_scope = credential_scope.substr(pos + 1); + + /* region cred */ + pos = credential_scope.find("/"); + const auto region_cs = credential_scope.substr(0, pos); + credential_scope = credential_scope.substr(pos + 1); + + /* service cred */ + pos = credential_scope.find("/"); + const auto service_cs = credential_scope.substr(0, pos); + + return std::make_tuple(date_cs, region_cs, service_cs); +} + +static inline std::vector +transform_secret_key(const std::string_view& secret_access_key) +{ + /* TODO(rzarzynski): switch to constexpr when C++14 becomes available. */ + static const std::initializer_list AWS4 { 'A', 'W', 'S', '4' }; + + /* boost::container::small_vector might be used here if someone wants to + * optimize out even more dynamic allocations. */ + std::vector secret_key_utf8; + secret_key_utf8.reserve(AWS4.size() + secret_access_key.size()); + secret_key_utf8.assign(AWS4); + + for (const auto c : secret_access_key) { + std::array buf; + const size_t n = encode_utf8(c, buf.data()); + secret_key_utf8.insert(std::end(secret_key_utf8), + std::begin(buf), std::begin(buf) + n); + } + + return secret_key_utf8; +} + +/* + * calculate the SigningKey of AWS auth version 4 + */ +static sha256_digest_t +get_v4_signing_key(CephContext* const cct, + const std::string_view& credential_scope, + const std::string_view& secret_access_key, + const DoutPrefixProvider *dpp) +{ + std::string_view date, region, service; + std::tie(date, region, service) = parse_cred_scope(credential_scope); + + const auto utfed_sec_key = transform_secret_key(secret_access_key); + const auto date_k = calc_hmac_sha256(utfed_sec_key, date); + const auto region_k = calc_hmac_sha256(date_k, region); + const auto service_k = calc_hmac_sha256(region_k, service); + + /* aws4_request */ + const auto signing_key = calc_hmac_sha256(service_k, + std::string_view("aws4_request")); + + ldpp_dout(dpp, 10) << "date_k = " << date_k << dendl; + ldpp_dout(dpp, 10) << "region_k = " << region_k << dendl; + ldpp_dout(dpp, 10) << "service_k = " << service_k << dendl; + ldpp_dout(dpp, 10) << "signing_k = " << signing_key << dendl; + + return signing_key; +} + +/* + * calculate the AWS signature version 4 + * + * http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html + * + * srv_signature_t is an alias over Ceph's basic_sstring. We're using + * it to keep everything within the stack boundaries instead of doing + * dynamic allocations. + */ +AWSEngine::VersionAbstractor::server_signature_t +get_v4_signature(const std::string_view& credential_scope, + CephContext* const cct, + const std::string_view& secret_key, + const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign, + const DoutPrefixProvider *dpp) +{ + auto signing_key = get_v4_signing_key(cct, credential_scope, secret_key, dpp); + + /* The server-side generated digest for comparison. */ + const auto digest = calc_hmac_sha256(signing_key, string_to_sign); + + /* TODO(rzarzynski): I would love to see our sstring having reserve() and + * the non-const data() variant like C++17's std::string. */ + using srv_signature_t = AWSEngine::VersionAbstractor::server_signature_t; + srv_signature_t signature(srv_signature_t::initialized_later(), + digest.SIZE * 2); + buf_to_hex(digest.v, digest.SIZE, signature.begin()); + + ldpp_dout(dpp, 10) << "generated signature = " << signature << dendl; + + return signature; +} + +AWSEngine::VersionAbstractor::server_signature_t +get_v2_signature(CephContext* const cct, + const std::string& secret_key, + const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign) +{ + if (secret_key.empty()) { + throw -EINVAL; + } + + const auto digest = calc_hmac_sha1(secret_key, string_to_sign); + + /* 64 is really enough */; + char buf[64]; + const int ret = ceph_armor(std::begin(buf), + std::begin(buf) + 64, + reinterpret_cast(digest.v), + reinterpret_cast(digest.v + digest.SIZE)); + if (ret < 0) { + ldout(cct, 10) << "ceph_armor failed" << dendl; + throw ret; + } else { + buf[ret] = '\0'; + using srv_signature_t = AWSEngine::VersionAbstractor::server_signature_t; + return srv_signature_t(buf, ret); + } +} + +bool AWSv4ComplMulti::ChunkMeta::is_new_chunk_in_stream(size_t stream_pos) const +{ + return stream_pos >= (data_offset_in_stream + data_length); +} + +size_t AWSv4ComplMulti::ChunkMeta::get_data_size(size_t stream_pos) const +{ + if (stream_pos > (data_offset_in_stream + data_length)) { + /* Data in parsing_buf. */ + return data_length; + } else { + return data_offset_in_stream + data_length - stream_pos; + } +} + + +/* AWSv4 completers begin. */ +std::pair +AWSv4ComplMulti::ChunkMeta::create_next(CephContext* const cct, + ChunkMeta&& old, + const char* const metabuf, + const size_t metabuf_len) +{ + std::string_view metastr(metabuf, metabuf_len); + + const size_t semicolon_pos = metastr.find(";"); + if (semicolon_pos == std::string_view::npos) { + ldout(cct, 20) << "AWSv4ComplMulti cannot find the ';' separator" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + char* data_field_end; + /* strtoull ignores the "\r\n" sequence after each non-first chunk. */ + const size_t data_length = std::strtoull(metabuf, &data_field_end, 16); + if (data_length == 0 && data_field_end == metabuf) { + ldout(cct, 20) << "AWSv4ComplMulti: cannot parse the data size" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + /* Parse the chunk_signature=... part. */ + const auto signature_part = metastr.substr(semicolon_pos + 1); + const size_t eq_sign_pos = signature_part.find("="); + if (eq_sign_pos == std::string_view::npos) { + ldout(cct, 20) << "AWSv4ComplMulti: cannot find the '=' separator" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + /* OK, we have at least the beginning of a signature. */ + const size_t data_sep_pos = signature_part.find("\r\n"); + if (data_sep_pos == std::string_view::npos) { + ldout(cct, 20) << "AWSv4ComplMulti: no new line at signature end" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + const auto signature = \ + signature_part.substr(eq_sign_pos + 1, data_sep_pos - 1 - eq_sign_pos); + if (signature.length() != SIG_SIZE) { + ldout(cct, 20) << "AWSv4ComplMulti: signature.length() != 64" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + const size_t data_starts_in_stream = \ + + semicolon_pos + strlen(";") + data_sep_pos + strlen("\r\n") + + old.data_offset_in_stream + old.data_length; + + ldout(cct, 20) << "parsed new chunk; signature=" << signature + << ", data_length=" << data_length + << ", data_starts_in_stream=" << data_starts_in_stream + << dendl; + + return std::make_pair(ChunkMeta(data_starts_in_stream, + data_length, + signature), + semicolon_pos + 83); +} + +std::string +AWSv4ComplMulti::calc_chunk_signature(const std::string& payload_hash) const +{ + const auto string_to_sign = string_join_reserve("\n", + AWS4_HMAC_SHA256_PAYLOAD_STR, + date, + credential_scope, + prev_chunk_signature, + AWS4_EMPTY_PAYLOAD_HASH, + payload_hash); + + ldout(cct, 20) << "AWSv4ComplMulti: string_to_sign=\n" << string_to_sign + << dendl; + + /* new chunk signature */ + const auto sig = calc_hmac_sha256(signing_key, string_to_sign); + /* FIXME(rzarzynski): std::string here is really unnecessary. */ + return sig.to_str(); +} + + +bool AWSv4ComplMulti::is_signature_mismatched() +{ + /* The validity of previous chunk can be verified only after getting meta- + * data of the next one. */ + const auto payload_hash = calc_hash_sha256_restart_stream(&sha256_hash); + const auto calc_signature = calc_chunk_signature(payload_hash); + + if (chunk_meta.get_signature() != calc_signature) { + ldout(cct, 20) << "AWSv4ComplMulti: ERROR: chunk signature mismatch" + << dendl; + ldout(cct, 20) << "AWSv4ComplMulti: declared signature=" + << chunk_meta.get_signature() << dendl; + ldout(cct, 20) << "AWSv4ComplMulti: calculated signature=" + << calc_signature << dendl; + + return true; + } else { + prev_chunk_signature = chunk_meta.get_signature(); + return false; + } +} + +size_t AWSv4ComplMulti::recv_chunk(char* const buf, const size_t buf_max, bool& eof) +{ + /* Buffer stores only parsed stream. Raw values reflect the stream + * we're getting from a client. */ + size_t buf_pos = 0; + + if (chunk_meta.is_new_chunk_in_stream(stream_pos)) { + /* Verify signature of the previous chunk. We aren't doing that for new + * one as the procedure requires calculation of payload hash. This code + * won't be triggered for the last, zero-length chunk. Instead, is will + * be checked in the complete() method. */ + if (stream_pos >= ChunkMeta::META_MAX_SIZE && is_signature_mismatched()) { + throw rgw::io::Exception(ERR_SIGNATURE_NO_MATCH, std::system_category()); + } + + /* We don't have metadata for this range. This means a new chunk, so we + * need to parse a fresh portion of the stream. Let's start. */ + size_t to_extract = parsing_buf.capacity() - parsing_buf.size(); + do { + const size_t orig_size = parsing_buf.size(); + parsing_buf.resize(parsing_buf.size() + to_extract); + const size_t received = io_base_t::recv_body(parsing_buf.data() + orig_size, + to_extract); + parsing_buf.resize(parsing_buf.size() - (to_extract - received)); + if (received == 0) { + eof = true; + break; + } + + stream_pos += received; + to_extract -= received; + } while (to_extract > 0); + + size_t consumed; + std::tie(chunk_meta, consumed) = \ + ChunkMeta::create_next(cct, std::move(chunk_meta), + parsing_buf.data(), parsing_buf.size()); + + /* We can drop the bytes consumed during metadata parsing. The remainder + * can be chunk's data plus possibly beginning of next chunks' metadata. */ + parsing_buf.erase(std::begin(parsing_buf), + std::begin(parsing_buf) + consumed); + } + + size_t stream_pos_was = stream_pos - parsing_buf.size(); + + size_t to_extract = \ + std::min(chunk_meta.get_data_size(stream_pos_was), buf_max); + dout(30) << "AWSv4ComplMulti: stream_pos_was=" << stream_pos_was << ", to_extract=" << to_extract << dendl; + + /* It's quite probable we have a couple of real data bytes stored together + * with meta-data in the parsing_buf. We need to extract them and move to + * the final buffer. This is a trade-off between frontend's read overhead + * and memcpy. */ + if (to_extract > 0 && parsing_buf.size() > 0) { + const auto data_len = std::min(to_extract, parsing_buf.size()); + const auto data_end_iter = std::begin(parsing_buf) + data_len; + dout(30) << "AWSv4ComplMulti: to_extract=" << to_extract << ", data_len=" << data_len << dendl; + + std::copy(std::begin(parsing_buf), data_end_iter, buf); + parsing_buf.erase(std::begin(parsing_buf), data_end_iter); + + calc_hash_sha256_update_stream(sha256_hash, buf, data_len); + + to_extract -= data_len; + buf_pos += data_len; + } + + /* Now we can do the bulk read directly from RestfulClient without any extra + * buffering. */ + while (to_extract > 0) { + const size_t received = io_base_t::recv_body(buf + buf_pos, to_extract); + dout(30) << "AWSv4ComplMulti: to_extract=" << to_extract << ", received=" << received << dendl; + + if (received == 0) { + eof = true; + break; + } + + calc_hash_sha256_update_stream(sha256_hash, buf + buf_pos, received); + + buf_pos += received; + stream_pos += received; + to_extract -= received; + } + + dout(20) << "AWSv4ComplMulti: filled=" << buf_pos << dendl; + return buf_pos; +} + +size_t AWSv4ComplMulti::recv_body(char* const buf, const size_t buf_max) +{ + bool eof = false; + size_t total = 0; + + while (total < buf_max && !eof) { + const size_t received = recv_chunk(buf + total, buf_max - total, eof); + total += received; + } + dout(20) << "AWSv4ComplMulti: received=" << total << dendl; + return total; +} + +void AWSv4ComplMulti::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s_rw) +{ + const char* const decoded_length = \ + s_rw->info.env->get("HTTP_X_AMZ_DECODED_CONTENT_LENGTH"); + + if (!decoded_length) { + throw -EINVAL; + } else { + s_rw->length = decoded_length; + s_rw->content_length = parse_content_length(decoded_length); + + if (s_rw->content_length < 0) { + ldpp_dout(dpp, 10) << "negative AWSv4's content length, aborting" << dendl; + throw -EINVAL; + } + } + + /* Install the filter over rgw::io::RestfulClient. */ + AWS_AUTHv4_IO(s_rw)->add_filter( + std::static_pointer_cast(shared_from_this())); +} + +bool AWSv4ComplMulti::complete() +{ + /* Now it's time to verify the signature of the last, zero-length chunk. */ + if (is_signature_mismatched()) { + ldout(cct, 10) << "ERROR: signature of last chunk does not match" + << dendl; + return false; + } else { + return true; + } +} + +rgw::auth::Completer::cmplptr_t +AWSv4ComplMulti::create(const req_state* const s, + std::string_view date, + std::string_view credential_scope, + std::string_view seed_signature, + const boost::optional& secret_key) +{ + if (!secret_key) { + /* Some external authorizers (like Keystone) aren't fully compliant with + * AWSv4. They do not provide the secret_key which is necessary to handle + * the streamed upload. */ + throw -ERR_NOT_IMPLEMENTED; + } + + const auto signing_key = \ + rgw::auth::s3::get_v4_signing_key(s->cct, credential_scope, *secret_key, s); + + return std::make_shared(s, + std::move(date), + std::move(credential_scope), + std::move(seed_signature), + signing_key); +} + +size_t AWSv4ComplSingle::recv_body(char* const buf, const size_t max) +{ + const auto received = io_base_t::recv_body(buf, max); + calc_hash_sha256_update_stream(sha256_hash, buf, received); + + return received; +} + +void AWSv4ComplSingle::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s_rw) +{ + /* Install the filter over rgw::io::RestfulClient. */ + AWS_AUTHv4_IO(s_rw)->add_filter( + std::static_pointer_cast(shared_from_this())); +} + +bool AWSv4ComplSingle::complete() +{ + /* The completer is only for the cases where signed payload has been + * requested. It won't be used, for instance, during the query string-based + * authentication. */ + const auto payload_hash = calc_hash_sha256_close_stream(&sha256_hash); + + /* Validate x-amz-sha256 */ + if (payload_hash.compare(expected_request_payload_hash) == 0) { + return true; + } else { + ldout(cct, 10) << "ERROR: x-amz-content-sha256 does not match" + << dendl; + ldout(cct, 10) << "ERROR: grab_aws4_sha256_hash()=" + << payload_hash << dendl; + ldout(cct, 10) << "ERROR: expected_request_payload_hash=" + << expected_request_payload_hash << dendl; + return false; + } +} + +AWSv4ComplSingle::AWSv4ComplSingle(const req_state* const s) + : io_base_t(nullptr), + cct(s->cct), + expected_request_payload_hash(get_v4_exp_payload_hash(s->info)), + sha256_hash(calc_hash_sha256_open_stream()) { +} + +rgw::auth::Completer::cmplptr_t +AWSv4ComplSingle::create(const req_state* const s, + const boost::optional&) +{ + return std::make_shared(s); +} + +} // namespace rgw::auth::s3 diff --git a/src/rgw/rgw_auth_s3.h b/src/rgw/rgw_auth_s3.h new file mode 100644 index 000000000..c03dfad82 --- /dev/null +++ b/src/rgw/rgw_auth_s3.h @@ -0,0 +1,649 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include "common/sstring.hh" +#include "rgw_common.h" +#include "rgw_rest_s3.h" +#include "rgw_auth.h" +#include "rgw_auth_filters.h" +#include "rgw_auth_keystone.h" + + +namespace rgw { +namespace auth { +namespace s3 { + +static constexpr auto RGW_AUTH_GRACE = std::chrono::minutes{15}; + +// returns true if the request time is within RGW_AUTH_GRACE of the current time +bool is_time_skew_ok(time_t t); + +class STSAuthStrategy : public rgw::auth::Strategy, + public rgw::auth::RemoteApplier::Factory, + public rgw::auth::LocalApplier::Factory, + public rgw::auth::RoleApplier::Factory { + typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t; + rgw::sal::Driver* driver; + const rgw::auth::ImplicitTenants& implicit_tenant_context; + + STSEngine sts_engine; + + aplptr_t create_apl_remote(CephContext* const cct, + const req_state* const s, + rgw::auth::RemoteApplier::acl_strategy_t&& acl_alg, + const rgw::auth::RemoteApplier::AuthInfo &info) const override { + auto apl = rgw::auth::add_sysreq(cct, driver, s, + rgw::auth::RemoteApplier(cct, driver, std::move(acl_alg), info, + implicit_tenant_context, + rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3)); + return aplptr_t(new decltype(apl)(std::move(apl))); + } + + aplptr_t create_apl_local(CephContext* const cct, + const req_state* const s, + const RGWUserInfo& user_info, + const std::string& subuser, + const std::optional& perm_mask, + const std::string& access_key_id) const override { + auto apl = rgw::auth::add_sysreq(cct, driver, s, + rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask, access_key_id)); + return aplptr_t(new decltype(apl)(std::move(apl))); + } + + aplptr_t create_apl_role(CephContext* const cct, + const req_state* const s, + const rgw::auth::RoleApplier::Role& role, + const rgw::auth::RoleApplier::TokenAttrs& token_attrs) const override { + auto apl = rgw::auth::add_sysreq(cct, driver, s, + rgw::auth::RoleApplier(cct, role, token_attrs)); + return aplptr_t(new decltype(apl)(std::move(apl))); + } + +public: + STSAuthStrategy(CephContext* const cct, + rgw::sal::Driver* driver, + const rgw::auth::ImplicitTenants& implicit_tenant_context, + AWSEngine::VersionAbstractor* const ver_abstractor) + : driver(driver), + implicit_tenant_context(implicit_tenant_context), + sts_engine(cct, driver, *ver_abstractor, + static_cast(this), + static_cast(this), + static_cast(this)) { + if (cct->_conf->rgw_s3_auth_use_sts) { + add_engine(Control::SUFFICIENT, sts_engine); + } + } + + const char* get_name() const noexcept override { + return "rgw::auth::s3::STSAuthStrategy"; + } +}; + +class ExternalAuthStrategy : public rgw::auth::Strategy, + public rgw::auth::RemoteApplier::Factory { + typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t; + rgw::sal::Driver* driver; + const rgw::auth::ImplicitTenants& implicit_tenant_context; + + using keystone_config_t = rgw::keystone::CephCtxConfig; + using keystone_cache_t = rgw::keystone::TokenCache; + using secret_cache_t = rgw::auth::keystone::SecretCache; + using EC2Engine = rgw::auth::keystone::EC2Engine; + + boost::optional keystone_engine; + LDAPEngine ldap_engine; + + aplptr_t create_apl_remote(CephContext* const cct, + const req_state* const s, + rgw::auth::RemoteApplier::acl_strategy_t&& acl_alg, + const rgw::auth::RemoteApplier::AuthInfo &info) const override { + auto apl = rgw::auth::add_sysreq(cct, driver, s, + rgw::auth::RemoteApplier(cct, driver, std::move(acl_alg), info, + implicit_tenant_context, + rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3)); + /* TODO(rzarzynski): replace with static_ptr. */ + return aplptr_t(new decltype(apl)(std::move(apl))); + } + +public: + ExternalAuthStrategy(CephContext* const cct, + rgw::sal::Driver* driver, + const rgw::auth::ImplicitTenants& implicit_tenant_context, + AWSEngine::VersionAbstractor* const ver_abstractor) + : driver(driver), + implicit_tenant_context(implicit_tenant_context), + ldap_engine(cct, driver, *ver_abstractor, + static_cast(this)) { + + if (cct->_conf->rgw_s3_auth_use_keystone && + ! cct->_conf->rgw_keystone_url.empty()) { + + keystone_engine.emplace(cct, ver_abstractor, + static_cast(this), + keystone_config_t::get_instance(), + keystone_cache_t::get_instance(), + secret_cache_t::get_instance()); + add_engine(Control::SUFFICIENT, *keystone_engine); + + } + + if (ldap_engine.valid()) { + add_engine(Control::SUFFICIENT, ldap_engine); + } + } + + const char* get_name() const noexcept override { + return "rgw::auth::s3::AWSv2ExternalAuthStrategy"; + } +}; + + +template +class AWSAuthStrategy : public rgw::auth::Strategy, + public rgw::auth::LocalApplier::Factory { + typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t; + + static_assert(std::is_base_of::value, + "AbstractorT must be a subclass of rgw::auth::s3::VersionAbstractor"); + + rgw::sal::Driver* driver; + AbstractorT ver_abstractor; + + S3AnonymousEngine anonymous_engine; + ExternalAuthStrategy external_engines; + STSAuthStrategy sts_engine; + LocalEngine local_engine; + + aplptr_t create_apl_local(CephContext* const cct, + const req_state* const s, + const RGWUserInfo& user_info, + const std::string& subuser, + const std::optional& perm_mask, + const std::string& access_key_id) const override { + auto apl = rgw::auth::add_sysreq(cct, driver, s, + rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask, access_key_id)); + /* TODO(rzarzynski): replace with static_ptr. */ + return aplptr_t(new decltype(apl)(std::move(apl))); + } + +public: + using engine_map_t = std::map >; + void add_engines(const std::vector & auth_order, + engine_map_t eng_map) + { + auto ctrl_flag = Control::SUFFICIENT; + for (const auto &eng : auth_order) { + // fallback to the last engine, in case of multiple engines, since ctrl + // flag is sufficient for others, error from earlier engine is returned + if (&eng == &auth_order.back() && eng_map.size() > 1) { + ctrl_flag = Control::FALLBACK; + } + if (const auto kv = eng_map.find(eng); + kv != eng_map.end()) { + add_engine(ctrl_flag, kv->second); + } + } + } + + auto parse_auth_order(CephContext* const cct) + { + std::vector result; + + const std::set allowed_auth = { "sts", "external", "local" }; + std::vector default_order = { "sts", "external", "local" }; + // supplied strings may contain a space, so let's bypass that + boost::split(result, cct->_conf->rgw_s3_auth_order, + boost::is_any_of(", "), boost::token_compress_on); + + if (std::any_of(result.begin(), result.end(), + [allowed_auth](std::string_view s) + { return allowed_auth.find(s) == allowed_auth.end();})){ + return default_order; + } + return result; + } + + AWSAuthStrategy(CephContext* const cct, + const rgw::auth::ImplicitTenants& implicit_tenant_context, + rgw::sal::Driver* driver) + : driver(driver), + ver_abstractor(cct), + anonymous_engine(cct, + static_cast(this)), + external_engines(cct, driver, implicit_tenant_context, &ver_abstractor), + sts_engine(cct, driver, implicit_tenant_context, &ver_abstractor), + local_engine(cct, driver, ver_abstractor, + static_cast(this)) { + /* The anonymous auth. */ + if (AllowAnonAccessT) { + add_engine(Control::SUFFICIENT, anonymous_engine); + } + + auto auth_order = parse_auth_order(cct); + engine_map_t engine_map; + + /* STS Auth*/ + if (! sts_engine.is_empty()) { + engine_map.insert(std::make_pair("sts", std::cref(sts_engine))); + } + + /* The external auth. */ + if (! external_engines.is_empty()) { + engine_map.insert(std::make_pair("external", std::cref(external_engines))); + } + /* The local auth. */ + if (cct->_conf->rgw_s3_auth_use_rados) { + engine_map.insert(std::make_pair("local", std::cref(local_engine))); + } + + add_engines(auth_order, engine_map); + } + + const char* get_name() const noexcept override { + return "rgw::auth::s3::AWSAuthStrategy"; + } +}; + + +class AWSv4ComplMulti : public rgw::auth::Completer, + public rgw::io::DecoratedRestfulClient, + public std::enable_shared_from_this { + using io_base_t = rgw::io::DecoratedRestfulClient; + using signing_key_t = sha256_digest_t; + + CephContext* const cct; + + const std::string_view date; + const std::string_view credential_scope; + const signing_key_t signing_key; + + class ChunkMeta { + size_t data_offset_in_stream = 0; + size_t data_length = 0; + std::string signature; + + ChunkMeta(const size_t data_starts_in_stream, + const size_t data_length, + const std::string_view signature) + : data_offset_in_stream(data_starts_in_stream), + data_length(data_length), + signature(std::string(signature)) { + } + + explicit ChunkMeta(const std::string_view& signature) + : signature(std::string(signature)) { + } + + public: + static constexpr size_t SIG_SIZE = 64; + + /* Let's suppose the data length fields can't exceed uint64_t. */ + static constexpr size_t META_MAX_SIZE = \ + sarrlen("\r\nffffffffffffffff;chunk-signature=") + SIG_SIZE + sarrlen("\r\n"); + + /* The metadata size of for the last, empty chunk. */ + static constexpr size_t META_MIN_SIZE = \ + sarrlen("0;chunk-signature=") + SIG_SIZE + sarrlen("\r\n"); + + /* Detect whether a given stream_pos fits in boundaries of a chunk. */ + bool is_new_chunk_in_stream(size_t stream_pos) const; + + /* Get the remaining data size. */ + size_t get_data_size(size_t stream_pos) const; + + const std::string& get_signature() const { + return signature; + } + + /* Factory: create an object representing metadata of first, initial chunk + * in a stream. */ + static ChunkMeta create_first(const std::string_view& seed_signature) { + return ChunkMeta(seed_signature); + } + + /* Factory: parse a block of META_MAX_SIZE bytes and creates an object + * representing non-first chunk in a stream. As the process is sequential + * and depends on the previous chunk, caller must pass it. */ + static std::pair create_next(CephContext* cct, + ChunkMeta&& prev, + const char* metabuf, + size_t metabuf_len); + } chunk_meta; + + size_t stream_pos; + boost::container::static_vector parsing_buf; + ceph::crypto::SHA256* sha256_hash; + std::string prev_chunk_signature; + + bool is_signature_mismatched(); + std::string calc_chunk_signature(const std::string& payload_hash) const; + size_t recv_chunk(char* buf, size_t max, bool& eof); + +public: + /* We need the constructor to be public because of the std::make_shared that + * is employed by the create() method. */ + AWSv4ComplMulti(const req_state* const s, + std::string_view date, + std::string_view credential_scope, + std::string_view seed_signature, + const signing_key_t& signing_key) + : io_base_t(nullptr), + cct(s->cct), + date(std::move(date)), + credential_scope(std::move(credential_scope)), + signing_key(signing_key), + + /* The evolving state. */ + chunk_meta(ChunkMeta::create_first(seed_signature)), + stream_pos(0), + sha256_hash(calc_hash_sha256_open_stream()), + prev_chunk_signature(std::move(seed_signature)) { + } + + ~AWSv4ComplMulti() { + if (sha256_hash) { + calc_hash_sha256_close_stream(&sha256_hash); + } + } + + /* rgw::io::DecoratedRestfulClient. */ + size_t recv_body(char* buf, size_t max) override; + + /* rgw::auth::Completer. */ + void modify_request_state(const DoutPrefixProvider* dpp, req_state* s_rw) override; + bool complete() override; + + /* Factories. */ + static cmplptr_t create(const req_state* s, + std::string_view date, + std::string_view credential_scope, + std::string_view seed_signature, + const boost::optional& secret_key); + +}; + +class AWSv4ComplSingle : public rgw::auth::Completer, + public rgw::io::DecoratedRestfulClient, + public std::enable_shared_from_this { + using io_base_t = rgw::io::DecoratedRestfulClient; + + CephContext* const cct; + const char* const expected_request_payload_hash; + ceph::crypto::SHA256* sha256_hash = nullptr; + +public: + /* Defined in rgw_auth_s3.cc because of get_v4_exp_payload_hash(). We need + * the constructor to be public because of the std::make_shared employed by + * the create() method. */ + explicit AWSv4ComplSingle(const req_state* const s); + + ~AWSv4ComplSingle() { + if (sha256_hash) { + calc_hash_sha256_close_stream(&sha256_hash); + } + } + + /* rgw::io::DecoratedRestfulClient. */ + size_t recv_body(char* buf, size_t max) override; + + /* rgw::auth::Completer. */ + void modify_request_state(const DoutPrefixProvider* dpp, req_state* s_rw) override; + bool complete() override; + + /* Factories. */ + static cmplptr_t create(const req_state* s, + const boost::optional&); + +}; + +} /* namespace s3 */ +} /* namespace auth */ +} /* namespace rgw */ + +void rgw_create_s3_canonical_header( + const DoutPrefixProvider *dpp, + const char *method, + const char *content_md5, + const char *content_type, + const char *date, + const meta_map_t& meta_map, + const meta_map_t& qs_map, + const char *request_uri, + const std::map& sub_resources, + std::string& dest_str); +bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp, + const req_info& info, + utime_t *header_time, /* out */ + std::string& dest, /* out */ + bool qsr); +static inline std::tuple +rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp, const req_info& info, const bool qsr) { + std::string dest; + utime_t header_time; + + const bool ok = rgw_create_s3_canonical_header(dpp, info, &header_time, dest, qsr); + return std::make_tuple(ok, dest, header_time); +} + +namespace rgw { +namespace auth { +namespace s3 { + +static constexpr char AWS4_HMAC_SHA256_STR[] = "AWS4-HMAC-SHA256"; +static constexpr char AWS4_HMAC_SHA256_PAYLOAD_STR[] = "AWS4-HMAC-SHA256-PAYLOAD"; + +static constexpr char AWS4_EMPTY_PAYLOAD_HASH[] = \ + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; + +static constexpr char AWS4_UNSIGNED_PAYLOAD_HASH[] = "UNSIGNED-PAYLOAD"; + +static constexpr char AWS4_STREAMING_PAYLOAD_HASH[] = \ + "STREAMING-AWS4-HMAC-SHA256-PAYLOAD"; + +bool is_non_s3_op(RGWOpType op_type); + +int parse_v4_credentials(const req_info& info, /* in */ + std::string_view& access_key_id, /* out */ + std::string_view& credential_scope, /* out */ + std::string_view& signedheaders, /* out */ + std::string_view& signature, /* out */ + std::string_view& date, /* out */ + std::string_view& session_token, /* out */ + const bool using_qs, /* in */ + const DoutPrefixProvider *dpp); /* in */ + +string gen_v4_scope(const ceph::real_time& timestamp, + const string& region, + const string& service); + +static inline bool char_needs_aws4_escaping(const char c, bool encode_slash) +{ + if ((c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9')) { + return false; + } + + switch (c) { + case '-': + case '_': + case '.': + case '~': + return false; + } + + if (c == '/' && !encode_slash) + return false; + + return true; +} + +static inline std::string aws4_uri_encode(const std::string& src, bool encode_slash) +{ + std::string result; + + for (const std::string::value_type c : src) { + if (char_needs_aws4_escaping(c, encode_slash)) { + rgw_uri_escape_char(c, result); + } else { + result.push_back(c); + } + } + + return result; +} + +static inline std::string aws4_uri_recode(const std::string_view& src, bool encode_slash) +{ + std::string decoded = url_decode(src); + return aws4_uri_encode(decoded, encode_slash); +} + +static inline std::string get_v4_canonical_uri(const req_info& info) { + /* The code should normalize according to RFC 3986 but S3 does NOT do path + * normalization that SigV4 typically does. This code follows the same + * approach that boto library. See auth.py:canonical_uri(...). */ + + std::string canonical_uri = aws4_uri_recode(info.request_uri_aws4, false); + + if (canonical_uri.empty()) { + canonical_uri = "/"; + } else { + boost::replace_all(canonical_uri, "+", "%20"); + } + + return canonical_uri; +} + +static inline std::string gen_v4_canonical_uri(const req_info& info) { + /* The code should normalize according to RFC 3986 but S3 does NOT do path + * normalization that SigV4 typically does. This code follows the same + * approach that boto library. See auth.py:canonical_uri(...). */ + + std::string canonical_uri = aws4_uri_recode(info.request_uri, false); + + if (canonical_uri.empty()) { + canonical_uri = "/"; + } else { + boost::replace_all(canonical_uri, "+", "%20"); + } + + return canonical_uri; +} + +static inline const string calc_v4_payload_hash(const string& payload) +{ + ceph::crypto::SHA256* sha256_hash = calc_hash_sha256_open_stream(); + calc_hash_sha256_update_stream(sha256_hash, payload.c_str(), payload.length()); + const auto payload_hash = calc_hash_sha256_close_stream(&sha256_hash); + return payload_hash; +} + +static inline const char* get_v4_exp_payload_hash(const req_info& info) +{ + /* In AWSv4 the hash of real, transferred payload IS NOT necessary to form + * a Canonical Request, and thus verify a Signature. x-amz-content-sha256 + * header lets get the information very early -- before seeing first byte + * of HTTP body. As a consequence, we can decouple Signature verification + * from payload's fingerprint check. */ + const char *expected_request_payload_hash = \ + info.env->get("HTTP_X_AMZ_CONTENT_SHA256"); + + if (!expected_request_payload_hash) { + /* An HTTP client MUST send x-amz-content-sha256. The single exception + * is the case of using the Query Parameters where "UNSIGNED-PAYLOAD" + * literals are used for crafting Canonical Request: + * + * You don't include a payload hash in the Canonical Request, because + * when you create a presigned URL, you don't know the payload content + * because the URL is used to upload an arbitrary payload. Instead, you + * use a constant string UNSIGNED-PAYLOAD. */ + expected_request_payload_hash = AWS4_UNSIGNED_PAYLOAD_HASH; + } + + return expected_request_payload_hash; +} + +static inline bool is_v4_payload_unsigned(const char* const exp_payload_hash) +{ + return boost::equals(exp_payload_hash, AWS4_UNSIGNED_PAYLOAD_HASH); +} + +static inline bool is_v4_payload_empty(const req_state* const s) +{ + /* from rfc2616 - 4.3 Message Body + * + * "The presence of a message-body in a request is signaled by the inclusion + * of a Content-Length or Transfer-Encoding header field in the request's + * message-headers." */ + return s->content_length == 0 && + s->info.env->get("HTTP_TRANSFER_ENCODING") == nullptr; +} + +static inline bool is_v4_payload_streamed(const char* const exp_payload_hash) +{ + return boost::equals(exp_payload_hash, AWS4_STREAMING_PAYLOAD_HASH); +} + +std::string get_v4_canonical_qs(const req_info& info, bool using_qs); + +std::string gen_v4_canonical_qs(const req_info& info, bool is_non_s3_op); + +std::string get_v4_canonical_method(const req_state* s); + +boost::optional +get_v4_canonical_headers(const req_info& info, + const std::string_view& signedheaders, + bool using_qs, + bool force_boto2_compat); + +std::string gen_v4_canonical_headers(const req_info& info, + const std::map& extra_headers, + string *signed_hdrs); + +extern sha256_digest_t +get_v4_canon_req_hash(CephContext* cct, + const std::string_view& http_verb, + const std::string& canonical_uri, + const std::string& canonical_qs, + const std::string& canonical_hdrs, + const std::string_view& signed_hdrs, + const std::string_view& request_payload_hash, + const DoutPrefixProvider *dpp); + +AWSEngine::VersionAbstractor::string_to_sign_t +get_v4_string_to_sign(CephContext* cct, + const std::string_view& algorithm, + const std::string_view& request_date, + const std::string_view& credential_scope, + const sha256_digest_t& canonreq_hash, + const DoutPrefixProvider *dpp); + +extern AWSEngine::VersionAbstractor::server_signature_t +get_v4_signature(const std::string_view& credential_scope, + CephContext* const cct, + const std::string_view& secret_key, + const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign, + const DoutPrefixProvider *dpp); + +extern AWSEngine::VersionAbstractor::server_signature_t +get_v2_signature(CephContext*, + const std::string& secret_key, + const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign); +} /* namespace s3 */ +} /* namespace auth */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_b64.h b/src/rgw/rgw_b64.h new file mode 100644 index 000000000..2948f6f31 --- /dev/null +++ b/src/rgw/rgw_b64.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rgw { + + /* + * A header-only Base64 encoder built on boost::archive. The + * formula is based on a class poposed for inclusion in boost in + * 2011 by Denis Shevchenko (abandoned), updated slightly + * (e.g., uses std::string_view). + * + * Also, wrap_width added as template argument, based on + * feedback from Marcus. + */ + + template::max()> + inline std::string to_base64(std::string_view sview) + { + using namespace boost::archive::iterators; + + // output must be =padded modulo 3 + auto psize = sview.size(); + while ((psize % 3) != 0) { + ++psize; + } + + /* RFC 2045 requires linebreaks to be present in the output + * sequence every at-most 76 characters (MIME-compliance), + * but we could likely omit it. */ + typedef + insert_linebreaks< + base64_from_binary< + transform_width< + std::string_view::const_iterator + ,6,8> + > + ,wrap_width + > b64_iter; + + std::string outstr(b64_iter(sview.data()), + b64_iter(sview.data() + sview.size())); + + // pad outstr with '=' to a length that is a multiple of 3 + for (size_t ix = 0; ix < (psize-sview.size()); ++ix) + outstr.push_back('='); + + return outstr; + } + + inline std::string from_base64(std::string_view sview) + { + using namespace boost::archive::iterators; + if (sview.empty()) + return std::string(); + /* MIME-compliant input will have line-breaks, so we have to + * filter WS */ + typedef + transform_width< + binary_from_base64< + remove_whitespace< + std::string_view::const_iterator>> + ,8,6 + > b64_iter; + + while (sview.back() == '=') + sview.remove_suffix(1); + + std::string outstr(b64_iter(sview.data()), + b64_iter(sview.data() + sview.size())); + + return outstr; + } +} /* namespace */ diff --git a/src/rgw/rgw_basic_types.cc b/src/rgw/rgw_basic_types.cc new file mode 100644 index 000000000..5a09c017f --- /dev/null +++ b/src/rgw/rgw_basic_types.cc @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include + +#include "cls/user/cls_user_types.h" + +#include "rgw_basic_types.h" +#include "rgw_bucket.h" +#include "rgw_xml.h" + +#include "common/ceph_json.h" +#include "common/Formatter.h" +#include "cls/user/cls_user_types.h" +#include "cls/rgw/cls_rgw_types.h" + +using std::ostream; +using std::string; +using std::stringstream; + +using namespace std; + +void decode_json_obj(rgw_user& val, JSONObj *obj) +{ + val.from_str(obj->get_data()); +} + +void encode_json(const char *name, const rgw_user& val, Formatter *f) +{ + f->dump_string(name, val.to_str()); +} + +void encode_xml(const char *name, const rgw_user& val, Formatter *f) +{ + encode_xml(name, val.to_str(), f); +} + +rgw_bucket::rgw_bucket(const rgw_user& u, const cls_user_bucket& b) : + tenant(u.tenant), + name(b.name), + marker(b.marker), + bucket_id(b.bucket_id), + explicit_placement(b.explicit_placement.data_pool, + b.explicit_placement.data_extra_pool, + b.explicit_placement.index_pool) +{ +} + +void rgw_bucket::convert(cls_user_bucket *b) const +{ + b->name = name; + b->marker = marker; + b->bucket_id = bucket_id; + b->explicit_placement.data_pool = explicit_placement.data_pool.to_str(); + b->explicit_placement.data_extra_pool = explicit_placement.data_extra_pool.to_str(); + b->explicit_placement.index_pool = explicit_placement.index_pool.to_str(); +} + +std::string rgw_bucket::get_key(char tenant_delim, char id_delim, size_t reserve) const +{ + const size_t max_len = tenant.size() + sizeof(tenant_delim) + + name.size() + sizeof(id_delim) + bucket_id.size() + reserve; + + std::string key; + key.reserve(max_len); + if (!tenant.empty() && tenant_delim) { + key.append(tenant); + key.append(1, tenant_delim); + } + key.append(name); + if (!bucket_id.empty() && id_delim) { + key.append(1, id_delim); + key.append(bucket_id); + } + return key; +} + +void rgw_bucket::generate_test_instances(list& o) +{ + rgw_bucket *b = new rgw_bucket; + init_bucket(b, "tenant", "name", "pool", ".index_pool", "marker", "123"); + o.push_back(b); + o.push_back(new rgw_bucket); +} + +std::string rgw_bucket_shard::get_key(char tenant_delim, char id_delim, + char shard_delim, size_t reserve) const +{ + static constexpr size_t shard_len{12}; // ":4294967295\0" + auto key = bucket.get_key(tenant_delim, id_delim, reserve + shard_len); + if (shard_id >= 0 && shard_delim) { + key.append(1, shard_delim); + key.append(std::to_string(shard_id)); + } + return key; +} + +void encode(const rgw_bucket_shard& b, bufferlist& bl, uint64_t f) +{ + encode(b.bucket, bl, f); + encode(b.shard_id, bl, f); +} + +void decode(rgw_bucket_shard& b, bufferlist::const_iterator& bl) +{ + decode(b.bucket, bl); + decode(b.shard_id, bl); +} + +void encode_json_impl(const char *name, const rgw_zone_id& zid, Formatter *f) +{ + encode_json(name, zid.id, f); +} + +void decode_json_obj(rgw_zone_id& zid, JSONObj *obj) +{ + decode_json_obj(zid.id, obj); +} + +void rgw_user::generate_test_instances(list& o) +{ + rgw_user *u = new rgw_user("tenant", "user"); + + o.push_back(u); + o.push_back(new rgw_user); +} + +void rgw_data_placement_target::dump(Formatter *f) const +{ + encode_json("data_pool", data_pool, f); + encode_json("data_extra_pool", data_extra_pool, f); + encode_json("index_pool", index_pool, f); +} + +void rgw_data_placement_target::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("data_pool", data_pool, obj); + JSONDecoder::decode_json("data_extra_pool", data_extra_pool, obj); + JSONDecoder::decode_json("index_pool", index_pool, obj); +} + +void rgw_bucket::dump(Formatter *f) const +{ + encode_json("name", name, f); + encode_json("marker", marker, f); + encode_json("bucket_id", bucket_id, f); + encode_json("tenant", tenant, f); + encode_json("explicit_placement", explicit_placement, f); +} + +void rgw_bucket::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("bucket_id", bucket_id, obj); + JSONDecoder::decode_json("tenant", tenant, obj); + JSONDecoder::decode_json("explicit_placement", explicit_placement, obj); + if (explicit_placement.data_pool.empty()) { + /* decoding old format */ + JSONDecoder::decode_json("pool", explicit_placement.data_pool, obj); + JSONDecoder::decode_json("data_extra_pool", explicit_placement.data_extra_pool, obj); + JSONDecoder::decode_json("index_pool", explicit_placement.index_pool, obj); + } +} + +namespace rgw { +namespace auth { +ostream& operator <<(ostream& m, const Principal& p) { + if (p.is_wildcard()) { + return m << "*"; + } + + m << "arn:aws:iam:" << p.get_tenant() << ":"; + if (p.is_tenant()) { + return m << "root"; + } + return m << (p.is_user() ? "user/" : "role/") << p.get_id(); +} +} +} diff --git a/src/rgw/rgw_basic_types.h b/src/rgw/rgw_basic_types.h new file mode 100644 index 000000000..25d70bdbf --- /dev/null +++ b/src/rgw/rgw_basic_types.h @@ -0,0 +1,291 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * introduce changes or include files which can only be compiled in + * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h) + */ + +#pragma once + +#include +#include + +#include "include/types.h" +#include "rgw_compression_types.h" +#include "rgw_pool_types.h" +#include "rgw_acl_types.h" +#include "rgw_zone_types.h" +#include "rgw_user_types.h" +#include "rgw_bucket_types.h" +#include "rgw_obj_types.h" +#include "rgw_obj_manifest.h" + +#include "common/Formatter.h" + +class JSONObj; +class cls_user_bucket; + +enum RGWIntentEvent { + DEL_OBJ = 0, + DEL_DIR = 1, +}; + +/** Store error returns for output at a different point in the program */ +struct rgw_err { + rgw_err(); + void clear(); + bool is_clear() const; + bool is_err() const; + friend std::ostream& operator<<(std::ostream& oss, const rgw_err &err); + + int http_ret; + int ret; + std::string err_code; + std::string message; +}; /* rgw_err */ + +struct rgw_zone_id { + std::string id; + + rgw_zone_id() {} + rgw_zone_id(const std::string& _id) : id(_id) {} + rgw_zone_id(std::string&& _id) : id(std::move(_id)) {} + + void encode(ceph::buffer::list& bl) const { + /* backward compatiblity, not using ENCODE_{START,END} macros */ + ceph::encode(id, bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) { + /* backward compatiblity, not using DECODE_{START,END} macros */ + ceph::decode(id, bl); + } + + void clear() { + id.clear(); + } + + bool operator==(const std::string& _id) const { + return (id == _id); + } + bool operator==(const rgw_zone_id& zid) const { + return (id == zid.id); + } + bool operator!=(const rgw_zone_id& zid) const { + return (id != zid.id); + } + bool operator<(const rgw_zone_id& zid) const { + return (id < zid.id); + } + bool operator>(const rgw_zone_id& zid) const { + return (id > zid.id); + } + + bool empty() const { + return id.empty(); + } +}; +WRITE_CLASS_ENCODER(rgw_zone_id) + +inline std::ostream& operator<<(std::ostream& os, const rgw_zone_id& zid) { + os << zid.id; + return os; +} + +struct obj_version; +struct rgw_placement_rule; +struct RGWAccessKey; +class RGWUserCaps; + +extern void encode_json(const char *name, const obj_version& v, Formatter *f); +extern void encode_json(const char *name, const RGWUserCaps& val, Formatter *f); +extern void encode_json(const char *name, const rgw_pool& pool, Formatter *f); +extern void encode_json(const char *name, const rgw_placement_rule& r, Formatter *f); +extern void encode_json_impl(const char *name, const rgw_zone_id& zid, ceph::Formatter *f); +extern void encode_json_plain(const char *name, const RGWAccessKey& val, Formatter *f); + +extern void decode_json_obj(obj_version& v, JSONObj *obj); +extern void decode_json_obj(rgw_zone_id& zid, JSONObj *obj); +extern void decode_json_obj(rgw_pool& pool, JSONObj *obj); +extern void decode_json_obj(rgw_placement_rule& v, JSONObj *obj); + +// Represents an identity. This is more wide-ranging than a +// 'User'. Its purposes is to be matched against by an +// IdentityApplier. The internal representation will doubtless change as +// more types are added. We may want to expose the type enum and make +// the member public so people can switch/case on it. + +namespace rgw { +namespace auth { +class Principal { + enum types { User, Role, Tenant, Wildcard, OidcProvider, AssumedRole }; + types t; + rgw_user u; + std::string idp_url; + + explicit Principal(types t) + : t(t) {} + + Principal(types t, std::string&& n, std::string i) + : t(t), u(std::move(n), std::move(i)) {} + + Principal(std::string&& idp_url) + : t(OidcProvider), idp_url(std::move(idp_url)) {} + +public: + + static Principal wildcard() { + return Principal(Wildcard); + } + + static Principal user(std::string&& t, std::string&& u) { + return Principal(User, std::move(t), std::move(u)); + } + + static Principal role(std::string&& t, std::string&& u) { + return Principal(Role, std::move(t), std::move(u)); + } + + static Principal tenant(std::string&& t) { + return Principal(Tenant, std::move(t), {}); + } + + static Principal oidc_provider(std::string&& idp_url) { + return Principal(std::move(idp_url)); + } + + static Principal assumed_role(std::string&& t, std::string&& u) { + return Principal(AssumedRole, std::move(t), std::move(u)); + } + + bool is_wildcard() const { + return t == Wildcard; + } + + bool is_user() const { + return t == User; + } + + bool is_role() const { + return t == Role; + } + + bool is_tenant() const { + return t == Tenant; + } + + bool is_oidc_provider() const { + return t == OidcProvider; + } + + bool is_assumed_role() const { + return t == AssumedRole; + } + + const std::string& get_tenant() const { + return u.tenant; + } + + const std::string& get_id() const { + return u.id; + } + + const std::string& get_idp_url() const { + return idp_url; + } + + const std::string& get_role_session() const { + return u.id; + } + + const std::string& get_role() const { + return u.id; + } + + bool operator ==(const Principal& o) const { + return (t == o.t) && (u == o.u); + } + + bool operator <(const Principal& o) const { + return (t < o.t) || ((t == o.t) && (u < o.u)); + } +}; + +std::ostream& operator <<(std::ostream& m, const Principal& p); +} +} + +class JSONObj; + +void decode_json_obj(rgw_user& val, JSONObj *obj); +void encode_json(const char *name, const rgw_user& val, ceph::Formatter *f); +void encode_xml(const char *name, const rgw_user& val, ceph::Formatter *f); + +inline std::ostream& operator<<(std::ostream& out, const rgw_user &u) { + std::string s; + u.to_str(s); + return out << s; +} + +struct RGWUploadPartInfo { + uint32_t num; + uint64_t size; + uint64_t accounted_size{0}; + std::string etag; + ceph::real_time modified; + RGWObjManifest manifest; + RGWCompressionInfo cs_info; + + // Previous part obj prefixes. Recorded here for later cleanup. + std::set past_prefixes; + + RGWUploadPartInfo() : num(0), size(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(5, 2, bl); + encode(num, bl); + encode(size, bl); + encode(etag, bl); + encode(modified, bl); + encode(manifest, bl); + encode(cs_info, bl); + encode(accounted_size, bl); + encode(past_prefixes, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl); + decode(num, bl); + decode(size, bl); + decode(etag, bl); + decode(modified, bl); + if (struct_v >= 3) + decode(manifest, bl); + if (struct_v >= 4) { + decode(cs_info, bl); + decode(accounted_size, bl); + } else { + accounted_size = size; + } + if (struct_v >= 5) { + decode(past_prefixes, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(RGWUploadPartInfo) diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc new file mode 100644 index 000000000..852469b7e --- /dev/null +++ b/src/rgw/rgw_bucket.cc @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_bucket.h" + +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rgw + +// stolen from src/cls/version/cls_version.cc +#define VERSION_ATTR "ceph.objclass.version" + +using namespace std; + +static void set_err_msg(std::string *sink, std::string msg) +{ + if (sink && !msg.empty()) + *sink = msg; +} + +void init_bucket(rgw_bucket *b, const char *t, const char *n, const char *dp, const char *ip, const char *m, const char *id) +{ + b->tenant = t; + b->name = n; + b->marker = m; + b->bucket_id = id; + b->explicit_placement.data_pool = rgw_pool(dp); + b->explicit_placement.index_pool = rgw_pool(ip); +} + +// parse key in format: [tenant/]name:instance[:shard_id] +int rgw_bucket_parse_bucket_key(CephContext *cct, const string& key, + rgw_bucket *bucket, int *shard_id) +{ + std::string_view name{key}; + std::string_view instance; + + // split tenant/name + auto pos = name.find('/'); + if (pos != string::npos) { + auto tenant = name.substr(0, pos); + bucket->tenant.assign(tenant.begin(), tenant.end()); + name = name.substr(pos + 1); + } else { + bucket->tenant.clear(); + } + + // split name:instance + pos = name.find(':'); + if (pos != string::npos) { + instance = name.substr(pos + 1); + name = name.substr(0, pos); + } + bucket->name.assign(name.begin(), name.end()); + + // split instance:shard + pos = instance.find(':'); + if (pos == string::npos) { + bucket->bucket_id.assign(instance.begin(), instance.end()); + if (shard_id) { + *shard_id = -1; + } + return 0; + } + + // parse shard id + auto shard = instance.substr(pos + 1); + string err; + auto id = strict_strtol(shard.data(), 10, &err); + if (!err.empty()) { + if (cct) { + ldout(cct, 0) << "ERROR: failed to parse bucket shard '" + << instance.data() << "': " << err << dendl; + } + return -EINVAL; + } + + if (shard_id) { + *shard_id = id; + } + instance = instance.substr(0, pos); + bucket->bucket_id.assign(instance.begin(), instance.end()); + return 0; +} + +/* + * Note that this is not a reversal of parse_bucket(). That one deals + * with the syntax we need in metadata and such. This one deals with + * the representation in RADOS pools. We chose '/' because it's not + * acceptable in bucket names and thus qualified buckets cannot conflict + * with the legacy or S3 buckets. + */ +std::string rgw_make_bucket_entry_name(const std::string& tenant_name, + const std::string& bucket_name) { + std::string bucket_entry; + + if (bucket_name.empty()) { + bucket_entry.clear(); + } else if (tenant_name.empty()) { + bucket_entry = bucket_name; + } else { + bucket_entry = tenant_name + "/" + bucket_name; + } + + return bucket_entry; +} + +/* + * Tenants are separated from buckets in URLs by a colon in S3. + * This function is not to be used on Swift URLs, not even for COPY arguments. + */ +int rgw_parse_url_bucket(const string &bucket, const string& auth_tenant, + string &tenant_name, string &bucket_name) { + + int pos = bucket.find(':'); + if (pos >= 0) { + /* + * N.B.: We allow ":bucket" syntax with explicit empty tenant in order + * to refer to the legacy tenant, in case users in new named tenants + * want to access old global buckets. + */ + tenant_name = bucket.substr(0, pos); + bucket_name = bucket.substr(pos + 1); + if (bucket_name.empty()) { + return -ERR_INVALID_BUCKET_NAME; + } + } else { + tenant_name = auth_tenant; + bucket_name = bucket; + } + return 0; +} + +int rgw_chown_bucket_and_objects(rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, + rgw::sal::User* new_user, + const std::string& marker, std::string *err_msg, + const DoutPrefixProvider *dpp, optional_yield y) +{ + /* Chown on the bucket */ + int ret = bucket->chown(dpp, *new_user, y); + if (ret < 0) { + set_err_msg(err_msg, "Failed to change object ownership: " + cpp_strerror(-ret)); + } + + /* Now chown on all the objects in the bucket */ + map common_prefixes; + + rgw::sal::Bucket::ListParams params; + rgw::sal::Bucket::ListResults results; + + params.list_versions = true; + params.allow_unordered = true; + params.marker = marker; + + int count = 0; + int max_entries = 1000; + + //Loop through objects and update object acls to point to bucket owner + + do { + results.objs.clear(); + ret = bucket->list(dpp, params, max_entries, results, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: list objects failed: " << cpp_strerror(-ret) << dendl; + return ret; + } + + params.marker = results.next_marker; + count += results.objs.size(); + + for (const auto& obj : results.objs) { + std::unique_ptr r_obj = bucket->get_object(obj.key); + + ret = r_obj->chown(*new_user, dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: chown failed on " << r_obj << " :" << cpp_strerror(-ret) << dendl; + return ret; + } + } + cerr << count << " objects processed in " << bucket + << ". Next marker " << params.marker.name << std::endl; + } while(results.is_truncated); + + return ret; +} + diff --git a/src/rgw/rgw_bucket.h b/src/rgw/rgw_bucket.h new file mode 100644 index 000000000..e62b46898 --- /dev/null +++ b/src/rgw/rgw_bucket.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include +#include + +#include "include/types.h" +#include "rgw_common.h" +#include "rgw_sal.h" + +extern void init_bucket(rgw_bucket *b, const char *t, const char *n, const char *dp, const char *ip, const char *m, const char *id); + +extern int rgw_bucket_parse_bucket_key(CephContext *cct, const std::string& key, + rgw_bucket* bucket, int *shard_id); + +extern std::string rgw_make_bucket_entry_name(const std::string& tenant_name, + const std::string& bucket_name); + +[[nodiscard]] int rgw_parse_url_bucket(const std::string& bucket, + const std::string& auth_tenant, + std::string &tenant_name, + std::string &bucket_name); + +extern int rgw_chown_bucket_and_objects(rgw::sal::Driver* driver, + rgw::sal::Bucket* bucket, + rgw::sal::User* new_user, + const std::string& marker, + std::string *err_msg, + const DoutPrefixProvider *dpp, + optional_yield y); diff --git a/src/rgw/rgw_bucket_encryption.cc b/src/rgw/rgw_bucket_encryption.cc new file mode 100644 index 000000000..f029709db --- /dev/null +++ b/src/rgw/rgw_bucket_encryption.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +// +#include "rgw_bucket_encryption.h" +#include "rgw_xml.h" +#include "common/ceph_json.h" + +void ApplyServerSideEncryptionByDefault::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("KMSMasterKeyID", kmsMasterKeyID, obj, false); + RGWXMLDecoder::decode_xml("SSEAlgorithm", sseAlgorithm, obj, false); +} + +void ApplyServerSideEncryptionByDefault::dump_xml(Formatter *f) const { + encode_xml("SSEAlgorithm", sseAlgorithm, f); + if (kmsMasterKeyID != "") { + encode_xml("KMSMasterKeyID", kmsMasterKeyID, f); + } +} + +void ServerSideEncryptionConfiguration::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("ApplyServerSideEncryptionByDefault", applyServerSideEncryptionByDefault, obj, false); + RGWXMLDecoder::decode_xml("BucketKeyEnabled", bucketKeyEnabled, obj, false); +} + +void ServerSideEncryptionConfiguration::dump_xml(Formatter *f) const { + encode_xml("ApplyServerSideEncryptionByDefault", applyServerSideEncryptionByDefault, f); + if (bucketKeyEnabled) { + encode_xml("BucketKeyEnabled", true, f); + } +} + +void RGWBucketEncryptionConfig::decode_xml(XMLObj *obj) { + rule_exist = RGWXMLDecoder::decode_xml("Rule", rule, obj); +} + +void RGWBucketEncryptionConfig::dump_xml(Formatter *f) const { + if (rule_exist) { + encode_xml("Rule", rule, f); + } +} + +void RGWBucketEncryptionConfig::dump(Formatter *f) const { + encode_json("rule_exist", has_rule(), f); + if (has_rule()) { + encode_json("sse_algorithm", sse_algorithm(), f); + encode_json("kms_master_key_id", kms_master_key_id(), f); + encode_json("bucket_key_enabled", bucket_key_enabled(), f); + } +} diff --git a/src/rgw/rgw_bucket_encryption.h b/src/rgw/rgw_bucket_encryption.h new file mode 100644 index 000000000..ba567bc71 --- /dev/null +++ b/src/rgw/rgw_bucket_encryption.h @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once +#include + +class XMLObj; + +class ApplyServerSideEncryptionByDefault +{ + std::string kmsMasterKeyID; + std::string sseAlgorithm; + +public: + ApplyServerSideEncryptionByDefault() {}; + ApplyServerSideEncryptionByDefault(const std::string &algorithm, + const std::string &key_id) + : kmsMasterKeyID(key_id), sseAlgorithm(algorithm) {}; + + const std::string& kms_master_key_id() const { + return kmsMasterKeyID; + } + + const std::string& sse_algorithm() const { + return sseAlgorithm; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(kmsMasterKeyID, bl); + encode(sseAlgorithm, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(kmsMasterKeyID, bl); + decode(sseAlgorithm, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(ApplyServerSideEncryptionByDefault) + +class ServerSideEncryptionConfiguration +{ +protected: + ApplyServerSideEncryptionByDefault applyServerSideEncryptionByDefault; + bool bucketKeyEnabled; + +public: + ServerSideEncryptionConfiguration(): bucketKeyEnabled(false) {}; + ServerSideEncryptionConfiguration(const std::string &algorithm, + const std::string &keyid="", bool enabled = false) + : applyServerSideEncryptionByDefault(algorithm, keyid), + bucketKeyEnabled(enabled) {} + + const std::string& kms_master_key_id() const { + return applyServerSideEncryptionByDefault.kms_master_key_id(); + } + + const std::string& sse_algorithm() const { + return applyServerSideEncryptionByDefault.sse_algorithm(); + } + + bool bucket_key_enabled() const { + return bucketKeyEnabled; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(applyServerSideEncryptionByDefault, bl); + encode(bucketKeyEnabled, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(applyServerSideEncryptionByDefault, bl); + decode(bucketKeyEnabled, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(ServerSideEncryptionConfiguration) + +class RGWBucketEncryptionConfig +{ +protected: + bool rule_exist; + ServerSideEncryptionConfiguration rule; + +public: + RGWBucketEncryptionConfig(): rule_exist(false) {} + RGWBucketEncryptionConfig(const std::string &algorithm, + const std::string &keyid = "", bool enabled = false) + : rule_exist(true), rule(algorithm, keyid, enabled) {} + + const std::string& kms_master_key_id() const { + return rule.kms_master_key_id(); + } + + const std::string& sse_algorithm() const { + return rule.sse_algorithm(); + } + + bool bucket_key_enabled() const { + return rule.bucket_key_enabled(); + } + + bool has_rule() const { + return rule_exist; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(rule_exist, bl); + if (rule_exist) { + encode(rule, bl); + } + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(rule_exist, bl); + if (rule_exist) { + decode(rule, bl); + } + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(RGWBucketEncryptionConfig) diff --git a/src/rgw/rgw_bucket_layout.cc b/src/rgw/rgw_bucket_layout.cc new file mode 100644 index 000000000..499e8f0cd --- /dev/null +++ b/src/rgw/rgw_bucket_layout.cc @@ -0,0 +1,380 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include "rgw_bucket_layout.h" + +namespace rgw { + +// BucketIndexType +std::string_view to_string(const BucketIndexType& t) +{ + switch (t) { + case BucketIndexType::Normal: return "Normal"; + case BucketIndexType::Indexless: return "Indexless"; + default: return "Unknown"; + } +} +bool parse(std::string_view str, BucketIndexType& t) +{ + if (boost::iequals(str, "Normal")) { + t = BucketIndexType::Normal; + return true; + } + if (boost::iequals(str, "Indexless")) { + t = BucketIndexType::Indexless; + return true; + } + return false; +} +void encode_json_impl(const char *name, const BucketIndexType& t, ceph::Formatter *f) +{ + encode_json(name, to_string(t), f); +} +void decode_json_obj(BucketIndexType& t, JSONObj *obj) +{ + std::string str; + decode_json_obj(str, obj); + parse(str, t); +} + +// BucketHashType +std::string_view to_string(const BucketHashType& t) +{ + switch (t) { + case BucketHashType::Mod: return "Mod"; + default: return "Unknown"; + } +} +bool parse(std::string_view str, BucketHashType& t) +{ + if (boost::iequals(str, "Mod")) { + t = BucketHashType::Mod; + return true; + } + return false; +} +void encode_json_impl(const char *name, const BucketHashType& t, ceph::Formatter *f) +{ + encode_json(name, to_string(t), f); +} +void decode_json_obj(BucketHashType& t, JSONObj *obj) +{ + std::string str; + decode_json_obj(str, obj); + parse(str, t); +} + +// bucket_index_normal_layout +void encode(const bucket_index_normal_layout& l, bufferlist& bl, uint64_t f) +{ + ENCODE_START(1, 1, bl); + encode(l.num_shards, bl); + encode(l.hash_type, bl); + ENCODE_FINISH(bl); +} +void decode(bucket_index_normal_layout& l, bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(l.num_shards, bl); + decode(l.hash_type, bl); + DECODE_FINISH(bl); +} +void encode_json_impl(const char *name, const bucket_index_normal_layout& l, ceph::Formatter *f) +{ + f->open_object_section(name); + encode_json("num_shards", l.num_shards, f); + encode_json("hash_type", l.hash_type, f); + f->close_section(); +} +void decode_json_obj(bucket_index_normal_layout& l, JSONObj *obj) +{ + JSONDecoder::decode_json("num_shards", l.num_shards, obj); + JSONDecoder::decode_json("hash_type", l.hash_type, obj); +} + +// bucket_index_layout +void encode(const bucket_index_layout& l, bufferlist& bl, uint64_t f) +{ + ENCODE_START(1, 1, bl); + encode(l.type, bl); + switch (l.type) { + case BucketIndexType::Normal: + encode(l.normal, bl); + break; + case BucketIndexType::Indexless: + break; + } + ENCODE_FINISH(bl); +} +void decode(bucket_index_layout& l, bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(l.type, bl); + switch (l.type) { + case BucketIndexType::Normal: + decode(l.normal, bl); + break; + case BucketIndexType::Indexless: + break; + } + DECODE_FINISH(bl); +} +void encode_json_impl(const char *name, const bucket_index_layout& l, ceph::Formatter *f) +{ + f->open_object_section(name); + encode_json("type", l.type, f); + encode_json("normal", l.normal, f); + f->close_section(); +} +void decode_json_obj(bucket_index_layout& l, JSONObj *obj) +{ + JSONDecoder::decode_json("type", l.type, obj); + JSONDecoder::decode_json("normal", l.normal, obj); +} + +// bucket_index_layout_generation +void encode(const bucket_index_layout_generation& l, bufferlist& bl, uint64_t f) +{ + ENCODE_START(1, 1, bl); + encode(l.gen, bl); + encode(l.layout, bl); + ENCODE_FINISH(bl); +} +void decode(bucket_index_layout_generation& l, bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(l.gen, bl); + decode(l.layout, bl); + DECODE_FINISH(bl); +} +void encode_json_impl(const char *name, const bucket_index_layout_generation& l, ceph::Formatter *f) +{ + f->open_object_section(name); + encode_json("gen", l.gen, f); + encode_json("layout", l.layout, f); + f->close_section(); +} +void decode_json_obj(bucket_index_layout_generation& l, JSONObj *obj) +{ + JSONDecoder::decode_json("gen", l.gen, obj); + JSONDecoder::decode_json("layout", l.layout, obj); +} + +// BucketLogType +std::string_view to_string(const BucketLogType& t) +{ + switch (t) { + case BucketLogType::InIndex: return "InIndex"; + default: return "Unknown"; + } +} +bool parse(std::string_view str, BucketLogType& t) +{ + if (boost::iequals(str, "InIndex")) { + t = BucketLogType::InIndex; + return true; + } + return false; +} +void encode_json_impl(const char *name, const BucketLogType& t, ceph::Formatter *f) +{ + encode_json(name, to_string(t), f); +} +void decode_json_obj(BucketLogType& t, JSONObj *obj) +{ + std::string str; + decode_json_obj(str, obj); + parse(str, t); +} + +// bucket_index_log_layout +void encode(const bucket_index_log_layout& l, bufferlist& bl, uint64_t f) +{ + ENCODE_START(1, 1, bl); + encode(l.gen, bl); + encode(l.layout, bl); + ENCODE_FINISH(bl); +} +void decode(bucket_index_log_layout& l, bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(l.gen, bl); + decode(l.layout, bl); + DECODE_FINISH(bl); +} +void encode_json_impl(const char *name, const bucket_index_log_layout& l, ceph::Formatter *f) +{ + f->open_object_section(name); + encode_json("gen", l.gen, f); + encode_json("layout", l.layout, f); + f->close_section(); +} +void decode_json_obj(bucket_index_log_layout& l, JSONObj *obj) +{ + JSONDecoder::decode_json("gen", l.gen, obj); + JSONDecoder::decode_json("layout", l.layout, obj); +} + +// bucket_log_layout +void encode(const bucket_log_layout& l, bufferlist& bl, uint64_t f) +{ + ENCODE_START(1, 1, bl); + encode(l.type, bl); + switch (l.type) { + case BucketLogType::InIndex: + encode(l.in_index, bl); + break; + } + ENCODE_FINISH(bl); +} +void decode(bucket_log_layout& l, bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(l.type, bl); + switch (l.type) { + case BucketLogType::InIndex: + decode(l.in_index, bl); + break; + } + DECODE_FINISH(bl); +} +void encode_json_impl(const char *name, const bucket_log_layout& l, ceph::Formatter *f) +{ + f->open_object_section(name); + encode_json("type", l.type, f); + if (l.type == BucketLogType::InIndex) { + encode_json("in_index", l.in_index, f); + } + f->close_section(); +} +void decode_json_obj(bucket_log_layout& l, JSONObj *obj) +{ + JSONDecoder::decode_json("type", l.type, obj); + JSONDecoder::decode_json("in_index", l.in_index, obj); +} + +// bucket_log_layout_generation +void encode(const bucket_log_layout_generation& l, bufferlist& bl, uint64_t f) +{ + ENCODE_START(1, 1, bl); + encode(l.gen, bl); + encode(l.layout, bl); + ENCODE_FINISH(bl); +} +void decode(bucket_log_layout_generation& l, bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(l.gen, bl); + decode(l.layout, bl); + DECODE_FINISH(bl); +} +void encode_json_impl(const char *name, const bucket_log_layout_generation& l, ceph::Formatter *f) +{ + f->open_object_section(name); + encode_json("gen", l.gen, f); + encode_json("layout", l.layout, f); + f->close_section(); +} +void decode_json_obj(bucket_log_layout_generation& l, JSONObj *obj) +{ + JSONDecoder::decode_json("gen", l.gen, obj); + JSONDecoder::decode_json("layout", l.layout, obj); +} + +// BucketReshardState +std::string_view to_string(const BucketReshardState& s) +{ + switch (s) { + case BucketReshardState::None: return "None"; + case BucketReshardState::InProgress: return "InProgress"; + default: return "Unknown"; + } +} +bool parse(std::string_view str, BucketReshardState& s) +{ + if (boost::iequals(str, "None")) { + s = BucketReshardState::None; + return true; + } + if (boost::iequals(str, "InProgress")) { + s = BucketReshardState::InProgress; + return true; + } + return false; +} +void encode_json_impl(const char *name, const BucketReshardState& s, ceph::Formatter *f) +{ + encode_json(name, to_string(s), f); +} +void decode_json_obj(BucketReshardState& s, JSONObj *obj) +{ + std::string str; + decode_json_obj(str, obj); + parse(str, s); +} + + +// BucketLayout +void encode(const BucketLayout& l, bufferlist& bl, uint64_t f) +{ + ENCODE_START(2, 1, bl); + encode(l.resharding, bl); + encode(l.current_index, bl); + encode(l.target_index, bl); + encode(l.logs, bl); + ENCODE_FINISH(bl); +} +void decode(BucketLayout& l, bufferlist::const_iterator& bl) +{ + DECODE_START(2, bl); + decode(l.resharding, bl); + decode(l.current_index, bl); + decode(l.target_index, bl); + if (struct_v < 2) { + l.logs.clear(); + // initialize the log layout to match the current index layout + if (l.current_index.layout.type == BucketIndexType::Normal) { + l.logs.push_back(log_layout_from_index(0, l.current_index)); + } + } else { + decode(l.logs, bl); + } + DECODE_FINISH(bl); +} +void encode_json_impl(const char *name, const BucketLayout& l, ceph::Formatter *f) +{ + f->open_object_section(name); + encode_json("resharding", l.resharding, f); + encode_json("current_index", l.current_index, f); + if (l.target_index) { + encode_json("target_index", *l.target_index, f); + } + f->open_array_section("logs"); + for (const auto& log : l.logs) { + encode_json("log", log, f); + } + f->close_section(); // logs[] + f->close_section(); +} +void decode_json_obj(BucketLayout& l, JSONObj *obj) +{ + JSONDecoder::decode_json("resharding", l.resharding, obj); + JSONDecoder::decode_json("current_index", l.current_index, obj); + JSONDecoder::decode_json("target_index", l.target_index, obj); + JSONDecoder::decode_json("logs", l.logs, obj); +} + +} // namespace rgw diff --git a/src/rgw/rgw_bucket_layout.h b/src/rgw/rgw_bucket_layout.h new file mode 100644 index 000000000..40aafd4dd --- /dev/null +++ b/src/rgw/rgw_bucket_layout.h @@ -0,0 +1,282 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * introduce changes or include files which can only be compiled in + * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h) + */ + +#pragma once + +#include +#include +#include "include/encoding.h" +#include "common/ceph_json.h" + +namespace rgw { + +enum class BucketIndexType : uint8_t { + Normal, // normal hash-based sharded index layout + Indexless, // no bucket index, so listing is unsupported +}; + +std::string_view to_string(const BucketIndexType& t); +bool parse(std::string_view str, BucketIndexType& t); +void encode_json_impl(const char *name, const BucketIndexType& t, ceph::Formatter *f); +void decode_json_obj(BucketIndexType& t, JSONObj *obj); + +inline std::ostream& operator<<(std::ostream& out, const BucketIndexType& t) +{ + return out << to_string(t); +} + +enum class BucketHashType : uint8_t { + Mod, // rjenkins hash of object name, modulo num_shards +}; + +std::string_view to_string(const BucketHashType& t); +bool parse(std::string_view str, BucketHashType& t); +void encode_json_impl(const char *name, const BucketHashType& t, ceph::Formatter *f); +void decode_json_obj(BucketHashType& t, JSONObj *obj); + +struct bucket_index_normal_layout { + uint32_t num_shards = 1; + + BucketHashType hash_type = BucketHashType::Mod; + + friend std::ostream& operator<<(std::ostream& out, const bucket_index_normal_layout& l) { + out << "num_shards=" << l.num_shards << ", hash_type=" << to_string(l.hash_type); + return out; + } +}; + +inline bool operator==(const bucket_index_normal_layout& l, + const bucket_index_normal_layout& r) { + return l.num_shards == r.num_shards + && l.hash_type == r.hash_type; +} +inline bool operator!=(const bucket_index_normal_layout& l, + const bucket_index_normal_layout& r) { + return !(l == r); +} + +void encode(const bucket_index_normal_layout& l, bufferlist& bl, uint64_t f=0); +void decode(bucket_index_normal_layout& l, bufferlist::const_iterator& bl); +void encode_json_impl(const char *name, const bucket_index_normal_layout& l, ceph::Formatter *f); +void decode_json_obj(bucket_index_normal_layout& l, JSONObj *obj); + +struct bucket_index_layout { + BucketIndexType type = BucketIndexType::Normal; + + // TODO: variant of layout types? + bucket_index_normal_layout normal; + + friend std::ostream& operator<<(std::ostream& out, const bucket_index_layout& l) { + out << "type=" << to_string(l.type) << ", normal=" << l.normal; + return out; + } +}; + +inline bool operator==(const bucket_index_layout& l, + const bucket_index_layout& r) { + return l.type == r.type && l.normal == r.normal; +} +inline bool operator!=(const bucket_index_layout& l, + const bucket_index_layout& r) { + return !(l == r); +} + +void encode(const bucket_index_layout& l, bufferlist& bl, uint64_t f=0); +void decode(bucket_index_layout& l, bufferlist::const_iterator& bl); +void encode_json_impl(const char *name, const bucket_index_layout& l, ceph::Formatter *f); +void decode_json_obj(bucket_index_layout& l, JSONObj *obj); + +struct bucket_index_layout_generation { + uint64_t gen = 0; + bucket_index_layout layout; + + friend std::ostream& operator<<(std::ostream& out, const bucket_index_layout_generation& g) { + out << "gen=" << g.gen; + return out; + } +}; + +inline bool operator==(const bucket_index_layout_generation& l, + const bucket_index_layout_generation& r) { + return l.gen == r.gen && l.layout == r.layout; +} +inline bool operator!=(const bucket_index_layout_generation& l, + const bucket_index_layout_generation& r) { + return !(l == r); +} + +void encode(const bucket_index_layout_generation& l, bufferlist& bl, uint64_t f=0); +void decode(bucket_index_layout_generation& l, bufferlist::const_iterator& bl); +void encode_json_impl(const char *name, const bucket_index_layout_generation& l, ceph::Formatter *f); +void decode_json_obj(bucket_index_layout_generation& l, JSONObj *obj); + + +enum class BucketLogType : uint8_t { + // colocated with bucket index, so the log layout matches the index layout + InIndex, +}; + +std::string_view to_string(const BucketLogType& t); +bool parse(std::string_view str, BucketLogType& t); +void encode_json_impl(const char *name, const BucketLogType& t, ceph::Formatter *f); +void decode_json_obj(BucketLogType& t, JSONObj *obj); + +inline std::ostream& operator<<(std::ostream& out, const BucketLogType &log_type) +{ + switch (log_type) { + case BucketLogType::InIndex: + return out << "InIndex"; + default: + return out << "Unknown"; + } +} + +struct bucket_index_log_layout { + uint64_t gen = 0; + bucket_index_normal_layout layout; + operator bucket_index_layout_generation() const { + bucket_index_layout_generation bilg; + bilg.gen = gen; + bilg.layout.type = BucketIndexType::Normal; + bilg.layout.normal = layout; + return bilg; + } +}; + +void encode(const bucket_index_log_layout& l, bufferlist& bl, uint64_t f=0); +void decode(bucket_index_log_layout& l, bufferlist::const_iterator& bl); +void encode_json_impl(const char *name, const bucket_index_log_layout& l, ceph::Formatter *f); +void decode_json_obj(bucket_index_log_layout& l, JSONObj *obj); + +struct bucket_log_layout { + BucketLogType type = BucketLogType::InIndex; + + bucket_index_log_layout in_index; + + friend std::ostream& operator<<(std::ostream& out, const bucket_log_layout& l) { + out << "type=" << to_string(l.type); + return out; + } +}; + +void encode(const bucket_log_layout& l, bufferlist& bl, uint64_t f=0); +void decode(bucket_log_layout& l, bufferlist::const_iterator& bl); +void encode_json_impl(const char *name, const bucket_log_layout& l, ceph::Formatter *f); +void decode_json_obj(bucket_log_layout& l, JSONObj *obj); + +struct bucket_log_layout_generation { + uint64_t gen = 0; + bucket_log_layout layout; + + friend std::ostream& operator<<(std::ostream& out, const bucket_log_layout_generation& g) { + out << "gen=" << g.gen << ", layout=[ " << g.layout << " ]"; + return out; + } +}; + +void encode(const bucket_log_layout_generation& l, bufferlist& bl, uint64_t f=0); +void decode(bucket_log_layout_generation& l, bufferlist::const_iterator& bl); +void encode_json_impl(const char *name, const bucket_log_layout_generation& l, ceph::Formatter *f); +void decode_json_obj(bucket_log_layout_generation& l, JSONObj *obj); + +// return a log layout that shares its layout with the index +inline bucket_log_layout_generation log_layout_from_index( + uint64_t gen, const bucket_index_layout_generation& index) +{ + return {gen, {BucketLogType::InIndex, {index.gen, index.layout.normal}}}; +} + +inline auto matches_gen(uint64_t gen) +{ + return [gen] (const bucket_log_layout_generation& l) { return l.gen == gen; }; +} + +inline bucket_index_layout_generation log_to_index_layout(const bucket_log_layout_generation& log_layout) +{ + ceph_assert(log_layout.layout.type == BucketLogType::InIndex); + bucket_index_layout_generation index; + index.gen = log_layout.layout.in_index.gen; + index.layout.normal = log_layout.layout.in_index.layout; + return index; +} + +enum class BucketReshardState : uint8_t { + None, + InProgress, +}; +std::string_view to_string(const BucketReshardState& s); +bool parse(std::string_view str, BucketReshardState& s); +void encode_json_impl(const char *name, const BucketReshardState& s, ceph::Formatter *f); +void decode_json_obj(BucketReshardState& s, JSONObj *obj); + +// describes the layout of bucket index objects +struct BucketLayout { + BucketReshardState resharding = BucketReshardState::None; + + // current bucket index layout + bucket_index_layout_generation current_index; + + // target index layout of a resharding operation + std::optional target_index; + + // history of untrimmed bucket log layout generations, with the current + // generation at the back() + std::vector logs; + + friend std::ostream& operator<<(std::ostream& out, const BucketLayout& l) { + std::stringstream ss; + if (l.target_index) { + ss << *l.target_index; + } else { + ss << "none"; + } + out << "resharding=" << to_string(l.resharding) << + ", current_index=[" << l.current_index << "], target_index=[" << + ss.str() << "], logs.size()=" << l.logs.size(); + + return out; + } +}; + +void encode(const BucketLayout& l, bufferlist& bl, uint64_t f=0); +void decode(BucketLayout& l, bufferlist::const_iterator& bl); +void encode_json_impl(const char *name, const BucketLayout& l, ceph::Formatter *f); +void decode_json_obj(BucketLayout& l, JSONObj *obj); + + +inline uint32_t num_shards(const bucket_index_normal_layout& index) { + // old buckets used num_shards=0 to mean 1 + return index.num_shards > 0 ? index.num_shards : 1; +} +inline uint32_t num_shards(const bucket_index_layout& index) { + ceph_assert(index.type == BucketIndexType::Normal); + return num_shards(index.normal); +} +inline uint32_t num_shards(const bucket_index_layout_generation& index) { + return num_shards(index.layout); +} +inline uint32_t current_num_shards(const BucketLayout& layout) { + return num_shards(layout.current_index); +} +inline bool is_layout_indexless(const bucket_index_layout_generation& layout) { + return layout.layout.type == BucketIndexType::Indexless; +} + +} // namespace rgw diff --git a/src/rgw/rgw_bucket_sync_cache.h b/src/rgw/rgw_bucket_sync_cache.h new file mode 100644 index 000000000..064fdce48 --- /dev/null +++ b/src/rgw/rgw_bucket_sync_cache.h @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include +#include "common/intrusive_lru.h" +#include "rgw_data_sync.h" + +namespace rgw::bucket_sync { + +// per bucket-shard state cached by DataSyncShardCR +struct State { + // the source bucket shard to sync + std::pair> key; + // current sync obligation being processed by DataSyncSingleEntry + std::optional obligation; + // incremented with each new obligation + uint32_t counter = 0; + // highest timestamp applied by all sources + ceph::real_time progress_timestamp; + + State(const std::pair>& key ) noexcept + : key(key) {} + State(const rgw_bucket_shard& shard, std::optional gen) noexcept + : key(shard, gen) {} +}; + +struct Entry; +struct EntryToKey; +class Handle; + +using lru_config = ceph::common::intrusive_lru_config< + std::pair>, Entry, EntryToKey>; + +// a recyclable cache entry +struct Entry : State, ceph::common::intrusive_lru_base { + using State::State; +}; + +struct EntryToKey { + using type = std::pair>; + const type& operator()(const Entry& e) { return e.key; } +}; + +// use a non-atomic reference count since these aren't shared across threads +template +using thread_unsafe_ref_counter = boost::intrusive_ref_counter< + T, boost::thread_unsafe_counter>; + +// a state cache for entries within a single datalog shard +class Cache : public thread_unsafe_ref_counter { + ceph::common::intrusive_lru cache; + protected: + // protected ctor to enforce the use of factory function create() + explicit Cache(size_t target_size) { + cache.set_target_size(target_size); + } + public: + static boost::intrusive_ptr create(size_t target_size) { + return new Cache(target_size); + } + + // find or create a cache entry for the given key, and return a Handle that + // keeps it lru-pinned until destruction + Handle get(const rgw_bucket_shard& shard, std::optional gen); +}; + +// a State handle that keeps the Cache referenced +class Handle { + boost::intrusive_ptr cache; + boost::intrusive_ptr entry; + public: + Handle() noexcept = default; + ~Handle() = default; + Handle(boost::intrusive_ptr cache, + boost::intrusive_ptr entry) noexcept + : cache(std::move(cache)), entry(std::move(entry)) {} + Handle(Handle&&) = default; + Handle(const Handle&) = default; + Handle& operator=(Handle&& o) noexcept { + // move the entry first so that its cache stays referenced over destruction + entry = std::move(o.entry); + cache = std::move(o.cache); + return *this; + } + Handle& operator=(const Handle& o) noexcept { + // copy the entry first so that its cache stays referenced over destruction + entry = o.entry; + cache = o.cache; + return *this; + } + + explicit operator bool() const noexcept { return static_cast(entry); } + State& operator*() const noexcept { return *entry; } + State* operator->() const noexcept { return entry.get(); } +}; + +inline Handle Cache::get(const rgw_bucket_shard& shard, std::optional gen) +{ + auto result = cache.get_or_create({ shard, gen }); + return {this, std::move(result.first)}; +} + +} // namespace rgw::bucket_sync diff --git a/src/rgw/rgw_bucket_types.h b/src/rgw/rgw_bucket_types.h new file mode 100644 index 000000000..61acc58bb --- /dev/null +++ b/src/rgw/rgw_bucket_types.h @@ -0,0 +1,233 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * include files which can only be compiled in radosgw or OSD + * contexts (e.g., rgw_sal.h, rgw_common.h) */ + +#pragma once + +#include + +#include "rgw_pool_types.h" +#include "rgw_user_types.h" +#include "rgw_placement_types.h" + +#include "common/dout.h" +#include "common/Formatter.h" + +struct cls_user_bucket; + +struct rgw_bucket_key { + std::string tenant; + std::string name; + std::string bucket_id; + + rgw_bucket_key(const std::string& _tenant, + const std::string& _name, + const std::string& _bucket_id) : tenant(_tenant), + name(_name), + bucket_id(_bucket_id) {} + rgw_bucket_key(const std::string& _tenant, + const std::string& _name) : tenant(_tenant), + name(_name) {} +}; + +struct rgw_bucket { + std::string tenant; + std::string name; + std::string marker; + std::string bucket_id; + rgw_data_placement_target explicit_placement; + + rgw_bucket() { } + // cppcheck-suppress noExplicitConstructor + explicit rgw_bucket(const rgw_user& u, const cls_user_bucket& b); + + rgw_bucket(const std::string& _tenant, + const std::string& _name, + const std::string& _bucket_id) : tenant(_tenant), + name(_name), + bucket_id(_bucket_id) {} + rgw_bucket(const rgw_bucket_key& bk) : tenant(bk.tenant), + name(bk.name), + bucket_id(bk.bucket_id) {} + rgw_bucket(const rgw_bucket&) = default; + rgw_bucket(rgw_bucket&&) = default; + + bool match(const rgw_bucket& b) const { + return (tenant == b.tenant && + name == b.name && + (bucket_id == b.bucket_id || + bucket_id.empty() || + b.bucket_id.empty())); + } + + void convert(cls_user_bucket *b) const; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(10, 10, bl); + encode(name, bl); + encode(marker, bl); + encode(bucket_id, bl); + encode(tenant, bl); + bool encode_explicit = !explicit_placement.data_pool.empty(); + encode(encode_explicit, bl); + if (encode_explicit) { + encode(explicit_placement.data_pool, bl); + encode(explicit_placement.data_extra_pool, bl); + encode(explicit_placement.index_pool, bl); + } + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(10, 3, 3, bl); + decode(name, bl); + if (struct_v < 10) { + decode(explicit_placement.data_pool.name, bl); + } + if (struct_v >= 2) { + decode(marker, bl); + if (struct_v <= 3) { + uint64_t id; + decode(id, bl); + char buf[16]; + snprintf(buf, sizeof(buf), "%" PRIu64, id); + bucket_id = buf; + } else { + decode(bucket_id, bl); + } + } + if (struct_v < 10) { + if (struct_v >= 5) { + decode(explicit_placement.index_pool.name, bl); + } else { + explicit_placement.index_pool = explicit_placement.data_pool; + } + if (struct_v >= 7) { + decode(explicit_placement.data_extra_pool.name, bl); + } + } + if (struct_v >= 8) { + decode(tenant, bl); + } + if (struct_v >= 10) { + bool decode_explicit = !explicit_placement.data_pool.empty(); + decode(decode_explicit, bl); + if (decode_explicit) { + decode(explicit_placement.data_pool, bl); + decode(explicit_placement.data_extra_pool, bl); + decode(explicit_placement.index_pool, bl); + } + } + DECODE_FINISH(bl); + } + + void update_bucket_id(const std::string& new_bucket_id) { + bucket_id = new_bucket_id; + } + + // format a key for the bucket/instance. pass delim=0 to skip a field + std::string get_key(char tenant_delim = '/', + char id_delim = ':', + size_t reserve = 0) const; + + const rgw_pool& get_data_extra_pool() const { + return explicit_placement.get_data_extra_pool(); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); + + rgw_bucket& operator=(const rgw_bucket&) = default; + + bool operator<(const rgw_bucket& b) const { + if (tenant < b.tenant) { + return true; + } else if (tenant > b.tenant) { + return false; + } + + if (name < b.name) { + return true; + } else if (name > b.name) { + return false; + } + + return (bucket_id < b.bucket_id); + } + + bool operator==(const rgw_bucket& b) const { + return (tenant == b.tenant) && (name == b.name) && \ + (bucket_id == b.bucket_id); + } + bool operator!=(const rgw_bucket& b) const { + return (tenant != b.tenant) || (name != b.name) || + (bucket_id != b.bucket_id); + } +}; +WRITE_CLASS_ENCODER(rgw_bucket) + +inline std::ostream& operator<<(std::ostream& out, const rgw_bucket &b) { + out << b.tenant << ":" << b.name << "[" << b.bucket_id << "])"; + return out; +} + +struct rgw_bucket_placement { + rgw_placement_rule placement_rule; + rgw_bucket bucket; + + void dump(Formatter *f) const; +}; /* rgw_bucket_placement */ + +struct rgw_bucket_shard { + rgw_bucket bucket; + int shard_id; + + rgw_bucket_shard() : shard_id(-1) {} + rgw_bucket_shard(const rgw_bucket& _b, int _sid) : bucket(_b), shard_id(_sid) {} + + std::string get_key(char tenant_delim = '/', char id_delim = ':', + char shard_delim = ':', + size_t reserve = 0) const; + + bool operator<(const rgw_bucket_shard& b) const { + if (bucket < b.bucket) { + return true; + } + if (b.bucket < bucket) { + return false; + } + return shard_id < b.shard_id; + } + + bool operator==(const rgw_bucket_shard& b) const { + return (bucket == b.bucket && + shard_id == b.shard_id); + } +}; /* rgw_bucket_shard */ + +void encode(const rgw_bucket_shard& b, bufferlist& bl, uint64_t f=0); +void decode(rgw_bucket_shard& b, bufferlist::const_iterator& bl); + +inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_shard& bs) { + if (bs.shard_id <= 0) { + return out << bs.bucket; + } + + return out << bs.bucket << ":" << bs.shard_id; +} diff --git a/src/rgw/rgw_cache.cc b/src/rgw/rgw_cache.cc new file mode 100644 index 000000000..dd7a826cd --- /dev/null +++ b/src/rgw/rgw_cache.cc @@ -0,0 +1,419 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_cache.h" +#include "rgw_perf_counters.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int ObjectCache::get(const DoutPrefixProvider *dpp, const string& name, ObjectCacheInfo& info, uint32_t mask, rgw_cache_entry_info *cache_info) +{ + + std::shared_lock rl{lock}; + std::unique_lock wl{lock, std::defer_lock}; // may be promoted to write lock + if (!enabled) { + return -ENOENT; + } + auto iter = cache_map.find(name); + if (iter == cache_map.end()) { + ldpp_dout(dpp, 10) << "cache get: name=" << name << " : miss" << dendl; + if (perfcounter) { + perfcounter->inc(l_rgw_cache_miss); + } + return -ENOENT; + } + + if (expiry.count() && + (ceph::coarse_mono_clock::now() - iter->second.info.time_added) > expiry) { + ldpp_dout(dpp, 10) << "cache get: name=" << name << " : expiry miss" << dendl; + rl.unlock(); + wl.lock(); // write lock for expiration + // check that wasn't already removed by other thread + iter = cache_map.find(name); + if (iter != cache_map.end()) { + for (auto &kv : iter->second.chained_entries) + kv.first->invalidate(kv.second); + remove_lru(name, iter->second.lru_iter); + cache_map.erase(iter); + } + if (perfcounter) { + perfcounter->inc(l_rgw_cache_miss); + } + return -ENOENT; + } + + ObjectCacheEntry *entry = &iter->second; + + if (lru_counter - entry->lru_promotion_ts > lru_window) { + ldpp_dout(dpp, 20) << "cache get: touching lru, lru_counter=" << lru_counter + << " promotion_ts=" << entry->lru_promotion_ts << dendl; + rl.unlock(); + wl.lock(); // write lock for touch_lru() + /* need to redo this because entry might have dropped off the cache */ + iter = cache_map.find(name); + if (iter == cache_map.end()) { + ldpp_dout(dpp, 10) << "lost race! cache get: name=" << name << " : miss" << dendl; + if(perfcounter) perfcounter->inc(l_rgw_cache_miss); + return -ENOENT; + } + + entry = &iter->second; + /* check again, we might have lost a race here */ + if (lru_counter - entry->lru_promotion_ts > lru_window) { + touch_lru(dpp, name, *entry, iter->second.lru_iter); + } + } + + ObjectCacheInfo& src = iter->second.info; + if(src.status == -ENOENT) { + ldpp_dout(dpp, 10) << "cache get: name=" << name << " : hit (negative entry)" << dendl; + if (perfcounter) perfcounter->inc(l_rgw_cache_hit); + return -ENODATA; + } + if ((src.flags & mask) != mask) { + ldpp_dout(dpp, 10) << "cache get: name=" << name << " : type miss (requested=0x" + << std::hex << mask << ", cached=0x" << src.flags + << std::dec << ")" << dendl; + if(perfcounter) perfcounter->inc(l_rgw_cache_miss); + return -ENOENT; + } + ldpp_dout(dpp, 10) << "cache get: name=" << name << " : hit (requested=0x" + << std::hex << mask << ", cached=0x" << src.flags + << std::dec << ")" << dendl; + + info = src; + if (cache_info) { + cache_info->cache_locator = name; + cache_info->gen = entry->gen; + } + if(perfcounter) perfcounter->inc(l_rgw_cache_hit); + + return 0; +} + +bool ObjectCache::chain_cache_entry(const DoutPrefixProvider *dpp, + std::initializer_list cache_info_entries, + RGWChainedCache::Entry *chained_entry) +{ + std::unique_lock l{lock}; + + if (!enabled) { + return false; + } + + std::vector entries; + entries.reserve(cache_info_entries.size()); + /* first verify that all entries are still valid */ + for (auto cache_info : cache_info_entries) { + ldpp_dout(dpp, 10) << "chain_cache_entry: cache_locator=" + << cache_info->cache_locator << dendl; + auto iter = cache_map.find(cache_info->cache_locator); + if (iter == cache_map.end()) { + ldpp_dout(dpp, 20) << "chain_cache_entry: couldn't find cache locator" << dendl; + return false; + } + + auto entry = &iter->second; + + if (entry->gen != cache_info->gen) { + ldpp_dout(dpp, 20) << "chain_cache_entry: entry.gen (" << entry->gen + << ") != cache_info.gen (" << cache_info->gen << ")" + << dendl; + return false; + } + entries.push_back(entry); + } + + + chained_entry->cache->chain_cb(chained_entry->key, chained_entry->data); + + for (auto entry : entries) { + entry->chained_entries.push_back(make_pair(chained_entry->cache, + chained_entry->key)); + } + + return true; +} + +void ObjectCache::put(const DoutPrefixProvider *dpp, const string& name, ObjectCacheInfo& info, rgw_cache_entry_info *cache_info) +{ + std::unique_lock l{lock}; + + if (!enabled) { + return; + } + + ldpp_dout(dpp, 10) << "cache put: name=" << name << " info.flags=0x" + << std::hex << info.flags << std::dec << dendl; + + auto [iter, inserted] = cache_map.emplace(name, ObjectCacheEntry{}); + ObjectCacheEntry& entry = iter->second; + entry.info.time_added = ceph::coarse_mono_clock::now(); + if (inserted) { + entry.lru_iter = lru.end(); + } + ObjectCacheInfo& target = entry.info; + + invalidate_lru(entry); + + entry.chained_entries.clear(); + entry.gen++; + + touch_lru(dpp, name, entry, entry.lru_iter); + + target.status = info.status; + + if (info.status < 0) { + target.flags = 0; + target.xattrs.clear(); + target.data.clear(); + return; + } + + if (cache_info) { + cache_info->cache_locator = name; + cache_info->gen = entry.gen; + } + + // put() must include the latest version if we're going to keep caching it + target.flags &= ~CACHE_FLAG_OBJV; + + target.flags |= info.flags; + + if (info.flags & CACHE_FLAG_META) + target.meta = info.meta; + else if (!(info.flags & CACHE_FLAG_MODIFY_XATTRS)) + target.flags &= ~CACHE_FLAG_META; // non-meta change should reset meta + + if (info.flags & CACHE_FLAG_XATTRS) { + target.xattrs = info.xattrs; + map::iterator iter; + for (iter = target.xattrs.begin(); iter != target.xattrs.end(); ++iter) { + ldpp_dout(dpp, 10) << "updating xattr: name=" << iter->first << " bl.length()=" << iter->second.length() << dendl; + } + } else if (info.flags & CACHE_FLAG_MODIFY_XATTRS) { + map::iterator iter; + for (iter = info.rm_xattrs.begin(); iter != info.rm_xattrs.end(); ++iter) { + ldpp_dout(dpp, 10) << "removing xattr: name=" << iter->first << dendl; + target.xattrs.erase(iter->first); + } + for (iter = info.xattrs.begin(); iter != info.xattrs.end(); ++iter) { + ldpp_dout(dpp, 10) << "appending xattr: name=" << iter->first << " bl.length()=" << iter->second.length() << dendl; + target.xattrs[iter->first] = iter->second; + } + } + + if (info.flags & CACHE_FLAG_DATA) + target.data = info.data; + + if (info.flags & CACHE_FLAG_OBJV) + target.version = info.version; +} + +// WARNING: This function /must not/ be modified to cache a +// negative lookup. It must only invalidate. +bool ObjectCache::invalidate_remove(const DoutPrefixProvider *dpp, const string& name) +{ + std::unique_lock l{lock}; + + if (!enabled) { + return false; + } + + auto iter = cache_map.find(name); + if (iter == cache_map.end()) + return false; + + ldpp_dout(dpp, 10) << "removing " << name << " from cache" << dendl; + ObjectCacheEntry& entry = iter->second; + + for (auto& kv : entry.chained_entries) { + kv.first->invalidate(kv.second); + } + + remove_lru(name, iter->second.lru_iter); + cache_map.erase(iter); + return true; +} + +void ObjectCache::touch_lru(const DoutPrefixProvider *dpp, const string& name, ObjectCacheEntry& entry, + std::list::iterator& lru_iter) +{ + while (lru_size > (size_t)cct->_conf->rgw_cache_lru_size) { + auto iter = lru.begin(); + if ((*iter).compare(name) == 0) { + /* + * if the entry we're touching happens to be at the lru end, don't remove it, + * lru shrinking can wait for next time + */ + break; + } + auto map_iter = cache_map.find(*iter); + ldout(cct, 10) << "removing entry: name=" << *iter << " from cache LRU" << dendl; + if (map_iter != cache_map.end()) { + ObjectCacheEntry& entry = map_iter->second; + invalidate_lru(entry); + cache_map.erase(map_iter); + } + lru.pop_front(); + lru_size--; + } + + if (lru_iter == lru.end()) { + lru.push_back(name); + lru_size++; + lru_iter--; + ldpp_dout(dpp, 10) << "adding " << name << " to cache LRU end" << dendl; + } else { + ldpp_dout(dpp, 10) << "moving " << name << " to cache LRU end" << dendl; + lru.erase(lru_iter); + lru.push_back(name); + lru_iter = lru.end(); + --lru_iter; + } + + lru_counter++; + entry.lru_promotion_ts = lru_counter; +} + +void ObjectCache::remove_lru(const string& name, + std::list::iterator& lru_iter) +{ + if (lru_iter == lru.end()) + return; + + lru.erase(lru_iter); + lru_size--; + lru_iter = lru.end(); +} + +void ObjectCache::invalidate_lru(ObjectCacheEntry& entry) +{ + for (auto iter = entry.chained_entries.begin(); + iter != entry.chained_entries.end(); ++iter) { + RGWChainedCache *chained_cache = iter->first; + chained_cache->invalidate(iter->second); + } +} + +void ObjectCache::set_enabled(bool status) +{ + std::unique_lock l{lock}; + + enabled = status; + + if (!enabled) { + do_invalidate_all(); + } +} + +void ObjectCache::invalidate_all() +{ + std::unique_lock l{lock}; + + do_invalidate_all(); +} + +void ObjectCache::do_invalidate_all() +{ + cache_map.clear(); + lru.clear(); + + lru_size = 0; + lru_counter = 0; + lru_window = 0; + + for (auto& cache : chained_cache) { + cache->invalidate_all(); + } +} + +void ObjectCache::chain_cache(RGWChainedCache *cache) { + std::unique_lock l{lock}; + chained_cache.push_back(cache); +} + +void ObjectCache::unchain_cache(RGWChainedCache *cache) { + std::unique_lock l{lock}; + + auto iter = chained_cache.begin(); + for (; iter != chained_cache.end(); ++iter) { + if (cache == *iter) { + chained_cache.erase(iter); + cache->unregistered(); + return; + } + } +} + +ObjectCache::~ObjectCache() +{ + for (auto cache : chained_cache) { + cache->unregistered(); + } +} + +void ObjectMetaInfo::generate_test_instances(list& o) +{ + ObjectMetaInfo *m = new ObjectMetaInfo; + m->size = 1024 * 1024; + o.push_back(m); + o.push_back(new ObjectMetaInfo); +} + +void ObjectMetaInfo::dump(Formatter *f) const +{ + encode_json("size", size, f); + encode_json("mtime", utime_t(mtime), f); +} + +void ObjectCacheInfo::generate_test_instances(list& o) +{ + using ceph::encode; + ObjectCacheInfo *i = new ObjectCacheInfo; + i->status = 0; + i->flags = CACHE_FLAG_MODIFY_XATTRS; + string s = "this is a string"; + string s2 = "this is a another string"; + bufferlist data, data2; + encode(s, data); + encode(s2, data2); + i->data = data; + i->xattrs["x1"] = data; + i->xattrs["x2"] = data2; + i->rm_xattrs["r2"] = data2; + i->rm_xattrs["r3"] = data; + i->meta.size = 512 * 1024; + o.push_back(i); + o.push_back(new ObjectCacheInfo); +} + +void ObjectCacheInfo::dump(Formatter *f) const +{ + encode_json("status", status, f); + encode_json("flags", flags, f); + encode_json("data", data, f); + encode_json_map("xattrs", "name", "value", "length", xattrs, f); + encode_json_map("rm_xattrs", "name", "value", "length", rm_xattrs, f); + encode_json("meta", meta, f); + +} + +void RGWCacheNotifyInfo::generate_test_instances(list& o) +{ + o.push_back(new RGWCacheNotifyInfo); +} + +void RGWCacheNotifyInfo::dump(Formatter *f) const +{ + encode_json("op", op, f); + encode_json("obj", obj, f); + encode_json("obj_info", obj_info, f); + encode_json("ofs", ofs, f); + encode_json("ns", ns, f); +} + diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h new file mode 100644 index 000000000..e70beb064 --- /dev/null +++ b/src/rgw/rgw_cache.h @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include "include/types.h" +#include "include/utime.h" +#include "include/ceph_assert.h" +#include "common/ceph_mutex.h" + +#include "cls/version/cls_version_types.h" +#include "rgw_common.h" + +enum { + UPDATE_OBJ, + INVALIDATE_OBJ, +}; + +#define CACHE_FLAG_DATA 0x01 +#define CACHE_FLAG_XATTRS 0x02 +#define CACHE_FLAG_META 0x04 +#define CACHE_FLAG_MODIFY_XATTRS 0x08 +#define CACHE_FLAG_OBJV 0x10 + +struct ObjectMetaInfo { + uint64_t size; + real_time mtime; + + ObjectMetaInfo() : size(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(size, bl); + encode(mtime, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(size, bl); + decode(mtime, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(ObjectMetaInfo) + +struct ObjectCacheInfo { + int status = 0; + uint32_t flags = 0; + uint64_t epoch = 0; + bufferlist data; + std::map xattrs; + std::map rm_xattrs; + ObjectMetaInfo meta; + obj_version version = {}; + ceph::coarse_mono_time time_added; + + ObjectCacheInfo() = default; + + void encode(bufferlist& bl) const { + ENCODE_START(5, 3, bl); + encode(status, bl); + encode(flags, bl); + encode(data, bl); + encode(xattrs, bl); + encode(meta, bl); + encode(rm_xattrs, bl); + encode(epoch, bl); + encode(version, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl); + decode(status, bl); + decode(flags, bl); + decode(data, bl); + decode(xattrs, bl); + decode(meta, bl); + if (struct_v >= 2) + decode(rm_xattrs, bl); + if (struct_v >= 4) + decode(epoch, bl); + if (struct_v >= 5) + decode(version, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(ObjectCacheInfo) + +struct RGWCacheNotifyInfo { + uint32_t op; + rgw_raw_obj obj; + ObjectCacheInfo obj_info; + off_t ofs; + std::string ns; + + RGWCacheNotifyInfo() : op(0), ofs(0) {} + + void encode(bufferlist& obl) const { + ENCODE_START(2, 2, obl); + encode(op, obl); + encode(obj, obl); + encode(obj_info, obl); + encode(ofs, obl); + encode(ns, obl); + ENCODE_FINISH(obl); + } + void decode(bufferlist::const_iterator& ibl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, ibl); + decode(op, ibl); + decode(obj, ibl); + decode(obj_info, ibl); + decode(ofs, ibl); + decode(ns, ibl); + DECODE_FINISH(ibl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(RGWCacheNotifyInfo) +inline std::ostream& operator <<(std::ostream& m, const RGWCacheNotifyInfo& cni) { + return m << "[op: " << cni.op << ", obj: " << cni.obj + << ", ofs" << cni.ofs << ", ns" << cni.ns << "]"; +} + + +class RGWChainedCache { +public: + virtual ~RGWChainedCache() {} + virtual void chain_cb(const std::string& key, void *data) = 0; + virtual void invalidate(const std::string& key) = 0; + virtual void invalidate_all() = 0; + virtual void unregistered() {} + + struct Entry { + RGWChainedCache *cache; + const std::string& key; + void *data; + + Entry(RGWChainedCache *_c, const std::string& _k, void *_d) : cache(_c), key(_k), data(_d) {} + }; +}; + + +struct ObjectCacheEntry { + ObjectCacheInfo info; + std::list::iterator lru_iter; + uint64_t lru_promotion_ts; + uint64_t gen; + std::vector > chained_entries; + + ObjectCacheEntry() : lru_promotion_ts(0), gen(0) {} +}; + +class ObjectCache { + std::unordered_map cache_map; + std::list lru; + unsigned long lru_size; + unsigned long lru_counter; + unsigned long lru_window; + ceph::shared_mutex lock = ceph::make_shared_mutex("ObjectCache"); + CephContext *cct; + + std::vector chained_cache; + + bool enabled; + ceph::timespan expiry; + + void touch_lru(const DoutPrefixProvider *dpp, const std::string& name, ObjectCacheEntry& entry, + std::list::iterator& lru_iter); + void remove_lru(const std::string& name, std::list::iterator& lru_iter); + void invalidate_lru(ObjectCacheEntry& entry); + + void do_invalidate_all(); + +public: + ObjectCache() : lru_size(0), lru_counter(0), lru_window(0), cct(NULL), enabled(false) { } + ~ObjectCache(); + int get(const DoutPrefixProvider *dpp, const std::string& name, ObjectCacheInfo& bl, uint32_t mask, rgw_cache_entry_info *cache_info); + std::optional get(const DoutPrefixProvider *dpp, const std::string& name) { + std::optional info{std::in_place}; + auto r = get(dpp, name, *info, 0, nullptr); + return r < 0 ? std::nullopt : info; + } + + template + void for_each(const F& f) { + std::shared_lock l{lock}; + if (enabled) { + auto now = ceph::coarse_mono_clock::now(); + for (const auto& [name, entry] : cache_map) { + if (expiry.count() && (now - entry.info.time_added) < expiry) { + f(name, entry); + } + } + } + } + + void put(const DoutPrefixProvider *dpp, const std::string& name, ObjectCacheInfo& bl, rgw_cache_entry_info *cache_info); + bool invalidate_remove(const DoutPrefixProvider *dpp, const std::string& name); + void set_ctx(CephContext *_cct) { + cct = _cct; + lru_window = cct->_conf->rgw_cache_lru_size / 2; + expiry = std::chrono::seconds(cct->_conf.get_val( + "rgw_cache_expiry_interval")); + } + bool chain_cache_entry(const DoutPrefixProvider *dpp, + std::initializer_list cache_info_entries, + RGWChainedCache::Entry *chained_entry); + + void set_enabled(bool status); + + void chain_cache(RGWChainedCache *cache); + void unchain_cache(RGWChainedCache *cache); + void invalidate_all(); +}; diff --git a/src/rgw/rgw_client_io.cc b/src/rgw/rgw_client_io.cc new file mode 100644 index 000000000..ed0925093 --- /dev/null +++ b/src/rgw/rgw_client_io.cc @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include + +#include "rgw_client_io.h" +#include "rgw_crypt.h" +#include "rgw_crypt_sanitize.h" +#define dout_subsys ceph_subsys_rgw + +namespace rgw { +namespace io { + +[[nodiscard]] int BasicClient::init(CephContext *cct) { + int init_error = init_env(cct); + + if (init_error != 0) + return init_error; + + if (cct->_conf->subsys.should_gather()) { + const auto& env_map = get_env().get_map(); + + for (const auto& iter: env_map) { + rgw::crypt_sanitize::env x{iter.first, iter.second}; + ldout(cct, 20) << iter.first << "=" << (x) << dendl; + } + } + return init_error; +} + +} /* namespace io */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_client_io.h b/src/rgw/rgw_client_io.h new file mode 100644 index 000000000..aedfe4500 --- /dev/null +++ b/src/rgw/rgw_client_io.h @@ -0,0 +1,435 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "include/types.h" +#include "rgw_common.h" + + +class RGWRestfulIO; + +namespace rgw { +namespace io { + +using Exception = std::system_error; + +/* The minimal and simplest subset of methods that a client of RadosGW can be + * interacted with. */ +class BasicClient { +protected: + virtual int init_env(CephContext *cct) = 0; + +public: + virtual ~BasicClient() = default; + + /* Initialize the BasicClient and inject CephContext. */ + int init(CephContext *cct); + + /* Return the RGWEnv describing the environment that a given request lives in. + * The method does not throw exceptions. */ + virtual RGWEnv& get_env() noexcept = 0; + + /* Complete request. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. */ + virtual size_t complete_request() = 0; +}; /* rgw::io::Client */ + + +class Accounter { +public: + virtual ~Accounter() = default; + + /* Enable or disable the accounting of both sent and received data. Changing + * the state does not affect the counters. */ + virtual void set_account(bool enabled) = 0; + + /* Return number of bytes sent to a direct client of RadosGW (direct means + * eg. a web server instance in the case of using FastCGI front-end) when + * the accounting was enabled. */ + virtual uint64_t get_bytes_sent() const = 0; + + /* Return number of bytes received from a direct client of RadosGW (direct + * means eg. a web server instance in the case of using FastCGI front-end) + * when the accounting was enabled. */ + virtual uint64_t get_bytes_received() const = 0; +}; /* rgw::io::Accounter */ + + +/* Interface abstracting restful interactions with clients, usually through + * the HTTP protocol. The methods participating in the response generation + * process should be called in the specific order: + * 1. send_100_continue() - at most once, + * 2. send_status() - exactly once, + * 3. Any of: + * a. send_header(), + * b. send_content_length() XOR send_chunked_transfer_encoding() + * Please note that only one of those two methods must be called + at most once. + * 4. complete_header() - exactly once, + * 5. send_body() + * 6. complete_request() - exactly once. + * There are no restrictions on flush() - it may be called in any moment. + * + * Receiving data from a client isn't a subject to any further call order + * restrictions besides those imposed by BasicClient. That is, get_env() + * and recv_body can be mixed. */ +class RestfulClient : public BasicClient { + template friend class DecoratedRestfulClient; + +public: + /* Generate the 100 Continue message. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. */ + virtual size_t send_100_continue() = 0; + + /* Generate the response's status part taking the HTTP status code as @status + * and its name pointed in @status_name. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. */ + virtual size_t send_status(int status, const char *status_name) = 0; + + /* Generate header. On success returns number of bytes generated for a direct + * client of RadosGW. On failure throws rgw::io::Exception containing errno. + * + * std::string_view is being used because of length it internally carries. */ + virtual size_t send_header(const std::string_view& name, + const std::string_view& value) = 0; + + /* Inform a client about a content length. Takes number of bytes as @len. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. + * + * CALL LIMITATIONS: + * - The method must be called EXACTLY ONCE. + * - The method is interchangeable with send_chunked_transfer_encoding(). */ + virtual size_t send_content_length(uint64_t len) = 0; + + /* Inform a client that the chunked transfer encoding will be used. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. + * + * CALL LIMITATIONS: + * - The method must be called EXACTLY ONCE. + * - The method is interchangeable with send_content_length(). */ + virtual size_t send_chunked_transfer_encoding() { + /* This is a null implementation. We don't send anything here, even the HTTP + * header. The intended behaviour should be provided through a decorator or + * directly by a given front-end. */ + return 0; + } + + /* Generate completion (the CRLF sequence separating headers and body in + * the case of HTTP) of headers. On success returns number of generated bytes + * for a direct client of RadosGW. On failure throws rgw::io::Exception with + * errno. */ + virtual size_t complete_header() = 0; + + /* Receive no more than @max bytes from a request's body and store it in + * buffer pointed by @buf. On success returns number of bytes received from + * a direct client of RadosGW that has been stored in @buf. On failure throws + * rgw::io::Exception containing errno. */ + virtual size_t recv_body(char* buf, size_t max) = 0; + + /* Generate a part of response's body by taking exactly @len bytes from + * the buffer pointed by @buf. On success returns number of generated bytes + * of response's body. On failure throws rgw::io::Exception. */ + virtual size_t send_body(const char* buf, size_t len) = 0; + + /* Flushes all already generated data to a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. */ + virtual void flush() = 0; +} /* rgw::io::RestfulClient */; + + +/* Abstract decorator over any implementation of rgw::io::RestfulClient + * which could be provided both as a pointer-to-object or the object itself. */ +template +class DecoratedRestfulClient : public RestfulClient { + template friend class DecoratedRestfulClient; + friend RGWRestfulIO; + + typedef typename std::remove_pointer::type DerefedDecorateeT; + + static_assert(std::is_base_of::value, + "DecorateeT must be a subclass of rgw::io::RestfulClient"); + + DecorateeT decoratee; + + /* There is an indirection layer over accessing decoratee to share the same + * code base between dynamic and static decorators. The difference is about + * what we store internally: pointer to a decorated object versus the whole + * object itself. */ + template ::value, T>::type* = nullptr> + DerefedDecorateeT& get_decoratee() { + return decoratee; + } + +protected: + template ::value, T>::type* = nullptr> + DerefedDecorateeT& get_decoratee() { + return *decoratee; + } + + /* Dynamic decorators (those storing a pointer instead of the decorated + * object itself) can be reconfigured on-the-fly. HOWEVER: there are no + * facilities for orchestrating such changes. Callers must take care of + * atomicity and thread-safety. */ + template ::value, T>::type* = nullptr> + void set_decoratee(DerefedDecorateeT& new_dec) { + decoratee = &new_dec; + } + + int init_env(CephContext *cct) override { + return get_decoratee().init_env(cct); + } + +public: + explicit DecoratedRestfulClient(DecorateeT&& decoratee) + : decoratee(std::forward(decoratee)) { + } + + size_t send_status(const int status, + const char* const status_name) override { + return get_decoratee().send_status(status, status_name); + } + + size_t send_100_continue() override { + return get_decoratee().send_100_continue(); + } + + size_t send_header(const std::string_view& name, + const std::string_view& value) override { + return get_decoratee().send_header(name, value); + } + + size_t send_content_length(const uint64_t len) override { + return get_decoratee().send_content_length(len); + } + + size_t send_chunked_transfer_encoding() override { + return get_decoratee().send_chunked_transfer_encoding(); + } + + size_t complete_header() override { + return get_decoratee().complete_header(); + } + + size_t recv_body(char* const buf, const size_t max) override { + return get_decoratee().recv_body(buf, max); + } + + size_t send_body(const char* const buf, + const size_t len) override { + return get_decoratee().send_body(buf, len); + } + + void flush() override { + return get_decoratee().flush(); + } + + RGWEnv& get_env() noexcept override { + return get_decoratee().get_env(); + } + + size_t complete_request() override { + return get_decoratee().complete_request(); + } +} /* rgw::io::DecoratedRestfulClient */; + + +/* Interface that should be provided by a front-end class wanting to use + * the low-level buffering offered by i.e. StaticOutputBufferer. */ +class BuffererSink { +public: + virtual ~BuffererSink() = default; + + /* Send exactly @len bytes from the memory location pointed by @buf. + * On success returns @len. On failure throws rgw::io::Exception. */ + virtual size_t write_data(const char *buf, size_t len) = 0; +}; + +/* Utility class providing RestfulClient's implementations with facilities + * for low-level buffering without relying on dynamic memory allocations. + * The buffer is carried entirely on stack. This narrows down applicability + * to these situations where buffers are relatively small. This perfectly + * fits the needs of composing an HTTP header. Without that a front-end + * might need to issue a lot of small IO operations leading to increased + * overhead on syscalls and fragmentation of a message if the Nagle's + * algorithm won't be able to form a single TCP segment (usually when + * running on extremely fast network interfaces like the loopback). */ +template +class StaticOutputBufferer : public std::streambuf { + static_assert(BufferSizeV >= sizeof(std::streambuf::char_type), + "Buffer size must be bigger than a single char_type."); + + using std::streambuf::int_type; + + int_type overflow(const int_type c) override { + *pptr() = c; + pbump(sizeof(std::streambuf::char_type)); + + if (! sync()) { + /* No error, the buffer has been successfully synchronized. */ + return c; + } else { + return std::streambuf::traits_type::eof(); + } + } + + int sync() override { + const auto len = static_cast(std::streambuf::pptr() - + std::streambuf::pbase()); + std::streambuf::pbump(-len); + sink.write_data(std::streambuf::pbase(), len); + /* Always return success here. In case of failure write_data() will throw + * rgw::io::Exception. */ + return 0; + } + + BuffererSink& sink; + std::streambuf::char_type buffer[BufferSizeV]; + +public: + explicit StaticOutputBufferer(BuffererSink& sink) + : sink(sink) { + constexpr size_t len = sizeof(buffer) - sizeof(std::streambuf::char_type); + std::streambuf::setp(buffer, buffer + len); + } +}; + +} /* namespace io */ +} /* namespace rgw */ + + +/* We're doing this nasty thing only because of extensive usage of templates + * to implement the static decorator pattern. C++ templates de facto enforce + * mixing interfaces with implementation. Additionally, those classes derive + * from RGWRestfulIO defined here. I believe that including in the middle of + * file is still better than polluting it directly. */ +#include "rgw_client_io_filters.h" + + +/* RGWRestfulIO: high level interface to interact with RESTful clients. What + * differentiates it from rgw::io::RestfulClient is providing more specific APIs + * like rgw::io::Accounter or the AWS Auth v4 stuff implemented by filters + * while hiding the pipelined architecture from clients. + * + * rgw::io::Accounter came in as a part of rgw::io::AccountingFilter. */ +class RGWRestfulIO : public rgw::io::AccountingFilter { + std::vector> filters; + +public: + ~RGWRestfulIO() override = default; + + RGWRestfulIO(CephContext *_cx, rgw::io::RestfulClient* engine) + : AccountingFilter(_cx, std::move(engine)) { + } + + void add_filter(std::shared_ptr new_filter) { + new_filter->set_decoratee(this->get_decoratee()); + this->set_decoratee(*new_filter); + filters.emplace_back(std::move(new_filter)); + } +}; /* RGWRestfulIO */ + + +/* Type conversions to work around lack of req_state type hierarchy matching + * (e.g.) REST backends (may be replaced w/dynamic typed req_state). */ +static inline rgw::io::RestfulClient* RESTFUL_IO(req_state* s) { + ceph_assert(dynamic_cast(s->cio) != nullptr); + + return static_cast(s->cio); +} + +static inline rgw::io::Accounter* ACCOUNTING_IO(req_state* s) { + auto ptr = dynamic_cast(s->cio); + ceph_assert(ptr != nullptr); + + return ptr; +} + +static inline RGWRestfulIO* AWS_AUTHv4_IO(const req_state* const s) { + ceph_assert(dynamic_cast(s->cio) != nullptr); + + return static_cast(s->cio); +} + + +class RGWClientIOStreamBuf : public std::streambuf { +protected: + RGWRestfulIO &rio; + size_t const window_size; + size_t const putback_size; + std::vector buffer; + +public: + RGWClientIOStreamBuf(RGWRestfulIO &rio, size_t ws, size_t ps = 1) + : rio(rio), + window_size(ws), + putback_size(ps), + buffer(ws + ps) + { + setg(nullptr, nullptr, nullptr); + } + + std::streambuf::int_type underflow() override { + if (gptr() < egptr()) { + return traits_type::to_int_type(*gptr()); + } + + char * const base = buffer.data(); + char * start; + + if (nullptr != eback()) { + /* We need to skip moving bytes on first underflow. In such case + * there is simply no previous data we should preserve for unget() + * or something similar. */ + std::memmove(base, egptr() - putback_size, putback_size); + start = base + putback_size; + } else { + start = base; + } + + size_t read_len = 0; + try { + read_len = rio.recv_body(base, window_size); + } catch (rgw::io::Exception&) { + return traits_type::eof(); + } + if (0 == read_len) { + return traits_type::eof(); + } + + setg(base, start, start + read_len); + + return traits_type::to_int_type(*gptr()); + } +}; + +class RGWClientIOStream : private RGWClientIOStreamBuf, public std::istream { +/* Inheritance from RGWClientIOStreamBuf is a kind of shadow, undirect + * form of composition here. We cannot do that explicitly because istream + * ctor is being called prior to construction of any member of this class. */ + +public: + explicit RGWClientIOStream(RGWRestfulIO &s) + : RGWClientIOStreamBuf(s, 1, 2), + std::istream(static_cast(this)) { + } +}; diff --git a/src/rgw/rgw_client_io_filters.h b/src/rgw/rgw_client_io_filters.h new file mode 100644 index 000000000..55d405e1b --- /dev/null +++ b/src/rgw/rgw_client_io_filters.h @@ -0,0 +1,454 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include + +#include "rgw_common.h" +#include "rgw_client_io.h" + +namespace rgw { +namespace io { + +template +class AccountingFilter : public DecoratedRestfulClient, + public Accounter { + bool enabled; + uint64_t total_sent; + uint64_t total_received; + CephContext *cct; + +public: + template + AccountingFilter(CephContext *cct, U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + enabled(false), + total_sent(0), + total_received(0), cct(cct) { + } + + size_t send_status(const int status, + const char* const status_name) override { + const auto sent = DecoratedRestfulClient::send_status(status, + status_name); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_status: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t send_100_continue() override { + const auto sent = DecoratedRestfulClient::send_100_continue(); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_100_continue: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t send_header(const std::string_view& name, + const std::string_view& value) override { + const auto sent = DecoratedRestfulClient::send_header(name, value); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_header: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t send_content_length(const uint64_t len) override { + const auto sent = DecoratedRestfulClient::send_content_length(len); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_content_length: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t send_chunked_transfer_encoding() override { + const auto sent = DecoratedRestfulClient::send_chunked_transfer_encoding(); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_chunked_transfer_encoding: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t complete_header() override { + const auto sent = DecoratedRestfulClient::complete_header(); + lsubdout(cct, rgw, 30) << "AccountingFilter::complete_header: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t recv_body(char* buf, size_t max) override { + const auto received = DecoratedRestfulClient::recv_body(buf, max); + lsubdout(cct, rgw, 30) << "AccountingFilter::recv_body: e=" + << (enabled ? "1" : "0") << ", received=" << received << dendl; + if (enabled) { + total_received += received; + } + return received; + } + + size_t send_body(const char* const buf, + const size_t len) override { + const auto sent = DecoratedRestfulClient::send_body(buf, len); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_body: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t complete_request() override { + const auto sent = DecoratedRestfulClient::complete_request(); + lsubdout(cct, rgw, 30) << "AccountingFilter::complete_request: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + uint64_t get_bytes_sent() const override { + return total_sent; + } + + uint64_t get_bytes_received() const override { + return total_received; + } + + void set_account(bool enabled) override { + this->enabled = enabled; + lsubdout(cct, rgw, 30) << "AccountingFilter::set_account: e=" + << (enabled ? "1" : "0") << dendl; + } +}; + + +/* Filter for in-memory buffering incoming data and calculating the content + * length header if it isn't present. */ +template +class BufferingFilter : public DecoratedRestfulClient { + template friend class DecoratedRestfulClient; +protected: + ceph::bufferlist data; + + bool has_content_length; + bool buffer_data; + CephContext *cct; + +public: + template + BufferingFilter(CephContext *cct, U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + has_content_length(false), + buffer_data(false), cct(cct) { + } + + size_t send_content_length(const uint64_t len) override; + size_t send_chunked_transfer_encoding() override; + size_t complete_header() override; + size_t send_body(const char* buf, size_t len) override; + size_t complete_request() override; +}; + +template +size_t BufferingFilter::send_body(const char* const buf, + const size_t len) +{ + if (buffer_data) { + data.append(buf, len); + + lsubdout(cct, rgw, 30) << "BufferingFilter::send_body: defer count = " + << len << dendl; + return 0; + } + + return DecoratedRestfulClient::send_body(buf, len); +} + +template +size_t BufferingFilter::send_content_length(const uint64_t len) +{ + has_content_length = true; + return DecoratedRestfulClient::send_content_length(len); +} + +template +size_t BufferingFilter::send_chunked_transfer_encoding() +{ + has_content_length = true; + return DecoratedRestfulClient::send_chunked_transfer_encoding(); +} + +template +size_t BufferingFilter::complete_header() +{ + if (! has_content_length) { + /* We will dump everything in complete_request(). */ + buffer_data = true; + lsubdout(cct, rgw, 30) << "BufferingFilter::complete_header: has_content_length=" + << (has_content_length ? "1" : "0") << dendl; + return 0; + } + + return DecoratedRestfulClient::complete_header(); +} + +template +size_t BufferingFilter::complete_request() +{ + size_t sent = 0; + + if (! has_content_length) { + /* It is not correct to count these bytes here, + * because they can only be part of the header. + * Therefore force count to 0. + */ + sent += DecoratedRestfulClient::send_content_length(data.length()); + sent += DecoratedRestfulClient::complete_header(); + lsubdout(cct, rgw, 30) << + "BufferingFilter::complete_request: !has_content_length: IGNORE: sent=" + << sent << dendl; + sent = 0; + } + + if (buffer_data) { + /* We are sending each buffer separately to avoid extra memory shuffling + * that would occur on data.c_str() to provide a continuous memory area. */ + for (const auto& ptr : data.buffers()) { + sent += DecoratedRestfulClient::send_body(ptr.c_str(), + ptr.length()); + } + data.clear(); + buffer_data = false; + lsubdout(cct, rgw, 30) << "BufferingFilter::complete_request: buffer_data: sent=" + << sent << dendl; + } + + return sent + DecoratedRestfulClient::complete_request(); +} + +template static inline +BufferingFilter add_buffering( +CephContext *cct, +T&& t) { + return BufferingFilter(cct, std::forward(t)); +} + + +template +class ChunkingFilter : public DecoratedRestfulClient { + template friend class DecoratedRestfulClient; +protected: + bool chunking_enabled; + +public: + template + explicit ChunkingFilter(U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + chunking_enabled(false) { + } + + size_t send_chunked_transfer_encoding() override { + chunking_enabled = true; + return DecoratedRestfulClient::send_header("Transfer-Encoding", + "chunked"); + } + + size_t send_body(const char* buf, + const size_t len) override { + if (! chunking_enabled) { + return DecoratedRestfulClient::send_body(buf, len); + } else { + static constexpr char HEADER_END[] = "\r\n"; + /* https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.6.1 */ + // TODO: we have no support for sending chunked-encoding + // extensions/trailing headers. + char chunk_size[32]; + const auto chunk_size_len = snprintf(chunk_size, sizeof(chunk_size), + "%zx\r\n", len); + size_t sent = 0; + + sent += DecoratedRestfulClient::send_body(chunk_size, chunk_size_len); + sent += DecoratedRestfulClient::send_body(buf, len); + sent += DecoratedRestfulClient::send_body(HEADER_END, + sizeof(HEADER_END) - 1); + return sent; + } + } + + size_t complete_request() override { + size_t sent = 0; + + if (chunking_enabled) { + static constexpr char CHUNKED_RESP_END[] = "0\r\n\r\n"; + sent += DecoratedRestfulClient::send_body(CHUNKED_RESP_END, + sizeof(CHUNKED_RESP_END) - 1); + } + + return sent + DecoratedRestfulClient::complete_request(); + } +}; + +template static inline +ChunkingFilter add_chunking(T&& t) { + return ChunkingFilter(std::forward(t)); +} + + +/* Class that controls and inhibits the process of sending Content-Length HTTP + * header where RFC 7230 requests so. The cases worth our attention are 204 No + * Content as well as 304 Not Modified. */ +template +class ConLenControllingFilter : public DecoratedRestfulClient { +protected: + enum class ContentLengthAction { + FORWARD, + INHIBIT, + UNKNOWN + } action; + +public: + template + explicit ConLenControllingFilter(U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + action(ContentLengthAction::UNKNOWN) { + } + + size_t send_status(const int status, + const char* const status_name) override { + if ((204 == status || 304 == status) && + ! g_conf()->rgw_print_prohibited_content_length) { + action = ContentLengthAction::INHIBIT; + } else { + action = ContentLengthAction::FORWARD; + } + + return DecoratedRestfulClient::send_status(status, status_name); + } + + size_t send_content_length(const uint64_t len) override { + switch(action) { + case ContentLengthAction::FORWARD: + return DecoratedRestfulClient::send_content_length(len); + case ContentLengthAction::INHIBIT: + return 0; + case ContentLengthAction::UNKNOWN: + default: + return -EINVAL; + } + } +}; + +template static inline +ConLenControllingFilter add_conlen_controlling(T&& t) { + return ConLenControllingFilter(std::forward(t)); +} + + +/* Filter that rectifies the wrong behaviour of some clients of the RGWRestfulIO + * interface. Should be removed after fixing those clients. */ +template +class ReorderingFilter : public DecoratedRestfulClient { +protected: + enum class ReorderState { + RGW_EARLY_HEADERS, /* Got headers sent before calling send_status. */ + RGW_STATUS_SEEN, /* Status has been seen. */ + RGW_DATA /* Header has been completed. */ + } phase; + + boost::optional content_length; + + std::vector> headers; + + size_t send_header(const std::string_view& name, + const std::string_view& value) override { + switch (phase) { + case ReorderState::RGW_EARLY_HEADERS: + case ReorderState::RGW_STATUS_SEEN: + headers.emplace_back(std::make_pair(std::string(name.data(), name.size()), + std::string(value.data(), value.size()))); + return 0; + case ReorderState::RGW_DATA: + return DecoratedRestfulClient::send_header(name, value); + } + + return -EIO; + } + +public: + template + explicit ReorderingFilter(U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + phase(ReorderState::RGW_EARLY_HEADERS) { + } + + size_t send_status(const int status, + const char* const status_name) override { + phase = ReorderState::RGW_STATUS_SEEN; + + return DecoratedRestfulClient::send_status(status, status_name); + } + + size_t send_content_length(const uint64_t len) override { + if (ReorderState::RGW_EARLY_HEADERS == phase) { + /* Oh great, someone tries to send content length before status. */ + content_length = len; + return 0; + } else { + return DecoratedRestfulClient::send_content_length(len); + } + } + + size_t complete_header() override { + size_t sent = 0; + + /* Change state in order to immediately send everything we get. */ + phase = ReorderState::RGW_DATA; + + /* Sent content length if necessary. */ + if (content_length) { + sent += DecoratedRestfulClient::send_content_length(*content_length); + } + + /* Header data in buffers are already counted. */ + for (const auto& kv : headers) { + sent += DecoratedRestfulClient::send_header(kv.first, kv.second); + } + headers.clear(); + + return sent + DecoratedRestfulClient::complete_header(); + } +}; + +template static inline +ReorderingFilter add_reordering(T&& t) { + return ReorderingFilter(std::forward(t)); +} + +} /* namespace io */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc new file mode 100644 index 000000000..f5d7912ea --- /dev/null +++ b/src/rgw/rgw_common.cc @@ -0,0 +1,3075 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include +#include + +#include "json_spirit/json_spirit.h" +#include "common/ceph_json.h" +#include "common/Formatter.h" + +#include "rgw_op.h" +#include "rgw_common.h" +#include "rgw_acl.h" +#include "rgw_string.h" +#include "rgw_http_errors.h" +#include "rgw_arn.h" +#include "rgw_data_sync.h" + +#include "global/global_init.h" +#include "common/ceph_crypto.h" +#include "common/armor.h" +#include "common/errno.h" +#include "common/Clock.h" +#include "common/convenience.h" +#include "common/strtol.h" +#include "include/str_list.h" +#include "rgw_crypt_sanitize.h" +#include "rgw_bucket_sync.h" +#include "rgw_sync_policy.h" + +#include "services/svc_zone.h" + +#include + +#define dout_context g_ceph_context + +static constexpr auto dout_subsys = ceph_subsys_rgw; + +using rgw::ARN; +using rgw::IAM::Effect; +using rgw::IAM::op_to_perm; +using rgw::IAM::Policy; + +const uint32_t RGWBucketInfo::NUM_SHARDS_BLIND_BUCKET(UINT32_MAX); + +rgw_http_errors rgw_http_s3_errors({ + { 0, {200, "" }}, + { STATUS_CREATED, {201, "Created" }}, + { STATUS_ACCEPTED, {202, "Accepted" }}, + { STATUS_NO_CONTENT, {204, "NoContent" }}, + { STATUS_PARTIAL_CONTENT, {206, "" }}, + { ERR_PERMANENT_REDIRECT, {301, "PermanentRedirect" }}, + { ERR_WEBSITE_REDIRECT, {301, "WebsiteRedirect" }}, + { STATUS_REDIRECT, {303, "" }}, + { ERR_NOT_MODIFIED, {304, "NotModified" }}, + { EINVAL, {400, "InvalidArgument" }}, + { ERR_INVALID_REQUEST, {400, "InvalidRequest" }}, + { ERR_INVALID_DIGEST, {400, "InvalidDigest" }}, + { ERR_BAD_DIGEST, {400, "BadDigest" }}, + { ERR_INVALID_LOCATION_CONSTRAINT, {400, "InvalidLocationConstraint" }}, + { ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION, {400, "ZonegroupDefaultPlacementMisconfiguration" }}, + { ERR_INVALID_BUCKET_NAME, {400, "InvalidBucketName" }}, + { ERR_INVALID_OBJECT_NAME, {400, "InvalidObjectName" }}, + { ERR_UNRESOLVABLE_EMAIL, {400, "UnresolvableGrantByEmailAddress" }}, + { ERR_INVALID_PART, {400, "InvalidPart" }}, + { ERR_INVALID_PART_ORDER, {400, "InvalidPartOrder" }}, + { ERR_REQUEST_TIMEOUT, {400, "RequestTimeout" }}, + { ERR_TOO_LARGE, {400, "EntityTooLarge" }}, + { ERR_TOO_SMALL, {400, "EntityTooSmall" }}, + { ERR_TOO_MANY_BUCKETS, {400, "TooManyBuckets" }}, + { ERR_MALFORMED_XML, {400, "MalformedXML" }}, + { ERR_AMZ_CONTENT_SHA256_MISMATCH, {400, "XAmzContentSHA256Mismatch" }}, + { ERR_MALFORMED_DOC, {400, "MalformedPolicyDocument"}}, + { ERR_INVALID_TAG, {400, "InvalidTag"}}, + { ERR_MALFORMED_ACL_ERROR, {400, "MalformedACLError" }}, + { ERR_INVALID_CORS_RULES_ERROR, {400, "InvalidRequest" }}, + { ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR, {400, "InvalidRequest" }}, + { ERR_INVALID_ENCRYPTION_ALGORITHM, {400, "InvalidEncryptionAlgorithmError" }}, + { ERR_INVALID_RETENTION_PERIOD,{400, "InvalidRetentionPeriod"}}, + { ERR_LIMIT_EXCEEDED, {400, "LimitExceeded" }}, + { ERR_LENGTH_REQUIRED, {411, "MissingContentLength" }}, + { EACCES, {403, "AccessDenied" }}, + { EPERM, {403, "AccessDenied" }}, + { ERR_SIGNATURE_NO_MATCH, {403, "SignatureDoesNotMatch" }}, + { ERR_INVALID_ACCESS_KEY, {403, "InvalidAccessKeyId" }}, + { ERR_USER_SUSPENDED, {403, "UserSuspended" }}, + { ERR_REQUEST_TIME_SKEWED, {403, "RequestTimeTooSkewed" }}, + { ERR_QUOTA_EXCEEDED, {403, "QuotaExceeded" }}, + { ERR_MFA_REQUIRED, {403, "AccessDenied" }}, + { ENOENT, {404, "NoSuchKey" }}, + { ERR_NO_SUCH_BUCKET, {404, "NoSuchBucket" }}, + { ERR_NO_SUCH_WEBSITE_CONFIGURATION, {404, "NoSuchWebsiteConfiguration" }}, + { ERR_NO_SUCH_UPLOAD, {404, "NoSuchUpload" }}, + { ERR_NOT_FOUND, {404, "Not Found"}}, + { ERR_NO_SUCH_LC, {404, "NoSuchLifecycleConfiguration"}}, + { ERR_NO_SUCH_BUCKET_POLICY, {404, "NoSuchBucketPolicy"}}, + { ERR_NO_SUCH_USER, {404, "NoSuchUser"}}, + { ERR_NO_ROLE_FOUND, {404, "NoSuchEntity"}}, + { ERR_NO_CORS_FOUND, {404, "NoSuchCORSConfiguration"}}, + { ERR_NO_SUCH_SUBUSER, {404, "NoSuchSubUser"}}, + { ERR_NO_SUCH_ENTITY, {404, "NoSuchEntity"}}, + { ERR_NO_SUCH_CORS_CONFIGURATION, {404, "NoSuchCORSConfiguration"}}, + { ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION, {404, "ObjectLockConfigurationNotFoundError"}}, + { ERR_METHOD_NOT_ALLOWED, {405, "MethodNotAllowed" }}, + { ETIMEDOUT, {408, "RequestTimeout" }}, + { EEXIST, {409, "BucketAlreadyExists" }}, + { ERR_USER_EXIST, {409, "UserAlreadyExists" }}, + { ERR_EMAIL_EXIST, {409, "EmailExists" }}, + { ERR_KEY_EXIST, {409, "KeyExists"}}, + { ERR_TAG_CONFLICT, {409, "OperationAborted"}}, + { ERR_POSITION_NOT_EQUAL_TO_LENGTH, {409, "PositionNotEqualToLength"}}, + { ERR_OBJECT_NOT_APPENDABLE, {409, "ObjectNotAppendable"}}, + { ERR_INVALID_BUCKET_STATE, {409, "InvalidBucketState"}}, + { ERR_INVALID_OBJECT_STATE, {403, "InvalidObjectState"}}, + { ERR_INVALID_SECRET_KEY, {400, "InvalidSecretKey"}}, + { ERR_INVALID_KEY_TYPE, {400, "InvalidKeyType"}}, + { ERR_INVALID_CAP, {400, "InvalidCapability"}}, + { ERR_INVALID_TENANT_NAME, {400, "InvalidTenantName" }}, + { ENOTEMPTY, {409, "BucketNotEmpty" }}, + { ERR_PRECONDITION_FAILED, {412, "PreconditionFailed" }}, + { ERANGE, {416, "InvalidRange" }}, + { ERR_UNPROCESSABLE_ENTITY, {422, "UnprocessableEntity" }}, + { ERR_LOCKED, {423, "Locked" }}, + { ERR_INTERNAL_ERROR, {500, "InternalError" }}, + { ERR_NOT_IMPLEMENTED, {501, "NotImplemented" }}, + { ERR_SERVICE_UNAVAILABLE, {503, "ServiceUnavailable"}}, + { ERR_RATE_LIMITED, {503, "SlowDown"}}, + { ERR_ZERO_IN_URL, {400, "InvalidRequest" }}, + { ERR_NO_SUCH_TAG_SET, {404, "NoSuchTagSet"}}, + { ERR_NO_SUCH_BUCKET_ENCRYPTION_CONFIGURATION, {404, "ServerSideEncryptionConfigurationNotFoundError"}}, +}); + +rgw_http_errors rgw_http_swift_errors({ + { EACCES, {403, "AccessDenied" }}, + { EPERM, {401, "AccessDenied" }}, + { ENAMETOOLONG, {400, "Metadata name too long" }}, + { ERR_USER_SUSPENDED, {401, "UserSuspended" }}, + { ERR_INVALID_UTF8, {412, "Invalid UTF8" }}, + { ERR_BAD_URL, {412, "Bad URL" }}, + { ERR_NOT_SLO_MANIFEST, {400, "Not an SLO manifest" }}, + { ERR_QUOTA_EXCEEDED, {413, "QuotaExceeded" }}, + { ENOTEMPTY, {409, "There was a conflict when trying " + "to complete your request." }}, + /* FIXME(rzarzynski): we need to find a way to apply Swift's error handling + * procedures also for ERR_ZERO_IN_URL. This make a problem as the validation + * is performed very early, even before setting the req_state::proto_flags. */ + { ERR_ZERO_IN_URL, {412, "Invalid UTF8 or contains NULL"}}, + { ERR_RATE_LIMITED, {498, "Rate Limited"}}, +}); + +rgw_http_errors rgw_http_sts_errors({ + { ERR_PACKED_POLICY_TOO_LARGE, {400, "PackedPolicyTooLarge" }}, + { ERR_INVALID_IDENTITY_TOKEN, {400, "InvalidIdentityToken" }}, +}); + +rgw_http_errors rgw_http_iam_errors({ + { EINVAL, {400, "InvalidInput" }}, + { ENOENT, {404, "NoSuchEntity"}}, + { ERR_ROLE_EXISTS, {409, "EntityAlreadyExists"}}, + { ERR_DELETE_CONFLICT, {409, "DeleteConflict"}}, + { EEXIST, {409, "EntityAlreadyExists"}}, + { ERR_INTERNAL_ERROR, {500, "ServiceFailure" }}, +}); + +using namespace std; +using namespace ceph::crypto; + +thread_local bool is_asio_thread = false; + +rgw_err:: +rgw_err() +{ + clear(); +} + +void rgw_err:: +clear() +{ + http_ret = 200; + ret = 0; + err_code.clear(); +} + +bool rgw_err:: +is_clear() const +{ + return (http_ret == 200); +} + +bool rgw_err:: +is_err() const +{ + return !(http_ret >= 200 && http_ret <= 399); +} + +// The requestURI transferred from the frontend can be abs_path or absoluteURI +// If it is absoluteURI, we should adjust it to abs_path for the following +// S3 authorization and some other processes depending on the requestURI +// The absoluteURI can start with "http://", "https://", "ws://" or "wss://" +static string get_abs_path(const string& request_uri) { + const static string ABS_PREFIXS[] = {"http://", "https://", "ws://", "wss://"}; + bool isAbs = false; + for (int i = 0; i < 4; ++i) { + if (boost::algorithm::starts_with(request_uri, ABS_PREFIXS[i])) { + isAbs = true; + break; + } + } + if (!isAbs) { // it is not a valid absolute uri + return request_uri; + } + size_t beg_pos = request_uri.find("://") + 3; + size_t len = request_uri.size(); + beg_pos = request_uri.find('/', beg_pos); + if (beg_pos == string::npos) return request_uri; + return request_uri.substr(beg_pos, len - beg_pos); +} + +req_info::req_info(CephContext *cct, const class RGWEnv *env) : env(env) { + method = env->get("REQUEST_METHOD", ""); + script_uri = env->get("SCRIPT_URI", cct->_conf->rgw_script_uri.c_str()); + request_uri = env->get("REQUEST_URI", cct->_conf->rgw_request_uri.c_str()); + if (request_uri[0] != '/') { + request_uri = get_abs_path(request_uri); + } + auto pos = request_uri.find('?'); + if (pos != string::npos) { + request_params = request_uri.substr(pos + 1); + request_uri = request_uri.substr(0, pos); + } else { + request_params = env->get("QUERY_STRING", ""); + } + host = env->get("HTTP_HOST", ""); + + // strip off any trailing :port from host (added by CrossFTP and maybe others) + size_t colon_offset = host.find_last_of(':'); + if (colon_offset != string::npos) { + bool all_digits = true; + for (unsigned i = colon_offset + 1; i < host.size(); ++i) { + if (!isdigit(host[i])) { + all_digits = false; + break; + } + } + if (all_digits) { + host.resize(colon_offset); + } + } +} + +void req_info::rebuild_from(req_info& src) +{ + method = src.method; + script_uri = src.script_uri; + args = src.args; + if (src.effective_uri.empty()) { + request_uri = src.request_uri; + } else { + request_uri = src.effective_uri; + } + effective_uri.clear(); + host = src.host; + + x_meta_map = src.x_meta_map; + x_meta_map.erase("x-amz-date"); +} + + +req_state::req_state(CephContext* _cct, const RGWProcessEnv& penv, + RGWEnv* e, uint64_t id) + : cct(_cct), penv(penv), info(_cct, e), id(id) +{ + enable_ops_log = e->get_enable_ops_log(); + enable_usage_log = e->get_enable_usage_log(); + defer_to_bucket_acls = e->get_defer_to_bucket_acls(); + + time = Clock::now(); +} + +req_state::~req_state() { + delete formatter; +} + +std::ostream& req_state::gen_prefix(std::ostream& out) const +{ + auto p = out.precision(); + return out << "req " << id << ' ' + << std::setprecision(3) << std::fixed << time_elapsed() // '0.123s' + << std::setprecision(p) << std::defaultfloat << ' '; +} + +bool search_err(rgw_http_errors& errs, int err_no, int& http_ret, string& code) +{ + auto r = errs.find(err_no); + if (r != errs.end()) { + http_ret = r->second.first; + code = r->second.second; + return true; + } + return false; +} + +void set_req_state_err(struct rgw_err& err, /* out */ + int err_no, /* in */ + const int prot_flags) /* in */ +{ + if (err_no < 0) + err_no = -err_no; + + err.ret = -err_no; + + if (prot_flags & RGW_REST_SWIFT) { + if (search_err(rgw_http_swift_errors, err_no, err.http_ret, err.err_code)) + return; + } + + if (prot_flags & RGW_REST_STS) { + if (search_err(rgw_http_sts_errors, err_no, err.http_ret, err.err_code)) + return; + } + + if (prot_flags & RGW_REST_IAM) { + if (search_err(rgw_http_iam_errors, err_no, err.http_ret, err.err_code)) + return; + } + + //Default to searching in s3 errors + if (search_err(rgw_http_s3_errors, err_no, err.http_ret, err.err_code)) + return; + dout(0) << "WARNING: set_req_state_err err_no=" << err_no + << " resorting to 500" << dendl; + + err.http_ret = 500; + err.err_code = "UnknownError"; +} + +void set_req_state_err(req_state* s, int err_no, const string& err_msg) +{ + if (s) { + set_req_state_err(s, err_no); + if (s->prot_flags & RGW_REST_SWIFT && !err_msg.empty()) { + /* TODO(rzarzynski): there never ever should be a check like this one. + * It's here only for the sake of the patch's backportability. Further + * commits will move the logic to a per-RGWHandler replacement of + * the end_header() function. Alternativaly, we might consider making + * that just for the dump(). Please take a look on @cbodley's comments + * in PR #10690 (https://github.com/ceph/ceph/pull/10690). */ + s->err.err_code = err_msg; + } else { + s->err.message = err_msg; + } + } +} + +void set_req_state_err(req_state* s, int err_no) +{ + if (s) { + set_req_state_err(s->err, err_no, s->prot_flags); + } +} + +void dump(req_state* s) +{ + if (s->format != RGWFormat::HTML) + s->formatter->open_object_section("Error"); + if (!s->err.err_code.empty()) + s->formatter->dump_string("Code", s->err.err_code); + s->formatter->dump_string("Message", s->err.message); + if (!s->bucket_name.empty()) // TODO: connect to expose_bucket + s->formatter->dump_string("BucketName", s->bucket_name); + if (!s->trans_id.empty()) // TODO: connect to expose_bucket or another toggle + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->dump_string("HostId", s->host_id); + if (s->format != RGWFormat::HTML) + s->formatter->close_section(); +} + +struct str_len { + const char *str; + int len; +}; + +#define STR_LEN_ENTRY(s) { s, sizeof(s) - 1 } + +struct str_len meta_prefixes[] = { STR_LEN_ENTRY("HTTP_X_AMZ_"), + STR_LEN_ENTRY("HTTP_X_GOOG_"), + STR_LEN_ENTRY("HTTP_X_DHO_"), + STR_LEN_ENTRY("HTTP_X_RGW_"), + STR_LEN_ENTRY("HTTP_X_OBJECT_"), + STR_LEN_ENTRY("HTTP_X_CONTAINER_"), + STR_LEN_ENTRY("HTTP_X_ACCOUNT_"), + {NULL, 0} }; + +void req_info::init_meta_info(const DoutPrefixProvider *dpp, bool *found_bad_meta) +{ + x_meta_map.clear(); + crypt_attribute_map.clear(); + + for (const auto& kv: env->get_map()) { + const char *prefix; + const string& header_name = kv.first; + const string& val = kv.second; + for (int prefix_num = 0; (prefix = meta_prefixes[prefix_num].str) != NULL; prefix_num++) { + int len = meta_prefixes[prefix_num].len; + const char *p = header_name.c_str(); + if (strncmp(p, prefix, len) == 0) { + ldpp_dout(dpp, 10) << "meta>> " << p << dendl; + const char *name = p+len; /* skip the prefix */ + int name_len = header_name.size() - len; + + if (found_bad_meta && strncmp(name, "META_", name_len) == 0) + *found_bad_meta = true; + + char name_low[meta_prefixes[0].len + name_len + 1]; + snprintf(name_low, meta_prefixes[0].len - 5 + name_len + 1, "%s%s", meta_prefixes[0].str + 5 /* skip HTTP_ */, name); // normalize meta prefix + int j; + for (j = 0; name_low[j]; j++) { + if (name_low[j] == '_') + name_low[j] = '-'; + else if (name_low[j] == '-') + name_low[j] = '_'; + else + name_low[j] = tolower(name_low[j]); + } + name_low[j] = 0; + + auto it = x_meta_map.find(name_low); + if (it != x_meta_map.end()) { + string old = it->second; + boost::algorithm::trim_right(old); + old.append(","); + old.append(val); + x_meta_map[name_low] = old; + } else { + x_meta_map[name_low] = val; + } + if (strncmp(name_low, "x-amz-server-side-encryption", 20) == 0) { + crypt_attribute_map[name_low] = val; + } + } + } + } + for (const auto& kv: x_meta_map) { + ldpp_dout(dpp, 10) << "x>> " << kv.first << ":" << rgw::crypt_sanitize::x_meta_map{kv.first, kv.second} << dendl; + } +} + +std::ostream& operator<<(std::ostream& oss, const rgw_err &err) +{ + oss << "rgw_err(http_ret=" << err.http_ret << ", err_code='" << err.err_code << "') "; + return oss; +} + +void rgw_add_amz_meta_header( + meta_map_t& x_meta_map, + const std::string& k, + const std::string& v) +{ + auto it = x_meta_map.find(k); + if (it != x_meta_map.end()) { + std::string old = it->second; + boost::algorithm::trim_right(old); + old.append(","); + old.append(v); + x_meta_map[k] = old; + } else { + x_meta_map[k] = v; + } +} + +bool rgw_set_amz_meta_header( + meta_map_t& x_meta_map, + const std::string& k, + const std::string& v, + rgw_set_action_if_set a) +{ + auto it { x_meta_map.find(k) }; + bool r { it != x_meta_map.end() }; + switch(a) { + default: + ceph_assert(a == 0); + case DISCARD: + break; + case APPEND: + if (r) { + std::string old { it->second }; + boost::algorithm::trim_right(old); + old.append(","); + old.append(v); + x_meta_map[k] = old; + break; + } + /* fall through */ + case OVERWRITE: + x_meta_map[k] = v; + } + return r; +} + +string rgw_string_unquote(const string& s) +{ + if (s[0] != '"' || s.size() < 2) + return s; + + int len; + for (len = s.size(); len > 2; --len) { + if (s[len - 1] != ' ') + break; + } + + if (s[len-1] != '"') + return s; + + return s.substr(1, len - 2); +} + +static bool check_str_end(const char *s) +{ + if (!s) + return false; + + while (*s) { + if (!isspace(*s)) + return false; + s++; + } + return true; +} + +static bool check_gmt_end(const char *s) +{ + if (!s || !*s) + return false; + + while (isspace(*s)) { + ++s; + } + + /* check for correct timezone */ + if ((strncmp(s, "GMT", 3) != 0) && + (strncmp(s, "UTC", 3) != 0)) { + return false; + } + + return true; +} + +static bool parse_rfc850(const char *s, struct tm *t) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + return check_gmt_end(strptime(s, "%A, %d-%b-%y %H:%M:%S ", t)); +} + +static bool parse_asctime(const char *s, struct tm *t) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + return check_str_end(strptime(s, "%a %b %d %H:%M:%S %Y", t)); +} + +static bool parse_rfc1123(const char *s, struct tm *t) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + return check_gmt_end(strptime(s, "%a, %d %b %Y %H:%M:%S ", t)); +} + +static bool parse_rfc1123_alt(const char *s, struct tm *t) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + return check_str_end(strptime(s, "%a, %d %b %Y %H:%M:%S %z", t)); +} + +bool parse_rfc2616(const char *s, struct tm *t) +{ + return parse_rfc850(s, t) || parse_asctime(s, t) || parse_rfc1123(s, t) || parse_rfc1123_alt(s,t); +} + +bool parse_iso8601(const char *s, struct tm *t, uint32_t *pns, bool extended_format) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + const char *p; + + if (!s) + s = ""; + + if (extended_format) { + p = strptime(s, "%Y-%m-%dT%T", t); + if (!p) { + p = strptime(s, "%Y-%m-%d %T", t); + } + } else { + p = strptime(s, "%Y%m%dT%H%M%S", t); + } + if (!p) { + dout(0) << "parse_iso8601 failed" << dendl; + return false; + } + const std::string_view str = rgw_trim_whitespace(std::string_view(p)); + int len = str.size(); + + if (len == 0 || (len == 1 && str[0] == 'Z')) + return true; + + if (str[0] != '.' || + str[len - 1] != 'Z') + return false; + + uint32_t ms; + std::string_view nsstr = str.substr(1, len - 2); + int r = stringtoul(std::string(nsstr), &ms); + if (r < 0) + return false; + + if (!pns) { + return true; + } + + if (nsstr.size() > 9) { + nsstr = nsstr.substr(0, 9); + } + + uint64_t mul_table[] = { 0, + 100000000LL, + 10000000LL, + 1000000LL, + 100000LL, + 10000LL, + 1000LL, + 100LL, + 10LL, + 1 }; + + + *pns = ms * mul_table[nsstr.size()]; + + return true; +} + +int parse_key_value(string& in_str, const char *delim, string& key, string& val) +{ + if (delim == NULL) + return -EINVAL; + + auto pos = in_str.find(delim); + if (pos == string::npos) + return -EINVAL; + + key = rgw_trim_whitespace(in_str.substr(0, pos)); + val = rgw_trim_whitespace(in_str.substr(pos + 1)); + + return 0; +} + +int parse_key_value(string& in_str, string& key, string& val) +{ + return parse_key_value(in_str, "=", key,val); +} + +boost::optional> +parse_key_value(const std::string_view& in_str, + const std::string_view& delim) +{ + const size_t pos = in_str.find(delim); + if (pos == std::string_view::npos) { + return boost::none; + } + + const auto key = rgw_trim_whitespace(in_str.substr(0, pos)); + const auto val = rgw_trim_whitespace(in_str.substr(pos + 1)); + + return std::make_pair(key, val); +} + +boost::optional> +parse_key_value(const std::string_view& in_str) +{ + return parse_key_value(in_str, "="); +} + +int parse_time(const char *time_str, real_time *time) +{ + struct tm tm; + uint32_t ns = 0; + + if (!parse_rfc2616(time_str, &tm) && !parse_iso8601(time_str, &tm, &ns)) { + return -EINVAL; + } + + time_t sec = internal_timegm(&tm); + *time = utime_t(sec, ns).to_real_time(); + + return 0; +} + +#define TIME_BUF_SIZE 128 + +void rgw_to_iso8601(const real_time& t, char *dest, int buf_size) +{ + utime_t ut(t); + + char buf[TIME_BUF_SIZE]; + struct tm result; + time_t epoch = ut.sec(); + struct tm *tmp = gmtime_r(&epoch, &result); + if (tmp == NULL) + return; + + if (strftime(buf, sizeof(buf), "%Y-%m-%dT%T", tmp) == 0) + return; + + snprintf(dest, buf_size, "%s.%03dZ", buf, (int)(ut.usec() / 1000)); +} + +void rgw_to_iso8601(const real_time& t, string *dest) +{ + char buf[TIME_BUF_SIZE]; + rgw_to_iso8601(t, buf, sizeof(buf)); + *dest = buf; +} + + +string rgw_to_asctime(const utime_t& t) +{ + stringstream s; + t.asctime(s); + return s.str(); +} + +/* + * calculate the sha1 value of a given msg and key + */ +void calc_hmac_sha1(const char *key, int key_len, + const char *msg, int msg_len, char *dest) +/* destination should be CEPH_CRYPTO_HMACSHA1_DIGESTSIZE bytes long */ +{ + HMACSHA1 hmac((const unsigned char *)key, key_len); + hmac.Update((const unsigned char *)msg, msg_len); + hmac.Final((unsigned char *)dest); +} + +/* + * calculate the sha256 value of a given msg and key + */ +void calc_hmac_sha256(const char *key, int key_len, + const char *msg, int msg_len, char *dest) +{ + char hash_sha256[CEPH_CRYPTO_HMACSHA256_DIGESTSIZE]; + + HMACSHA256 hmac((const unsigned char *)key, key_len); + hmac.Update((const unsigned char *)msg, msg_len); + hmac.Final((unsigned char *)hash_sha256); + + memcpy(dest, hash_sha256, CEPH_CRYPTO_HMACSHA256_DIGESTSIZE); +} + +using ceph::crypto::SHA256; + +/* + * calculate the sha256 hash value of a given msg + */ +sha256_digest_t calc_hash_sha256(const std::string_view& msg) +{ + sha256_digest_t hash; + + SHA256 hasher; + hasher.Update(reinterpret_cast(msg.data()), msg.size()); + hasher.Final(hash.v); + + return hash; +} + +SHA256* calc_hash_sha256_open_stream() +{ + return new SHA256; +} + +void calc_hash_sha256_update_stream(SHA256 *hash, const char *msg, int len) +{ + hash->Update((const unsigned char *)msg, len); +} + +string calc_hash_sha256_close_stream(SHA256 **phash) +{ + SHA256 *hash = *phash; + if (!hash) { + hash = calc_hash_sha256_open_stream(); + } + char hash_sha256[CEPH_CRYPTO_HMACSHA256_DIGESTSIZE]; + + hash->Final((unsigned char *)hash_sha256); + + char hex_str[(CEPH_CRYPTO_SHA256_DIGESTSIZE * 2) + 1]; + buf_to_hex((unsigned char *)hash_sha256, CEPH_CRYPTO_SHA256_DIGESTSIZE, hex_str); + + delete hash; + *phash = NULL; + + return std::string(hex_str); +} + +std::string calc_hash_sha256_restart_stream(SHA256 **phash) +{ + const auto hash = calc_hash_sha256_close_stream(phash); + *phash = calc_hash_sha256_open_stream(); + + return hash; +} + +int NameVal::parse() +{ + auto delim_pos = str.find('='); + int ret = 0; + + if (delim_pos == string::npos) { + name = str; + val = ""; + ret = 1; + } else { + name = str.substr(0, delim_pos); + val = str.substr(delim_pos + 1); + } + + return ret; +} + +int RGWHTTPArgs::parse(const DoutPrefixProvider *dpp) +{ + int pos = 0; + bool end = false; + + if (str.empty()) + return 0; + + if (str[pos] == '?') + pos++; + + while (!end) { + int fpos = str.find('&', pos); + if (fpos < pos) { + end = true; + fpos = str.size(); + } + std::string nameval = url_decode(str.substr(pos, fpos - pos), true); + NameVal nv(std::move(nameval)); + int ret = nv.parse(); + if (ret >= 0) { + string& name = nv.get_name(); + if (name.find("X-Amz-") != string::npos) { + std::for_each(name.begin(), + name.end(), + [](char &c){ + if (c != '-') { + c = ::tolower(static_cast(c)); + } + }); + } + string& val = nv.get_val(); + ldpp_dout(dpp, 10) << "name: " << name << " val: " << val << dendl; + append(name, val); + } + + pos = fpos + 1; + } + + return 0; +} + +void RGWHTTPArgs::remove(const string& name) +{ + auto val_iter = val_map.find(name); + if (val_iter != std::end(val_map)) { + val_map.erase(val_iter); + } + + auto sys_val_iter = sys_val_map.find(name); + if (sys_val_iter != std::end(sys_val_map)) { + sys_val_map.erase(sys_val_iter); + } + + auto subres_iter = sub_resources.find(name); + if (subres_iter != std::end(sub_resources)) { + sub_resources.erase(subres_iter); + } +} + +void RGWHTTPArgs::append(const string& name, const string& val) +{ + if (name.compare(0, sizeof(RGW_SYS_PARAM_PREFIX) - 1, RGW_SYS_PARAM_PREFIX) == 0) { + sys_val_map[name] = val; + } else { + val_map[name] = val; + } + +// when sub_resources exclusive by object are added, please remember to update obj_sub_resource in RGWHTTPArgs::exist_obj_excl_sub_resource(). + if ((name.compare("acl") == 0) || + (name.compare("cors") == 0) || + (name.compare("notification") == 0) || + (name.compare("location") == 0) || + (name.compare("logging") == 0) || + (name.compare("usage") == 0) || + (name.compare("lifecycle") == 0) || + (name.compare("delete") == 0) || + (name.compare("uploads") == 0) || + (name.compare("partNumber") == 0) || + (name.compare("uploadId") == 0) || + (name.compare("versionId") == 0) || + (name.compare("start-date") == 0) || + (name.compare("end-date") == 0) || + (name.compare("versions") == 0) || + (name.compare("versioning") == 0) || + (name.compare("website") == 0) || + (name.compare("requestPayment") == 0) || + (name.compare("torrent") == 0) || + (name.compare("tagging") == 0) || + (name.compare("append") == 0) || + (name.compare("position") == 0) || + (name.compare("policyStatus") == 0) || + (name.compare("publicAccessBlock") == 0)) { + sub_resources[name] = val; + } else if (name[0] == 'r') { // root of all evil + if ((name.compare("response-content-type") == 0) || + (name.compare("response-content-language") == 0) || + (name.compare("response-expires") == 0) || + (name.compare("response-cache-control") == 0) || + (name.compare("response-content-disposition") == 0) || + (name.compare("response-content-encoding") == 0)) { + sub_resources[name] = val; + has_resp_modifier = true; + } + } else if ((name.compare("subuser") == 0) || + (name.compare("key") == 0) || + (name.compare("caps") == 0) || + (name.compare("index") == 0) || + (name.compare("policy") == 0) || + (name.compare("quota") == 0) || + (name.compare("list") == 0) || + (name.compare("object") == 0) || + (name.compare("sync") == 0)) { + if (!admin_subresource_added) { + sub_resources[name] = ""; + admin_subresource_added = true; + } + } +} + +const string& RGWHTTPArgs::get(const string& name, bool *exists) const +{ + auto iter = val_map.find(name); + bool e = (iter != std::end(val_map)); + if (exists) + *exists = e; + if (e) + return iter->second; + return empty_str; +} + +boost::optional +RGWHTTPArgs::get_optional(const std::string& name) const +{ + bool exists; + const std::string& value = get(name, &exists); + if (exists) { + return value; + } else { + return boost::none; + } +} + +int RGWHTTPArgs::get_bool(const string& name, bool *val, bool *exists) const +{ + map::const_iterator iter; + iter = val_map.find(name); + bool e = (iter != val_map.end()); + if (exists) + *exists = e; + + if (e) { + const char *s = iter->second.c_str(); + + if (strcasecmp(s, "false") == 0) { + *val = false; + } else if (strcasecmp(s, "true") == 0) { + *val = true; + } else { + return -EINVAL; + } + } + + return 0; +} + +int RGWHTTPArgs::get_bool(const char *name, bool *val, bool *exists) const +{ + string s(name); + return get_bool(s, val, exists); +} + +void RGWHTTPArgs::get_bool(const char *name, bool *val, bool def_val) const +{ + bool exists = false; + if ((get_bool(name, val, &exists) < 0) || + !exists) { + *val = def_val; + } +} + +int RGWHTTPArgs::get_int(const char *name, int *val, int def_val) const +{ + bool exists = false; + string val_str; + val_str = get(name, &exists); + if (!exists) { + *val = def_val; + return 0; + } + + string err; + + *val = (int)strict_strtol(val_str.c_str(), 10, &err); + if (!err.empty()) { + *val = def_val; + return -EINVAL; + } + return 0; +} + +string RGWHTTPArgs::sys_get(const string& name, bool * const exists) const +{ + const auto iter = sys_val_map.find(name); + const bool e = (iter != sys_val_map.end()); + + if (exists) { + *exists = e; + } + + return e ? iter->second : string(); +} + +bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env) +{ + const auto& m = env.get_map(); + // frontend connected with ssl + if (m.count("SERVER_PORT_SECURE")) { + return true; + } + // ignore proxy headers unless explicitly enabled + if (!cct->_conf->rgw_trust_forwarded_https) { + return false; + } + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Forwarded + // Forwarded: by=; for=; host=; proto= + auto i = m.find("HTTP_FORWARDED"); + if (i != m.end() && i->second.find("proto=https") != std::string::npos) { + return true; + } + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Proto + i = m.find("HTTP_X_FORWARDED_PROTO"); + if (i != m.end() && i->second == "https") { + return true; + } + return false; +} + + +namespace { + +struct perm_state_from_req_state : public perm_state_base { + req_state * const s; + perm_state_from_req_state(req_state * const _s) + : perm_state_base(_s->cct, + _s->env, + _s->auth.identity.get(), + _s->bucket.get() ? _s->bucket->get_info() : RGWBucketInfo(), + _s->perm_mask, + _s->defer_to_bucket_acls, + _s->bucket_access_conf), + s(_s) {} + + std::optional get_request_payer() const override { + const char *request_payer = s->info.env->get("HTTP_X_AMZ_REQUEST_PAYER"); + if (!request_payer) { + bool exists; + request_payer = s->info.args.get("x-amz-request-payer", &exists).c_str(); + if (!exists) { + return false; + } + } + + if (strcasecmp(request_payer, "requester") == 0) { + return true; + } + + return std::nullopt; + } + + const char *get_referer() const override { + return s->info.env->get("HTTP_REFERER"); + } +}; + +Effect eval_or_pass(const DoutPrefixProvider* dpp, + const boost::optional& policy, + const rgw::IAM::Environment& env, + boost::optional id, + const uint64_t op, + const ARN& resource, + boost::optional princ_type=boost::none) { + if (!policy) + return Effect::Pass; + else + return policy->eval(env, id, op, resource, princ_type); +} + +} + +Effect eval_identity_or_session_policies(const DoutPrefixProvider* dpp, + const vector& policies, + const rgw::IAM::Environment& env, + const uint64_t op, + const ARN& arn) { + auto policy_res = Effect::Pass, prev_res = Effect::Pass; + for (auto& policy : policies) { + if (policy_res = eval_or_pass(dpp, policy, env, boost::none, op, arn); policy_res == Effect::Deny) + return policy_res; + else if (policy_res == Effect::Allow) + prev_res = Effect::Allow; + else if (policy_res == Effect::Pass && prev_res == Effect::Allow) + policy_res = Effect::Allow; + } + return policy_res; +} + +bool verify_user_permission(const DoutPrefixProvider* dpp, + perm_state_base * const s, + RGWAccessControlPolicy * const user_acl, + const vector& user_policies, + const vector& session_policies, + const rgw::ARN& res, + const uint64_t op, + bool mandatory_policy) +{ + auto identity_policy_res = eval_identity_or_session_policies(dpp, user_policies, s->env, op, res); + if (identity_policy_res == Effect::Deny) { + return false; + } + + if (! session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(dpp, session_policies, s->env, op, res); + if (session_policy_res == Effect::Deny) { + return false; + } + //Intersection of identity policies and session policies + if (identity_policy_res == Effect::Allow && session_policy_res == Effect::Allow) { + return true; + } + return false; + } + + if (identity_policy_res == Effect::Allow) { + return true; + } + + if (mandatory_policy) { + // no policies, and policy is mandatory + ldpp_dout(dpp, 20) << "no policies for a policy mandatory op " << op << dendl; + return false; + } + + auto perm = op_to_perm(op); + + return verify_user_permission_no_policy(dpp, s, user_acl, perm); +} + +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + RGWAccessControlPolicy * const user_acl, + const int perm) +{ + if (s->identity->get_identity_type() == TYPE_ROLE) + return false; + + /* S3 doesn't support account ACLs. */ + if (!user_acl) + return true; + + if ((perm & (int)s->perm_mask) != perm) + return false; + + return user_acl->verify_permission(dpp, *s->identity, perm, perm); +} + +bool verify_user_permission(const DoutPrefixProvider* dpp, + req_state * const s, + const rgw::ARN& res, + const uint64_t op, + bool mandatory_policy) +{ + perm_state_from_req_state ps(s); + return verify_user_permission(dpp, &ps, s->user_acl.get(), s->iam_user_policies, s->session_policies, res, op, mandatory_policy); +} + +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, + req_state * const s, + const int perm) +{ + perm_state_from_req_state ps(s); + return verify_user_permission_no_policy(dpp, &ps, s->user_acl.get(), perm); +} + +bool verify_requester_payer_permission(struct perm_state_base *s) +{ + if (!s->bucket_info.requester_pays) + return true; + + if (s->identity->is_owner_of(s->bucket_info.owner)) + return true; + + if (s->identity->is_anonymous()) { + return false; + } + + auto request_payer = s->get_request_payer(); + if (request_payer) { + return *request_payer; + } + + return false; +} + +bool verify_bucket_permission(const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + const rgw_bucket& bucket, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const vector& identity_policies, + const vector& session_policies, + const uint64_t op) +{ + if (!verify_requester_payer_permission(s)) + return false; + + auto identity_policy_res = eval_identity_or_session_policies(dpp, identity_policies, s->env, op, ARN(bucket)); + if (identity_policy_res == Effect::Deny) + return false; + + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + if (bucket_policy) { + ldpp_dout(dpp, 16) << __func__ << ": policy: " << bucket_policy.get() + << "resource: " << ARN(bucket) << dendl; + } + auto r = eval_or_pass(dpp, bucket_policy, s->env, *s->identity, + op, ARN(bucket), princ_type); + if (r == Effect::Deny) + return false; + + //Take into account session policies, if the identity making a request is a role + if (!session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(dpp, session_policies, s->env, op, ARN(bucket)); + if (session_policy_res == Effect::Deny) { + return false; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && r == Effect::Allow)) + return true; + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || r == Effect::Allow) + return true; + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) + return true; + } + return false; + } + + if (r == Effect::Allow || identity_policy_res == Effect::Allow) + // It looks like S3 ACLs only GRANT permissions rather than + // denying them, so this should be safe. + return true; + + const auto perm = op_to_perm(op); + + return verify_bucket_permission_no_policy(dpp, s, user_acl, bucket_acl, perm); +} + +bool verify_bucket_permission(const DoutPrefixProvider* dpp, + req_state * const s, + const rgw_bucket& bucket, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const vector& user_policies, + const vector& session_policies, + const uint64_t op) +{ + perm_state_from_req_state ps(s); + return verify_bucket_permission(dpp, &ps, bucket, + user_acl, bucket_acl, + bucket_policy, user_policies, + session_policies, op); +} + +bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, struct perm_state_base * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const int perm) +{ + if (!bucket_acl) + return false; + + if ((perm & (int)s->perm_mask) != perm) + return false; + + if (bucket_acl->verify_permission(dpp, *s->identity, perm, perm, + s->get_referer(), + s->bucket_access_conf && + s->bucket_access_conf->ignore_public_acls())) + return true; + + if (!user_acl) + return false; + + return user_acl->verify_permission(dpp, *s->identity, perm, perm); +} + +bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, req_state * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const int perm) +{ + perm_state_from_req_state ps(s); + return verify_bucket_permission_no_policy(dpp, + &ps, + user_acl, + bucket_acl, + perm); +} + +bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, req_state * const s, const int perm) +{ + perm_state_from_req_state ps(s); + + if (!verify_requester_payer_permission(&ps)) + return false; + + return verify_bucket_permission_no_policy(dpp, + &ps, + s->user_acl.get(), + s->bucket_acl.get(), + perm); +} + +bool verify_bucket_permission(const DoutPrefixProvider* dpp, req_state * const s, const uint64_t op) +{ + if (rgw::sal::Bucket::empty(s->bucket)) { + // request is missing a bucket name + return false; + } + + perm_state_from_req_state ps(s); + + return verify_bucket_permission(dpp, + &ps, + s->bucket->get_key(), + s->user_acl.get(), + s->bucket_acl.get(), + s->iam_policy, + s->iam_user_policies, + s->session_policies, + op); +} + +// Authorize anyone permitted by the bucket policy, identity policies, session policies and the bucket owner +// unless explicitly denied by the policy. + +int verify_bucket_owner_or_policy(req_state* const s, + const uint64_t op) +{ + auto identity_policy_res = eval_identity_or_session_policies(s, s->iam_user_policies, s->env, op, ARN(s->bucket->get_key())); + if (identity_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + auto e = eval_or_pass(s, s->iam_policy, + s->env, *s->auth.identity, + op, ARN(s->bucket->get_key()), princ_type); + if (e == Effect::Deny) { + return -EACCES; + } + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(s, s->session_policies, s->env, op, + ARN(s->bucket->get_key())); + if (session_policy_res == Effect::Deny) { + return -EACCES; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && e == Effect::Allow)) + return 0; + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) + return 0; + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) + return 0; + } + return -EACCES; + } + + if (e == Effect::Allow || + identity_policy_res == Effect::Allow || + (e == Effect::Pass && + identity_policy_res == Effect::Pass && + s->auth.identity->is_owner_of(s->bucket_owner.get_id()))) { + return 0; + } else { + return -EACCES; + } +} + + +static inline bool check_deferred_bucket_perms(const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + const rgw_bucket& bucket, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const vector& identity_policies, + const vector& session_policies, + const uint8_t deferred_check, + const uint64_t op) +{ + return (s->defer_to_bucket_acls == deferred_check \ + && verify_bucket_permission(dpp, s, bucket, user_acl, bucket_acl, bucket_policy, identity_policies, session_policies,op)); +} + +static inline bool check_deferred_bucket_only_acl(const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const uint8_t deferred_check, + const int perm) +{ + return (s->defer_to_bucket_acls == deferred_check \ + && verify_bucket_permission_no_policy(dpp, s, user_acl, bucket_acl, perm)); +} + +bool verify_object_permission(const DoutPrefixProvider* dpp, struct perm_state_base * const s, + const rgw_obj& obj, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + const boost::optional& bucket_policy, + const vector& identity_policies, + const vector& session_policies, + const uint64_t op) +{ + if (!verify_requester_payer_permission(s)) + return false; + + auto identity_policy_res = eval_identity_or_session_policies(dpp, identity_policies, s->env, op, ARN(obj)); + if (identity_policy_res == Effect::Deny) + return false; + + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + auto r = eval_or_pass(dpp, bucket_policy, s->env, *s->identity, op, ARN(obj), princ_type); + if (r == Effect::Deny) + return false; + + if (!session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(dpp, session_policies, s->env, op, ARN(obj)); + if (session_policy_res == Effect::Deny) { + return false; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && r == Effect::Allow)) + return true; + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || r == Effect::Allow) + return true; + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) + return true; + } + return false; + } + + if (r == Effect::Allow || identity_policy_res == Effect::Allow) + // It looks like S3 ACLs only GRANT permissions rather than + // denying them, so this should be safe. + return true; + + const auto perm = op_to_perm(op); + + if (check_deferred_bucket_perms(dpp, s, obj.bucket, user_acl, bucket_acl, bucket_policy, + identity_policies, session_policies, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, op) || + check_deferred_bucket_perms(dpp, s, obj.bucket, user_acl, bucket_acl, bucket_policy, + identity_policies, session_policies, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, rgw::IAM::s3All)) { + return true; + } + + if (!object_acl) { + return false; + } + + bool ret = object_acl->verify_permission(dpp, *s->identity, s->perm_mask, perm, + nullptr, /* http_referrer */ + s->bucket_access_conf && + s->bucket_access_conf->ignore_public_acls()); + if (ret) { + return true; + } + + if (!s->cct->_conf->rgw_enforce_swift_acls) + return ret; + + if ((perm & (int)s->perm_mask) != perm) + return false; + + int swift_perm = 0; + if (perm & (RGW_PERM_READ | RGW_PERM_READ_ACP)) + swift_perm |= RGW_PERM_READ_OBJS; + if (perm & RGW_PERM_WRITE) + swift_perm |= RGW_PERM_WRITE_OBJS; + + if (!swift_perm) + return false; + + /* we already verified the user mask above, so we pass swift_perm as the mask here, + otherwise the mask might not cover the swift permissions bits */ + if (bucket_acl->verify_permission(dpp, *s->identity, swift_perm, swift_perm, + s->get_referer())) + return true; + + if (!user_acl) + return false; + + return user_acl->verify_permission(dpp, *s->identity, swift_perm, swift_perm); +} + +bool verify_object_permission(const DoutPrefixProvider* dpp, req_state * const s, + const rgw_obj& obj, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + const boost::optional& bucket_policy, + const vector& identity_policies, + const vector& session_policies, + const uint64_t op) +{ + perm_state_from_req_state ps(s); + return verify_object_permission(dpp, &ps, obj, + user_acl, bucket_acl, + object_acl, bucket_policy, + identity_policies, session_policies, op); +} + +bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + const int perm) +{ + if (check_deferred_bucket_only_acl(dpp, s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, perm) || + check_deferred_bucket_only_acl(dpp, s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, RGW_PERM_FULL_CONTROL)) { + return true; + } + + if (!object_acl) { + return false; + } + + bool ret = object_acl->verify_permission(dpp, *s->identity, s->perm_mask, perm, + nullptr, /* http referrer */ + s->bucket_access_conf && + s->bucket_access_conf->ignore_public_acls()); + if (ret) { + return true; + } + + if (!s->cct->_conf->rgw_enforce_swift_acls) + return ret; + + if ((perm & (int)s->perm_mask) != perm) + return false; + + int swift_perm = 0; + if (perm & (RGW_PERM_READ | RGW_PERM_READ_ACP)) + swift_perm |= RGW_PERM_READ_OBJS; + if (perm & RGW_PERM_WRITE) + swift_perm |= RGW_PERM_WRITE_OBJS; + + if (!swift_perm) + return false; + + /* we already verified the user mask above, so we pass swift_perm as the mask here, + otherwise the mask might not cover the swift permissions bits */ + if (bucket_acl->verify_permission(dpp, *s->identity, swift_perm, swift_perm, + s->get_referer())) + return true; + + if (!user_acl) + return false; + + return user_acl->verify_permission(dpp, *s->identity, swift_perm, swift_perm); +} + +bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, req_state *s, int perm) +{ + perm_state_from_req_state ps(s); + + if (!verify_requester_payer_permission(&ps)) + return false; + + return verify_object_permission_no_policy(dpp, + &ps, + s->user_acl.get(), + s->bucket_acl.get(), + s->object_acl.get(), + perm); +} + +bool verify_object_permission(const DoutPrefixProvider* dpp, req_state *s, uint64_t op) +{ + perm_state_from_req_state ps(s); + + return verify_object_permission(dpp, + &ps, + rgw_obj(s->bucket->get_key(), s->object->get_key()), + s->user_acl.get(), + s->bucket_acl.get(), + s->object_acl.get(), + s->iam_policy, + s->iam_user_policies, + s->session_policies, + op); +} + + +int verify_object_lock(const DoutPrefixProvider* dpp, const rgw::sal::Attrs& attrs, const bool bypass_perm, const bool bypass_governance_mode) { + auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (aiter != attrs.end()) { + RGWObjectRetention obj_retention; + try { + decode(obj_retention, aiter->second); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl; + return -EIO; + } + if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) > ceph_clock_now()) { + if (obj_retention.get_mode().compare("GOVERNANCE") != 0 || !bypass_perm || !bypass_governance_mode) { + return -EACCES; + } + } + } + aiter = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD); + if (aiter != attrs.end()) { + RGWObjectLegalHold obj_legal_hold; + try { + decode(obj_legal_hold, aiter->second); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode RGWObjectLegalHold" << dendl; + return -EIO; + } + if (obj_legal_hold.is_enabled()) { + return -EACCES; + } + } + + return 0; +} + + +class HexTable +{ + char table[256]; + +public: + HexTable() { + // FIPS zeroization audit 20191115: this memset is not security related. + memset(table, -1, sizeof(table)); + int i; + for (i = '0'; i<='9'; i++) + table[i] = i - '0'; + for (i = 'A'; i<='F'; i++) + table[i] = i - 'A' + 0xa; + for (i = 'a'; i<='f'; i++) + table[i] = i - 'a' + 0xa; + } + + char to_num(char c) { + return table[(int)c]; + } +}; + +static char hex_to_num(char c) +{ + static HexTable hex_table; + return hex_table.to_num(c); +} + +std::string url_decode(const std::string_view& src_str, bool in_query) +{ + std::string dest_str; + dest_str.reserve(src_str.length() + 1); + + for (auto src = std::begin(src_str); src != std::end(src_str); ++src) { + if (*src != '%') { + if (!in_query || *src != '+') { + if (*src == '?') { + in_query = true; + } + dest_str.push_back(*src); + } else { + dest_str.push_back(' '); + } + } else { + /* 3 == strlen("%%XX") */ + if (std::distance(src, std::end(src_str)) < 3) { + break; + } + + src++; + const char c1 = hex_to_num(*src++); + const char c2 = hex_to_num(*src); + if (c1 < 0 || c2 < 0) { + return std::string(); + } else { + dest_str.push_back(c1 << 4 | c2); + } + } + } + + return dest_str; +} + +void rgw_uri_escape_char(char c, string& dst) +{ + char buf[16]; + snprintf(buf, sizeof(buf), "%%%.2X", (int)(unsigned char)c); + dst.append(buf); +} + +static bool char_needs_url_encoding(char c) +{ + if (c <= 0x20 || c >= 0x7f) + return true; + + switch (c) { + case 0x22: + case 0x23: + case 0x25: + case 0x26: + case 0x2B: + case 0x2C: + case 0x2F: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3E: + case 0x3D: + case 0x3F: + case 0x40: + case 0x5B: + case 0x5D: + case 0x5C: + case 0x5E: + case 0x60: + case 0x7B: + case 0x7D: + return true; + } + return false; +} + +void url_encode(const string& src, string& dst, bool encode_slash) +{ + const char *p = src.c_str(); + for (unsigned i = 0; i < src.size(); i++, p++) { + if ((!encode_slash && *p == 0x2F) || !char_needs_url_encoding(*p)) { + dst.append(p, 1); + }else { + rgw_uri_escape_char(*p, dst); + } + } +} + +std::string url_encode(const std::string& src, bool encode_slash) +{ + std::string dst; + url_encode(src, dst, encode_slash); + + return dst; +} + +std::string url_remove_prefix(const std::string& url) +{ + std::string dst = url; + auto pos = dst.find("http://"); + if (pos == std::string::npos) { + pos = dst.find("https://"); + if (pos != std::string::npos) { + dst.erase(pos, 8); + } else { + pos = dst.find("www."); + if (pos != std::string::npos) { + dst.erase(pos, 4); + } + } + } else { + dst.erase(pos, 7); + } + + return dst; +} + +string rgw_trim_whitespace(const string& src) +{ + if (src.empty()) { + return string(); + } + + int start = 0; + for (; start != (int)src.size(); start++) { + if (!isspace(src[start])) + break; + } + + int end = src.size() - 1; + if (end < start) { + return string(); + } + + for (; end > start; end--) { + if (!isspace(src[end])) + break; + } + + return src.substr(start, end - start + 1); +} + +std::string_view rgw_trim_whitespace(const std::string_view& src) +{ + std::string_view res = src; + + while (res.size() > 0 && std::isspace(res.front())) { + res.remove_prefix(1); + } + while (res.size() > 0 && std::isspace(res.back())) { + res.remove_suffix(1); + } + return res; +} + +string rgw_trim_quotes(const string& val) +{ + string s = rgw_trim_whitespace(val); + if (s.size() < 2) + return s; + + int start = 0; + int end = s.size() - 1; + int quotes_count = 0; + + if (s[start] == '"') { + start++; + quotes_count++; + } + if (s[end] == '"') { + end--; + quotes_count++; + } + if (quotes_count == 2) { + return s.substr(start, end - start + 1); + } + return s; +} + +static struct rgw_name_to_flag cap_names[] = { {"*", RGW_CAP_ALL}, + {"read", RGW_CAP_READ}, + {"write", RGW_CAP_WRITE}, + {NULL, 0} }; + +static int rgw_parse_list_of_flags(struct rgw_name_to_flag *mapping, + const string& str, uint32_t *perm) +{ + list strs; + get_str_list(str, strs); + list::iterator iter; + uint32_t v = 0; + for (iter = strs.begin(); iter != strs.end(); ++iter) { + string& s = *iter; + for (int i = 0; mapping[i].type_name; i++) { + if (s.compare(mapping[i].type_name) == 0) + v |= mapping[i].flag; + } + } + + *perm = v; + return 0; +} + +int RGWUserCaps::parse_cap_perm(const string& str, uint32_t *perm) +{ + return rgw_parse_list_of_flags(cap_names, str, perm); +} + +int RGWUserCaps::get_cap(const string& cap, string& type, uint32_t *pperm) +{ + int pos = cap.find('='); + if (pos >= 0) { + type = rgw_trim_whitespace(cap.substr(0, pos)); + } + + if (!is_valid_cap_type(type)) + return -ERR_INVALID_CAP; + + string cap_perm; + uint32_t perm = 0; + if (pos < (int)cap.size() - 1) { + cap_perm = cap.substr(pos + 1); + int r = RGWUserCaps::parse_cap_perm(cap_perm, &perm); + if (r < 0) + return r; + } + + *pperm = perm; + + return 0; +} + +int RGWUserCaps::add_cap(const string& cap) +{ + uint32_t perm; + string type; + + int r = get_cap(cap, type, &perm); + if (r < 0) + return r; + + caps[type] |= perm; + + return 0; +} + +int RGWUserCaps::remove_cap(const string& cap) +{ + uint32_t perm; + string type; + + int r = get_cap(cap, type, &perm); + if (r < 0) + return r; + + map::iterator iter = caps.find(type); + if (iter == caps.end()) + return 0; + + uint32_t& old_perm = iter->second; + old_perm &= ~perm; + if (!old_perm) + caps.erase(iter); + + return 0; +} + +int RGWUserCaps::add_from_string(const string& str) +{ + int start = 0; + do { + auto end = str.find(';', start); + if (end == string::npos) + end = str.size(); + + int r = add_cap(str.substr(start, end - start)); + if (r < 0) + return r; + + start = end + 1; + } while (start < (int)str.size()); + + return 0; +} + +int RGWUserCaps::remove_from_string(const string& str) +{ + int start = 0; + do { + auto end = str.find(';', start); + if (end == string::npos) + end = str.size(); + + int r = remove_cap(str.substr(start, end - start)); + if (r < 0) + return r; + + start = end + 1; + } while (start < (int)str.size()); + + return 0; +} + +void RGWUserCaps::dump(Formatter *f) const +{ + dump(f, "caps"); +} + +void RGWUserCaps::dump(Formatter *f, const char *name) const +{ + f->open_array_section(name); + map::const_iterator iter; + for (iter = caps.begin(); iter != caps.end(); ++iter) + { + f->open_object_section("cap"); + f->dump_string("type", iter->first); + uint32_t perm = iter->second; + string perm_str; + for (int i=0; cap_names[i].type_name; i++) { + if ((perm & cap_names[i].flag) == cap_names[i].flag) { + if (perm_str.size()) + perm_str.append(", "); + + perm_str.append(cap_names[i].type_name); + perm &= ~cap_names[i].flag; + } + } + if (perm_str.empty()) + perm_str = ""; + + f->dump_string("perm", perm_str); + f->close_section(); + } + + f->close_section(); +} + +struct RGWUserCap { + string type; + uint32_t perm; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("type", type, obj); + string perm_str; + JSONDecoder::decode_json("perm", perm_str, obj); + if (RGWUserCaps::parse_cap_perm(perm_str, &perm) < 0) { + throw JSONDecoder::err("failed to parse permissions"); + } + } +}; + +void RGWUserCaps::decode_json(JSONObj *obj) +{ + list caps_list; + decode_json_obj(caps_list, obj); + + list::iterator iter; + for (iter = caps_list.begin(); iter != caps_list.end(); ++iter) { + RGWUserCap& cap = *iter; + caps[cap.type] = cap.perm; + } +} + +int RGWUserCaps::check_cap(const string& cap, uint32_t perm) const +{ + auto iter = caps.find(cap); + + if ((iter == caps.end()) || + (iter->second & perm) != perm) { + return -EPERM; + } + + return 0; +} + +bool RGWUserCaps::is_valid_cap_type(const string& tp) +{ + static const char *cap_type[] = { "user", + "users", + "buckets", + "metadata", + "info", + "usage", + "zone", + "bilog", + "mdlog", + "datalog", + "roles", + "user-policy", + "amz-cache", + "oidc-provider", + "ratelimit"}; + + for (unsigned int i = 0; i < sizeof(cap_type) / sizeof(char *); ++i) { + if (tp.compare(cap_type[i]) == 0) { + return true; + } + } + + return false; +} + +void rgw_pool::from_str(const string& s) +{ + size_t pos = rgw_unescape_str(s, 0, '\\', ':', &name); + if (pos != string::npos) { + pos = rgw_unescape_str(s, pos, '\\', ':', &ns); + /* ignore return; if pos != string::npos it means that we had a colon + * in the middle of ns that wasn't escaped, we're going to stop there + */ + } +} + +string rgw_pool::to_str() const +{ + string esc_name; + rgw_escape_str(name, '\\', ':', &esc_name); + if (ns.empty()) { + return esc_name; + } + string esc_ns; + rgw_escape_str(ns, '\\', ':', &esc_ns); + return esc_name + ":" + esc_ns; +} + +void rgw_raw_obj::decode_from_rgw_obj(bufferlist::const_iterator& bl) +{ + using ceph::decode; + rgw_obj old_obj; + decode(old_obj, bl); + + get_obj_bucket_and_oid_loc(old_obj, oid, loc); + pool = old_obj.get_explicit_data_pool(); +} + +static struct rgw_name_to_flag op_type_mapping[] = { {"*", RGW_OP_TYPE_ALL}, + {"read", RGW_OP_TYPE_READ}, + {"write", RGW_OP_TYPE_WRITE}, + {"delete", RGW_OP_TYPE_DELETE}, + {NULL, 0} }; + + +int rgw_parse_op_type_list(const string& str, uint32_t *perm) +{ + return rgw_parse_list_of_flags(op_type_mapping, str, perm); +} + +bool match_policy(std::string_view pattern, std::string_view input, + uint32_t flag) +{ + const uint32_t flag2 = flag & (MATCH_POLICY_ACTION|MATCH_POLICY_ARN) ? + MATCH_CASE_INSENSITIVE : 0; + const bool colonblocks = !(flag & (MATCH_POLICY_RESOURCE | + MATCH_POLICY_STRING)); + + const auto npos = std::string_view::npos; + std::string_view::size_type last_pos_input = 0, last_pos_pattern = 0; + while (true) { + auto cur_pos_input = colonblocks ? input.find(":", last_pos_input) : npos; + auto cur_pos_pattern = + colonblocks ? pattern.find(":", last_pos_pattern) : npos; + + auto substr_input = input.substr(last_pos_input, cur_pos_input); + auto substr_pattern = pattern.substr(last_pos_pattern, cur_pos_pattern); + + if (!match_wildcards(substr_pattern, substr_input, flag2)) + return false; + + if (cur_pos_pattern == npos) + return cur_pos_input == npos; + if (cur_pos_input == npos) + return false; + + last_pos_pattern = cur_pos_pattern + 1; + last_pos_input = cur_pos_input + 1; + } +} + +/* + * make attrs look-like-this + * converts underscores to dashes + */ +string lowercase_dash_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + switch (*s) { + case '_': + buf[i] = '-'; + break; + default: + buf[i] = tolower(*s); + } + } + return string(buf); +} + +/* + * make attrs Look-Like-This + * converts underscores to dashes + */ +string camelcase_dash_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + bool last_sep = true; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + switch (*s) { + case '_': + case '-': + buf[i] = '-'; + last_sep = true; + break; + default: + if (last_sep) { + buf[i] = toupper(*s); + } else { + buf[i] = tolower(*s); + } + last_sep = false; + } + } + return string(buf); +} + +RGWBucketInfo::RGWBucketInfo() +{ +} + +RGWBucketInfo::~RGWBucketInfo() +{ +} + +void RGWBucketInfo::encode(bufferlist& bl) const { + ENCODE_START(23, 4, bl); + encode(bucket, bl); + encode(owner.id, bl); + encode(flags, bl); + encode(zonegroup, bl); + uint64_t ct = real_clock::to_time_t(creation_time); + encode(ct, bl); + encode(placement_rule, bl); + encode(has_instance_obj, bl); + encode(quota, bl); + encode(requester_pays, bl); + encode(owner.tenant, bl); + encode(has_website, bl); + if (has_website) { + encode(website_conf, bl); + } + encode(swift_versioning, bl); + if (swift_versioning) { + encode(swift_ver_location, bl); + } + encode(creation_time, bl); + encode(mdsearch_config, bl); + encode(reshard_status, bl); + encode(new_bucket_instance_id, bl); + if (obj_lock_enabled()) { + encode(obj_lock, bl); + } + bool has_sync_policy = !empty_sync_policy(); + encode(has_sync_policy, bl); + if (has_sync_policy) { + encode(*sync_policy, bl); + } + encode(layout, bl); + encode(owner.ns, bl); + ENCODE_FINISH(bl); +} + +void RGWBucketInfo::decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(23, 4, 4, bl); + decode(bucket, bl); + if (struct_v >= 2) { + string s; + decode(s, bl); + owner.from_str(s); + } + if (struct_v >= 3) + decode(flags, bl); + if (struct_v >= 5) + decode(zonegroup, bl); + if (struct_v >= 6) { + uint64_t ct; + decode(ct, bl); + if (struct_v < 17) + creation_time = ceph::real_clock::from_time_t((time_t)ct); + } + if (struct_v >= 7) + decode(placement_rule, bl); + if (struct_v >= 8) + decode(has_instance_obj, bl); + if (struct_v >= 9) + decode(quota, bl); + static constexpr uint8_t new_layout_v = 22; + if (struct_v >= 10 && struct_v < new_layout_v) + decode(layout.current_index.layout.normal.num_shards, bl); + if (struct_v >= 11 && struct_v < new_layout_v) + decode(layout.current_index.layout.normal.hash_type, bl); + if (struct_v >= 12) + decode(requester_pays, bl); + if (struct_v >= 13) + decode(owner.tenant, bl); + if (struct_v >= 14) { + decode(has_website, bl); + if (has_website) { + decode(website_conf, bl); + } else { + website_conf = RGWBucketWebsiteConf(); + } + } + if (struct_v >= 15 && struct_v < new_layout_v) { + uint32_t it; + decode(it, bl); + layout.current_index.layout.type = (rgw::BucketIndexType)it; + } else { + layout.current_index.layout.type = rgw::BucketIndexType::Normal; + } + swift_versioning = false; + swift_ver_location.clear(); + if (struct_v >= 16) { + decode(swift_versioning, bl); + if (swift_versioning) { + decode(swift_ver_location, bl); + } + } + if (struct_v >= 17) { + decode(creation_time, bl); + } + if (struct_v >= 18) { + decode(mdsearch_config, bl); + } + if (struct_v >= 19) { + decode(reshard_status, bl); + decode(new_bucket_instance_id, bl); + } + if (struct_v >= 20 && obj_lock_enabled()) { + decode(obj_lock, bl); + } + if (struct_v >= 21) { + decode(sync_policy, bl); + } + if (struct_v >= 22) { + decode(layout, bl); + } + if (struct_v >= 23) { + decode(owner.ns, bl); + } + + if (layout.logs.empty() && + layout.current_index.layout.type == rgw::BucketIndexType::Normal) { + layout.logs.push_back(rgw::log_layout_from_index(0, layout.current_index)); + } + DECODE_FINISH(bl); +} + +void RGWBucketInfo::set_sync_policy(rgw_sync_policy_info&& policy) +{ + sync_policy = std::move(policy); +} + +bool RGWBucketInfo::empty_sync_policy() const +{ + if (!sync_policy) { + return true; + } + + return sync_policy->empty(); +} + +struct rgw_pool; +struct rgw_placement_rule; +class RGWUserCaps; + +void decode_json_obj(rgw_pool& pool, JSONObj *obj) +{ + string s; + decode_json_obj(s, obj); + pool = rgw_pool(s); +} + +void encode_json(const char *name, const rgw_placement_rule& r, Formatter *f) +{ + encode_json(name, r.to_str(), f); +} + +void encode_json(const char *name, const rgw_pool& pool, Formatter *f) +{ + f->dump_string(name, pool.to_str()); +} + +void encode_json(const char *name, const RGWUserCaps& val, Formatter *f) +{ + val.dump(f, name); +} + +void RGWBucketEnt::generate_test_instances(list& o) +{ + RGWBucketEnt *e = new RGWBucketEnt; + init_bucket(&e->bucket, "tenant", "bucket", "pool", ".index_pool", "marker", "10"); + e->size = 1024; + e->size_rounded = 4096; + e->count = 1; + o.push_back(e); + o.push_back(new RGWBucketEnt); +} + +void RGWBucketEnt::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + encode_json("size", size, f); + encode_json("size_rounded", size_rounded, f); + utime_t ut(creation_time); + encode_json("mtime", ut, f); /* mtime / creation time discrepency needed for backward compatibility */ + encode_json("count", count, f); + encode_json("placement_rule", placement_rule.to_str(), f); +} + +void rgw_obj::generate_test_instances(list& o) +{ + rgw_bucket b; + init_bucket(&b, "tenant", "bucket", "pool", ".index_pool", "marker", "10"); + rgw_obj *obj = new rgw_obj(b, "object"); + o.push_back(obj); + o.push_back(new rgw_obj); +} + +void rgw_bucket_placement::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + encode_json("placement_rule", placement_rule, f); +} + +void RGWBucketInfo::generate_test_instances(list& o) +{ + // Since things without a log will have one synthesized on decode, + // ensure the things we attempt to encode will have one added so we + // round-trip properly. + auto gen_layout = [](rgw::BucketLayout& l) { + l.current_index.gen = 0; + l.current_index.layout.normal.hash_type = rgw::BucketHashType::Mod; + l.current_index.layout.type = rgw::BucketIndexType::Normal; + l.current_index.layout.normal.num_shards = 11; + l.logs.push_back(log_layout_from_index( + l.current_index.gen, + l.current_index)); + }; + + + RGWBucketInfo *i = new RGWBucketInfo; + init_bucket(&i->bucket, "tenant", "bucket", "pool", ".index_pool", "marker", "10"); + i->owner = "owner"; + i->flags = BUCKET_SUSPENDED; + gen_layout(i->layout); + o.push_back(i); + i = new RGWBucketInfo; + gen_layout(i->layout); + o.push_back(i); +} + +void RGWBucketInfo::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + utime_t ut(creation_time); + encode_json("creation_time", ut, f); + encode_json("owner", owner.to_str(), f); + encode_json("flags", flags, f); + encode_json("zonegroup", zonegroup, f); + encode_json("placement_rule", placement_rule, f); + encode_json("has_instance_obj", has_instance_obj, f); + encode_json("quota", quota, f); + encode_json("num_shards", layout.current_index.layout.normal.num_shards, f); + encode_json("bi_shard_hash_type", (uint32_t)layout.current_index.layout.normal.hash_type, f); + encode_json("requester_pays", requester_pays, f); + encode_json("has_website", has_website, f); + if (has_website) { + encode_json("website_conf", website_conf, f); + } + encode_json("swift_versioning", swift_versioning, f); + encode_json("swift_ver_location", swift_ver_location, f); + encode_json("index_type", (uint32_t)layout.current_index.layout.type, f); + encode_json("mdsearch_config", mdsearch_config, f); + encode_json("reshard_status", (int)reshard_status, f); + encode_json("new_bucket_instance_id", new_bucket_instance_id, f); + if (!empty_sync_policy()) { + encode_json("sync_policy", *sync_policy, f); + } +} + +void RGWBucketInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket", bucket, obj); + utime_t ut; + JSONDecoder::decode_json("creation_time", ut, obj); + creation_time = ut.to_real_time(); + JSONDecoder::decode_json("owner", owner, obj); + JSONDecoder::decode_json("flags", flags, obj); + JSONDecoder::decode_json("zonegroup", zonegroup, obj); + /* backward compatability with region */ + if (zonegroup.empty()) { + JSONDecoder::decode_json("region", zonegroup, obj); + } + string pr; + JSONDecoder::decode_json("placement_rule", pr, obj); + placement_rule.from_str(pr); + JSONDecoder::decode_json("has_instance_obj", has_instance_obj, obj); + JSONDecoder::decode_json("quota", quota, obj); + JSONDecoder::decode_json("num_shards", layout.current_index.layout.normal.num_shards, obj); + uint32_t hash_type; + JSONDecoder::decode_json("bi_shard_hash_type", hash_type, obj); + layout.current_index.layout.normal.hash_type = static_cast(hash_type); + JSONDecoder::decode_json("requester_pays", requester_pays, obj); + JSONDecoder::decode_json("has_website", has_website, obj); + if (has_website) { + JSONDecoder::decode_json("website_conf", website_conf, obj); + } + JSONDecoder::decode_json("swift_versioning", swift_versioning, obj); + JSONDecoder::decode_json("swift_ver_location", swift_ver_location, obj); + uint32_t it; + JSONDecoder::decode_json("index_type", it, obj); + layout.current_index.layout.type = (rgw::BucketIndexType)it; + JSONDecoder::decode_json("mdsearch_config", mdsearch_config, obj); + int rs; + JSONDecoder::decode_json("reshard_status", rs, obj); + reshard_status = (cls_rgw_reshard_status)rs; + + rgw_sync_policy_info sp; + JSONDecoder::decode_json("sync_policy", sp, obj); + if (!sp.empty()) { + set_sync_policy(std::move(sp)); + } +} + +void RGWUserInfo::generate_test_instances(list& o) +{ + RGWUserInfo *i = new RGWUserInfo; + i->user_id = "user_id"; + i->display_name = "display_name"; + i->user_email = "user@email"; + RGWAccessKey k1, k2; + k1.id = "id1"; + k1.key = "key1"; + k2.id = "id2"; + k2.subuser = "subuser"; + RGWSubUser u; + u.name = "id2"; + u.perm_mask = 0x1; + i->access_keys[k1.id] = k1; + i->swift_keys[k2.id] = k2; + i->subusers[u.name] = u; + o.push_back(i); + + o.push_back(new RGWUserInfo); +} + +static void user_info_dump_subuser(const char *name, const RGWSubUser& subuser, Formatter *f, void *parent) +{ + RGWUserInfo *info = static_cast(parent); + subuser.dump(f, info->user_id.to_str()); +} + +static void user_info_dump_key(const char *name, const RGWAccessKey& key, Formatter *f, void *parent) +{ + RGWUserInfo *info = static_cast(parent); + key.dump(f, info->user_id.to_str(), false); +} + +static void user_info_dump_swift_key(const char *name, const RGWAccessKey& key, Formatter *f, void *parent) +{ + RGWUserInfo *info = static_cast(parent); + key.dump(f, info->user_id.to_str(), true); +} + +static void decode_access_keys(map& m, JSONObj *o) +{ + RGWAccessKey k; + k.decode_json(o); + m[k.id] = k; +} + +static void decode_swift_keys(map& m, JSONObj *o) +{ + RGWAccessKey k; + k.decode_json(o, true); + m[k.id] = k; +} + +static void decode_subusers(map& m, JSONObj *o) +{ + RGWSubUser u; + u.decode_json(o); + m[u.name] = u; +} + + +struct rgw_flags_desc { + uint32_t mask; + const char *str; +}; + +static struct rgw_flags_desc rgw_perms[] = { + { RGW_PERM_FULL_CONTROL, "full-control" }, + { RGW_PERM_READ | RGW_PERM_WRITE, "read-write" }, + { RGW_PERM_READ, "read" }, + { RGW_PERM_WRITE, "write" }, + { RGW_PERM_READ_ACP, "read-acp" }, + { RGW_PERM_WRITE_ACP, "write-acp" }, + { 0, NULL } +}; + +void rgw_perm_to_str(uint32_t mask, char *buf, int len) +{ + const char *sep = ""; + int pos = 0; + if (!mask) { + snprintf(buf, len, ""); + return; + } + while (mask) { + uint32_t orig_mask = mask; + for (int i = 0; rgw_perms[i].mask; i++) { + struct rgw_flags_desc *desc = &rgw_perms[i]; + if ((mask & desc->mask) == desc->mask) { + pos += snprintf(buf + pos, len - pos, "%s%s", sep, desc->str); + if (pos == len) + return; + sep = ", "; + mask &= ~desc->mask; + if (!mask) + return; + } + } + if (mask == orig_mask) // no change + break; + } +} + +uint32_t rgw_str_to_perm(const char *str) +{ + if (strcasecmp(str, "") == 0) + return RGW_PERM_NONE; + else if (strcasecmp(str, "read") == 0) + return RGW_PERM_READ; + else if (strcasecmp(str, "write") == 0) + return RGW_PERM_WRITE; + else if (strcasecmp(str, "readwrite") == 0) + return RGW_PERM_READ | RGW_PERM_WRITE; + else if (strcasecmp(str, "full") == 0) + return RGW_PERM_FULL_CONTROL; + + return RGW_PERM_INVALID; +} + +template +static void mask_to_str(T *mask_list, uint32_t mask, char *buf, int len) +{ + const char *sep = ""; + int pos = 0; + if (!mask) { + snprintf(buf, len, ""); + return; + } + while (mask) { + uint32_t orig_mask = mask; + for (int i = 0; mask_list[i].mask; i++) { + T *desc = &mask_list[i]; + if ((mask & desc->mask) == desc->mask) { + pos += snprintf(buf + pos, len - pos, "%s%s", sep, desc->str); + if (pos == len) + return; + sep = ", "; + mask &= ~desc->mask; + if (!mask) + return; + } + } + if (mask == orig_mask) // no change + break; + } +} + +static void perm_to_str(uint32_t mask, char *buf, int len) +{ + return mask_to_str(rgw_perms, mask, buf, len); +} + +static struct rgw_flags_desc op_type_flags[] = { + { RGW_OP_TYPE_READ, "read" }, + { RGW_OP_TYPE_WRITE, "write" }, + { RGW_OP_TYPE_DELETE, "delete" }, + { 0, NULL } +}; + +void op_type_to_str(uint32_t mask, char *buf, int len) +{ + return mask_to_str(op_type_flags, mask, buf, len); +} + +void RGWRateLimitInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("max_read_ops", max_read_ops, obj); + JSONDecoder::decode_json("max_write_ops", max_write_ops, obj); + JSONDecoder::decode_json("max_read_bytes", max_read_bytes, obj); + JSONDecoder::decode_json("max_write_bytes", max_write_bytes, obj); + JSONDecoder::decode_json("enabled", enabled, obj); +} + +void RGWRateLimitInfo::dump(Formatter *f) const +{ + f->dump_int("max_read_ops", max_read_ops); + f->dump_int("max_write_ops", max_write_ops); + f->dump_int("max_read_bytes", max_read_bytes); + f->dump_int("max_write_bytes", max_write_bytes); + f->dump_bool("enabled", enabled); +} + +void RGWUserInfo::dump(Formatter *f) const +{ + + encode_json("user_id", user_id.to_str(), f); + encode_json("display_name", display_name, f); + encode_json("email", user_email, f); + encode_json("suspended", (int)suspended, f); + encode_json("max_buckets", (int)max_buckets, f); + + encode_json_map("subusers", NULL, "subuser", NULL, user_info_dump_subuser,(void *)this, subusers, f); + encode_json_map("keys", NULL, "key", NULL, user_info_dump_key,(void *)this, access_keys, f); + encode_json_map("swift_keys", NULL, "key", NULL, user_info_dump_swift_key,(void *)this, swift_keys, f); + + encode_json("caps", caps, f); + + char buf[256]; + op_type_to_str(op_mask, buf, sizeof(buf)); + encode_json("op_mask", (const char *)buf, f); + + if (system) { /* no need to show it for every user */ + encode_json("system", (bool)system, f); + } + if (admin) { + encode_json("admin", (bool)admin, f); + } + encode_json("default_placement", default_placement.name, f); + encode_json("default_storage_class", default_placement.storage_class, f); + encode_json("placement_tags", placement_tags, f); + encode_json("bucket_quota", quota.bucket_quota, f); + encode_json("user_quota", quota.user_quota, f); + encode_json("temp_url_keys", temp_url_keys, f); + + string user_source_type; + switch ((RGWIdentityType)type) { + case TYPE_RGW: + user_source_type = "rgw"; + break; + case TYPE_KEYSTONE: + user_source_type = "keystone"; + break; + case TYPE_LDAP: + user_source_type = "ldap"; + break; + case TYPE_NONE: + user_source_type = "none"; + break; + default: + user_source_type = "none"; + break; + } + encode_json("type", user_source_type, f); + encode_json("mfa_ids", mfa_ids, f); +} + +void RGWUserInfo::decode_json(JSONObj *obj) +{ + string uid; + + JSONDecoder::decode_json("user_id", uid, obj, true); + user_id.from_str(uid); + + JSONDecoder::decode_json("display_name", display_name, obj); + JSONDecoder::decode_json("email", user_email, obj); + bool susp = false; + JSONDecoder::decode_json("suspended", susp, obj); + suspended = (__u8)susp; + JSONDecoder::decode_json("max_buckets", max_buckets, obj); + + JSONDecoder::decode_json("keys", access_keys, decode_access_keys, obj); + JSONDecoder::decode_json("swift_keys", swift_keys, decode_swift_keys, obj); + JSONDecoder::decode_json("subusers", subusers, decode_subusers, obj); + + JSONDecoder::decode_json("caps", caps, obj); + + string mask_str; + JSONDecoder::decode_json("op_mask", mask_str, obj); + rgw_parse_op_type_list(mask_str, &op_mask); + + bool sys = false; + JSONDecoder::decode_json("system", sys, obj); + system = (__u8)sys; + bool ad = false; + JSONDecoder::decode_json("admin", ad, obj); + admin = (__u8)ad; + JSONDecoder::decode_json("default_placement", default_placement.name, obj); + JSONDecoder::decode_json("default_storage_class", default_placement.storage_class, obj); + JSONDecoder::decode_json("placement_tags", placement_tags, obj); + JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj); + JSONDecoder::decode_json("user_quota", quota.user_quota, obj); + JSONDecoder::decode_json("temp_url_keys", temp_url_keys, obj); + + string user_source_type; + JSONDecoder::decode_json("type", user_source_type, obj); + if (user_source_type == "rgw") { + type = TYPE_RGW; + } else if (user_source_type == "keystone") { + type = TYPE_KEYSTONE; + } else if (user_source_type == "ldap") { + type = TYPE_LDAP; + } else if (user_source_type == "none") { + type = TYPE_NONE; + } + JSONDecoder::decode_json("mfa_ids", mfa_ids, obj); +} + + +void RGWSubUser::generate_test_instances(list& o) +{ + RGWSubUser *u = new RGWSubUser; + u->name = "name"; + u->perm_mask = 0xf; + o.push_back(u); + o.push_back(new RGWSubUser); +} + +void RGWSubUser::dump(Formatter *f) const +{ + encode_json("id", name, f); + char buf[256]; + perm_to_str(perm_mask, buf, sizeof(buf)); + encode_json("permissions", (const char *)buf, f); +} + +void RGWSubUser::dump(Formatter *f, const string& user) const +{ + string s = user; + s.append(":"); + s.append(name); + encode_json("id", s, f); + char buf[256]; + perm_to_str(perm_mask, buf, sizeof(buf)); + encode_json("permissions", (const char *)buf, f); +} + +uint32_t str_to_perm(const string& s) +{ + if (s.compare("read") == 0) + return RGW_PERM_READ; + else if (s.compare("write") == 0) + return RGW_PERM_WRITE; + else if (s.compare("read-write") == 0) + return RGW_PERM_READ | RGW_PERM_WRITE; + else if (s.compare("full-control") == 0) + return RGW_PERM_FULL_CONTROL; + return 0; +} + +void RGWSubUser::decode_json(JSONObj *obj) +{ + string uid; + JSONDecoder::decode_json("id", uid, obj); + int pos = uid.find(':'); + if (pos >= 0) + name = uid.substr(pos + 1); + string perm_str; + JSONDecoder::decode_json("permissions", perm_str, obj); + perm_mask = str_to_perm(perm_str); +} + +void RGWAccessKey::generate_test_instances(list& o) +{ + RGWAccessKey *k = new RGWAccessKey; + k->id = "id"; + k->key = "key"; + k->subuser = "subuser"; + o.push_back(k); + o.push_back(new RGWAccessKey); +} + +void RGWAccessKey::dump(Formatter *f) const +{ + encode_json("access_key", id, f); + encode_json("secret_key", key, f); + encode_json("subuser", subuser, f); +} + +void RGWAccessKey::dump_plain(Formatter *f) const +{ + encode_json("access_key", id, f); + encode_json("secret_key", key, f); +} + +void RGWAccessKey::dump(Formatter *f, const string& user, bool swift) const +{ + string u = user; + if (!subuser.empty()) { + u.append(":"); + u.append(subuser); + } + encode_json("user", u, f); + if (!swift) { + encode_json("access_key", id, f); + } + encode_json("secret_key", key, f); +} + +void RGWAccessKey::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("access_key", id, obj, true); + JSONDecoder::decode_json("secret_key", key, obj, true); + if (!JSONDecoder::decode_json("subuser", subuser, obj)) { + string user; + JSONDecoder::decode_json("user", user, obj); + int pos = user.find(':'); + if (pos >= 0) { + subuser = user.substr(pos + 1); + } + } +} + +void RGWAccessKey::decode_json(JSONObj *obj, bool swift) { + if (!swift) { + decode_json(obj); + return; + } + + if (!JSONDecoder::decode_json("subuser", subuser, obj)) { + JSONDecoder::decode_json("user", id, obj, true); + int pos = id.find(':'); + if (pos >= 0) { + subuser = id.substr(pos + 1); + } + } + JSONDecoder::decode_json("secret_key", key, obj, true); +} + +void RGWStorageStats::dump(Formatter *f) const +{ + encode_json("size", size, f); + encode_json("size_actual", size_rounded, f); + if (dump_utilized) { + encode_json("size_utilized", size_utilized, f); + } + encode_json("size_kb", rgw_rounded_kb(size), f); + encode_json("size_kb_actual", rgw_rounded_kb(size_rounded), f); + if (dump_utilized) { + encode_json("size_kb_utilized", rgw_rounded_kb(size_utilized), f); + } + encode_json("num_objects", num_objects, f); +} + +void rgw_obj_key::dump(Formatter *f) const +{ + encode_json("name", name, f); + encode_json("instance", instance, f); + encode_json("ns", ns, f); +} + +void rgw_obj_key::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("instance", instance, obj); + JSONDecoder::decode_json("ns", ns, obj); +} + +void rgw_raw_obj::dump(Formatter *f) const +{ + encode_json("pool", pool, f); + encode_json("oid", oid, f); + encode_json("loc", loc, f); +} + +void rgw_raw_obj::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("oid", oid, obj); + JSONDecoder::decode_json("loc", loc, obj); +} + +void rgw_obj::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + encode_json("key", key, f); +} + +int rgw_bucket_parse_bucket_instance(const string& bucket_instance, string *bucket_name, string *bucket_id, int *shard_id) +{ + auto pos = bucket_instance.rfind(':'); + if (pos == string::npos) { + return -EINVAL; + } + + string first = bucket_instance.substr(0, pos); + string second = bucket_instance.substr(pos + 1); + + pos = first.find(':'); + + if (pos == string::npos) { + *shard_id = -1; + *bucket_name = first; + *bucket_id = second; + return 0; + } + + *bucket_name = first.substr(0, pos); + *bucket_id = first.substr(pos + 1); + + string err; + *shard_id = strict_strtol(second.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + + return 0; +} + +boost::intrusive_ptr +rgw_global_init(const std::map *defaults, + std::vector < const char* >& args, + uint32_t module_type, code_environment_t code_env, + int flags) +{ + // Load the config from the files, but not the mon + global_pre_init(defaults, args, module_type, code_env, flags); + + // Get the store backend + const auto& config_store = g_conf().get_val("rgw_backend_store"); + + if ((config_store == "dbstore") || + (config_store == "motr") || + (config_store == "daos")) { + // These stores don't use the mon + flags |= CINIT_FLAG_NO_MON_CONFIG; + } + + // Finish global init, indicating we already ran pre-init + return global_init(defaults, args, module_type, code_env, flags, false); +} + +void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct) +{ + write_version.ver = 1; +#define TAG_LEN 24 + + write_version.tag.clear(); + append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN); +} + diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h new file mode 100644 index 000000000..648b2e087 --- /dev/null +++ b/src/rgw/rgw_common.h @@ -0,0 +1,1842 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2009 Sage Weil + * Copyright (C) 2015 Yehuda Sadeh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include "common/ceph_crypto.h" +#include "common/random_string.h" +#include "rgw_acl.h" +#include "rgw_bucket_layout.h" +#include "rgw_cors.h" +#include "rgw_basic_types.h" +#include "rgw_iam_policy.h" +#include "rgw_quota_types.h" +#include "rgw_string.h" +#include "common/async/yield_context.h" +#include "rgw_website.h" +#include "rgw_object_lock.h" +#include "rgw_tag.h" +#include "rgw_op_type.h" +#include "rgw_sync_policy.h" +#include "cls/version/cls_version_types.h" +#include "cls/user/cls_user_types.h" +#include "cls/rgw/cls_rgw_types.h" +#include "include/rados/librados.hpp" +#include "rgw_public_access.h" +#include "common/tracer.h" +#include "rgw_sal_fwd.h" + +namespace ceph { + class Formatter; +} + +namespace rgw::sal { + using Attrs = std::map; +} + +namespace rgw::lua { + class Background; +} + +struct RGWProcessEnv; + +using ceph::crypto::MD5; + +#define RGW_ATTR_PREFIX "user.rgw." + +#define RGW_HTTP_RGWX_ATTR_PREFIX "RGWX_ATTR_" +#define RGW_HTTP_RGWX_ATTR_PREFIX_OUT "Rgwx-Attr-" + +#define RGW_AMZ_PREFIX "x-amz-" +#define RGW_AMZ_META_PREFIX RGW_AMZ_PREFIX "meta-" +#define RGW_AMZ_WEBSITE_REDIRECT_LOCATION RGW_AMZ_PREFIX "website-redirect-location" +#define RGW_AMZ_TAG_COUNT RGW_AMZ_PREFIX "tagging-count" + +#define RGW_SYS_PARAM_PREFIX "rgwx-" + +#define RGW_ATTR_ACL RGW_ATTR_PREFIX "acl" +#define RGW_ATTR_RATELIMIT RGW_ATTR_PREFIX "ratelimit" +#define RGW_ATTR_LC RGW_ATTR_PREFIX "lc" +#define RGW_ATTR_CORS RGW_ATTR_PREFIX "cors" +#define RGW_ATTR_ETAG RGW_ATTR_PREFIX "etag" +#define RGW_ATTR_BUCKETS RGW_ATTR_PREFIX "buckets" +#define RGW_ATTR_META_PREFIX RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX +#define RGW_ATTR_CONTENT_TYPE RGW_ATTR_PREFIX "content_type" +#define RGW_ATTR_CACHE_CONTROL RGW_ATTR_PREFIX "cache_control" +#define RGW_ATTR_CONTENT_DISP RGW_ATTR_PREFIX "content_disposition" +#define RGW_ATTR_CONTENT_ENC RGW_ATTR_PREFIX "content_encoding" +#define RGW_ATTR_CONTENT_LANG RGW_ATTR_PREFIX "content_language" +#define RGW_ATTR_EXPIRES RGW_ATTR_PREFIX "expires" +#define RGW_ATTR_DELETE_AT RGW_ATTR_PREFIX "delete_at" +#define RGW_ATTR_ID_TAG RGW_ATTR_PREFIX "idtag" +#define RGW_ATTR_TAIL_TAG RGW_ATTR_PREFIX "tail_tag" +#define RGW_ATTR_SHADOW_OBJ RGW_ATTR_PREFIX "shadow_name" +#define RGW_ATTR_MANIFEST RGW_ATTR_PREFIX "manifest" +#define RGW_ATTR_USER_MANIFEST RGW_ATTR_PREFIX "user_manifest" +#define RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION RGW_ATTR_PREFIX RGW_AMZ_WEBSITE_REDIRECT_LOCATION +#define RGW_ATTR_SLO_MANIFEST RGW_ATTR_PREFIX "slo_manifest" +/* Information whether an object is SLO or not must be exposed to + * user through custom HTTP header named X-Static-Large-Object. */ +#define RGW_ATTR_SLO_UINDICATOR RGW_ATTR_META_PREFIX "static-large-object" +#define RGW_ATTR_X_ROBOTS_TAG RGW_ATTR_PREFIX "x-robots-tag" +#define RGW_ATTR_STORAGE_CLASS RGW_ATTR_PREFIX "storage_class" + +/* S3 Object Lock*/ +#define RGW_ATTR_OBJECT_LOCK RGW_ATTR_PREFIX "object-lock" +#define RGW_ATTR_OBJECT_RETENTION RGW_ATTR_PREFIX "object-retention" +#define RGW_ATTR_OBJECT_LEGAL_HOLD RGW_ATTR_PREFIX "object-legal-hold" + + +#define RGW_ATTR_PG_VER RGW_ATTR_PREFIX "pg_ver" +#define RGW_ATTR_SOURCE_ZONE RGW_ATTR_PREFIX "source_zone" +#define RGW_ATTR_TAGS RGW_ATTR_PREFIX RGW_AMZ_PREFIX "tagging" + +#define RGW_ATTR_TEMPURL_KEY1 RGW_ATTR_META_PREFIX "temp-url-key" +#define RGW_ATTR_TEMPURL_KEY2 RGW_ATTR_META_PREFIX "temp-url-key-2" + +/* Account/container quota of the Swift API. */ +#define RGW_ATTR_QUOTA_NOBJS RGW_ATTR_META_PREFIX "quota-count" +#define RGW_ATTR_QUOTA_MSIZE RGW_ATTR_META_PREFIX "quota-bytes" + +/* Static Web Site of Swift API. */ +#define RGW_ATTR_WEB_INDEX RGW_ATTR_META_PREFIX "web-index" +#define RGW_ATTR_WEB_ERROR RGW_ATTR_META_PREFIX "web-error" +#define RGW_ATTR_WEB_LISTINGS RGW_ATTR_META_PREFIX "web-listings" +#define RGW_ATTR_WEB_LIST_CSS RGW_ATTR_META_PREFIX "web-listings-css" +#define RGW_ATTR_SUBDIR_MARKER RGW_ATTR_META_PREFIX "web-directory-type" + +#define RGW_ATTR_OLH_PREFIX RGW_ATTR_PREFIX "olh." + +#define RGW_ATTR_OLH_INFO RGW_ATTR_OLH_PREFIX "info" +#define RGW_ATTR_OLH_VER RGW_ATTR_OLH_PREFIX "ver" +#define RGW_ATTR_OLH_ID_TAG RGW_ATTR_OLH_PREFIX "idtag" +#define RGW_ATTR_OLH_PENDING_PREFIX RGW_ATTR_OLH_PREFIX "pending." + +#define RGW_ATTR_COMPRESSION RGW_ATTR_PREFIX "compression" + +#define RGW_ATTR_APPEND_PART_NUM RGW_ATTR_PREFIX "append_part_num" + +/* Attrs to store cloudtier config information. These are used internally + * for the replication of cloudtiered objects but not stored as xattrs in + * the head object. */ +#define RGW_ATTR_CLOUD_TIER_TYPE RGW_ATTR_PREFIX "cloud_tier_type" +#define RGW_ATTR_CLOUD_TIER_CONFIG RGW_ATTR_PREFIX "cloud_tier_config" + +#define RGW_ATTR_OBJ_REPLICATION_STATUS RGW_ATTR_PREFIX "amz-replication-status" +#define RGW_ATTR_OBJ_REPLICATION_TRACE RGW_ATTR_PREFIX "replication-trace" + +/* IAM Policy */ +#define RGW_ATTR_IAM_POLICY RGW_ATTR_PREFIX "iam-policy" +#define RGW_ATTR_USER_POLICY RGW_ATTR_PREFIX "user-policy" +#define RGW_ATTR_PUBLIC_ACCESS RGW_ATTR_PREFIX "public-access" + +/* RGW File Attributes */ +#define RGW_ATTR_UNIX_KEY1 RGW_ATTR_PREFIX "unix-key1" +#define RGW_ATTR_UNIX1 RGW_ATTR_PREFIX "unix1" + +#define RGW_ATTR_CRYPT_PREFIX RGW_ATTR_PREFIX "crypt." +#define RGW_ATTR_CRYPT_MODE RGW_ATTR_CRYPT_PREFIX "mode" +#define RGW_ATTR_CRYPT_KEYMD5 RGW_ATTR_CRYPT_PREFIX "keymd5" +#define RGW_ATTR_CRYPT_KEYID RGW_ATTR_CRYPT_PREFIX "keyid" +#define RGW_ATTR_CRYPT_KEYSEL RGW_ATTR_CRYPT_PREFIX "keysel" +#define RGW_ATTR_CRYPT_CONTEXT RGW_ATTR_CRYPT_PREFIX "context" +#define RGW_ATTR_CRYPT_DATAKEY RGW_ATTR_CRYPT_PREFIX "datakey" +#define RGW_ATTR_CRYPT_PARTS RGW_ATTR_CRYPT_PREFIX "part-lengths" + +/* SSE-S3 Encryption Attributes */ +#define RGW_ATTR_BUCKET_ENCRYPTION_PREFIX RGW_ATTR_PREFIX "sse-s3." +#define RGW_ATTR_BUCKET_ENCRYPTION_POLICY RGW_ATTR_BUCKET_ENCRYPTION_PREFIX "policy" +#define RGW_ATTR_BUCKET_ENCRYPTION_KEY_ID RGW_ATTR_BUCKET_ENCRYPTION_PREFIX "key-id" + +#define RGW_ATTR_TRACE RGW_ATTR_PREFIX "trace" + +enum class RGWFormat : int8_t { + BAD_FORMAT = -1, + PLAIN = 0, + XML, + JSON, + HTML, +}; + +static inline const char* to_mime_type(const RGWFormat f) +{ + switch (f) { + case RGWFormat::XML: + return "application/xml"; + break; + case RGWFormat::JSON: + return "application/json"; + break; + case RGWFormat::HTML: + return "text/html"; + break; + case RGWFormat::PLAIN: + return "text/plain"; + break; + default: + return "invalid format"; + } +} + +#define RGW_CAP_READ 0x1 +#define RGW_CAP_WRITE 0x2 +#define RGW_CAP_ALL (RGW_CAP_READ | RGW_CAP_WRITE) + +#define RGW_REST_SWIFT 0x1 +#define RGW_REST_SWIFT_AUTH 0x2 +#define RGW_REST_S3 0x4 +#define RGW_REST_WEBSITE 0x8 +#define RGW_REST_STS 0x10 +#define RGW_REST_IAM 0x20 +#define RGW_REST_SNS 0x30 + +#define RGW_SUSPENDED_USER_AUID (uint64_t)-2 + +#define RGW_OP_TYPE_READ 0x01 +#define RGW_OP_TYPE_WRITE 0x02 +#define RGW_OP_TYPE_DELETE 0x04 + +#define RGW_OP_TYPE_MODIFY (RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE) +#define RGW_OP_TYPE_ALL (RGW_OP_TYPE_READ | RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE) + +#define RGW_DEFAULT_MAX_BUCKETS 1000 + +#define RGW_DEFER_TO_BUCKET_ACLS_RECURSE 1 +#define RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL 2 + +#define STATUS_CREATED 1900 +#define STATUS_ACCEPTED 1901 +#define STATUS_NO_CONTENT 1902 +#define STATUS_PARTIAL_CONTENT 1903 +#define STATUS_REDIRECT 1904 +#define STATUS_NO_APPLY 1905 +#define STATUS_APPLIED 1906 + +#define ERR_INVALID_BUCKET_NAME 2000 +#define ERR_INVALID_OBJECT_NAME 2001 +#define ERR_NO_SUCH_BUCKET 2002 +#define ERR_METHOD_NOT_ALLOWED 2003 +#define ERR_INVALID_DIGEST 2004 +#define ERR_BAD_DIGEST 2005 +#define ERR_UNRESOLVABLE_EMAIL 2006 +#define ERR_INVALID_PART 2007 +#define ERR_INVALID_PART_ORDER 2008 +#define ERR_NO_SUCH_UPLOAD 2009 +#define ERR_REQUEST_TIMEOUT 2010 +#define ERR_LENGTH_REQUIRED 2011 +#define ERR_REQUEST_TIME_SKEWED 2012 +#define ERR_BUCKET_EXISTS 2013 +#define ERR_BAD_URL 2014 +#define ERR_PRECONDITION_FAILED 2015 +#define ERR_NOT_MODIFIED 2016 +#define ERR_INVALID_UTF8 2017 +#define ERR_UNPROCESSABLE_ENTITY 2018 +#define ERR_TOO_LARGE 2019 +#define ERR_TOO_MANY_BUCKETS 2020 +#define ERR_INVALID_REQUEST 2021 +#define ERR_TOO_SMALL 2022 +#define ERR_NOT_FOUND 2023 +#define ERR_PERMANENT_REDIRECT 2024 +#define ERR_LOCKED 2025 +#define ERR_QUOTA_EXCEEDED 2026 +#define ERR_SIGNATURE_NO_MATCH 2027 +#define ERR_INVALID_ACCESS_KEY 2028 +#define ERR_MALFORMED_XML 2029 +#define ERR_USER_EXIST 2030 +#define ERR_NOT_SLO_MANIFEST 2031 +#define ERR_EMAIL_EXIST 2032 +#define ERR_KEY_EXIST 2033 +#define ERR_INVALID_SECRET_KEY 2034 +#define ERR_INVALID_KEY_TYPE 2035 +#define ERR_INVALID_CAP 2036 +#define ERR_INVALID_TENANT_NAME 2037 +#define ERR_WEBSITE_REDIRECT 2038 +#define ERR_NO_SUCH_WEBSITE_CONFIGURATION 2039 +#define ERR_AMZ_CONTENT_SHA256_MISMATCH 2040 +#define ERR_NO_SUCH_LC 2041 +#define ERR_NO_SUCH_USER 2042 +#define ERR_NO_SUCH_SUBUSER 2043 +#define ERR_MFA_REQUIRED 2044 +#define ERR_NO_SUCH_CORS_CONFIGURATION 2045 +#define ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION 2046 +#define ERR_INVALID_RETENTION_PERIOD 2047 +#define ERR_NO_SUCH_BUCKET_ENCRYPTION_CONFIGURATION 2048 +#define ERR_USER_SUSPENDED 2100 +#define ERR_INTERNAL_ERROR 2200 +#define ERR_NOT_IMPLEMENTED 2201 +#define ERR_SERVICE_UNAVAILABLE 2202 +#define ERR_ROLE_EXISTS 2203 +#define ERR_MALFORMED_DOC 2204 +#define ERR_NO_ROLE_FOUND 2205 +#define ERR_DELETE_CONFLICT 2206 +#define ERR_NO_SUCH_BUCKET_POLICY 2207 +#define ERR_INVALID_LOCATION_CONSTRAINT 2208 +#define ERR_TAG_CONFLICT 2209 +#define ERR_INVALID_TAG 2210 +#define ERR_ZERO_IN_URL 2211 +#define ERR_MALFORMED_ACL_ERROR 2212 +#define ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION 2213 +#define ERR_INVALID_ENCRYPTION_ALGORITHM 2214 +#define ERR_INVALID_CORS_RULES_ERROR 2215 +#define ERR_NO_CORS_FOUND 2216 +#define ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR 2217 +#define ERR_RATE_LIMITED 2218 +#define ERR_POSITION_NOT_EQUAL_TO_LENGTH 2219 +#define ERR_OBJECT_NOT_APPENDABLE 2220 +#define ERR_INVALID_BUCKET_STATE 2221 +#define ERR_INVALID_OBJECT_STATE 2222 + +#define ERR_BUSY_RESHARDING 2300 +#define ERR_NO_SUCH_ENTITY 2301 +#define ERR_LIMIT_EXCEEDED 2302 + +// STS Errors +#define ERR_PACKED_POLICY_TOO_LARGE 2400 +#define ERR_INVALID_IDENTITY_TOKEN 2401 + +#define ERR_NO_SUCH_TAG_SET 2402 + +#ifndef UINT32_MAX +#define UINT32_MAX (0xffffffffu) +#endif + +typedef void *RGWAccessHandle; + +/* Helper class used for RGWHTTPArgs parsing */ +class NameVal +{ + const std::string str; + std::string name; + std::string val; + public: + explicit NameVal(const std::string& nv) : str(nv) {} + + int parse(); + + std::string& get_name() { return name; } + std::string& get_val() { return val; } +}; + +/** Stores the XML arguments associated with the HTTP request in req_state*/ +class RGWHTTPArgs { + std::string str, empty_str; + std::map val_map; + std::map sys_val_map; + std::map sub_resources; + bool has_resp_modifier = false; + bool admin_subresource_added = false; + public: + RGWHTTPArgs() = default; + explicit RGWHTTPArgs(const std::string& s, const DoutPrefixProvider *dpp) { + set(s); + parse(dpp); + } + + /** Set the arguments; as received */ + void set(const std::string& s) { + has_resp_modifier = false; + val_map.clear(); + sub_resources.clear(); + str = s; + } + /** parse the received arguments */ + int parse(const DoutPrefixProvider *dpp); + void append(const std::string& name, const std::string& val); + void remove(const std::string& name); + /** Get the value for a specific argument parameter */ + const std::string& get(const std::string& name, bool *exists = NULL) const; + boost::optional + get_optional(const std::string& name) const; + int get_bool(const std::string& name, bool *val, bool *exists) const; + int get_bool(const char *name, bool *val, bool *exists) const; + void get_bool(const char *name, bool *val, bool def_val) const; + int get_int(const char *name, int *val, int def_val) const; + + /** Get the value for specific system argument parameter */ + std::string sys_get(const std::string& name, bool *exists = nullptr) const; + + /** see if a parameter is contained in this RGWHTTPArgs */ + bool exists(const char *name) const { + return (val_map.find(name) != std::end(val_map)); + } + bool sub_resource_exists(const char *name) const { + return (sub_resources.find(name) != std::end(sub_resources)); + } + bool exist_obj_excl_sub_resource() const { + const char* const obj_sub_resource[] = {"append", "torrent", "uploadId", + "partNumber", "versionId"}; + for (unsigned i = 0; i != std::size(obj_sub_resource); i++) { + if (sub_resource_exists(obj_sub_resource[i])) return true; + } + return false; + } + + std::map& get_params() { + return val_map; + } + const std::map& get_params() const { + return val_map; + } + std::map& get_sys_params() { + return sys_val_map; + } + const std::map& get_sys_params() const { + return sys_val_map; + } + const std::map& get_sub_resources() const { + return sub_resources; + } + unsigned get_num_params() const { + return val_map.size(); + } + bool has_response_modifier() const { + return has_resp_modifier; + } + void set_system() { /* make all system params visible */ + std::map::iterator iter; + for (iter = sys_val_map.begin(); iter != sys_val_map.end(); ++iter) { + val_map[iter->first] = iter->second; + } + } + const std::string& get_str() { + return str; + } +}; // RGWHTTPArgs + +const char *rgw_conf_get(const std::map& conf_map, const char *name, const char *def_val); +boost::optional rgw_conf_get_optional(const std::map& conf_map, const std::string& name); +int rgw_conf_get_int(const std::map& conf_map, const char *name, int def_val); +bool rgw_conf_get_bool(const std::map& conf_map, const char *name, bool def_val); + +class RGWEnv; + +class RGWConf { + friend class RGWEnv; + int enable_ops_log; + int enable_usage_log; + uint8_t defer_to_bucket_acls; + void init(CephContext *cct); +public: + RGWConf() + : enable_ops_log(1), + enable_usage_log(1), + defer_to_bucket_acls(0) { + } +}; + +class RGWEnv { + std::map env_map; + RGWConf conf; +public: + void init(CephContext *cct); + void init(CephContext *cct, char **envp); + void set(std::string name, std::string val); + const char *get(const char *name, const char *def_val = nullptr) const; + boost::optional + get_optional(const std::string& name) const; + int get_int(const char *name, int def_val = 0) const; + bool get_bool(const char *name, bool def_val = 0); + size_t get_size(const char *name, size_t def_val = 0) const; + bool exists(const char *name) const; + bool exists_prefix(const char *prefix) const; + void remove(const char *name); + const std::map& get_map() const { return env_map; } + int get_enable_ops_log() const { + return conf.enable_ops_log; + } + + int get_enable_usage_log() const { + return conf.enable_usage_log; + } + + int get_defer_to_bucket_acls() const { + return conf.defer_to_bucket_acls; + } +}; + +// return true if the connection is secure. this either means that the +// connection arrived via ssl, or was forwarded as https by a trusted proxy +bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env); + +enum http_op { + OP_GET, + OP_PUT, + OP_DELETE, + OP_HEAD, + OP_POST, + OP_COPY, + OP_OPTIONS, + OP_UNKNOWN, +}; + +class RGWAccessControlPolicy; +class JSONObj; + +void encode_json(const char *name, const obj_version& v, Formatter *f); +void encode_json(const char *name, const RGWUserCaps& val, Formatter *f); + +void decode_json_obj(obj_version& v, JSONObj *obj); + +enum RGWIdentityType +{ + TYPE_NONE=0, + TYPE_RGW=1, + TYPE_KEYSTONE=2, + TYPE_LDAP=3, + TYPE_ROLE=4, + TYPE_WEB=5, +}; + +void encode_json(const char *name, const rgw_placement_rule& val, ceph::Formatter *f); +void decode_json_obj(rgw_placement_rule& v, JSONObj *obj); + +inline std::ostream& operator<<(std::ostream& out, const rgw_placement_rule& rule) { + return out << rule.to_str(); +} + +class RateLimiter; +struct RGWRateLimitInfo { + int64_t max_write_ops; + int64_t max_read_ops; + int64_t max_write_bytes; + int64_t max_read_bytes; + bool enabled = false; + RGWRateLimitInfo() + : max_write_ops(0), max_read_ops(0), max_write_bytes(0), max_read_bytes(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(max_write_ops, bl); + encode(max_read_ops, bl); + encode(max_write_bytes, bl); + encode(max_read_bytes, bl); + encode(enabled, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(max_write_ops,bl); + decode(max_read_ops, bl); + decode(max_write_bytes,bl); + decode(max_read_bytes, bl); + decode(enabled, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + + void decode_json(JSONObj *obj); + +}; +WRITE_CLASS_ENCODER(RGWRateLimitInfo) + +struct RGWUserInfo +{ + rgw_user user_id; + std::string display_name; + std::string user_email; + std::map access_keys; + std::map swift_keys; + std::map subusers; + __u8 suspended; + int32_t max_buckets; + uint32_t op_mask; + RGWUserCaps caps; + __u8 admin; + __u8 system; + rgw_placement_rule default_placement; + std::list placement_tags; + std::map temp_url_keys; + RGWQuota quota; + uint32_t type; + std::set mfa_ids; + + RGWUserInfo() + : suspended(0), + max_buckets(RGW_DEFAULT_MAX_BUCKETS), + op_mask(RGW_OP_TYPE_ALL), + admin(0), + system(0), + type(TYPE_NONE) { + } + + RGWAccessKey* get_key(const std::string& access_key) { + if (access_keys.empty()) + return nullptr; + + auto k = access_keys.find(access_key); + if (k == access_keys.end()) + return nullptr; + else + return &(k->second); + } + + void encode(bufferlist& bl) const { + ENCODE_START(22, 9, bl); + encode((uint64_t)0, bl); // old auid + std::string access_key; + std::string secret_key; + if (!access_keys.empty()) { + std::map::const_iterator iter = access_keys.begin(); + const RGWAccessKey& k = iter->second; + access_key = k.id; + secret_key = k.key; + } + encode(access_key, bl); + encode(secret_key, bl); + encode(display_name, bl); + encode(user_email, bl); + std::string swift_name; + std::string swift_key; + if (!swift_keys.empty()) { + std::map::const_iterator iter = swift_keys.begin(); + const RGWAccessKey& k = iter->second; + swift_name = k.id; + swift_key = k.key; + } + encode(swift_name, bl); + encode(swift_key, bl); + encode(user_id.id, bl); + encode(access_keys, bl); + encode(subusers, bl); + encode(suspended, bl); + encode(swift_keys, bl); + encode(max_buckets, bl); + encode(caps, bl); + encode(op_mask, bl); + encode(system, bl); + encode(default_placement, bl); + encode(placement_tags, bl); + encode(quota.bucket_quota, bl); + encode(temp_url_keys, bl); + encode(quota.user_quota, bl); + encode(user_id.tenant, bl); + encode(admin, bl); + encode(type, bl); + encode(mfa_ids, bl); + { + std::string assumed_role_arn; // removed + encode(assumed_role_arn, bl); + } + encode(user_id.ns, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(22, 9, 9, bl); + if (struct_v >= 2) { + uint64_t old_auid; + decode(old_auid, bl); + } + std::string access_key; + std::string secret_key; + decode(access_key, bl); + decode(secret_key, bl); + if (struct_v < 6) { + RGWAccessKey k; + k.id = access_key; + k.key = secret_key; + access_keys[access_key] = k; + } + decode(display_name, bl); + decode(user_email, bl); + /* We populate swift_keys map later nowadays, but we have to decode. */ + std::string swift_name; + std::string swift_key; + if (struct_v >= 3) decode(swift_name, bl); + if (struct_v >= 4) decode(swift_key, bl); + if (struct_v >= 5) + decode(user_id.id, bl); + else + user_id.id = access_key; + if (struct_v >= 6) { + decode(access_keys, bl); + decode(subusers, bl); + } + suspended = 0; + if (struct_v >= 7) { + decode(suspended, bl); + } + if (struct_v >= 8) { + decode(swift_keys, bl); + } + if (struct_v >= 10) { + decode(max_buckets, bl); + } else { + max_buckets = RGW_DEFAULT_MAX_BUCKETS; + } + if (struct_v >= 11) { + decode(caps, bl); + } + if (struct_v >= 12) { + decode(op_mask, bl); + } else { + op_mask = RGW_OP_TYPE_ALL; + } + if (struct_v >= 13) { + decode(system, bl); + decode(default_placement, bl); + decode(placement_tags, bl); /* tags of allowed placement rules */ + } + if (struct_v >= 14) { + decode(quota.bucket_quota, bl); + } + if (struct_v >= 15) { + decode(temp_url_keys, bl); + } + if (struct_v >= 16) { + decode(quota.user_quota, bl); + } + if (struct_v >= 17) { + decode(user_id.tenant, bl); + } else { + user_id.tenant.clear(); + } + if (struct_v >= 18) { + decode(admin, bl); + } + if (struct_v >= 19) { + decode(type, bl); + } + if (struct_v >= 20) { + decode(mfa_ids, bl); + } + if (struct_v >= 21) { + std::string assumed_role_arn; // removed + decode(assumed_role_arn, bl); + } + if (struct_v >= 22) { + decode(user_id.ns, bl); + } else { + user_id.ns.clear(); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWUserInfo) + +/// `RGWObjVersionTracker` +/// ====================== +/// +/// What and why is this? +/// --------------------- +/// +/// This is a wrapper around `cls_version` functionality. If two RGWs +/// (or two non-synchronized threads in the same RGW) are accessing +/// the same object, they may race and overwrite each other's work. +/// +/// This class solves this issue by tracking and recording an object's +/// version in the extended attributes. Operations are failed with +/// ECANCELED if the version is not what we expect. +/// +/// How to Use It +/// ------------- +/// +/// When preparing a read operation, call `prepare_op_for_read`. +/// For a write, call `prepare_op_for_write` when preparing the +/// operation, and `apply_write` after it succeeds. +/// +/// Adhere to the following guidelines: +/// +/// - Each RGWObjVersionTracker should be used with only one object. +/// +/// - If you receive `ECANCELED`, throw away whatever you were doing +/// based on the content of the versioned object, re-read, and +/// restart as appropriate. +/// +/// - If one code path uses RGWObjVersionTracker, then they all +/// should. In a situation where a writer should unconditionally +/// overwrite an object, call `generate_new_write_ver` on a default +/// constructed `RGWObjVersionTracker`. +/// +/// - If we have a version from a previous read, we will check against +/// it and fail the read if it doesn't match. Thus, if we want to +/// re-read a new version of the object, call `clear()` on the +/// `RGWObjVersionTracker`. +/// +/// - This type is not thread-safe. Every thread must have its own +/// instance. +/// +struct RGWObjVersionTracker { + obj_version read_version; //< The version read from an object. If + // set, this value is used to check the + // stored version. + obj_version write_version; //< Set the object to this version on + // write, if set. + + /// Pointer to the read version. + obj_version* version_for_read() { + return &read_version; + } + + /// If we have a write version, return a pointer to it. Otherwise + /// return null. This is used in `prepare_op_for_write` to treat the + /// `write_version` as effectively an `option` type. + obj_version* version_for_write() { + if (write_version.ver == 0) + return nullptr; + + return &write_version; + } + + /// If read_version is non-empty, return a pointer to it, otherwise + /// null. This is used internally by `prepare_op_for_read` and + /// `prepare_op_for_write` to treat the `read_version` as + /// effectively an `option` type. + obj_version* version_for_check() { + if (read_version.ver == 0) + return nullptr; + + return &read_version; + } + + /// This function is to be called on any read operation. If we have + /// a non-empty `read_version`, assert on the OSD that the object + /// has the same version. Also reads the version into `read_version`. + /// + /// This function is defined in `rgw_rados.cc` rather than `rgw_common.cc`. + void prepare_op_for_read(librados::ObjectReadOperation* op); + + /// This function is to be called on any write operation. If we have + /// a non-empty read operation, assert on the OSD that the object + /// has the same version. If we have a non-empty `write_version`, + /// set the object to it. Otherwise increment the version on the OSD. + /// + /// This function is defined in `rgw_rados.cc` rather than + /// `rgw_common.cc`. + void prepare_op_for_write(librados::ObjectWriteOperation* op); + + /// This function is to be called after the completion of any write + /// operation on which `prepare_op_for_write` was called. If we did + /// not set the write version explicitly, it increments + /// `read_version`. If we did, it sets `read_version` to + /// `write_version`. In either case, it clears `write_version`. + /// + /// RADOS write operations, at least those not using the relatively + /// new RETURNVEC flag, cannot return more information than an error + /// code. Thus, write operations can't simply fill in the read + /// version the way read operations can, so prepare_op_for_write` + /// instructs the OSD to increment the object as stored in RADOS and + /// `apply_write` increments our `read_version` in RAM. + /// + /// This function is defined in `rgw_rados.cc` rather than + /// `rgw_common.cc`. + void apply_write(); + + /// Clear `read_version` and `write_version`, making the instance + /// identical to a default-constructed instance. + void clear() { + read_version = obj_version(); + write_version = obj_version(); + } + + /// Set `write_version` to a new, unique version. + /// + /// An `obj_version` contains an opaque, random tag and a + /// sequence. If the tags of two `obj_version`s don't match, the + /// versions are unordered and unequal. This function creates a + /// version with a new tag, ensuring that any other process + /// operating on the object will receive `ECANCELED` and will know + /// to re-read the object and restart whatever it was doing. + void generate_new_write_ver(CephContext* cct); +}; + +inline std::ostream& operator<<(std::ostream& out, const obj_version &v) +{ + out << v.tag << ":" << v.ver; + return out; +} + +inline std::ostream& operator<<(std::ostream& out, const RGWObjVersionTracker &ot) +{ + out << "{r=" << ot.read_version << ",w=" << ot.write_version << "}"; + return out; +} + +enum RGWBucketFlags { + BUCKET_SUSPENDED = 0x1, + BUCKET_VERSIONED = 0x2, + BUCKET_VERSIONS_SUSPENDED = 0x4, + BUCKET_DATASYNC_DISABLED = 0X8, + BUCKET_MFA_ENABLED = 0X10, + BUCKET_OBJ_LOCK_ENABLED = 0X20, +}; + +class RGWSI_Zone; + +struct RGWBucketInfo { + rgw_bucket bucket; + rgw_user owner; + uint32_t flags{0}; + std::string zonegroup; + ceph::real_time creation_time; + rgw_placement_rule placement_rule; + bool has_instance_obj{false}; + RGWObjVersionTracker objv_tracker; /* we don't need to serialize this, for runtime tracking */ + RGWQuotaInfo quota; + + // layout of bucket index objects + rgw::BucketLayout layout; + + // Represents the shard number for blind bucket. + const static uint32_t NUM_SHARDS_BLIND_BUCKET; + + bool requester_pays{false}; + + bool has_website{false}; + RGWBucketWebsiteConf website_conf; + + bool swift_versioning{false}; + std::string swift_ver_location; + + std::map mdsearch_config; + + // resharding + cls_rgw_reshard_status reshard_status{cls_rgw_reshard_status::NOT_RESHARDING}; + std::string new_bucket_instance_id; + + RGWObjectLock obj_lock; + + std::optional sync_policy; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + void decode_json(JSONObj *obj); + + bool versioned() const { return (flags & BUCKET_VERSIONED) != 0; } + int versioning_status() const { return flags & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED | BUCKET_MFA_ENABLED); } + bool versioning_enabled() const { return (versioning_status() & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED)) == BUCKET_VERSIONED; } + bool mfa_enabled() const { return (versioning_status() & BUCKET_MFA_ENABLED) != 0; } + bool datasync_flag_enabled() const { return (flags & BUCKET_DATASYNC_DISABLED) == 0; } + bool obj_lock_enabled() const { return (flags & BUCKET_OBJ_LOCK_ENABLED) != 0; } + + bool has_swift_versioning() const { + /* A bucket may be versioned through one mechanism only. */ + return swift_versioning && !versioned(); + } + + void set_sync_policy(rgw_sync_policy_info&& policy); + + bool empty_sync_policy() const; + + bool is_indexless() const { + return rgw::is_layout_indexless(layout.current_index); + } + const rgw::bucket_index_layout_generation& get_current_index() const { + return layout.current_index; + } + rgw::bucket_index_layout_generation& get_current_index() { + return layout.current_index; + } + + RGWBucketInfo(); + ~RGWBucketInfo(); +}; +WRITE_CLASS_ENCODER(RGWBucketInfo) + +struct RGWBucketEntryPoint +{ + rgw_bucket bucket; + rgw_user owner; + ceph::real_time creation_time; + bool linked; + + bool has_bucket_info; + RGWBucketInfo old_bucket_info; + + RGWBucketEntryPoint() : linked(false), has_bucket_info(false) {} + + void encode(bufferlist& bl) const { + ENCODE_START(10, 8, bl); + encode(bucket, bl); + encode(owner.id, bl); + encode(linked, bl); + uint64_t ctime = (uint64_t)real_clock::to_time_t(creation_time); + encode(ctime, bl); + encode(owner, bl); + encode(creation_time, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + auto orig_iter = bl; + DECODE_START_LEGACY_COMPAT_LEN_32(10, 4, 4, bl); + if (struct_v < 8) { + /* ouch, old entry, contains the bucket info itself */ + old_bucket_info.decode(orig_iter); + has_bucket_info = true; + return; + } + has_bucket_info = false; + decode(bucket, bl); + decode(owner.id, bl); + decode(linked, bl); + uint64_t ctime; + decode(ctime, bl); + if (struct_v < 10) { + creation_time = real_clock::from_time_t((time_t)ctime); + } + if (struct_v >= 9) { + decode(owner, bl); + } + if (struct_v >= 10) { + decode(creation_time, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(RGWBucketEntryPoint) + +struct RGWStorageStats +{ + RGWObjCategory category; + uint64_t size; + uint64_t size_rounded; + uint64_t num_objects; + uint64_t size_utilized{0}; //< size after compression, encryption + bool dump_utilized; // whether dump should include utilized values + + RGWStorageStats(bool _dump_utilized=true) + : category(RGWObjCategory::None), + size(0), + size_rounded(0), + num_objects(0), + dump_utilized(_dump_utilized) + {} + + void dump(Formatter *f) const; +}; // RGWStorageStats + +class RGWEnv; + +/* Namespaced forward declarations. */ +namespace rgw { + namespace auth { + namespace s3 { + class AWSBrowserUploadAbstractor; + class STSEngine; + } + class Completer; + } + namespace io { + class BasicClient; + } +} + +using meta_map_t = boost::container::flat_map ; + +struct req_info { + const RGWEnv *env; + RGWHTTPArgs args; + meta_map_t x_meta_map; + meta_map_t crypt_attribute_map; + + std::string host; + const char *method; + std::string script_uri; + std::string request_uri; + std::string request_uri_aws4; + std::string effective_uri; + std::string request_params; + std::string domain; + std::string storage_class; + + req_info(CephContext *cct, const RGWEnv *env); + void rebuild_from(req_info& src); + void init_meta_info(const DoutPrefixProvider *dpp, bool *found_bad_meta); +}; + +struct req_init_state { + /* Keeps [[tenant]:]bucket until we parse the token. */ + std::string url_bucket; + std::string src_bucket; +}; + +#include "rgw_auth.h" + +class RGWObjectCtx; + +/** Store all the state necessary to complete and respond to an HTTP request*/ +struct req_state : DoutPrefixProvider { + CephContext *cct; + const RGWProcessEnv& penv; + rgw::io::BasicClient *cio{nullptr}; + http_op op{OP_UNKNOWN}; + RGWOpType op_type{}; + std::shared_ptr ratelimit_data; + RGWRateLimitInfo user_ratelimit; + RGWRateLimitInfo bucket_ratelimit; + std::string ratelimit_bucket_marker; + std::string ratelimit_user_name; + bool content_started{false}; + RGWFormat format{RGWFormat::PLAIN}; + ceph::Formatter *formatter{nullptr}; + std::string decoded_uri; + std::string relative_uri; + const char *length{nullptr}; + int64_t content_length{0}; + std::map generic_attrs; + rgw_err err; + bool expect_cont{false}; + uint64_t obj_size{0}; + bool enable_ops_log; + bool enable_usage_log; + uint8_t defer_to_bucket_acls; + uint32_t perm_mask{0}; + + /* Set once when url_bucket is parsed and not violated thereafter. */ + std::string account_name; + + std::string bucket_tenant; + std::string bucket_name; + + /* bucket is only created in rgw_build_bucket_policies() and should never be + * overwritten */ + std::unique_ptr bucket; + std::unique_ptr object; + std::string src_tenant_name; + std::string src_bucket_name; + std::unique_ptr src_object; + ACLOwner bucket_owner; + ACLOwner owner; + + std::string zonegroup_name; + std::string zonegroup_endpoint; + std::string bucket_instance_id; + int bucket_instance_shard_id{-1}; + std::string redirect_zone_endpoint; + + std::string redirect; + + real_time bucket_mtime; + std::map bucket_attrs; + bool bucket_exists{false}; + rgw_placement_rule dest_placement; + + bool has_bad_meta{false}; + + std::unique_ptr user; + + struct { + /* TODO(rzarzynski): switch out to the static_ptr for both members. */ + + /* Object having the knowledge about an authenticated identity and allowing + * to apply it during the authorization phase (verify_permission() methods + * of a given RGWOp). Thus, it bounds authentication and authorization steps + * through a well-defined interface. For more details, see rgw_auth.h. */ + std::unique_ptr identity; + + std::shared_ptr completer; + + /* A container for credentials of the S3's browser upload. It's necessary + * because: 1) the ::authenticate() method of auth engines and strategies + * take req_state only; 2) auth strategies live much longer than RGWOps - + * there is no way to pass additional data dependencies through ctors. */ + class { + /* Writer. */ + friend class RGWPostObj_ObjStore_S3; + /* Reader. */ + friend class rgw::auth::s3::AWSBrowserUploadAbstractor; + friend class rgw::auth::s3::STSEngine; + + std::string access_key; + std::string signature; + std::string x_amz_algorithm; + std::string x_amz_credential; + std::string x_amz_date; + std::string x_amz_security_token; + ceph::bufferlist encoded_policy; + } s3_postobj_creds; + } auth; + + std::unique_ptr user_acl; + std::unique_ptr bucket_acl; + std::unique_ptr object_acl; + + rgw::IAM::Environment env; + boost::optional iam_policy; + boost::optional bucket_access_conf; + std::vector iam_user_policies; + + /* Is the request made by an user marked as a system one? + * Being system user means we also have the admin status. */ + bool system_request{false}; + + std::string canned_acl; + bool has_acl_header{false}; + bool local_source{false}; /* source is local */ + + int prot_flags{0}; + + /* Content-Disposition override for TempURL of Swift API. */ + struct { + std::string override; + std::string fallback; + } content_disp; + + std::string host_id; + + req_info info; + req_init_state init_state; + + using Clock = ceph::coarse_real_clock; + Clock::time_point time; + + Clock::duration time_elapsed() const { return Clock::now() - time; } + + std::string dialect; + std::string req_id; + std::string trans_id; + uint64_t id; + + RGWObjTags tagset; + + bool mfa_verified{false}; + + /// optional coroutine context + optional_yield yield{null_yield}; + + //token claims from STS token for ops log (can be used for Keystone token also) + std::vector token_claims; + + std::vector session_policies; + + jspan trace; + bool trace_enabled = false; + + //Principal tags that come in as part of AssumeRoleWithWebIdentity + std::vector> principal_tags; + + req_state(CephContext* _cct, const RGWProcessEnv& penv, RGWEnv* e, uint64_t id); + ~req_state(); + + + void set_user(std::unique_ptr& u) { user.swap(u); } + bool is_err() const { return err.is_err(); } + + // implements DoutPrefixProvider + std::ostream& gen_prefix(std::ostream& out) const override; + CephContext* get_cct() const override { return cct; } + unsigned get_subsys() const override { return ceph_subsys_rgw; } +}; + +void set_req_state_err(req_state*, int); +void set_req_state_err(req_state*, int, const std::string&); +void set_req_state_err(struct rgw_err&, int, const int); +void dump(req_state*); + +/** Store basic data on bucket */ +struct RGWBucketEnt { + rgw_bucket bucket; + size_t size; + size_t size_rounded; + ceph::real_time creation_time; + uint64_t count; + + /* The placement_rule is necessary to calculate per-storage-policy statics + * of the Swift API. Although the info available in RGWBucketInfo, we need + * to duplicate it here to not affect the performance of buckets listing. */ + rgw_placement_rule placement_rule; + + RGWBucketEnt() + : size(0), + size_rounded(0), + count(0) { + } + RGWBucketEnt(const RGWBucketEnt&) = default; + RGWBucketEnt(RGWBucketEnt&&) = default; + explicit RGWBucketEnt(const rgw_user& u, cls_user_bucket_entry&& e) + : bucket(u, std::move(e.bucket)), + size(e.size), + size_rounded(e.size_rounded), + creation_time(e.creation_time), + count(e.count) { + } + + RGWBucketEnt& operator=(const RGWBucketEnt&) = default; + + void convert(cls_user_bucket_entry *b) const { + bucket.convert(&b->bucket); + b->size = size; + b->size_rounded = size_rounded; + b->creation_time = creation_time; + b->count = count; + } + + void encode(bufferlist& bl) const { + ENCODE_START(7, 5, bl); + uint64_t s = size; + __u32 mt = ceph::real_clock::to_time_t(creation_time); + std::string empty_str; // originally had the bucket name here, but we encode bucket later + encode(empty_str, bl); + encode(s, bl); + encode(mt, bl); + encode(count, bl); + encode(bucket, bl); + s = size_rounded; + encode(s, bl); + encode(creation_time, bl); + encode(placement_rule, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); + __u32 mt; + uint64_t s; + std::string empty_str; // backward compatibility + decode(empty_str, bl); + decode(s, bl); + decode(mt, bl); + size = s; + if (struct_v < 6) { + creation_time = ceph::real_clock::from_time_t(mt); + } + if (struct_v >= 2) + decode(count, bl); + if (struct_v >= 3) + decode(bucket, bl); + if (struct_v >= 4) + decode(s, bl); + size_rounded = s; + if (struct_v >= 6) + decode(creation_time, bl); + if (struct_v >= 7) + decode(placement_rule, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(RGWBucketEnt) + +struct rgw_cache_entry_info { + std::string cache_locator; + uint64_t gen; + + rgw_cache_entry_info() : gen(0) {} +}; + +inline std::ostream& operator<<(std::ostream& out, const rgw_obj &o) { + return out << o.bucket.name << ":" << o.get_oid(); +} + +struct multipart_upload_info +{ + rgw_placement_rule dest_placement; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(dest_placement, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(dest_placement, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(multipart_upload_info) + +static inline void buf_to_hex(const unsigned char* const buf, + const size_t len, + char* const str) +{ + str[0] = '\0'; + for (size_t i = 0; i < len; i++) { + ::sprintf(&str[i*2], "%02x", static_cast(buf[i])); + } +} + +template static inline std::array +buf_to_hex(const std::array& buf) +{ + static_assert(N > 0, "The input array must be at least one element long"); + + std::array hex_dest; + buf_to_hex(buf.data(), N, hex_dest.data()); + return hex_dest; +} + +static inline int hexdigit(char c) +{ + if (c >= '0' && c <= '9') + return (c - '0'); + c = toupper(c); + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xa; + return -EINVAL; +} + +static inline int hex_to_buf(const char *hex, char *buf, int len) +{ + int i = 0; + const char *p = hex; + while (*p) { + if (i >= len) + return -EINVAL; + buf[i] = 0; + int d = hexdigit(*p); + if (d < 0) + return d; + buf[i] = d << 4; + p++; + if (!*p) + return -EINVAL; + d = hexdigit(*p); + if (d < 0) + return d; + buf[i] += d; + i++; + p++; + } + return i; +} + +static inline int rgw_str_to_bool(const char *s, int def_val) +{ + if (!s) + return def_val; + + return (strcasecmp(s, "true") == 0 || + strcasecmp(s, "on") == 0 || + strcasecmp(s, "yes") == 0 || + strcasecmp(s, "1") == 0); +} + +static inline void append_rand_alpha(CephContext *cct, const std::string& src, std::string& dest, int len) +{ + dest = src; + char buf[len + 1]; + gen_rand_alphanumeric(cct, buf, len); + dest.append("_"); + dest.append(buf); +} + +static inline uint64_t rgw_rounded_kb(uint64_t bytes) +{ + return (bytes + 1023) / 1024; +} + +static inline uint64_t rgw_rounded_objsize(uint64_t bytes) +{ + return ((bytes + 4095) & ~4095); +} + +static inline uint64_t rgw_rounded_objsize_kb(uint64_t bytes) +{ + return ((bytes + 4095) & ~4095) / 1024; +} + +/* implement combining step, S3 header canonicalization; k is a + * valid header and in lc form */ +void rgw_add_amz_meta_header( + meta_map_t& x_meta_map, + const std::string& k, + const std::string& v); + +enum rgw_set_action_if_set { + DISCARD=0, OVERWRITE, APPEND +}; + +bool rgw_set_amz_meta_header( + meta_map_t& x_meta_map, + const std::string& k, + const std::string& v, rgw_set_action_if_set f); + +extern std::string rgw_string_unquote(const std::string& s); +extern void parse_csv_string(const std::string& ival, std::vector& ovals); +extern int parse_key_value(std::string& in_str, std::string& key, std::string& val); +extern int parse_key_value(std::string& in_str, const char *delim, std::string& key, std::string& val); + +extern boost::optional> +parse_key_value(const std::string_view& in_str, + const std::string_view& delim); +extern boost::optional> +parse_key_value(const std::string_view& in_str); + +struct rgw_name_to_flag { + const char *type_name; + uint32_t flag; +}; + +/** time parsing */ +extern int parse_time(const char *time_str, real_time *time); +extern bool parse_rfc2616(const char *s, struct tm *t); +extern bool parse_iso8601(const char *s, struct tm *t, uint32_t *pns = NULL, bool extended_format = true); +extern std::string rgw_trim_whitespace(const std::string& src); +extern std::string_view rgw_trim_whitespace(const std::string_view& src); +extern std::string rgw_trim_quotes(const std::string& val); + +extern void rgw_to_iso8601(const real_time& t, char *dest, int buf_size); +extern void rgw_to_iso8601(const real_time& t, std::string *dest); +extern std::string rgw_to_asctime(const utime_t& t); + +struct perm_state_base { + CephContext *cct; + const rgw::IAM::Environment& env; + rgw::auth::Identity *identity; + const RGWBucketInfo bucket_info; + int perm_mask; + bool defer_to_bucket_acls; + boost::optional bucket_access_conf; + + perm_state_base(CephContext *_cct, + const rgw::IAM::Environment& _env, + rgw::auth::Identity *_identity, + const RGWBucketInfo& _bucket_info, + int _perm_mask, + bool _defer_to_bucket_acls, + boost::optional _bucket_acess_conf = boost::none) : + cct(_cct), + env(_env), + identity(_identity), + bucket_info(_bucket_info), + perm_mask(_perm_mask), + defer_to_bucket_acls(_defer_to_bucket_acls), + bucket_access_conf(_bucket_acess_conf) + {} + + virtual ~perm_state_base() {} + + virtual const char *get_referer() const = 0; + virtual std::optional get_request_payer() const = 0; /* + * empty state means that request_payer param was not passed in + */ + +}; + +struct perm_state : public perm_state_base { + const char *referer; + bool request_payer; + + perm_state(CephContext *_cct, + const rgw::IAM::Environment& _env, + rgw::auth::Identity *_identity, + const RGWBucketInfo& _bucket_info, + int _perm_mask, + bool _defer_to_bucket_acls, + const char *_referer, + bool _request_payer) : perm_state_base(_cct, + _env, + _identity, + _bucket_info, + _perm_mask, + _defer_to_bucket_acls), + referer(_referer), + request_payer(_request_payer) {} + + const char *get_referer() const override { + return referer; + } + + std::optional get_request_payer() const override { + return request_payer; + } +}; + +/** Check if the req_state's user has the necessary permissions + * to do the requested action */ +bool verify_bucket_permission_no_policy( + const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const int perm); + +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + RGWAccessControlPolicy * const user_acl, + const int perm); + +bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + const int perm); + +/** Check if the req_state's user has the necessary permissions + * to do the requested action */ +rgw::IAM::Effect eval_identity_or_session_policies(const DoutPrefixProvider* dpp, + const std::vector& user_policies, + const rgw::IAM::Environment& env, + const uint64_t op, + const rgw::ARN& arn); +bool verify_user_permission(const DoutPrefixProvider* dpp, + req_state * const s, + RGWAccessControlPolicy * const user_acl, + const std::vector& user_policies, + const std::vector& session_policies, + const rgw::ARN& res, + const uint64_t op, + bool mandatory_policy=true); +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, + req_state * const s, + RGWAccessControlPolicy * const user_acl, + const int perm); +bool verify_user_permission(const DoutPrefixProvider* dpp, + req_state * const s, + const rgw::ARN& res, + const uint64_t op, + bool mandatory_policy=true); +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, + req_state * const s, + int perm); +bool verify_bucket_permission( + const DoutPrefixProvider* dpp, + req_state * const s, + const rgw_bucket& bucket, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const std::vector& identity_policies, + const std::vector& session_policies, + const uint64_t op); +bool verify_bucket_permission(const DoutPrefixProvider* dpp, req_state * const s, const uint64_t op); +bool verify_bucket_permission_no_policy( + const DoutPrefixProvider* dpp, + req_state * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const int perm); +bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, + req_state * const s, + const int perm); +int verify_bucket_owner_or_policy(req_state* const s, + const uint64_t op); +extern bool verify_object_permission( + const DoutPrefixProvider* dpp, + req_state * const s, + const rgw_obj& obj, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + const boost::optional& bucket_policy, + const std::vector& identity_policies, + const std::vector& session_policies, + const uint64_t op); +extern bool verify_object_permission(const DoutPrefixProvider* dpp, req_state *s, uint64_t op); +extern bool verify_object_permission_no_policy( + const DoutPrefixProvider* dpp, + req_state * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + int perm); +extern bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, req_state *s, + int perm); +extern int verify_object_lock( + const DoutPrefixProvider* dpp, + const rgw::sal::Attrs& attrs, + const bool bypass_perm, + const bool bypass_governance_mode); + +/** Convert an input URL into a sane object name + * by converting %-escaped std::strings into characters, etc*/ +extern void rgw_uri_escape_char(char c, std::string& dst); +extern std::string url_decode(const std::string_view& src_str, + bool in_query = false); +extern void url_encode(const std::string& src, std::string& dst, + bool encode_slash = true); +extern std::string url_encode(const std::string& src, bool encode_slash = true); +extern std::string url_remove_prefix(const std::string& url); // Removes hhtp, https and www from url +/* destination should be CEPH_CRYPTO_HMACSHA1_DIGESTSIZE bytes long */ +extern void calc_hmac_sha1(const char *key, int key_len, + const char *msg, int msg_len, char *dest); + +static inline sha1_digest_t +calc_hmac_sha1(const std::string_view& key, const std::string_view& msg) { + sha1_digest_t dest; + calc_hmac_sha1(key.data(), key.size(), msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +/* destination should be CEPH_CRYPTO_HMACSHA256_DIGESTSIZE bytes long */ +extern void calc_hmac_sha256(const char *key, int key_len, + const char *msg, int msg_len, + char *dest); + +static inline sha256_digest_t +calc_hmac_sha256(const char *key, const int key_len, + const char *msg, const int msg_len) { + sha256_digest_t dest; + calc_hmac_sha256(key, key_len, msg, msg_len, + reinterpret_cast(dest.v)); + return dest; +} + +static inline sha256_digest_t +calc_hmac_sha256(const std::string_view& key, const std::string_view& msg) { + sha256_digest_t dest; + calc_hmac_sha256(key.data(), key.size(), + msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +static inline sha256_digest_t +calc_hmac_sha256(const sha256_digest_t &key, + const std::string_view& msg) { + sha256_digest_t dest; + calc_hmac_sha256(reinterpret_cast(key.v), sha256_digest_t::SIZE, + msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +static inline sha256_digest_t +calc_hmac_sha256(const std::vector& key, + const std::string_view& msg) { + sha256_digest_t dest; + calc_hmac_sha256(reinterpret_cast(key.data()), key.size(), + msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +template +static inline sha256_digest_t +calc_hmac_sha256(const std::array& key, + const std::string_view& msg) { + sha256_digest_t dest; + calc_hmac_sha256(reinterpret_cast(key.data()), key.size(), + msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +extern sha256_digest_t calc_hash_sha256(const std::string_view& msg); + +extern ceph::crypto::SHA256* calc_hash_sha256_open_stream(); +extern void calc_hash_sha256_update_stream(ceph::crypto::SHA256* hash, + const char* msg, + int len); +extern std::string calc_hash_sha256_close_stream(ceph::crypto::SHA256** phash); +extern std::string calc_hash_sha256_restart_stream(ceph::crypto::SHA256** phash); + +extern int rgw_parse_op_type_list(const std::string& str, uint32_t *perm); + +static constexpr uint32_t MATCH_POLICY_ACTION = 0x01; +static constexpr uint32_t MATCH_POLICY_RESOURCE = 0x02; +static constexpr uint32_t MATCH_POLICY_ARN = 0x04; +static constexpr uint32_t MATCH_POLICY_STRING = 0x08; + +extern bool match_policy(std::string_view pattern, std::string_view input, + uint32_t flag); + +extern std::string camelcase_dash_http_attr(const std::string& orig); +extern std::string lowercase_dash_http_attr(const std::string& orig); + +void rgw_setup_saved_curl_handles(); +void rgw_release_all_curl_handles(); + +static inline void rgw_escape_str(const std::string& s, char esc_char, + char special_char, std::string *dest) +{ + const char *src = s.c_str(); + char dest_buf[s.size() * 2 + 1]; + char *destp = dest_buf; + + for (size_t i = 0; i < s.size(); i++) { + char c = src[i]; + if (c == esc_char || c == special_char) { + *destp++ = esc_char; + } + *destp++ = c; + } + *destp++ = '\0'; + *dest = dest_buf; +} + +static inline ssize_t rgw_unescape_str(const std::string& s, ssize_t ofs, + char esc_char, char special_char, + std::string *dest) +{ + const char *src = s.c_str(); + char dest_buf[s.size() + 1]; + char *destp = dest_buf; + bool esc = false; + + dest_buf[0] = '\0'; + + for (size_t i = ofs; i < s.size(); i++) { + char c = src[i]; + if (!esc && c == esc_char) { + esc = true; + continue; + } + if (!esc && c == special_char) { + *destp = '\0'; + *dest = dest_buf; + return (ssize_t)i + 1; + } + *destp++ = c; + esc = false; + } + *destp = '\0'; + *dest = dest_buf; + return std::string::npos; +} + +static inline std::string rgw_bl_str(ceph::buffer::list& raw) +{ + size_t len = raw.length(); + std::string s(raw.c_str(), len); + while (len && !s[len - 1]) { + --len; + s.resize(len); + } + return s; +} + +template +int decode_bl(bufferlist& bl, T& t) +{ + auto iter = bl.cbegin(); + try { + decode(t, iter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +extern int rgw_bucket_parse_bucket_instance(const std::string& bucket_instance, std::string *bucket_name, std::string *bucket_id, int *shard_id); + +boost::intrusive_ptr +rgw_global_init(const std::map *defaults, + std::vector < const char* >& args, + uint32_t module_type, code_environment_t code_env, + int flags); diff --git a/src/rgw/rgw_compression.cc b/src/rgw/rgw_compression.cc new file mode 100644 index 000000000..8306e766a --- /dev/null +++ b/src/rgw/rgw_compression.cc @@ -0,0 +1,236 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_compression.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int rgw_compression_info_from_attr(const bufferlist& attr, + bool& need_decompress, + RGWCompressionInfo& cs_info) +{ + auto bliter = attr.cbegin(); + try { + decode(cs_info, bliter); + } catch (buffer::error& err) { + return -EIO; + } + if (cs_info.blocks.size() == 0) { + return -EIO; + } + if (cs_info.compression_type != "none") + need_decompress = true; + else + need_decompress = false; + return 0; +} + +int rgw_compression_info_from_attrset(const map& attrs, + bool& need_decompress, + RGWCompressionInfo& cs_info) +{ + auto value = attrs.find(RGW_ATTR_COMPRESSION); + if (value == attrs.end()) { + need_decompress = false; + return 0; + } + return rgw_compression_info_from_attr(value->second, need_decompress, cs_info); +} + +//------------RGWPutObj_Compress--------------- + +int RGWPutObj_Compress::process(bufferlist&& in, uint64_t logical_offset) +{ + bufferlist out; + compressed_ofs = logical_offset; + + if (in.length() > 0) { + // compression stuff + if ((logical_offset > 0 && compressed) || // if previous part was compressed + (logical_offset == 0)) { // or it's the first part + ldout(cct, 10) << "Compression for rgw is enabled, compress part " << in.length() << dendl; + int cr = compressor->compress(in, out, compressor_message); + if (cr < 0) { + if (logical_offset > 0) { + lderr(cct) << "Compression failed with exit code " << cr + << " for next part, compression process failed" << dendl; + return -EIO; + } + compressed = false; + ldout(cct, 5) << "Compression failed with exit code " << cr + << " for first part, storing uncompressed" << dendl; + out = std::move(in); + } else { + compressed = true; + + compression_block newbl; + size_t bs = blocks.size(); + newbl.old_ofs = logical_offset; + newbl.new_ofs = bs > 0 ? blocks[bs-1].len + blocks[bs-1].new_ofs : 0; + newbl.len = out.length(); + blocks.push_back(newbl); + + compressed_ofs = newbl.new_ofs; + } + } else { + compressed = false; + out = std::move(in); + } + // end of compression stuff + } else { + size_t bs = blocks.size(); + compressed_ofs = bs > 0 ? blocks[bs-1].len + blocks[bs-1].new_ofs : logical_offset; + } + + return Pipe::process(std::move(out), compressed_ofs); +} + +//----------------RGWGetObj_Decompress--------------------- +RGWGetObj_Decompress::RGWGetObj_Decompress(CephContext* cct_, + RGWCompressionInfo* cs_info_, + bool partial_content_, + RGWGetObj_Filter* next): RGWGetObj_Filter(next), + cct(cct_), + cs_info(cs_info_), + partial_content(partial_content_), + q_ofs(0), + q_len(0), + cur_ofs(0) +{ + compressor = Compressor::create(cct, cs_info->compression_type); + if (!compressor.get()) + lderr(cct) << "Cannot load compressor of type " << cs_info->compression_type << dendl; +} + +int RGWGetObj_Decompress::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + ldout(cct, 10) << "Compression for rgw is enabled, decompress part " + << "bl_ofs=" << bl_ofs << ", bl_len=" << bl_len << dendl; + + if (!compressor.get()) { + // if compressor isn't available - error, because cannot return decompressed data? + lderr(cct) << "Cannot load compressor of type " << cs_info->compression_type << dendl; + return -EIO; + } + bufferlist out_bl, in_bl, temp_in_bl; + bl.begin(bl_ofs).copy(bl_len, temp_in_bl); + bl_ofs = 0; + int r = 0; + if (waiting.length() != 0) { + in_bl.append(waiting); + in_bl.append(temp_in_bl); + waiting.clear(); + } else { + in_bl = std::move(temp_in_bl); + } + bl_len = in_bl.length(); + + auto iter_in_bl = in_bl.cbegin(); + while (first_block <= last_block) { + bufferlist tmp; + off_t ofs_in_bl = first_block->new_ofs - cur_ofs; + if (ofs_in_bl + (off_t)first_block->len > bl_len) { + // not complete block, put it to waiting + unsigned tail = bl_len - ofs_in_bl; + if (iter_in_bl.get_off() != ofs_in_bl) { + iter_in_bl.seek(ofs_in_bl); + } + iter_in_bl.copy(tail, waiting); + cur_ofs -= tail; + break; + } + if (iter_in_bl.get_off() != ofs_in_bl) { + iter_in_bl.seek(ofs_in_bl); + } + iter_in_bl.copy(first_block->len, tmp); + int cr = compressor->decompress(tmp, out_bl, cs_info->compressor_message); + if (cr < 0) { + lderr(cct) << "Decompression failed with exit code " << cr << dendl; + return cr; + } + ++first_block; + while (out_bl.length() - q_ofs >= + static_cast(cct->_conf->rgw_max_chunk_size)) { + off_t ch_len = std::min(cct->_conf->rgw_max_chunk_size, q_len); + q_len -= ch_len; + r = next->handle_data(out_bl, q_ofs, ch_len); + if (r < 0) { + lsubdout(cct, rgw, 0) << "handle_data failed with exit code " << r << dendl; + return r; + } + out_bl.splice(0, q_ofs + ch_len); + q_ofs = 0; + } + } + + cur_ofs += bl_len; + off_t ch_len = std::min(out_bl.length() - q_ofs, q_len); + if (ch_len > 0) { + r = next->handle_data(out_bl, q_ofs, ch_len); + if (r < 0) { + lsubdout(cct, rgw, 0) << "handle_data failed with exit code " << r << dendl; + return r; + } + out_bl.splice(0, q_ofs + ch_len); + q_len -= ch_len; + q_ofs = 0; + } + return r; +} + +int RGWGetObj_Decompress::fixup_range(off_t& ofs, off_t& end) +{ + if (partial_content) { + // if user set range, we need to calculate it in decompressed data + first_block = cs_info->blocks.begin(); last_block = cs_info->blocks.begin(); + if (cs_info->blocks.size() > 1) { + vector::iterator fb, lb; + // not bad to use auto for lambda, I think + auto cmp_u = [] (off_t ofs, const compression_block& e) { return (uint64_t)ofs < e.old_ofs; }; + auto cmp_l = [] (const compression_block& e, off_t ofs) { return e.old_ofs <= (uint64_t)ofs; }; + fb = upper_bound(cs_info->blocks.begin()+1, + cs_info->blocks.end(), + ofs, + cmp_u); + first_block = fb - 1; + lb = lower_bound(fb, + cs_info->blocks.end(), + end, + cmp_l); + last_block = lb - 1; + } + } else { + first_block = cs_info->blocks.begin(); last_block = cs_info->blocks.end() - 1; + } + + q_ofs = ofs - first_block->old_ofs; + q_len = end + 1 - ofs; + + ofs = first_block->new_ofs; + end = last_block->new_ofs + last_block->len - 1; + + cur_ofs = ofs; + waiting.clear(); + + return next->fixup_range(ofs, end); +} + +void compression_block::dump(Formatter *f) const +{ + f->dump_unsigned("old_ofs", old_ofs); + f->dump_unsigned("new_ofs", new_ofs); + f->dump_unsigned("len", len); +} + +void RGWCompressionInfo::dump(Formatter *f) const +{ + f->dump_string("compression_type", compression_type); + f->dump_unsigned("orig_size", orig_size); + if (compressor_message) { + f->dump_int("compressor_message", *compressor_message); + } + ::encode_json("blocks", blocks, f); +} + diff --git a/src/rgw/rgw_compression.h b/src/rgw/rgw_compression.h new file mode 100644 index 000000000..84250bfe4 --- /dev/null +++ b/src/rgw/rgw_compression.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "compressor/Compressor.h" +#include "rgw_putobj.h" +#include "rgw_op.h" +#include "rgw_compression_types.h" + +int rgw_compression_info_from_attr(const bufferlist& attr, + bool& need_decompress, + RGWCompressionInfo& cs_info); +int rgw_compression_info_from_attrset(const std::map& attrs, + bool& need_decompress, + RGWCompressionInfo& cs_info); + +class RGWGetObj_Decompress : public RGWGetObj_Filter +{ + CephContext* cct; + CompressorRef compressor; + RGWCompressionInfo* cs_info; + bool partial_content; + std::vector::iterator first_block, last_block; + off_t q_ofs, q_len; + uint64_t cur_ofs; + bufferlist waiting; +public: + RGWGetObj_Decompress(CephContext* cct_, + RGWCompressionInfo* cs_info_, + bool partial_content_, + RGWGetObj_Filter* next); + virtual ~RGWGetObj_Decompress() override {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override; + int fixup_range(off_t& ofs, off_t& end) override; + +}; + +class RGWPutObj_Compress : public rgw::putobj::Pipe +{ + CephContext* cct; + bool compressed{false}; + CompressorRef compressor; + std::optional compressor_message; + std::vector blocks; + uint64_t compressed_ofs{0}; +public: + RGWPutObj_Compress(CephContext* cct_, CompressorRef compressor, + rgw::sal::DataProcessor *next) + : Pipe(next), cct(cct_), compressor(compressor) {} + virtual ~RGWPutObj_Compress() override {}; + + int process(bufferlist&& data, uint64_t logical_offset) override; + + bool is_compressed() { return compressed; } + std::vector& get_compression_blocks() { return blocks; } + std::optional get_compressor_message() { return compressor_message; } + +}; /* RGWPutObj_Compress */ diff --git a/src/rgw/rgw_compression_types.h b/src/rgw/rgw_compression_types.h new file mode 100644 index 000000000..efc002efb --- /dev/null +++ b/src/rgw/rgw_compression_types.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +struct compression_block { + uint64_t old_ofs; + uint64_t new_ofs; + uint64_t len; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(old_ofs, bl); + encode(new_ofs, bl); + encode(len, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(old_ofs, bl); + decode(new_ofs, bl); + decode(len, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(compression_block) + +struct RGWCompressionInfo { + std::string compression_type; + uint64_t orig_size; + std::optional compressor_message; + std::vector blocks; + + RGWCompressionInfo() : compression_type("none"), orig_size(0) {} + RGWCompressionInfo(const RGWCompressionInfo& cs_info) : compression_type(cs_info.compression_type), + orig_size(cs_info.orig_size), + compressor_message(cs_info.compressor_message), + blocks(cs_info.blocks) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(compression_type, bl); + encode(orig_size, bl); + encode(compressor_message, bl); + encode(blocks, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(compression_type, bl); + decode(orig_size, bl); + if (struct_v >= 2) { + decode(compressor_message, bl); + } + decode(blocks, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWCompressionInfo) + diff --git a/src/rgw/rgw_coroutine.cc b/src/rgw/rgw_coroutine.cc new file mode 100644 index 000000000..a9c9c38e3 --- /dev/null +++ b/src/rgw/rgw_coroutine.cc @@ -0,0 +1,1130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "include/Context.h" +#include "common/ceph_json.h" +#include "rgw_coroutine.h" + +// re-include our assert to clobber the system one; fix dout: +#include "include/ceph_assert.h" + +#include + +#define dout_subsys ceph_subsys_rgw +#define dout_context g_ceph_context + +using namespace std; + +class RGWCompletionManager::WaitContext : public Context { + RGWCompletionManager *manager; + void *opaque; +public: + WaitContext(RGWCompletionManager *_cm, void *_opaque) : manager(_cm), opaque(_opaque) {} + void finish(int r) override { + manager->_wakeup(opaque); + } +}; + +RGWCompletionManager::RGWCompletionManager(CephContext *_cct) : cct(_cct), + timer(cct, lock) +{ + timer.init(); +} + +RGWCompletionManager::~RGWCompletionManager() +{ + std::lock_guard l{lock}; + timer.cancel_all_events(); + timer.shutdown(); +} + +void RGWCompletionManager::complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info) +{ + std::lock_guard l{lock}; + _complete(cn, io_id, user_info); +} + +void RGWCompletionManager::register_completion_notifier(RGWAioCompletionNotifier *cn) +{ + std::lock_guard l{lock}; + if (cn) { + cns.insert(cn); + } +} + +void RGWCompletionManager::unregister_completion_notifier(RGWAioCompletionNotifier *cn) +{ + std::lock_guard l{lock}; + if (cn) { + cns.erase(cn); + } +} + +void RGWCompletionManager::_complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info) +{ + if (cn) { + cns.erase(cn); + } + + if (complete_reqs_set.find(io_id) != complete_reqs_set.end()) { + /* already have completion for this io_id, don't allow multiple completions for it */ + return; + } + complete_reqs.push_back(io_completion{io_id, user_info}); + cond.notify_all(); +} + +int RGWCompletionManager::get_next(io_completion *io) +{ + std::unique_lock l{lock}; + while (complete_reqs.empty()) { + if (going_down) { + return -ECANCELED; + } + cond.wait(l); + } + *io = complete_reqs.front(); + complete_reqs_set.erase(io->io_id); + complete_reqs.pop_front(); + return 0; +} + +bool RGWCompletionManager::try_get_next(io_completion *io) +{ + std::lock_guard l{lock}; + if (complete_reqs.empty()) { + return false; + } + *io = complete_reqs.front(); + complete_reqs_set.erase(io->io_id); + complete_reqs.pop_front(); + return true; +} + +void RGWCompletionManager::go_down() +{ + std::lock_guard l{lock}; + for (auto cn : cns) { + cn->unregister(); + } + going_down = true; + cond.notify_all(); +} + +void RGWCompletionManager::wait_interval(void *opaque, const utime_t& interval, void *user_info) +{ + std::lock_guard l{lock}; + ceph_assert(waiters.find(opaque) == waiters.end()); + waiters[opaque] = user_info; + timer.add_event_after(interval, new WaitContext(this, opaque)); +} + +void RGWCompletionManager::wakeup(void *opaque) +{ + std::lock_guard l{lock}; + _wakeup(opaque); +} + +void RGWCompletionManager::_wakeup(void *opaque) +{ + map::iterator iter = waiters.find(opaque); + if (iter != waiters.end()) { + void *user_id = iter->second; + waiters.erase(iter); + _complete(NULL, rgw_io_id{0, -1} /* no IO id */, user_id); + } +} + +RGWCoroutine::~RGWCoroutine() { + for (auto stack : spawned.entries) { + stack->put(); + } +} + +void RGWCoroutine::init_new_io(RGWIOProvider *io_provider) +{ + ceph_assert(stack); // if there's no stack, io_provider won't be uninitialized + stack->init_new_io(io_provider); +} + +void RGWCoroutine::set_io_blocked(bool flag) { + if (stack) { + stack->set_io_blocked(flag); + } +} + +void RGWCoroutine::set_sleeping(bool flag) { + if (stack) { + stack->set_sleeping(flag); + } +} + +int RGWCoroutine::io_block(int ret, int64_t io_id) { + return io_block(ret, rgw_io_id{io_id, -1}); +} + +int RGWCoroutine::io_block(int ret, const rgw_io_id& io_id) { + if (!stack) { + return 0; + } + if (stack->consume_io_finish(io_id)) { + return 0; + } + set_io_blocked(true); + stack->set_io_blocked_id(io_id); + return ret; +} + +void RGWCoroutine::io_complete(const rgw_io_id& io_id) { + if (stack) { + stack->io_complete(io_id); + } +} + +void RGWCoroutine::StatusItem::dump(Formatter *f) const { + ::encode_json("timestamp", timestamp, f); + ::encode_json("status", status, f); +} + +stringstream& RGWCoroutine::Status::set_status() +{ + std::unique_lock l{lock}; + string s = status.str(); + status.str(string()); + if (!timestamp.is_zero()) { + history.push_back(StatusItem(timestamp, s)); + } + if (history.size() > (size_t)max_history) { + history.pop_front(); + } + timestamp = ceph_clock_now(); + + return status; +} + +RGWCoroutinesManager::~RGWCoroutinesManager() { + stop(); + completion_mgr->put(); + if (cr_registry) { + cr_registry->remove(this); + } +} + +int64_t RGWCoroutinesManager::get_next_io_id() +{ + return (int64_t)++max_io_id; +} + +uint64_t RGWCoroutinesManager::get_next_stack_id() { + return (uint64_t)++max_stack_id; +} + +RGWCoroutinesStack::RGWCoroutinesStack(CephContext *_cct, RGWCoroutinesManager *_ops_mgr, RGWCoroutine *start) : cct(_cct), ops_mgr(_ops_mgr), + done_flag(false), error_flag(false), blocked_flag(false), + sleep_flag(false), interval_wait_flag(false), is_scheduled(false), is_waiting_for_child(false), + retcode(0), run_count(0), + env(NULL), parent(NULL) +{ + id = ops_mgr->get_next_stack_id(); + if (start) { + ops.push_back(start); + } + pos = ops.begin(); +} + +RGWCoroutinesStack::~RGWCoroutinesStack() +{ + for (auto op : ops) { + op->put(); + } + + for (auto stack : spawned.entries) { + stack->put(); + } +} + +int RGWCoroutinesStack::operate(const DoutPrefixProvider *dpp, RGWCoroutinesEnv *_env) +{ + env = _env; + RGWCoroutine *op = *pos; + op->stack = this; + ldpp_dout(dpp, 20) << *op << ": operate()" << dendl; + int r = op->operate_wrapper(dpp); + if (r < 0) { + ldpp_dout(dpp, 20) << *op << ": operate() returned r=" << r << dendl; + } + + error_flag = op->is_error(); + + if (op->is_done()) { + int op_retcode = r; + r = unwind(op_retcode); + op->put(); + done_flag = (pos == ops.end()); + blocked_flag &= !done_flag; + if (done_flag) { + retcode = op_retcode; + } + return r; + } + + /* should r ever be negative at this point? */ + ceph_assert(r >= 0); + + return 0; +} + +string RGWCoroutinesStack::error_str() +{ + if (pos != ops.end()) { + return (*pos)->error_str(); + } + return string(); +} + +void RGWCoroutinesStack::call(RGWCoroutine *next_op) { + if (!next_op) { + return; + } + ops.push_back(next_op); + if (pos != ops.end()) { + ++pos; + } else { + pos = ops.begin(); + } +} + +void RGWCoroutinesStack::schedule() +{ + env->manager->schedule(env, this); +} + +void RGWCoroutinesStack::_schedule() +{ + env->manager->_schedule(env, this); +} + +RGWCoroutinesStack *RGWCoroutinesStack::spawn(RGWCoroutine *source_op, RGWCoroutine *op, bool wait) +{ + if (!op) { + return NULL; + } + + rgw_spawned_stacks *s = (source_op ? &source_op->spawned : &spawned); + + RGWCoroutinesStack *stack = env->manager->allocate_stack(); + s->add_pending(stack); + stack->parent = this; + + stack->get(); /* we'll need to collect the stack */ + stack->call(op); + + env->manager->schedule(env, stack); + + if (wait) { + set_blocked_by(stack); + } + + return stack; +} + +RGWCoroutinesStack *RGWCoroutinesStack::spawn(RGWCoroutine *op, bool wait) +{ + return spawn(NULL, op, wait); +} + +int RGWCoroutinesStack::wait(const utime_t& interval) +{ + RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr(); + completion_mgr->wait_interval((void *)this, interval, (void *)this); + set_io_blocked(true); + set_interval_wait(true); + return 0; +} + +void RGWCoroutinesStack::wakeup() +{ + RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr(); + completion_mgr->wakeup((void *)this); +} + +void RGWCoroutinesStack::io_complete(const rgw_io_id& io_id) +{ + RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr(); + completion_mgr->complete(nullptr, io_id, (void *)this); +} + +int RGWCoroutinesStack::unwind(int retcode) +{ + rgw_spawned_stacks *src_spawned = &(*pos)->spawned; + + if (pos == ops.begin()) { + ldout(cct, 15) << "stack " << (void *)this << " end" << dendl; + spawned.inherit(src_spawned); + ops.clear(); + pos = ops.end(); + return retcode; + } + + --pos; + ops.pop_back(); + RGWCoroutine *op = *pos; + op->set_retcode(retcode); + op->spawned.inherit(src_spawned); + return 0; +} + +void RGWCoroutinesStack::cancel() +{ + while (!ops.empty()) { + RGWCoroutine *op = *pos; + unwind(-ECANCELED); + op->put(); + } + put(); +} + +bool RGWCoroutinesStack::collect(RGWCoroutine *op, int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id) /* returns true if needs to be called again */ +{ + bool need_retry = false; + rgw_spawned_stacks *s = (op ? &op->spawned : &spawned); + *ret = 0; + vector new_list; + + for (vector::iterator iter = s->entries.begin(); iter != s->entries.end(); ++iter) { + RGWCoroutinesStack *stack = *iter; + if (stack == skip_stack || !stack->is_done()) { + new_list.push_back(stack); + if (!stack->is_done()) { + ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " is still running" << dendl; + } else if (stack == skip_stack) { + ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " explicitly skipping stack" << dendl; + } + continue; + } + if (stack_id) { + *stack_id = stack->get_id(); + } + int r = stack->get_ret_status(); + stack->put(); + if (r < 0) { + *ret = r; + ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " encountered error (r=" << r << "), skipping next stacks" << dendl; + new_list.insert(new_list.end(), ++iter, s->entries.end()); + need_retry = (iter != s->entries.end()); + break; + } + + ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " is complete" << dendl; + } + + s->entries.swap(new_list); + return need_retry; +} + +bool RGWCoroutinesStack::collect_next(RGWCoroutine *op, int *ret, RGWCoroutinesStack **collected_stack) /* returns true if found a stack to collect */ +{ + rgw_spawned_stacks *s = (op ? &op->spawned : &spawned); + *ret = 0; + + if (collected_stack) { + *collected_stack = NULL; + } + + for (vector::iterator iter = s->entries.begin(); iter != s->entries.end(); ++iter) { + RGWCoroutinesStack *stack = *iter; + if (!stack->is_done()) { + continue; + } + int r = stack->get_ret_status(); + if (r < 0) { + *ret = r; + } + + if (collected_stack) { + *collected_stack = stack; + } + stack->put(); + + s->entries.erase(iter); + return true; + } + + return false; +} + +bool RGWCoroutinesStack::collect(int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id) /* returns true if needs to be called again */ +{ + return collect(NULL, ret, skip_stack, stack_id); +} + +static void _aio_completion_notifier_cb(librados::completion_t cb, void *arg) +{ + (static_cast(arg))->cb(); +} + +RGWAioCompletionNotifier::RGWAioCompletionNotifier(RGWCompletionManager *_mgr, const rgw_io_id& _io_id, void *_user_data) : completion_mgr(_mgr), + io_id(_io_id), + user_data(_user_data), registered(true) { + c = librados::Rados::aio_create_completion(this, _aio_completion_notifier_cb); +} + +RGWAioCompletionNotifier *RGWCoroutinesStack::create_completion_notifier() +{ + return ops_mgr->create_completion_notifier(this); +} + +RGWCompletionManager *RGWCoroutinesStack::get_completion_mgr() +{ + return ops_mgr->get_completion_mgr(); +} + +bool RGWCoroutinesStack::unblock_stack(RGWCoroutinesStack **s) +{ + if (blocking_stacks.empty()) { + return false; + } + + set::iterator iter = blocking_stacks.begin(); + *s = *iter; + blocking_stacks.erase(iter); + (*s)->blocked_by_stack.erase(this); + + return true; +} + +void RGWCoroutinesManager::report_error(RGWCoroutinesStack *op) +{ + if (!op) { + return; + } + string err = op->error_str(); + if (err.empty()) { + return; + } + lderr(cct) << "ERROR: failed operation: " << op->error_str() << dendl; +} + +void RGWCoroutinesStack::dump(Formatter *f) const { + stringstream ss; + ss << (void *)this; + ::encode_json("stack", ss.str(), f); + ::encode_json("run_count", run_count, f); + f->open_array_section("ops"); + for (auto& i : ops) { + encode_json("op", *i, f); + } + f->close_section(); +} + +void RGWCoroutinesStack::init_new_io(RGWIOProvider *io_provider) +{ + io_provider->set_io_user_info((void *)this); + io_provider->assign_io(env->manager->get_io_id_provider()); +} + +bool RGWCoroutinesStack::try_io_unblock(const rgw_io_id& io_id) +{ + if (!can_io_unblock(io_id)) { + auto p = io_finish_ids.emplace(io_id.id, io_id); + auto& iter = p.first; + bool inserted = p.second; + if (!inserted) { /* could not insert, entry already existed, add channel to completion mask */ + iter->second.channels |= io_id.channels; + } + return false; + } + + return true; +} + +bool RGWCoroutinesStack::consume_io_finish(const rgw_io_id& io_id) +{ + auto iter = io_finish_ids.find(io_id.id); + if (iter == io_finish_ids.end()) { + return false; + } + int finish_mask = iter->second.channels; + bool found = (finish_mask & io_id.channels) != 0; + + finish_mask &= ~(finish_mask & io_id.channels); + + if (finish_mask == 0) { + io_finish_ids.erase(iter); + } + return found; +} + + +void RGWCoroutinesManager::handle_unblocked_stack(set& context_stacks, list& scheduled_stacks, + RGWCompletionManager::io_completion& io, int *blocked_count, int *interval_wait_count) +{ + ceph_assert(ceph_mutex_is_wlocked(lock)); + RGWCoroutinesStack *stack = static_cast(io.user_info); + if (context_stacks.find(stack) == context_stacks.end()) { + return; + } + if (!stack->try_io_unblock(io.io_id)) { + return; + } + if (stack->is_io_blocked()) { + --(*blocked_count); + stack->set_io_blocked(false); + if (stack->is_interval_waiting()) { + --(*interval_wait_count); + } + } + stack->set_interval_wait(false); + if (!stack->is_done()) { + if (!stack->is_scheduled) { + scheduled_stacks.push_back(stack); + stack->set_is_scheduled(true); + } + } else { + context_stacks.erase(stack); + stack->put(); + } +} + +void RGWCoroutinesManager::schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack) +{ + std::unique_lock wl{lock}; + _schedule(env, stack); +} + +void RGWCoroutinesManager::_schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack) +{ + ceph_assert(ceph_mutex_is_wlocked(lock)); + if (!stack->is_scheduled) { + env->scheduled_stacks->push_back(stack); + stack->set_is_scheduled(true); + } + set& context_stacks = run_contexts[env->run_context]; + context_stacks.insert(stack); +} + +void RGWCoroutinesManager::set_sleeping(RGWCoroutine *cr, bool flag) +{ + cr->set_sleeping(flag); +} + +void RGWCoroutinesManager::io_complete(RGWCoroutine *cr, const rgw_io_id& io_id) +{ + cr->io_complete(io_id); +} + +int RGWCoroutinesManager::run(const DoutPrefixProvider *dpp, list& stacks) +{ + int ret = 0; + int blocked_count = 0; + int interval_wait_count = 0; + bool canceled = false; // set on going_down + RGWCoroutinesEnv env; + bool op_not_blocked; + + uint64_t run_context = ++run_context_count; + + lock.lock(); + set& context_stacks = run_contexts[run_context]; + list scheduled_stacks; + for (auto& st : stacks) { + context_stacks.insert(st); + scheduled_stacks.push_back(st); + st->set_is_scheduled(true); + } + env.run_context = run_context; + env.manager = this; + env.scheduled_stacks = &scheduled_stacks; + + for (list::iterator iter = scheduled_stacks.begin(); iter != scheduled_stacks.end() && !going_down;) { + RGWCompletionManager::io_completion io; + RGWCoroutinesStack *stack = *iter; + ++iter; + scheduled_stacks.pop_front(); + + if (context_stacks.find(stack) == context_stacks.end()) { + /* stack was probably schedule more than once due to IO, but was since complete */ + goto next; + } + env.stack = stack; + + lock.unlock(); + + ret = stack->operate(dpp, &env); + + lock.lock(); + + stack->set_is_scheduled(false); + if (ret < 0) { + ldpp_dout(dpp, 20) << "stack->operate() returned ret=" << ret << dendl; + } + + if (stack->is_error()) { + report_error(stack); + } + + op_not_blocked = false; + + if (stack->is_io_blocked()) { + ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is io blocked" << dendl; + if (stack->is_interval_waiting()) { + interval_wait_count++; + } + blocked_count++; + } else if (stack->is_blocked()) { + /* do nothing, we'll re-add the stack when the blocking stack is done, + * or when we're awaken + */ + ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is_blocked_by_stack()=" << stack->is_blocked_by_stack() + << " is_sleeping=" << stack->is_sleeping() << " waiting_for_child()=" << stack->waiting_for_child() << dendl; + } else if (stack->is_done()) { + ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is done" << dendl; + RGWCoroutinesStack *s; + while (stack->unblock_stack(&s)) { + if (!s->is_blocked_by_stack() && !s->is_done()) { + if (s->is_io_blocked()) { + if (stack->is_interval_waiting()) { + interval_wait_count++; + } + blocked_count++; + } else { + s->_schedule(); + } + } + } + if (stack->parent && stack->parent->waiting_for_child()) { + stack->parent->set_wait_for_child(false); + stack->parent->_schedule(); + } + context_stacks.erase(stack); + stack->put(); + stack = NULL; + } else { + op_not_blocked = true; + stack->run_count++; + stack->_schedule(); + } + + if (!op_not_blocked && stack) { + stack->run_count = 0; + } + + while (completion_mgr->try_get_next(&io)) { + handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count, &interval_wait_count); + } + + /* + * only account blocked operations that are not in interval_wait, these are stacks that + * were put on a wait without any real IO operations. While we mark these as io_blocked, + * these aren't really waiting for IOs + */ + while (blocked_count - interval_wait_count >= ops_window) { + lock.unlock(); + ret = completion_mgr->get_next(&io); + lock.lock(); + if (ret < 0) { + ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl; + } + handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count, &interval_wait_count); + } + +next: + while (scheduled_stacks.empty() && blocked_count > 0) { + lock.unlock(); + ret = completion_mgr->get_next(&io); + lock.lock(); + if (ret < 0) { + ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl; + } + if (going_down) { + ldout(cct, 5) << __func__ << "(): was stopped, exiting" << dendl; + ret = -ECANCELED; + canceled = true; + break; + } + handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count, &interval_wait_count); + iter = scheduled_stacks.begin(); + } + if (canceled) { + break; + } + + if (iter == scheduled_stacks.end()) { + iter = scheduled_stacks.begin(); + } + } + + if (!context_stacks.empty() && !going_down) { + JSONFormatter formatter(true); + formatter.open_array_section("context_stacks"); + for (auto& s : context_stacks) { + ::encode_json("entry", *s, &formatter); + } + formatter.close_section(); + lderr(cct) << __func__ << "(): ERROR: deadlock detected, dumping remaining coroutines:\n"; + formatter.flush(*_dout); + *_dout << dendl; + ceph_assert(context_stacks.empty() || going_down); // assert on deadlock + } + + for (auto stack : context_stacks) { + ldout(cct, 20) << "clearing stack on run() exit: stack=" << (void *)stack << " nref=" << stack->get_nref() << dendl; + stack->cancel(); + } + run_contexts.erase(run_context); + lock.unlock(); + + return ret; +} + +int RGWCoroutinesManager::run(const DoutPrefixProvider *dpp, RGWCoroutine *op) +{ + if (!op) { + return 0; + } + list stacks; + RGWCoroutinesStack *stack = allocate_stack(); + op->get(); + stack->call(op); + + stacks.push_back(stack); + + int r = run(dpp, stacks); + if (r < 0) { + ldpp_dout(dpp, 20) << "run(stacks) returned r=" << r << dendl; + } else { + r = op->get_ret_status(); + } + op->put(); + + return r; +} + +RGWAioCompletionNotifier *RGWCoroutinesManager::create_completion_notifier(RGWCoroutinesStack *stack) +{ + rgw_io_id io_id{get_next_io_id(), -1}; + RGWAioCompletionNotifier *cn = new RGWAioCompletionNotifier(completion_mgr, io_id, (void *)stack); + completion_mgr->register_completion_notifier(cn); + return cn; +} + +void RGWCoroutinesManager::dump(Formatter *f) const { + std::shared_lock rl{lock}; + + f->open_array_section("run_contexts"); + for (auto& i : run_contexts) { + f->open_object_section("context"); + ::encode_json("id", i.first, f); + f->open_array_section("entries"); + for (auto& s : i.second) { + ::encode_json("entry", *s, f); + } + f->close_section(); + f->close_section(); + } + f->close_section(); +} + +RGWCoroutinesStack *RGWCoroutinesManager::allocate_stack() { + return new RGWCoroutinesStack(cct, this); +} + +string RGWCoroutinesManager::get_id() +{ + if (!id.empty()) { + return id; + } + stringstream ss; + ss << (void *)this; + return ss.str(); +} + +void RGWCoroutinesManagerRegistry::add(RGWCoroutinesManager *mgr) +{ + std::unique_lock wl{lock}; + if (managers.find(mgr) == managers.end()) { + managers.insert(mgr); + get(); + } +} + +void RGWCoroutinesManagerRegistry::remove(RGWCoroutinesManager *mgr) +{ + std::unique_lock wl{lock}; + if (managers.find(mgr) != managers.end()) { + managers.erase(mgr); + put(); + } +} + +RGWCoroutinesManagerRegistry::~RGWCoroutinesManagerRegistry() +{ + AdminSocket *admin_socket = cct->get_admin_socket(); + if (!admin_command.empty()) { + admin_socket->unregister_commands(this); + } +} + +int RGWCoroutinesManagerRegistry::hook_to_admin_command(const string& command) +{ + AdminSocket *admin_socket = cct->get_admin_socket(); + if (!admin_command.empty()) { + admin_socket->unregister_commands(this); + } + admin_command = command; + int r = admin_socket->register_command(admin_command, this, + "dump current coroutines stack state"); + if (r < 0) { + lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl; + return r; + } + return 0; +} + +int RGWCoroutinesManagerRegistry::call(std::string_view command, + const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& ss, + bufferlist& out) { + std::shared_lock rl{lock}; + ::encode_json("cr_managers", *this, f); + return 0; +} + +void RGWCoroutinesManagerRegistry::dump(Formatter *f) const { + f->open_array_section("coroutine_managers"); + for (auto m : managers) { + ::encode_json("entry", *m, f); + } + f->close_section(); +} + +void RGWCoroutine::call(RGWCoroutine *op) +{ + if (op) { + stack->call(op); + } else { + // the call()er expects this to set a retcode + retcode = 0; + } +} + +RGWCoroutinesStack *RGWCoroutine::spawn(RGWCoroutine *op, bool wait) +{ + return stack->spawn(this, op, wait); +} + +bool RGWCoroutine::collect(int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id) /* returns true if needs to be called again */ +{ + return stack->collect(this, ret, skip_stack, stack_id); +} + +bool RGWCoroutine::collect_next(int *ret, RGWCoroutinesStack **collected_stack) /* returns true if found a stack to collect */ +{ + return stack->collect_next(this, ret, collected_stack); +} + +int RGWCoroutine::wait(const utime_t& interval) +{ + return stack->wait(interval); +} + +void RGWCoroutine::wait_for_child() +{ + /* should only wait for child if there is a child that is not done yet, and no complete children */ + if (spawned.entries.empty()) { + return; + } + for (vector::iterator iter = spawned.entries.begin(); iter != spawned.entries.end(); ++iter) { + if ((*iter)->is_done()) { + return; + } + } + stack->set_wait_for_child(true); +} + +string RGWCoroutine::to_str() const +{ + return typeid(*this).name(); +} + +ostream& operator<<(ostream& out, const RGWCoroutine& cr) +{ + out << "cr:s=" << (void *)cr.get_stack() << ":op=" << (void *)&cr << ":" << typeid(cr).name(); + return out; +} + +bool RGWCoroutine::drain_children(int num_cr_left, + RGWCoroutinesStack *skip_stack, + std::optional > cb) +{ + bool done = false; + ceph_assert(num_cr_left >= 0); + if (num_cr_left == 0 && skip_stack) { + num_cr_left = 1; + } + reenter(&drain_status.cr) { + while (num_spawned() > (size_t)num_cr_left) { + yield wait_for_child(); + int ret; + uint64_t stack_id; + bool again = false; + do { + again = collect(&ret, skip_stack, &stack_id); + if (ret < 0) { + ldout(cct, 10) << "collect() returned ret=" << ret << dendl; + /* we should have reported this error */ + log_error() << "ERROR: collect() returned error (ret=" << ret << ")"; + } + if (cb) { + (*cb)(stack_id, ret); + } + } while (again); + } + done = true; + } + return done; +} + +bool RGWCoroutine::drain_children(int num_cr_left, + std::optional > cb) +{ + bool done = false; + ceph_assert(num_cr_left >= 0); + + reenter(&drain_status.cr) { + while (num_spawned() > (size_t)num_cr_left) { + yield wait_for_child(); + int ret; + uint64_t stack_id; + bool again = false; + do { + again = collect(&ret, nullptr, &stack_id); + if (ret < 0) { + ldout(cct, 10) << "collect() returned ret=" << ret << dendl; + /* we should have reported this error */ + log_error() << "ERROR: collect() returned error (ret=" << ret << ")"; + } + if (cb && !drain_status.should_exit) { + int r = (*cb)(stack_id, ret); + if (r < 0) { + drain_status.ret = r; + drain_status.should_exit = true; + num_cr_left = 0; /* need to drain all */ + } + } + } while (again); + } + done = true; + } + return done; +} + +void RGWCoroutine::wakeup() +{ + if (stack) { + stack->wakeup(); + } +} + +RGWCoroutinesEnv *RGWCoroutine::get_env() const +{ + return stack->get_env(); +} + +void RGWCoroutine::dump(Formatter *f) const { + if (!description.str().empty()) { + encode_json("description", description.str(), f); + } + encode_json("type", to_str(), f); + if (!spawned.entries.empty()) { + f->open_array_section("spawned"); + for (auto& i : spawned.entries) { + char buf[32]; + snprintf(buf, sizeof(buf), "%p", (void *)i); + encode_json("stack", string(buf), f); + } + f->close_section(); + } + if (!status.history.empty()) { + encode_json("history", status.history, f); + } + + if (!status.status.str().empty()) { + f->open_object_section("status"); + encode_json("status", status.status.str(), f); + encode_json("timestamp", status.timestamp, f); + f->close_section(); + } +} + +RGWSimpleCoroutine::~RGWSimpleCoroutine() +{ + if (!called_cleanup) { + request_cleanup(); + } +} + +void RGWSimpleCoroutine::call_cleanup() +{ + called_cleanup = true; + request_cleanup(); +} + +int RGWSimpleCoroutine::operate(const DoutPrefixProvider *dpp) +{ + int ret = 0; + reenter(this) { + yield return state_init(); + yield return state_send_request(dpp); + yield return state_request_complete(); + yield return state_all_complete(); + drain_all(); + call_cleanup(); + return set_state(RGWCoroutine_Done, ret); + } + return 0; +} + +int RGWSimpleCoroutine::state_init() +{ + int ret = init(); + if (ret < 0) { + call_cleanup(); + return set_state(RGWCoroutine_Error, ret); + } + return 0; +} + +int RGWSimpleCoroutine::state_send_request(const DoutPrefixProvider *dpp) +{ + int ret = send_request(dpp); + if (ret < 0) { + call_cleanup(); + return set_state(RGWCoroutine_Error, ret); + } + return io_block(0); +} + +int RGWSimpleCoroutine::state_request_complete() +{ + int ret = request_complete(); + if (ret < 0) { + call_cleanup(); + return set_state(RGWCoroutine_Error, ret); + } + return 0; +} + +int RGWSimpleCoroutine::state_all_complete() +{ + int ret = finish(); + if (ret < 0) { + call_cleanup(); + return set_state(RGWCoroutine_Error, ret); + } + return 0; +} + + diff --git a/src/rgw/rgw_coroutine.h b/src/rgw/rgw_coroutine.h new file mode 100644 index 000000000..eb3216640 --- /dev/null +++ b/src/rgw/rgw_coroutine.h @@ -0,0 +1,722 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#ifdef _ASSERT_H +#define NEED_ASSERT_H +#pragma push_macro("_ASSERT_H") +#endif + +#include +#include + +#ifdef NEED_ASSERT_H +#pragma pop_macro("_ASSERT_H") +#endif + +#include "include/utime.h" +#include "common/RefCountedObj.h" +#include "common/debug.h" +#include "common/Timer.h" +#include "common/admin_socket.h" + +#include "rgw_common.h" +#include "rgw_http_client_types.h" + +#include + +#include + +#define RGW_ASYNC_OPS_MGR_WINDOW 100 + +class RGWCoroutinesStack; +class RGWCoroutinesManager; +class RGWAioCompletionNotifier; + +class RGWCompletionManager : public RefCountedObject { + friend class RGWCoroutinesManager; + + CephContext *cct; + + struct io_completion { + rgw_io_id io_id; + void *user_info; + }; + std::list complete_reqs; + std::set complete_reqs_set; + using NotifierRef = boost::intrusive_ptr; + std::set cns; + + ceph::mutex lock = ceph::make_mutex("RGWCompletionManager::lock"); + ceph::condition_variable cond; + + SafeTimer timer; + + std::atomic going_down = { false }; + + std::map waiters; + + class WaitContext; + +protected: + void _wakeup(void *opaque); + void _complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info); +public: + explicit RGWCompletionManager(CephContext *_cct); + virtual ~RGWCompletionManager() override; + + void complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info); + int get_next(io_completion *io); + bool try_get_next(io_completion *io); + + void go_down(); + + /* + * wait for interval length to complete user_info + */ + void wait_interval(void *opaque, const utime_t& interval, void *user_info); + void wakeup(void *opaque); + + void register_completion_notifier(RGWAioCompletionNotifier *cn); + void unregister_completion_notifier(RGWAioCompletionNotifier *cn); +}; + +/* a single use librados aio completion notifier that hooks into the RGWCompletionManager */ +class RGWAioCompletionNotifier : public RefCountedObject { + librados::AioCompletion *c; + RGWCompletionManager *completion_mgr; + rgw_io_id io_id; + void *user_data; + ceph::mutex lock = ceph::make_mutex("RGWAioCompletionNotifier"); + bool registered; + +public: + RGWAioCompletionNotifier(RGWCompletionManager *_mgr, const rgw_io_id& _io_id, void *_user_data); + virtual ~RGWAioCompletionNotifier() override { + c->release(); + lock.lock(); + bool need_unregister = registered; + if (registered) { + completion_mgr->get(); + } + registered = false; + lock.unlock(); + if (need_unregister) { + completion_mgr->unregister_completion_notifier(this); + completion_mgr->put(); + } + } + + librados::AioCompletion *completion() { + return c; + } + + void unregister() { + std::lock_guard l{lock}; + if (!registered) { + return; + } + registered = false; + } + + void cb() { + lock.lock(); + if (!registered) { + lock.unlock(); + put(); + return; + } + completion_mgr->get(); + registered = false; + lock.unlock(); + completion_mgr->complete(this, io_id, user_data); + completion_mgr->put(); + put(); + } +}; + +// completion notifier with opaque payload (ie a reference-counted pointer) +template +class RGWAioCompletionNotifierWith : public RGWAioCompletionNotifier { + T value; +public: + RGWAioCompletionNotifierWith(RGWCompletionManager *mgr, + const rgw_io_id& io_id, void *user_data, + T value) + : RGWAioCompletionNotifier(mgr, io_id, user_data), value(std::move(value)) + {} +}; + +struct RGWCoroutinesEnv { + uint64_t run_context; + RGWCoroutinesManager *manager; + std::list *scheduled_stacks; + RGWCoroutinesStack *stack; + + RGWCoroutinesEnv() : run_context(0), manager(NULL), scheduled_stacks(NULL), stack(NULL) {} +}; + +enum RGWCoroutineState { + RGWCoroutine_Error = -2, + RGWCoroutine_Done = -1, + RGWCoroutine_Run = 0, +}; + +struct rgw_spawned_stacks { + std::vector entries; + + rgw_spawned_stacks() {} + + void add_pending(RGWCoroutinesStack *s) { + entries.push_back(s); + } + + void inherit(rgw_spawned_stacks *source) { + for (auto* entry : source->entries) { + add_pending(entry); + } + source->entries.clear(); + } +}; + + + +class RGWCoroutine : public RefCountedObject, public boost::asio::coroutine { + friend class RGWCoroutinesStack; + + struct StatusItem { + utime_t timestamp; + std::string status; + + StatusItem(utime_t& t, const std::string& s) : timestamp(t), status(s) {} + + void dump(Formatter *f) const; + }; + +#define MAX_COROUTINE_HISTORY 10 + + struct Status { + CephContext *cct; + ceph::shared_mutex lock = + ceph::make_shared_mutex("RGWCoroutine::Status::lock"); + int max_history; + + utime_t timestamp; + std::stringstream status; + + explicit Status(CephContext *_cct) : cct(_cct), max_history(MAX_COROUTINE_HISTORY) {} + + std::deque history; + + std::stringstream& set_status(); + } status; + + std::stringstream description; + +protected: + bool _yield_ret; + + struct { + boost::asio::coroutine cr; + bool should_exit{false}; + int ret{0}; + + void init() { + cr = boost::asio::coroutine(); + should_exit = false; + ret = 0; + } + } drain_status; + + CephContext *cct; + + RGWCoroutinesStack *stack; + int retcode; + int state; + + rgw_spawned_stacks spawned; + + std::stringstream error_stream; + + int set_state(int s, int ret = 0) { + retcode = ret; + state = s; + return ret; + } + int set_cr_error(int ret) { + return set_state(RGWCoroutine_Error, ret); + } + int set_cr_done() { + return set_state(RGWCoroutine_Done, 0); + } + void set_io_blocked(bool flag); + + void reset_description() { + description.str(std::string()); + } + + std::stringstream& set_description() { + return description; + } + std::stringstream& set_status() { + return status.set_status(); + } + + std::stringstream& set_status(const std::string& s) { + std::stringstream& status = set_status(); + status << s; + return status; + } + + virtual int operate_wrapper(const DoutPrefixProvider *dpp) { + return operate(dpp); + } +public: + RGWCoroutine(CephContext *_cct) : status(_cct), _yield_ret(false), cct(_cct), stack(NULL), retcode(0), state(RGWCoroutine_Run) {} + virtual ~RGWCoroutine() override; + + virtual int operate(const DoutPrefixProvider *dpp) = 0; + + bool is_done() { return (state == RGWCoroutine_Done || state == RGWCoroutine_Error); } + bool is_error() { return (state == RGWCoroutine_Error); } + + std::stringstream& log_error() { return error_stream; } + std::string error_str() { + return error_stream.str(); + } + + void set_retcode(int r) { + retcode = r; + } + + int get_ret_status() { + return retcode; + } + + void call(RGWCoroutine *op); /* call at the same stack we're in */ + RGWCoroutinesStack *spawn(RGWCoroutine *op, bool wait); /* execute on a different stack */ + bool collect(int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id = nullptr); /* returns true if needs to be called again */ + bool collect_next(int *ret, RGWCoroutinesStack **collected_stack = NULL); /* returns true if found a stack to collect */ + + int wait(const utime_t& interval); + bool drain_children(int num_cr_left, + RGWCoroutinesStack *skip_stack = nullptr, + std::optional > cb = std::nullopt); /* returns true if needed to be called again, + cb will be called on completion of every + completion. */ + bool drain_children(int num_cr_left, + std::optional > cb); /* returns true if needed to be called again, + cb will be called on every completion, can filter errors. + A negative return value from cb means that current cr + will need to exit */ + void wakeup(); + void set_sleeping(bool flag); /* put in sleep, or wakeup from sleep */ + + size_t num_spawned() { + return spawned.entries.size(); + } + + void wait_for_child(); + + virtual std::string to_str() const; + + RGWCoroutinesStack *get_stack() const { + return stack; + } + + RGWCoroutinesEnv *get_env() const; + + void dump(Formatter *f) const; + + void init_new_io(RGWIOProvider *io_provider); /* only links the default io id */ + + int io_block(int ret = 0) { + return io_block(ret, -1); + } + int io_block(int ret, int64_t io_id); + int io_block(int ret, const rgw_io_id& io_id); + void io_complete() { + io_complete(rgw_io_id{}); + } + void io_complete(const rgw_io_id& io_id); +}; + +std::ostream& operator<<(std::ostream& out, const RGWCoroutine& cr); + +#define yield_until_true(x) \ +do { \ + do { \ + yield _yield_ret = x; \ + } while (!_yield_ret); \ + _yield_ret = false; \ +} while (0) + +#define drain_all() \ + drain_status.init(); \ + yield_until_true(drain_children(0)) + +#define drain_all_but(n) \ + drain_status.init(); \ + yield_until_true(drain_children(n)) + +#define drain_all_but_stack(stack) \ + drain_status.init(); \ + yield_until_true(drain_children(1, stack)) + +#define drain_all_but_stack_cb(stack, cb) \ + drain_status.init(); \ + yield_until_true(drain_children(1, stack, cb)) + +#define drain_with_cb(n, cb) \ + drain_status.init(); \ + yield_until_true(drain_children(n, cb)); \ + if (drain_status.should_exit) { \ + return set_cr_error(drain_status.ret); \ + } + +#define drain_all_cb(cb) \ + drain_with_cb(0, cb) + +#define yield_spawn_window(cr, n, cb) \ + do { \ + spawn(cr, false); \ + drain_with_cb(n, cb); /* this is guaranteed to yield */ \ + } while (0) + + + +template +class RGWConsumerCR : public RGWCoroutine { + std::list product; + +public: + explicit RGWConsumerCR(CephContext *_cct) : RGWCoroutine(_cct) {} + + bool has_product() { + return !product.empty(); + } + + void wait_for_product() { + if (!has_product()) { + set_sleeping(true); + } + } + + bool consume(T *p) { + if (product.empty()) { + return false; + } + *p = product.front(); + product.pop_front(); + return true; + } + + void receive(const T& p, bool wakeup = true); + void receive(std::list& l, bool wakeup = true); +}; + +class RGWCoroutinesStack : public RefCountedObject { + friend class RGWCoroutine; + friend class RGWCoroutinesManager; + + CephContext *cct; + + int64_t id{-1}; + + RGWCoroutinesManager *ops_mgr; + + std::list ops; + std::list::iterator pos; + + rgw_spawned_stacks spawned; + + std::set blocked_by_stack; + std::set blocking_stacks; + + std::map io_finish_ids; + rgw_io_id io_blocked_id; + + bool done_flag; + bool error_flag; + bool blocked_flag; + bool sleep_flag; + bool interval_wait_flag; + + bool is_scheduled; + + bool is_waiting_for_child; + + int retcode; + + uint64_t run_count; + +protected: + RGWCoroutinesEnv *env; + RGWCoroutinesStack *parent; + + RGWCoroutinesStack *spawn(RGWCoroutine *source_op, RGWCoroutine *next_op, bool wait); + bool collect(RGWCoroutine *op, int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id); /* returns true if needs to be called again */ + bool collect_next(RGWCoroutine *op, int *ret, RGWCoroutinesStack **collected_stack); /* returns true if found a stack to collect */ +public: + RGWCoroutinesStack(CephContext *_cct, RGWCoroutinesManager *_ops_mgr, RGWCoroutine *start = NULL); + virtual ~RGWCoroutinesStack() override; + + int64_t get_id() const { + return id; + } + + int operate(const DoutPrefixProvider *dpp, RGWCoroutinesEnv *env); + + bool is_done() { + return done_flag; + } + bool is_error() { + return error_flag; + } + bool is_blocked_by_stack() { + return !blocked_by_stack.empty(); + } + void set_io_blocked(bool flag) { + blocked_flag = flag; + } + void set_io_blocked_id(const rgw_io_id& io_id) { + io_blocked_id = io_id; + } + bool is_io_blocked() { + return blocked_flag && !done_flag; + } + bool can_io_unblock(const rgw_io_id& io_id) { + return ((io_blocked_id.id < 0) || + io_blocked_id.intersects(io_id)); + } + bool try_io_unblock(const rgw_io_id& io_id); + bool consume_io_finish(const rgw_io_id& io_id); + void set_interval_wait(bool flag) { + interval_wait_flag = flag; + } + bool is_interval_waiting() { + return interval_wait_flag; + } + void set_sleeping(bool flag) { + bool wakeup = sleep_flag & !flag; + sleep_flag = flag; + if (wakeup) { + schedule(); + } + } + bool is_sleeping() { + return sleep_flag; + } + void set_is_scheduled(bool flag) { + is_scheduled = flag; + } + + bool is_blocked() { + return is_blocked_by_stack() || is_sleeping() || + is_io_blocked() || waiting_for_child() ; + } + + void schedule(); + void _schedule(); + + int get_ret_status() { + return retcode; + } + + std::string error_str(); + + void call(RGWCoroutine *next_op); + RGWCoroutinesStack *spawn(RGWCoroutine *next_op, bool wait); + int unwind(int retcode); + + int wait(const utime_t& interval); + void wakeup(); + void io_complete() { + io_complete(rgw_io_id{}); + } + void io_complete(const rgw_io_id& io_id); + + bool collect(int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id); /* returns true if needs to be called again */ + + void cancel(); + + RGWAioCompletionNotifier *create_completion_notifier(); + template + RGWAioCompletionNotifier *create_completion_notifier(T value); + RGWCompletionManager *get_completion_mgr(); + + void set_blocked_by(RGWCoroutinesStack *s) { + blocked_by_stack.insert(s); + s->blocking_stacks.insert(this); + } + + void set_wait_for_child(bool flag) { + is_waiting_for_child = flag; + } + + bool waiting_for_child() { + return is_waiting_for_child; + } + + bool unblock_stack(RGWCoroutinesStack **s); + + RGWCoroutinesEnv *get_env() const { return env; } + + void dump(Formatter *f) const; + + void init_new_io(RGWIOProvider *io_provider); +}; + +template +void RGWConsumerCR::receive(std::list& l, bool wakeup) +{ + product.splice(product.end(), l); + if (wakeup) { + set_sleeping(false); + } +} + + +template +void RGWConsumerCR::receive(const T& p, bool wakeup) +{ + product.push_back(p); + if (wakeup) { + set_sleeping(false); + } +} + +class RGWCoroutinesManagerRegistry : public RefCountedObject, public AdminSocketHook { + CephContext *cct; + + std::set managers; + ceph::shared_mutex lock = + ceph::make_shared_mutex("RGWCoroutinesRegistry::lock"); + + std::string admin_command; + +public: + explicit RGWCoroutinesManagerRegistry(CephContext *_cct) : cct(_cct) {} + virtual ~RGWCoroutinesManagerRegistry() override; + + void add(RGWCoroutinesManager *mgr); + void remove(RGWCoroutinesManager *mgr); + + int hook_to_admin_command(const std::string& command); + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& ss, + bufferlist& out) override; + + void dump(Formatter *f) const; +}; + +class RGWCoroutinesManager { + CephContext *cct; + std::atomic going_down = { false }; + + std::atomic run_context_count = { 0 }; + std::map > run_contexts; + + std::atomic max_io_id = { 0 }; + std::atomic max_stack_id = { 0 }; + + mutable ceph::shared_mutex lock = + ceph::make_shared_mutex("RGWCoroutinesManager::lock"); + + RGWIOIDProvider io_id_provider; + + void handle_unblocked_stack(std::set& context_stacks, std::list& scheduled_stacks, + RGWCompletionManager::io_completion& io, int *waiting_count, int *interval_wait_count); +protected: + RGWCompletionManager *completion_mgr; + RGWCoroutinesManagerRegistry *cr_registry; + + int ops_window; + + std::string id; + + void put_completion_notifier(RGWAioCompletionNotifier *cn); +public: + RGWCoroutinesManager(CephContext *_cct, RGWCoroutinesManagerRegistry *_cr_registry) : cct(_cct), + cr_registry(_cr_registry), ops_window(RGW_ASYNC_OPS_MGR_WINDOW) { + completion_mgr = new RGWCompletionManager(cct); + if (cr_registry) { + cr_registry->add(this); + } + } + virtual ~RGWCoroutinesManager(); + + int run(const DoutPrefixProvider *dpp, std::list& ops); + int run(const DoutPrefixProvider *dpp, RGWCoroutine *op); + void stop() { + bool expected = false; + if (going_down.compare_exchange_strong(expected, true)) { + completion_mgr->go_down(); + } + } + + virtual void report_error(RGWCoroutinesStack *op); + + RGWAioCompletionNotifier *create_completion_notifier(RGWCoroutinesStack *stack); + template + RGWAioCompletionNotifier *create_completion_notifier(RGWCoroutinesStack *stack, T value); + RGWCompletionManager *get_completion_mgr() { return completion_mgr; } + + void schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack); + void _schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack); + RGWCoroutinesStack *allocate_stack(); + + int64_t get_next_io_id(); + uint64_t get_next_stack_id(); + + void set_sleeping(RGWCoroutine *cr, bool flag); + void io_complete(RGWCoroutine *cr, const rgw_io_id& io_id); + + virtual std::string get_id(); + void dump(Formatter *f) const; + + RGWIOIDProvider& get_io_id_provider() { + return io_id_provider; + } +}; + +template +RGWAioCompletionNotifier *RGWCoroutinesManager::create_completion_notifier(RGWCoroutinesStack *stack, T value) +{ + rgw_io_id io_id{get_next_io_id(), -1}; + RGWAioCompletionNotifier *cn = new RGWAioCompletionNotifierWith(completion_mgr, io_id, (void *)stack, std::move(value)); + completion_mgr->register_completion_notifier(cn); + return cn; +} + +template +RGWAioCompletionNotifier *RGWCoroutinesStack::create_completion_notifier(T value) +{ + return ops_mgr->create_completion_notifier(this, std::move(value)); +} + +class RGWSimpleCoroutine : public RGWCoroutine { + bool called_cleanup; + + int operate(const DoutPrefixProvider *dpp) override; + + int state_init(); + int state_send_request(const DoutPrefixProvider *dpp); + int state_request_complete(); + int state_all_complete(); + + void call_cleanup(); + +public: + RGWSimpleCoroutine(CephContext *_cct) : RGWCoroutine(_cct), called_cleanup(false) {} + virtual ~RGWSimpleCoroutine() override; + + virtual int init() { return 0; } + virtual int send_request(const DoutPrefixProvider *dpp) = 0; + virtual int request_complete() = 0; + virtual int finish() { return 0; } + virtual void request_cleanup() {} +}; diff --git a/src/rgw/rgw_cors.cc b/src/rgw/rgw_cors.cc new file mode 100644 index 000000000..83ba079b2 --- /dev/null +++ b/src/rgw/rgw_cors.cc @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include + +#include +#include + +#include + +#include "include/types.h" +#include "common/debug.h" +#include "include/str_list.h" +#include "common/Formatter.h" + +#include "rgw_cors.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void RGWCORSRule::dump_origins() { + unsigned num_origins = allowed_origins.size(); + dout(10) << "Allowed origins : " << num_origins << dendl; + for(auto& origin : allowed_origins) { + dout(10) << origin << "," << dendl; + } +} + +void RGWCORSRule::erase_origin_if_present(string& origin, bool *rule_empty) { + set::iterator it = allowed_origins.find(origin); + if (!rule_empty) + return; + *rule_empty = false; + if (it != allowed_origins.end()) { + dout(10) << "Found origin " << origin << ", set size:" << + allowed_origins.size() << dendl; + allowed_origins.erase(it); + *rule_empty = (allowed_origins.empty()); + } +} + +/* + * make attrs look-like-this + * does not convert underscores or dashes + * + * Per CORS specification, section 3: + * === + * "Converting a string to ASCII lowercase" means replacing all characters in the + * range U+0041 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z with + * the corresponding characters in the range U+0061 LATIN SMALL LETTER A to + * U+007A LATIN SMALL LETTER Z). + * === + * + * @todo When UTF-8 is allowed in HTTP headers, this function will need to change + */ +string lowercase_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + buf[i] = tolower(*s); + } + return string(buf); +} + + +static bool is_string_in_set(set& s, string h) { + if ((s.find("*") != s.end()) || + (s.find(h) != s.end())) { + return true; + } + /* The header can be Content-*-type, or Content-* */ + for(set::iterator it = s.begin(); + it != s.end(); ++it) { + size_t off; + if ((off = (*it).find("*"))!=string::npos) { + list ssplit; + unsigned flen = 0; + + get_str_list((*it), "* \t", ssplit); + if (off != 0) { + string sl = ssplit.front(); + flen = sl.length(); + dout(10) << "Finding " << sl << ", in " << h << ", at offset 0" << dendl; + if (!boost::algorithm::starts_with(h,sl)) + continue; + ssplit.pop_front(); + } + if (off != ((*it).length() - 1)) { + string sl = ssplit.front(); + dout(10) << "Finding " << sl << ", in " << h + << ", at offset not less than " << flen << dendl; + if (h.size() < sl.size() || + h.compare((h.size() - sl.size()), sl.size(), sl) != 0) + continue; + ssplit.pop_front(); + } + if (!ssplit.empty()) + continue; + return true; + } + } + return false; +} + +bool RGWCORSRule::has_wildcard_origin() { + if (allowed_origins.find("*") != allowed_origins.end()) + return true; + + return false; +} + +bool RGWCORSRule::is_origin_present(const char *o) { + string origin = o; + return is_string_in_set(allowed_origins, origin); +} + +bool RGWCORSRule::is_header_allowed(const char *h, size_t len) { + string hdr(h, len); + if(lowercase_allowed_hdrs.empty()) { + set::iterator iter; + for (iter = allowed_hdrs.begin(); iter != allowed_hdrs.end(); ++iter) { + lowercase_allowed_hdrs.insert(lowercase_http_attr(*iter)); + } + } + return is_string_in_set(lowercase_allowed_hdrs, lowercase_http_attr(hdr)); +} + +void RGWCORSRule::format_exp_headers(string& s) { + s = ""; + for (const auto& header : exposable_hdrs) { + if (s.length() > 0) + s.append(","); + // these values are sent to clients in a 'Access-Control-Expose-Headers' + // response header, so we escape '\n' to avoid header injection + boost::replace_all_copy(std::back_inserter(s), header, "\n", "\\n"); + } +} + +RGWCORSRule * RGWCORSConfiguration::host_name_rule(const char *origin) { + for(list::iterator it_r = rules.begin(); + it_r != rules.end(); ++it_r) { + RGWCORSRule& r = (*it_r); + if (r.is_origin_present(origin)) + return &r; + } + return NULL; +} + +void RGWCORSConfiguration::erase_host_name_rule(string& origin) { + bool rule_empty; + unsigned loop = 0; + /*Erase the host name from that rule*/ + dout(10) << "Num of rules : " << rules.size() << dendl; + for(list::iterator it_r = rules.begin(); + it_r != rules.end(); ++it_r, loop++) { + RGWCORSRule& r = (*it_r); + r.erase_origin_if_present(origin, &rule_empty); + dout(10) << "Origin:" << origin << ", rule num:" + << loop << ", emptying now:" << rule_empty << dendl; + if (rule_empty) { + rules.erase(it_r); + break; + } + } +} + +void RGWCORSConfiguration::dump() { + unsigned loop = 1; + unsigned num_rules = rules.size(); + dout(10) << "Number of rules: " << num_rules << dendl; + for(list::iterator it = rules.begin(); + it!= rules.end(); ++it, loop++) { + dout(10) << " <<<<<<< Rule " << loop << " >>>>>>> " << dendl; + (*it).dump_origins(); + } +} diff --git a/src/rgw/rgw_cors.h b/src/rgw/rgw_cors.h new file mode 100644 index 000000000..c7a2ed5bd --- /dev/null +++ b/src/rgw/rgw_cors.h @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include + +#define RGW_CORS_GET 0x1 +#define RGW_CORS_PUT 0x2 +#define RGW_CORS_HEAD 0x4 +#define RGW_CORS_POST 0x8 +#define RGW_CORS_DELETE 0x10 +#define RGW_CORS_COPY 0x20 +#define RGW_CORS_ALL (RGW_CORS_GET | \ + RGW_CORS_PUT | \ + RGW_CORS_HEAD | \ + RGW_CORS_POST | \ + RGW_CORS_DELETE | \ + RGW_CORS_COPY) + +#define CORS_MAX_AGE_INVALID ((uint32_t)-1) + +class RGWCORSRule +{ +protected: + uint32_t max_age; + uint8_t allowed_methods; + std::string id; + std::set allowed_hdrs; /* If you change this, you need to discard lowercase_allowed_hdrs */ + std::set lowercase_allowed_hdrs; /* Not built until needed in RGWCORSRule::is_header_allowed */ + std::set allowed_origins; + std::list exposable_hdrs; + +public: + RGWCORSRule() : max_age(CORS_MAX_AGE_INVALID),allowed_methods(0) {} + RGWCORSRule(std::set& o, std::set& h, + std::list& e, uint8_t f, uint32_t a) + :max_age(a), + allowed_methods(f), + allowed_hdrs(h), + allowed_origins(o), + exposable_hdrs(e) {} + virtual ~RGWCORSRule() {} + + std::string& get_id() { return id; } + uint32_t get_max_age() { return max_age; } + uint8_t get_allowed_methods() { return allowed_methods; } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(max_age, bl); + encode(allowed_methods, bl); + encode(id, bl); + encode(allowed_hdrs, bl); + encode(allowed_origins, bl); + encode(exposable_hdrs, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(max_age, bl); + decode(allowed_methods, bl); + decode(id, bl); + decode(allowed_hdrs, bl); + decode(allowed_origins, bl); + decode(exposable_hdrs, bl); + DECODE_FINISH(bl); + } + bool has_wildcard_origin(); + bool is_origin_present(const char *o); + void format_exp_headers(std::string& s); + void erase_origin_if_present(std::string& origin, bool *rule_empty); + void dump_origins(); + void dump(Formatter *f) const; + bool is_header_allowed(const char *hdr, size_t len); +}; +WRITE_CLASS_ENCODER(RGWCORSRule) + +class RGWCORSConfiguration +{ + protected: + std::list rules; + public: + RGWCORSConfiguration() {} + ~RGWCORSConfiguration() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(rules, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(rules, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + std::list& get_rules() { + return rules; + } + bool is_empty() { + return rules.empty(); + } + void get_origins_list(const char *origin, std::list& origins); + RGWCORSRule * host_name_rule(const char *origin); + void erase_host_name_rule(std::string& origin); + void dump(); + void stack_rule(RGWCORSRule& r) { + rules.push_front(r); + } +}; +WRITE_CLASS_ENCODER(RGWCORSConfiguration) + +static inline int validate_name_string(std::string_view o) { + if (o.length() == 0) + return -1; + if (o.find_first_of("*") != o.find_last_of("*")) + return -1; + return 0; +} + +static inline uint8_t get_cors_method_flags(const char *req_meth) { + uint8_t flags = 0; + + if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET; + else if (strcmp(req_meth, "POST") == 0) flags = RGW_CORS_POST; + else if (strcmp(req_meth, "PUT") == 0) flags = RGW_CORS_PUT; + else if (strcmp(req_meth, "DELETE") == 0) flags = RGW_CORS_DELETE; + else if (strcmp(req_meth, "HEAD") == 0) flags = RGW_CORS_HEAD; + + return flags; +} diff --git a/src/rgw/rgw_cors_s3.cc b/src/rgw/rgw_cors_s3.cc new file mode 100644 index 000000000..ba68487e2 --- /dev/null +++ b/src/rgw/rgw_cors_s3.cc @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_cors_s3.h" +#include "rgw_user.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void RGWCORSRule_S3::to_xml(XMLFormatter& f) { + + f.open_object_section("CORSRule"); + /*ID if present*/ + if (id.length() > 0) { + f.dump_string("ID", id); + } + /*AllowedMethods*/ + if (allowed_methods & RGW_CORS_GET) + f.dump_string("AllowedMethod", "GET"); + if (allowed_methods & RGW_CORS_PUT) + f.dump_string("AllowedMethod", "PUT"); + if (allowed_methods & RGW_CORS_DELETE) + f.dump_string("AllowedMethod", "DELETE"); + if (allowed_methods & RGW_CORS_HEAD) + f.dump_string("AllowedMethod", "HEAD"); + if (allowed_methods & RGW_CORS_POST) + f.dump_string("AllowedMethod", "POST"); + if (allowed_methods & RGW_CORS_COPY) + f.dump_string("AllowedMethod", "COPY"); + /*AllowedOrigins*/ + for(set::iterator it = allowed_origins.begin(); + it != allowed_origins.end(); + ++it) { + string host = *it; + f.dump_string("AllowedOrigin", host); + } + /*AllowedHeader*/ + for(set::iterator it = allowed_hdrs.begin(); + it != allowed_hdrs.end(); ++it) { + f.dump_string("AllowedHeader", *it); + } + /*MaxAgeSeconds*/ + if (max_age != CORS_MAX_AGE_INVALID) { + f.dump_unsigned("MaxAgeSeconds", max_age); + } + /*ExposeHeader*/ + for(list::iterator it = exposable_hdrs.begin(); + it != exposable_hdrs.end(); ++it) { + f.dump_string("ExposeHeader", *it); + } + f.close_section(); +} + +bool RGWCORSRule_S3::xml_end(const char *el) { + XMLObjIter iter = find("AllowedMethod"); + XMLObj *obj; + /*Check all the allowedmethods*/ + obj = iter.get_next(); + if (obj) { + for( ; obj; obj = iter.get_next()) { + const char *s = obj->get_data().c_str(); + ldpp_dout(dpp, 10) << "RGWCORSRule::xml_end, el : " << el << ", data : " << s << dendl; + if (strcasecmp(s, "GET") == 0) { + allowed_methods |= RGW_CORS_GET; + } else if (strcasecmp(s, "POST") == 0) { + allowed_methods |= RGW_CORS_POST; + } else if (strcasecmp(s, "DELETE") == 0) { + allowed_methods |= RGW_CORS_DELETE; + } else if (strcasecmp(s, "HEAD") == 0) { + allowed_methods |= RGW_CORS_HEAD; + } else if (strcasecmp(s, "PUT") == 0) { + allowed_methods |= RGW_CORS_PUT; + } else if (strcasecmp(s, "COPY") == 0) { + allowed_methods |= RGW_CORS_COPY; + } else { + return false; + } + } + } + /*Check the id's len, it should be less than 255*/ + XMLObj *xml_id = find_first("ID"); + if (xml_id != NULL) { + string data = xml_id->get_data(); + if (data.length() > 255) { + ldpp_dout(dpp, 0) << "RGWCORSRule has id of length greater than 255" << dendl; + return false; + } + ldpp_dout(dpp, 10) << "RGWCORRule id : " << data << dendl; + id = data; + } + /*Check if there is atleast one AllowedOrigin*/ + iter = find("AllowedOrigin"); + if (!(obj = iter.get_next())) { + ldpp_dout(dpp, 0) << "RGWCORSRule does not have even one AllowedOrigin" << dendl; + return false; + } + for( ; obj; obj = iter.get_next()) { + ldpp_dout(dpp, 10) << "RGWCORSRule - origin : " << obj->get_data() << dendl; + /*Just take the hostname*/ + string host = obj->get_data(); + if (validate_name_string(host) != 0) + return false; + allowed_origins.insert(allowed_origins.end(), host); + } + /*Check of max_age*/ + iter = find("MaxAgeSeconds"); + if ((obj = iter.get_next())) { + char *end = NULL; + + unsigned long long ull = strtoull(obj->get_data().c_str(), &end, 10); + if (*end != '\0') { + ldpp_dout(dpp, 0) << "RGWCORSRule's MaxAgeSeconds " << obj->get_data() << " is an invalid integer" << dendl; + return false; + } + if (ull >= 0x100000000ull) { + max_age = CORS_MAX_AGE_INVALID; + } else { + max_age = (uint32_t)ull; + } + ldpp_dout(dpp, 10) << "RGWCORSRule : max_age : " << max_age << dendl; + } + /*Check and update ExposeHeader*/ + iter = find("ExposeHeader"); + if ((obj = iter.get_next())) { + for(; obj; obj = iter.get_next()) { + ldpp_dout(dpp, 10) << "RGWCORSRule - exp_hdr : " << obj->get_data() << dendl; + exposable_hdrs.push_back(obj->get_data()); + } + } + /*Check and update AllowedHeader*/ + iter = find("AllowedHeader"); + if ((obj = iter.get_next())) { + for(; obj; obj = iter.get_next()) { + ldpp_dout(dpp, 10) << "RGWCORSRule - allowed_hdr : " << obj->get_data() << dendl; + string s = obj->get_data(); + if (validate_name_string(s) != 0) + return false; + allowed_hdrs.insert(allowed_hdrs.end(), s); + } + } + return true; +} + +void RGWCORSConfiguration_S3::to_xml(ostream& out) { + XMLFormatter f; + f.open_object_section_in_ns("CORSConfiguration", XMLNS_AWS_S3); + for(list::iterator it = rules.begin(); + it != rules.end(); ++it) { + (static_cast(*it)).to_xml(f); + } + f.close_section(); + f.flush(out); +} + +bool RGWCORSConfiguration_S3::xml_end(const char *el) { + XMLObjIter iter = find("CORSRule"); + RGWCORSRule_S3 *obj; + if (!(obj = static_cast(iter.get_next()))) { + ldpp_dout(dpp, 0) << "CORSConfiguration should have atleast one CORSRule" << dendl; + return false; + } + for(; obj; obj = static_cast(iter.get_next())) { + rules.push_back(*obj); + } + return true; +} + +class CORSRuleID_S3 : public XMLObj { + public: + CORSRuleID_S3() {} + ~CORSRuleID_S3() override {} +}; + +class CORSRuleAllowedOrigin_S3 : public XMLObj { + public: + CORSRuleAllowedOrigin_S3() {} + ~CORSRuleAllowedOrigin_S3() override {} +}; + +class CORSRuleAllowedMethod_S3 : public XMLObj { + public: + CORSRuleAllowedMethod_S3() {} + ~CORSRuleAllowedMethod_S3() override {} +}; + +class CORSRuleAllowedHeader_S3 : public XMLObj { + public: + CORSRuleAllowedHeader_S3() {} + ~CORSRuleAllowedHeader_S3() override {} +}; + +class CORSRuleMaxAgeSeconds_S3 : public XMLObj { + public: + CORSRuleMaxAgeSeconds_S3() {} + ~CORSRuleMaxAgeSeconds_S3() override {} +}; + +class CORSRuleExposeHeader_S3 : public XMLObj { + public: + CORSRuleExposeHeader_S3() {} + ~CORSRuleExposeHeader_S3() override {} +}; + +XMLObj *RGWCORSXMLParser_S3::alloc_obj(const char *el) { + if (strcmp(el, "CORSConfiguration") == 0) { + return new RGWCORSConfiguration_S3(dpp); + } else if (strcmp(el, "CORSRule") == 0) { + return new RGWCORSRule_S3(dpp); + } else if (strcmp(el, "ID") == 0) { + return new CORSRuleID_S3; + } else if (strcmp(el, "AllowedOrigin") == 0) { + return new CORSRuleAllowedOrigin_S3; + } else if (strcmp(el, "AllowedMethod") == 0) { + return new CORSRuleAllowedMethod_S3; + } else if (strcmp(el, "AllowedHeader") == 0) { + return new CORSRuleAllowedHeader_S3; + } else if (strcmp(el, "MaxAgeSeconds") == 0) { + return new CORSRuleMaxAgeSeconds_S3; + } else if (strcmp(el, "ExposeHeader") == 0) { + return new CORSRuleExposeHeader_S3; + } + return NULL; +} + diff --git a/src/rgw/rgw_cors_s3.h b/src/rgw/rgw_cors_s3.h new file mode 100644 index 000000000..8d92a3c5f --- /dev/null +++ b/src/rgw/rgw_cors_s3.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include "rgw_xml.h" +#include "rgw_cors.h" + +class RGWCORSRule_S3 : public RGWCORSRule, public XMLObj +{ + const DoutPrefixProvider *dpp; + public: + RGWCORSRule_S3(const DoutPrefixProvider *dpp) : dpp(dpp) {} + ~RGWCORSRule_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(XMLFormatter& f); +}; + +class RGWCORSConfiguration_S3 : public RGWCORSConfiguration, public XMLObj +{ + const DoutPrefixProvider *dpp; + public: + RGWCORSConfiguration_S3(const DoutPrefixProvider *dpp) : dpp(dpp) {} + ~RGWCORSConfiguration_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(std::ostream& out); +}; + +class RGWCORSXMLParser_S3 : public RGWXMLParser +{ + const DoutPrefixProvider *dpp; + CephContext *cct; + + XMLObj *alloc_obj(const char *el) override; +public: + explicit RGWCORSXMLParser_S3(const DoutPrefixProvider *_dpp, CephContext *_cct) : dpp(_dpp), cct(_cct) {} +}; diff --git a/src/rgw/rgw_cors_swift.h b/src/rgw/rgw_cors_swift.h new file mode 100644 index 000000000..f5a1b14a0 --- /dev/null +++ b/src/rgw/rgw_cors_swift.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "rgw_cors.h" + +class RGWCORSConfiguration_SWIFT : public RGWCORSConfiguration +{ + public: + RGWCORSConfiguration_SWIFT() {} + ~RGWCORSConfiguration_SWIFT() {} + int create_update(const char *allow_origins, const char *allow_headers, + const char *expose_headers, const char *max_age) { + std::set o, h; + std::list e; + unsigned long a = CORS_MAX_AGE_INVALID; + uint8_t flags = RGW_CORS_ALL; + + int nr_invalid_names = 0; + auto add_host = [&nr_invalid_names, &o] (auto host) { + if (validate_name_string(host) == 0) { + o.emplace(std::string{host}); + } else { + nr_invalid_names++; + } + }; + for_each_substr(allow_origins, ";,= \t", add_host); + if (o.empty() || nr_invalid_names > 0) { + return -EINVAL; + } + + if (allow_headers) { + int nr_invalid_headers = 0; + auto add_header = [&nr_invalid_headers, &h] (auto allow_header) { + if (validate_name_string(allow_header) == 0) { + h.emplace(std::string{allow_header}); + } else { + nr_invalid_headers++; + } + }; + for_each_substr(allow_headers, ";,= \t", add_header); + if (h.empty() || nr_invalid_headers > 0) { + return -EINVAL; + } + } + + if (expose_headers) { + for_each_substr(expose_headers, ";,= \t", + [&e] (auto expose_header) { + e.emplace_back(std::string(expose_header)); + }); + } + if (max_age) { + char *end = NULL; + a = strtoul(max_age, &end, 10); + if (a == ULONG_MAX) + a = CORS_MAX_AGE_INVALID; + } + + RGWCORSRule rule(o, h, e, flags, a); + stack_rule(rule); + return 0; + } +}; diff --git a/src/rgw/rgw_cr_rest.cc b/src/rgw/rgw_cr_rest.cc new file mode 100644 index 000000000..04920a155 --- /dev/null +++ b/src/rgw/rgw_cr_rest.cc @@ -0,0 +1,351 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_cr_rest.h" + +#include "rgw_coroutine.h" + +// re-include our assert to clobber the system one; fix dout: +#include "include/ceph_assert.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWCRHTTPGetDataCB::RGWCRHTTPGetDataCB(RGWCoroutinesEnv *_env, RGWCoroutine *_cr, RGWHTTPStreamRWRequest *_req) : env(_env), cr(_cr), req(_req) { + io_id = req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_READ |RGWHTTPClient::HTTPCLIENT_IO_CONTROL); + req->set_in_cb(this); +} + +#define GET_DATA_WINDOW_SIZE 2 * 1024 * 1024 + +int RGWCRHTTPGetDataCB::handle_data(bufferlist& bl, bool *pause) { + if (data.length() < GET_DATA_WINDOW_SIZE / 2) { + notified = false; + } + + { + uint64_t bl_len = bl.length(); + + std::lock_guard l{lock}; + + if (!got_all_extra_data) { + uint64_t max = extra_data_len - extra_data.length(); + if (max > bl_len) { + max = bl_len; + } + bl.splice(0, max, &extra_data); + bl_len -= max; + got_all_extra_data = extra_data.length() == extra_data_len; + } + + data.append(bl); + } + + uint64_t data_len = data.length(); + if (data_len >= GET_DATA_WINDOW_SIZE && !notified) { + notified = true; + env->manager->io_complete(cr, io_id); + } + if (data_len >= 2 * GET_DATA_WINDOW_SIZE) { + *pause = true; + paused = true; + } + return 0; +} + +void RGWCRHTTPGetDataCB::claim_data(bufferlist *dest, uint64_t max) { + bool need_to_unpause = false; + + { + std::lock_guard l{lock}; + + if (data.length() == 0) { + return; + } + + if (data.length() < max) { + max = data.length(); + } + + data.splice(0, max, dest); + need_to_unpause = (paused && data.length() <= GET_DATA_WINDOW_SIZE); + } + + if (need_to_unpause) { + req->unpause_receive(); + } +} + +RGWStreamReadHTTPResourceCRF::~RGWStreamReadHTTPResourceCRF() +{ + if (req) { + req->cancel(); + req->wait(null_yield); + delete req; + } +} + +int RGWStreamReadHTTPResourceCRF::init(const DoutPrefixProvider *dpp) +{ + env->stack->init_new_io(req); + + in_cb.emplace(env, caller, req); + + int r = req->send(http_manager); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWStreamWriteHTTPResourceCRF::send() +{ + env->stack->init_new_io(req); + + req->set_write_drain_cb(&write_drain_notify_cb); + + int r = req->send(http_manager); + if (r < 0) { + return r; + } + + return 0; +} + +bool RGWStreamReadHTTPResourceCRF::has_attrs() +{ + return got_attrs; +} + +void RGWStreamReadHTTPResourceCRF::get_attrs(std::map *attrs) +{ + req->get_out_headers(attrs); +} + +int RGWStreamReadHTTPResourceCRF::decode_rest_obj(const DoutPrefixProvider *dpp, map& headers, bufferlist& extra_data) { + /* basic generic implementation */ + for (auto header : headers) { + const string& val = header.second; + + rest_obj.attrs[header.first] = val; + } + + return 0; +} + +int RGWStreamReadHTTPResourceCRF::read(const DoutPrefixProvider *dpp, bufferlist *out, uint64_t max_size, bool *io_pending) +{ + reenter(&read_state) { + io_read_mask = req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_READ | RGWHTTPClient::HTTPCLIENT_IO_CONTROL); + while (!req->is_done() || + in_cb->has_data()) { + *io_pending = true; + if (!in_cb->has_data()) { + yield caller->io_block(0, io_read_mask); + } + got_attrs = true; + if (need_extra_data() && !got_extra_data) { + if (!in_cb->has_all_extra_data()) { + continue; + } + extra_data.claim_append(in_cb->get_extra_data()); + map attrs; + req->get_out_headers(&attrs); + int ret = decode_rest_obj(dpp, attrs, extra_data); + if (ret < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << " decode_rest_obj() returned ret=" << ret << dendl; + return ret; + } + got_extra_data = true; + } + *io_pending = false; + in_cb->claim_data(out, max_size); + if (out->length() == 0) { + /* this may happen if we just read the prepended extra_data and didn't have any data + * after. In that case, retry reading, so that caller doesn't assume it's EOF. + */ + continue; + } + if (!req->is_done() || out->length() >= max_size) { + yield; + } + } + } + return 0; +} + +bool RGWStreamReadHTTPResourceCRF::is_done() +{ + return req->is_done(); +} + +RGWStreamWriteHTTPResourceCRF::~RGWStreamWriteHTTPResourceCRF() +{ + if (req) { + req->cancel(); + req->wait(null_yield); + delete req; + } +} + +void RGWStreamWriteHTTPResourceCRF::send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) +{ + req->set_send_length(rest_obj.content_len); + for (auto h : rest_obj.attrs) { + req->append_header(h.first, h.second); + } +} + +#define PENDING_WRITES_WINDOW (1 * 1024 * 1024) + +void RGWStreamWriteHTTPResourceCRF::write_drain_notify(uint64_t pending_size) +{ + lock_guard l(blocked_lock); + if (is_blocked && (pending_size < PENDING_WRITES_WINDOW / 2)) { + env->manager->io_complete(caller, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_WRITE | RGWHTTPClient::HTTPCLIENT_IO_CONTROL)); + is_blocked = false; + } +} + +void RGWStreamWriteHTTPResourceCRF::WriteDrainNotify::notify(uint64_t pending_size) +{ + crf->write_drain_notify(pending_size); +} + +int RGWStreamWriteHTTPResourceCRF::write(bufferlist& data, bool *io_pending) +{ + reenter(&write_state) { + while (!req->is_done()) { + *io_pending = false; + if (req->get_pending_send_size() >= PENDING_WRITES_WINDOW) { + *io_pending = true; + { + lock_guard l(blocked_lock); + is_blocked = true; + + /* it's ok to unlock here, even if io_complete() arrives before io_block(), it'll wakeup + * correctly */ + } + yield caller->io_block(0, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_WRITE | RGWHTTPClient::HTTPCLIENT_IO_CONTROL)); + } + yield req->add_send_data(data); + } + return req->get_status(); + } + return 0; +} + +int RGWStreamWriteHTTPResourceCRF::drain_writes(bool *need_retry) +{ + reenter(&drain_state) { + *need_retry = true; + yield req->finish_write(); + *need_retry = !req->is_done(); + while (!req->is_done()) { + yield caller->io_block(0, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_CONTROL)); + *need_retry = !req->is_done(); + } + + map headers; + req->get_out_headers(&headers); + handle_headers(headers); + + return req->get_req_retcode(); + } + return 0; +} + +RGWStreamSpliceCR::RGWStreamSpliceCR(CephContext *_cct, RGWHTTPManager *_mgr, + shared_ptr& _in_crf, + shared_ptr& _out_crf) : RGWCoroutine(_cct), cct(_cct), http_manager(_mgr), + in_crf(_in_crf), out_crf(_out_crf) {} +RGWStreamSpliceCR::~RGWStreamSpliceCR() { } + +int RGWStreamSpliceCR::operate(const DoutPrefixProvider *dpp) { + reenter(this) { + { + int ret = in_crf->init(dpp); + if (ret < 0) { + return set_cr_error(ret); + } + } + + do { + + bl.clear(); + + do { + yield { + ret = in_crf->read(dpp, &bl, 4 * 1024 * 1024, &need_retry); + if (ret < 0) { + return set_cr_error(ret); + } + } + + if (retcode < 0) { + ldout(cct, 20) << __func__ << ": in_crf->read() retcode=" << retcode << dendl; + return set_cr_error(ret); + } + } while (need_retry); + + ldout(cct, 20) << "read " << bl.length() << " bytes" << dendl; + + if (!in_crf->has_attrs()) { + assert (bl.length() == 0); + continue; + } + + if (!sent_attrs) { + int ret = out_crf->init(); + if (ret < 0) { + return set_cr_error(ret); + } + out_crf->send_ready(dpp, in_crf->get_rest_obj()); + ret = out_crf->send(); + if (ret < 0) { + return set_cr_error(ret); + } + sent_attrs = true; + } + + if (bl.length() == 0 && in_crf->is_done()) { + break; + } + + total_read += bl.length(); + + do { + yield { + ldout(cct, 20) << "writing " << bl.length() << " bytes" << dendl; + ret = out_crf->write(bl, &need_retry); + if (ret < 0) { + return set_cr_error(ret); + } + } + + if (retcode < 0) { + ldout(cct, 20) << __func__ << ": out_crf->write() retcode=" << retcode << dendl; + return set_cr_error(ret); + } + } while (need_retry); + } while (true); + + do { + yield { + int ret = out_crf->drain_writes(&need_retry); + if (ret < 0) { + return set_cr_error(ret); + } + } + } while (need_retry); + + return set_cr_done(); + } + return 0; +} + diff --git a/src/rgw/rgw_cr_rest.h b/src/rgw/rgw_cr_rest.h new file mode 100644 index 000000000..ba47c3dd6 --- /dev/null +++ b/src/rgw/rgw_cr_rest.h @@ -0,0 +1,590 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include "include/ceph_assert.h" // boost header clobbers our assert.h + +#include "rgw_coroutine.h" +#include "rgw_rest_conn.h" + + +struct rgw_rest_obj { + rgw_obj_key key; + uint64_t content_len; + std::map attrs; + std::map custom_attrs; + RGWAccessControlPolicy acls; + + void init(const rgw_obj_key& _key) { + key = _key; + } +}; + +class RGWReadRawRESTResourceCR : public RGWSimpleCoroutine { + bufferlist *result; + protected: + RGWRESTConn *conn; + RGWHTTPManager *http_manager; + std::string path; + param_vec_t params; + param_vec_t extra_headers; +public: + boost::intrusive_ptr http_op; + RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const std::string& _path, + rgw_http_param_pair *params, bufferlist *_result) + : RGWSimpleCoroutine(_cct), result(_result), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(params)) + {} + + RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const std::string& _path, + rgw_http_param_pair *params) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(params)) + {} + + RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const std::string& _path, + rgw_http_param_pair *params, param_vec_t &hdrs) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(params)), + extra_headers(hdrs) + {} + + RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const std::string& _path, + rgw_http_param_pair *params, + std::map *hdrs) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(params)), + extra_headers(make_param_list(hdrs)) + {} + + + ~RGWReadRawRESTResourceCR() override { + request_cleanup(); + } + + int send_request(const DoutPrefixProvider *dpp) override { + auto op = boost::intrusive_ptr( + new RGWRESTReadResource(conn, path, params, &extra_headers, http_manager)); + + init_new_io(op.get()); + + int ret = op->aio_read(dpp); + if (ret < 0) { + log_error() << "failed to send http operation: " << op->to_str() + << " ret=" << ret << std::endl; + op->put(); + return ret; + } + std::swap(http_op, op); // store reference in http_op on success + return 0; + } + + + + virtual int wait_result() { + return http_op->wait(result, null_yield); + } + + int request_complete() override { + int ret; + + ret = wait_result(); + + auto op = std::move(http_op); // release ref on return + if (ret < 0) { + error_stream << "http operation failed: " << op->to_str() + << " status=" << op->get_http_status() << std::endl; + op->put(); + return ret; + } + op->put(); + return 0; + } + + void request_cleanup() override { + if (http_op) { + http_op->put(); + http_op = NULL; + } + } + +}; + + +template +class RGWReadRESTResourceCR : public RGWReadRawRESTResourceCR { + T *result; + public: + RGWReadRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const std::string& _path, + rgw_http_param_pair *params, T *_result) + : RGWReadRawRESTResourceCR(_cct, _conn, _http_manager, _path, params), result(_result) + {} + + RGWReadRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const std::string& _path, + rgw_http_param_pair *params, + std::map *hdrs, + T *_result) + : RGWReadRawRESTResourceCR(_cct, _conn, _http_manager, _path, params, hdrs), result(_result) + {} + + int wait_result() override { + return http_op->wait(result, null_yield); + } + +}; + +template +class RGWSendRawRESTResourceCR: public RGWSimpleCoroutine { + protected: + RGWRESTConn *conn; + RGWHTTPManager *http_manager; + std::string method; + std::string path; + param_vec_t params; + param_vec_t headers; + std::map *attrs; + T *result; + E *err_result; + bufferlist input_bl; + bool send_content_length=false; + boost::intrusive_ptr http_op; + + public: + RGWSendRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _method, const std::string& _path, + rgw_http_param_pair *_params, + std::map *_attrs, + bufferlist& _input, T *_result, + bool _send_content_length, + E *_err_result = nullptr) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + method(_method), path(_path), params(make_param_list(_params)), + headers(make_param_list(_attrs)), attrs(_attrs), + result(_result), err_result(_err_result), + input_bl(_input), send_content_length(_send_content_length) {} + + RGWSendRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _method, const std::string& _path, + rgw_http_param_pair *_params, std::map *_attrs, + T *_result, E *_err_result = nullptr) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + method(_method), path(_path), params(make_param_list(_params)), headers(make_param_list(_attrs)), attrs(_attrs), result(_result), + err_result(_err_result) {} + + ~RGWSendRawRESTResourceCR() override { + request_cleanup(); + } + + int send_request(const DoutPrefixProvider *dpp) override { + auto op = boost::intrusive_ptr( + new RGWRESTSendResource(conn, method, path, params, &headers, http_manager)); + + init_new_io(op.get()); + + int ret = op->aio_send(dpp, input_bl); + if (ret < 0) { + ldpp_subdout(dpp, rgw, 0) << "ERROR: failed to send request" << dendl; + op->put(); + return ret; + } + std::swap(http_op, op); // store reference in http_op on success + return 0; + } + + int request_complete() override { + int ret; + if (result || err_result) { + ret = http_op->wait(result, null_yield, err_result); + } else { + bufferlist bl; + ret = http_op->wait(&bl, null_yield); + } + auto op = std::move(http_op); // release ref on return + if (ret < 0) { + error_stream << "http operation failed: " << op->to_str() + << " status=" << op->get_http_status() << std::endl; + lsubdout(cct, rgw, 5) << "failed to wait for op, ret=" << ret + << ": " << op->to_str() << dendl; + op->put(); + return ret; + } + op->put(); + return 0; + } + + void request_cleanup() override { + if (http_op) { + http_op->put(); + http_op = NULL; + } + } +}; + +template +class RGWSendRESTResourceCR : public RGWSendRawRESTResourceCR { + public: + RGWSendRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _method, const std::string& _path, + rgw_http_param_pair *_params, std::map *_attrs, + S& _input, T *_result, E *_err_result = nullptr) + : RGWSendRawRESTResourceCR(_cct, _conn, _http_manager, _method, _path, _params, _attrs, _result, _err_result) { + + JSONFormatter jf; + encode_json("data", _input, &jf); + std::stringstream ss; + jf.flush(ss); + //bufferlist bl; + this->input_bl.append(ss.str()); + } + +}; + +template +class RGWPostRESTResourceCR : public RGWSendRESTResourceCR { +public: + RGWPostRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _path, + rgw_http_param_pair *_params, S& _input, + T *_result, E *_err_result = nullptr) + : RGWSendRESTResourceCR(_cct, _conn, _http_manager, + "POST", _path, + _params, nullptr, _input, + _result, _err_result) {} +}; + +template +class RGWPutRawRESTResourceCR: public RGWSendRawRESTResourceCR { + public: + RGWPutRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _path, + rgw_http_param_pair *_params, bufferlist& _input, + T *_result, E *_err_result = nullptr) + : RGWSendRawRESTResourceCR(_cct, _conn, _http_manager, "PUT", _path, + _params, nullptr, _input, _result, true, _err_result) {} + +}; + +template +class RGWPostRawRESTResourceCR: public RGWSendRawRESTResourceCR { + public: + RGWPostRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _path, + rgw_http_param_pair *_params, + std::map * _attrs, + bufferlist& _input, + T *_result, E *_err_result = nullptr) + : RGWSendRawRESTResourceCR(_cct, _conn, _http_manager, "POST", _path, + _params, _attrs, _input, _result, true, _err_result) {} + +}; + + +template +class RGWPutRESTResourceCR : public RGWSendRESTResourceCR { +public: + RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _path, + rgw_http_param_pair *_params, S& _input, + T *_result, E *_err_result = nullptr) + : RGWSendRESTResourceCR(_cct, _conn, _http_manager, + "PUT", _path, + _params, nullptr, _input, + _result, _err_result) {} + + RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _path, + rgw_http_param_pair *_params, + std::map *_attrs, + S& _input, T *_result, E *_err_result = nullptr) + : RGWSendRESTResourceCR(_cct, _conn, _http_manager, + "PUT", _path, + _params, _attrs, _input, + _result, _err_result) {} + +}; + +class RGWDeleteRESTResourceCR : public RGWSimpleCoroutine { + RGWRESTConn *conn; + RGWHTTPManager *http_manager; + std::string path; + param_vec_t params; + + boost::intrusive_ptr http_op; + +public: + RGWDeleteRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const std::string& _path, + rgw_http_param_pair *_params) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(_params)) + {} + + ~RGWDeleteRESTResourceCR() override { + request_cleanup(); + } + + int send_request(const DoutPrefixProvider *dpp) override { + auto op = boost::intrusive_ptr( + new RGWRESTDeleteResource(conn, path, params, nullptr, http_manager)); + + init_new_io(op.get()); + + bufferlist bl; + + int ret = op->aio_send(dpp, bl); + if (ret < 0) { + ldpp_subdout(dpp, rgw, 0) << "ERROR: failed to send DELETE request" << dendl; + op->put(); + return ret; + } + std::swap(http_op, op); // store reference in http_op on success + return 0; + } + + int request_complete() override { + int ret; + bufferlist bl; + ret = http_op->wait(&bl, null_yield); + auto op = std::move(http_op); // release ref on return + if (ret < 0) { + error_stream << "http operation failed: " << op->to_str() + << " status=" << op->get_http_status() << std::endl; + lsubdout(cct, rgw, 5) << "failed to wait for op, ret=" << ret + << ": " << op->to_str() << dendl; + op->put(); + return ret; + } + op->put(); + return 0; + } + + void request_cleanup() override { + if (http_op) { + http_op->put(); + http_op = NULL; + } + } +}; + +class RGWCRHTTPGetDataCB : public RGWHTTPStreamRWRequest::ReceiveCB { + ceph::mutex lock = ceph::make_mutex("RGWCRHTTPGetDataCB"); + RGWCoroutinesEnv *env; + RGWCoroutine *cr; + RGWHTTPStreamRWRequest *req; + rgw_io_id io_id; + bufferlist data; + bufferlist extra_data; + bool got_all_extra_data{false}; + bool paused{false}; + bool notified{false}; +public: + RGWCRHTTPGetDataCB(RGWCoroutinesEnv *_env, RGWCoroutine *_cr, RGWHTTPStreamRWRequest *_req); + + int handle_data(bufferlist& bl, bool *pause) override; + + void claim_data(bufferlist *dest, uint64_t max); + + bufferlist& get_extra_data() { + return extra_data; + } + + bool has_data() { + return (data.length() > 0); + } + + bool has_all_extra_data() { + return got_all_extra_data; + } +}; + + +class RGWStreamReadResourceCRF { +protected: + boost::asio::coroutine read_state; + +public: + virtual int init(const DoutPrefixProvider *dpp) = 0; + virtual int read(const DoutPrefixProvider *dpp, bufferlist *data, uint64_t max, bool *need_retry) = 0; /* reentrant */ + virtual int decode_rest_obj(const DoutPrefixProvider *dpp, std::map& headers, bufferlist& extra_data) = 0; + virtual bool has_attrs() = 0; + virtual void get_attrs(std::map *attrs) = 0; + virtual ~RGWStreamReadResourceCRF() = default; +}; + +class RGWStreamWriteResourceCRF { +protected: + boost::asio::coroutine write_state; + boost::asio::coroutine drain_state; + +public: + virtual int init() = 0; + virtual void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) = 0; + virtual int send() = 0; + virtual int write(bufferlist& data, bool *need_retry) = 0; /* reentrant */ + virtual int drain_writes(bool *need_retry) = 0; /* reentrant */ + + virtual ~RGWStreamWriteResourceCRF() = default; +}; + +class RGWStreamReadHTTPResourceCRF : public RGWStreamReadResourceCRF { + CephContext *cct; + RGWCoroutinesEnv *env; + RGWCoroutine *caller; + RGWHTTPManager *http_manager; + + RGWHTTPStreamRWRequest *req{nullptr}; + + std::optional in_cb; + + bufferlist extra_data; + + bool got_attrs{false}; + bool got_extra_data{false}; + + rgw_io_id io_read_mask; + +protected: + rgw_rest_obj rest_obj; + + struct range_info { + bool is_set{false}; + uint64_t ofs; + uint64_t size; + } range; + + ceph::real_time mtime; + std::string etag; + +public: + RGWStreamReadHTTPResourceCRF(CephContext *_cct, + RGWCoroutinesEnv *_env, + RGWCoroutine *_caller, + RGWHTTPManager *_http_manager, + const rgw_obj_key& _src_key) : cct(_cct), + env(_env), + caller(_caller), + http_manager(_http_manager) { + rest_obj.init(_src_key); + } + ~RGWStreamReadHTTPResourceCRF(); + + int init(const DoutPrefixProvider *dpp) override; + int read(const DoutPrefixProvider *dpp, bufferlist *data, uint64_t max, bool *need_retry) override; /* reentrant */ + int decode_rest_obj(const DoutPrefixProvider *dpp, std::map& headers, bufferlist& extra_data) override; + bool has_attrs() override; + void get_attrs(std::map *attrs) override; + bool is_done(); + virtual bool need_extra_data() { return false; } + + void set_req(RGWHTTPStreamRWRequest *r) { + req = r; + } + + rgw_rest_obj& get_rest_obj() { + return rest_obj; + } + + void set_range(uint64_t ofs, uint64_t size) { + range.is_set = true; + range.ofs = ofs; + range.size = size; + } +}; + +class RGWStreamWriteHTTPResourceCRF : public RGWStreamWriteResourceCRF { +protected: + RGWCoroutinesEnv *env; + RGWCoroutine *caller; + RGWHTTPManager *http_manager; + + using lock_guard = std::lock_guard; + + std::mutex blocked_lock; + bool is_blocked; + + RGWHTTPStreamRWRequest *req{nullptr}; + + struct multipart_info { + bool is_multipart{false}; + std::string upload_id; + int part_num{0}; + uint64_t part_size; + } multipart; + + class WriteDrainNotify : public RGWWriteDrainCB { + RGWStreamWriteHTTPResourceCRF *crf; + public: + explicit WriteDrainNotify(RGWStreamWriteHTTPResourceCRF *_crf) : crf(_crf) {} + void notify(uint64_t pending_size) override; + } write_drain_notify_cb; + +public: + RGWStreamWriteHTTPResourceCRF(CephContext *_cct, + RGWCoroutinesEnv *_env, + RGWCoroutine *_caller, + RGWHTTPManager *_http_manager) : env(_env), + caller(_caller), + http_manager(_http_manager), + write_drain_notify_cb(this) {} + virtual ~RGWStreamWriteHTTPResourceCRF(); + + int init() override { + return 0; + } + void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) override; + int send() override; + int write(bufferlist& data, bool *need_retry) override; /* reentrant */ + void write_drain_notify(uint64_t pending_size); + int drain_writes(bool *need_retry) override; /* reentrant */ + + virtual void handle_headers(const std::map& headers) {} + + void set_req(RGWHTTPStreamRWRequest *r) { + req = r; + } + + void set_multipart(const std::string& upload_id, int part_num, uint64_t part_size) { + multipart.is_multipart = true; + multipart.upload_id = upload_id; + multipart.part_num = part_num; + multipart.part_size = part_size; + } +}; + +class RGWStreamSpliceCR : public RGWCoroutine { + CephContext *cct; + RGWHTTPManager *http_manager; + std::string url; + std::shared_ptr in_crf; + std::shared_ptr out_crf; + bufferlist bl; + bool need_retry{false}; + bool sent_attrs{false}; + uint64_t total_read{0}; + int ret{0}; +public: + RGWStreamSpliceCR(CephContext *_cct, RGWHTTPManager *_mgr, + std::shared_ptr& _in_crf, + std::shared_ptr& _out_crf); + ~RGWStreamSpliceCR(); + + int operate(const DoutPrefixProvider *dpp) override; +}; diff --git a/src/rgw/rgw_crypt.cc b/src/rgw/rgw_crypt.cc new file mode 100644 index 000000000..69b1b8bc6 --- /dev/null +++ b/src/rgw/rgw_crypt.cc @@ -0,0 +1,1537 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/** + * Crypto filters for Put/Post/Get operations. + */ + +#include + +#include +#include +#include +#include +#include +#include "include/ceph_assert.h" +#include "crypto/crypto_accel.h" +#include "crypto/crypto_plugin.h" +#include "rgw/rgw_kms.h" +#include "rapidjson/document.h" +#include "rapidjson/writer.h" +#include "rapidjson/error/error.h" +#include "rapidjson/error/en.h" +#include // libicu + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace rgw; + +template +class canonical_char_sorter { +private: + const DoutPrefixProvider *dpp; + const icu::Normalizer2* normalizer; + CephContext *cct; +public: + canonical_char_sorter(const DoutPrefixProvider *dpp, CephContext *cct) : dpp(dpp), cct(cct) { + UErrorCode status = U_ZERO_ERROR; + normalizer = icu::Normalizer2::getNFCInstance(status); + if (U_FAILURE(status)) { + ldpp_dout(this->dpp, -1) << "ERROR: can't get nfc instance, error = " << status << dendl; + normalizer = 0; + } + } + bool compare_helper (const M *, const M *); + bool make_string_canonical(rapidjson::Value &, + rapidjson::Document::AllocatorType&); +}; + +template +bool +canonical_char_sorter::compare_helper (const M*a, const M*b) +{ + UErrorCode status = U_ZERO_ERROR; + const std::string as{a->name.GetString(), a->name.GetStringLength()}, + bs{b->name.GetString(), b->name.GetStringLength()}; + icu::UnicodeString aw{icu::UnicodeString::fromUTF8(as)}, bw{icu::UnicodeString::fromUTF8(bs)}; + int32_t afl{ aw.countChar32()}, bfl{bw.countChar32()}; + std::u32string af, bf; + af.resize(afl); bf.resize(bfl); + auto *astr{af.c_str()}, *bstr{bf.c_str()}; + aw.toUTF32((int32_t*)astr, afl, status); + bw.toUTF32((int32_t*)bstr, bfl, status); + bool r{af < bf}; + return r; +} + +template +bool +canonical_char_sorter::make_string_canonical (rapidjson::Value &v, rapidjson::Document::AllocatorType&a) +{ + UErrorCode status = U_ZERO_ERROR; + const std::string as{v.GetString(), v.GetStringLength()}; + + if (!normalizer) + return false; + const icu::UnicodeString aw{icu::UnicodeString::fromUTF8(as)}; + icu::UnicodeString an{normalizer->normalize(aw, status)}; + if (U_FAILURE(status)) { + ldpp_dout(this->dpp, 5) << "conversion error; code=" << status << + " on string " << as << dendl; + return false; + } + std::string ans; + an.toUTF8String(ans); + v.SetString(ans.c_str(), ans.length(), a); + return true; +} + +typedef +rapidjson::GenericMember, rapidjson::MemoryPoolAllocator<> > +MyMember; + +template +bool +sort_and_write(rapidjson::Value &d, H &writer, canonical_char_sorter& ccs) +{ + bool r; + switch(d.GetType()) { + case rapidjson::kObjectType: { + struct comparer { + canonical_char_sorter &r; + comparer(canonical_char_sorter &r) : r(r) {}; + bool operator()(const MyMember*a, const MyMember*b) { + return r.compare_helper(a,b); + } + } cmp_functor{ccs}; + if (!(r = writer.StartObject())) + break; + std::vector q; + for (auto &m: d.GetObject()) + q.push_back(&m); + std::sort(q.begin(), q.end(), cmp_functor); + for (auto m: q) { + assert(m->name.IsString()); + if (!(r = writer.Key(m->name.GetString(), m->name.GetStringLength()))) + goto Done; + if (!(r = sort_and_write(m->value, writer, ccs))) + goto Done; + } + r = writer.EndObject(); + break; } + case rapidjson::kArrayType: + if (!(r = writer.StartArray())) + break; + for (auto &v: d.GetArray()) { + if (!(r = sort_and_write(v, writer, ccs))) + goto Done; + } + r = writer.EndArray(); + break; + default: + r = d.Accept(writer); + break; + } +Done: + return r; +} + +enum struct mec_option { +empty = 0, number_ok = 1 +}; + +enum struct mec_error { +success = 0, conversion, number +}; + +mec_error +make_everything_canonical(rapidjson::Value &d, rapidjson::Document::AllocatorType&a, canonical_char_sorter& ccs, mec_option f = mec_option::empty ) +{ + mec_error r; + switch(d.GetType()) { + case rapidjson::kObjectType: + for (auto &m: d.GetObject()) { + assert(m.name.IsString()); + if (!ccs.make_string_canonical(m.name, a)) { + r = mec_error::conversion; + goto Error; + } + if ((r = make_everything_canonical(m.value, a, ccs, f)) != mec_error::success) + goto Error; + } + break; + case rapidjson::kArrayType: + for (auto &v: d.GetArray()) { + if ((r = make_everything_canonical(v, a, ccs, f)) != mec_error::success) + goto Error; + } + break; + case rapidjson::kStringType: + if (!ccs.make_string_canonical(d, a)) { + r = mec_error::conversion; + goto Error; + } + break; + case rapidjson::kNumberType: + if (static_cast(f) & static_cast(mec_option::number_ok)) + break; + r = mec_error::number; + goto Error; + default: + break; + } + r = mec_error::success; +Error: + return r; +} + +bool +add_object_to_context(rgw_obj &obj, rapidjson::Document &d) +{ + ARN a{obj}; + const char aws_s3_arn[] { "aws:s3:arn" }; + std::string as{a.to_string()}; + rapidjson::Document::AllocatorType &allocator { d.GetAllocator() }; + rapidjson::Value name, val; + + if (!d.IsObject()) + return false; + if (d.HasMember(aws_s3_arn)) + return true; + val.SetString(as.c_str(), as.length(), allocator); + name.SetString(aws_s3_arn, sizeof aws_s3_arn - 1, allocator); + d.AddMember(name, val, allocator); + return true; +} + +static inline const std::string & +get_tenant_or_id(req_state *s) +{ + const std::string &tenant{ s->user->get_tenant() }; + if (!tenant.empty()) return tenant; + return s->user->get_id().id; +} + +int +make_canonical_context(req_state *s, + std::string_view &context, + std::string &cooked_context) +{ + rapidjson::Document d; + bool b = false; +mec_option options { +//mec_option::number_ok : SEE BOTTOM OF FILE +mec_option::empty }; + rgw_obj obj; + std::ostringstream oss; + canonical_char_sorter ccs{s, s->cct}; + + obj.bucket.tenant = get_tenant_or_id(s); + obj.bucket.name = s->bucket->get_name(); + obj.key.name = s->object->get_name(); + std::string iline; + rapidjson::Document::AllocatorType &allocator { d.GetAllocator() }; + + try { + iline = rgw::from_base64(context); + } catch (const std::exception& e) { + oss << "bad context: " << e.what(); + s->err.message = oss.str(); + return -ERR_INVALID_REQUEST; + } + rapidjson::StringStream isw(iline.c_str()); + if (!iline.length()) + d.SetObject(); +// else if (qflag) SEE BOTTOM OF FILE +// d.ParseStream(isw); + else + d.ParseStream(isw); + if (isw.Tell() != iline.length()) { + oss << "bad context: did not consume all of input: @ " + << isw.Tell(); + s->err.message = oss.str(); + return -ERR_INVALID_REQUEST; + } + if (d.HasParseError()) { + oss << "bad context: parse error: @ " << d.GetErrorOffset() + << " " << rapidjson::GetParseError_En(d.GetParseError()); + s->err.message = oss.str(); + return -ERR_INVALID_REQUEST; + } + rapidjson::StringBuffer buf; + rapidjson::Writer writer(buf); + if (!add_object_to_context(obj, d)) { + ldpp_dout(s, -1) << "ERROR: can't add default value to context" << dendl; + s->err.message = "context: internal error adding defaults"; + return -ERR_INVALID_REQUEST; + } + b = make_everything_canonical(d, allocator, ccs, options) == mec_error::success; + if (!b) { + ldpp_dout(s, -1) << "ERROR: can't make canonical json <" + << context << ">" << dendl; + s->err.message = "context: can't make canonical"; + return -ERR_INVALID_REQUEST; + } + b = sort_and_write(d, writer, ccs); + if (!b) { + ldpp_dout(s, 5) << "format error <" << context + << ">: partial.results=" << buf.GetString() << dendl; + s->err.message = "unable to reformat json"; + return -ERR_INVALID_REQUEST; + } + cooked_context = rgw::to_base64(buf.GetString()); + return 0; +} + + +CryptoAccelRef get_crypto_accel(const DoutPrefixProvider* dpp, CephContext *cct) +{ + CryptoAccelRef ca_impl = nullptr; + stringstream ss; + PluginRegistry *reg = cct->get_plugin_registry(); + string crypto_accel_type = cct->_conf->plugin_crypto_accelerator; + + CryptoPlugin *factory = dynamic_cast(reg->get_with_load("crypto", crypto_accel_type)); + if (factory == nullptr) { + ldpp_dout(dpp, -1) << __func__ << " cannot load crypto accelerator of type " << crypto_accel_type << dendl; + return nullptr; + } + int err = factory->factory(&ca_impl, &ss); + if (err) { + ldpp_dout(dpp, -1) << __func__ << " factory return error " << err << + " with description: " << ss.str() << dendl; + } + return ca_impl; +} + + +template +static inline +bool evp_sym_transform(const DoutPrefixProvider* dpp, + CephContext* const cct, + const EVP_CIPHER* const type, + unsigned char* const out, + const unsigned char* const in, + const size_t size, + const unsigned char* const iv, + const unsigned char* const key, + const bool encrypt) +{ + using pctx_t = \ + std::unique_ptr; + pctx_t pctx{ EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free }; + + if (!pctx) { + return false; + } + + if (1 != EVP_CipherInit_ex(pctx.get(), type, nullptr, + nullptr, nullptr, encrypt)) { + ldpp_dout(dpp, 5) << "EVP: failed to 1st initialization stage" << dendl; + return false; + } + + // we want to support ciphers that don't use IV at all like AES-256-ECB + if constexpr (static_cast(IvSizeV)) { + ceph_assert(EVP_CIPHER_CTX_iv_length(pctx.get()) == IvSizeV); + ceph_assert(EVP_CIPHER_CTX_block_size(pctx.get()) == IvSizeV); + } + ceph_assert(EVP_CIPHER_CTX_key_length(pctx.get()) == KeySizeV); + + if (1 != EVP_CipherInit_ex(pctx.get(), nullptr, nullptr, key, iv, encrypt)) { + ldpp_dout(dpp, 5) << "EVP: failed to 2nd initialization stage" << dendl; + return false; + } + + // disable padding + if (1 != EVP_CIPHER_CTX_set_padding(pctx.get(), 0)) { + ldpp_dout(dpp, 5) << "EVP: cannot disable PKCS padding" << dendl; + return false; + } + + // operate! + int written = 0; + ceph_assert(size <= static_cast(std::numeric_limits::max())); + if (1 != EVP_CipherUpdate(pctx.get(), out, &written, in, size)) { + ldpp_dout(dpp, 5) << "EVP: EVP_CipherUpdate failed" << dendl; + return false; + } + + int finally_written = 0; + static_assert(sizeof(*out) == 1); + if (1 != EVP_CipherFinal_ex(pctx.get(), out + written, &finally_written)) { + ldpp_dout(dpp, 5) << "EVP: EVP_CipherFinal_ex failed" << dendl; + return false; + } + + // padding is disabled so EVP_CipherFinal_ex should not append anything + ceph_assert(finally_written == 0); + return (written + finally_written) == static_cast(size); +} + + +/** + * Encryption in CBC mode. Chunked to 4K blocks. Offset is used as IV for each 4K block. + * + * + * + * A. Encryption + * 1. Input is split to 4K chunks + remainder in one, smaller chunk + * 2. Each full chunk is encrypted separately with CBC chained mode, with initial IV derived from offset + * 3. Last chunk is 16*m + n. + * 4. 16*m bytes are encrypted with CBC chained mode, with initial IV derived from offset + * 5. Last n bytes are xor-ed with pattern obtained by CBC encryption of + * last encrypted 16 byte block <16m-16, 16m-15) with IV = {0}. + * 6. (Special case) If m == 0 then last n bytes are xor-ed with pattern + * obtained by CBC encryption of {0} with IV derived from offset + * + * B. Decryption + * 1. Input is split to 4K chunks + remainder in one, smaller chunk + * 2. Each full chunk is decrypted separately with CBC chained mode, with initial IV derived from offset + * 3. Last chunk is 16*m + n. + * 4. 16*m bytes are decrypted with CBC chained mode, with initial IV derived from offset + * 5. Last n bytes are xor-ed with pattern obtained by CBC ENCRYPTION of + * last (still encrypted) 16 byte block <16m-16,16m-15) with IV = {0} + * 6. (Special case) If m == 0 then last n bytes are xor-ed with pattern + * obtained by CBC ENCRYPTION of {0} with IV derived from offset + */ +class AES_256_CBC : public BlockCrypt { +public: + static const size_t AES_256_KEYSIZE = 256 / 8; + static const size_t AES_256_IVSIZE = 128 / 8; + static const size_t CHUNK_SIZE = 4096; + const DoutPrefixProvider* dpp; +private: + static const uint8_t IV[AES_256_IVSIZE]; + CephContext* cct; + uint8_t key[AES_256_KEYSIZE]; +public: + explicit AES_256_CBC(const DoutPrefixProvider* dpp, CephContext* cct): dpp(dpp), cct(cct) { + } + ~AES_256_CBC() { + ::ceph::crypto::zeroize_for_security(key, AES_256_KEYSIZE); + } + bool set_key(const uint8_t* _key, size_t key_size) { + if (key_size != AES_256_KEYSIZE) { + return false; + } + memcpy(key, _key, AES_256_KEYSIZE); + return true; + } + size_t get_block_size() { + return CHUNK_SIZE; + } + + bool cbc_transform(unsigned char* out, + const unsigned char* in, + const size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE], + bool encrypt) + { + return evp_sym_transform( + dpp, cct, EVP_aes_256_cbc(), out, in, size, iv, key, encrypt); + } + + bool cbc_transform(unsigned char* out, + const unsigned char* in, + size_t size, + off_t stream_offset, + const unsigned char (&key)[AES_256_KEYSIZE], + bool encrypt) + { + static std::atomic failed_to_get_crypto(false); + CryptoAccelRef crypto_accel; + if (! failed_to_get_crypto.load()) + { + crypto_accel = get_crypto_accel(this->dpp, cct); + if (!crypto_accel) + failed_to_get_crypto = true; + } + bool result = true; + unsigned char iv[AES_256_IVSIZE]; + for (size_t offset = 0; result && (offset < size); offset += CHUNK_SIZE) { + size_t process_size = offset + CHUNK_SIZE <= size ? CHUNK_SIZE : size - offset; + prepare_iv(iv, stream_offset + offset); + if (crypto_accel != nullptr) { + if (encrypt) { + result = crypto_accel->cbc_encrypt(out + offset, in + offset, + process_size, iv, key); + } else { + result = crypto_accel->cbc_decrypt(out + offset, in + offset, + process_size, iv, key); + } + } else { + result = cbc_transform( + out + offset, in + offset, process_size, + iv, key, encrypt); + } + } + return result; + } + + + bool encrypt(bufferlist& input, + off_t in_ofs, + size_t size, + bufferlist& output, + off_t stream_offset) + { + bool result = false; + size_t aligned_size = size / AES_256_IVSIZE * AES_256_IVSIZE; + size_t unaligned_rest_size = size - aligned_size; + output.clear(); + buffer::ptr buf(aligned_size + AES_256_IVSIZE); + unsigned char* buf_raw = reinterpret_cast(buf.c_str()); + const unsigned char* input_raw = reinterpret_cast(input.c_str()); + + /* encrypt main bulk of data */ + result = cbc_transform(buf_raw, + input_raw + in_ofs, + aligned_size, + stream_offset, key, true); + if (result && (unaligned_rest_size > 0)) { + /* remainder to encrypt */ + if (aligned_size % CHUNK_SIZE > 0) { + /* use last chunk for unaligned part */ + unsigned char iv[AES_256_IVSIZE] = {0}; + result = cbc_transform(buf_raw + aligned_size, + buf_raw + aligned_size - AES_256_IVSIZE, + AES_256_IVSIZE, + iv, key, true); + } else { + /* 0 full blocks in current chunk, use IV as base for unaligned part */ + unsigned char iv[AES_256_IVSIZE] = {0}; + unsigned char data[AES_256_IVSIZE]; + prepare_iv(data, stream_offset + aligned_size); + result = cbc_transform(buf_raw + aligned_size, + data, + AES_256_IVSIZE, + iv, key, true); + } + if (result) { + for(size_t i = aligned_size; i < size; i++) { + *(buf_raw + i) ^= *(input_raw + in_ofs + i); + } + } + } + if (result) { + ldpp_dout(this->dpp, 25) << "Encrypted " << size << " bytes"<< dendl; + buf.set_length(size); + output.append(buf); + } else { + ldpp_dout(this->dpp, 5) << "Failed to encrypt" << dendl; + } + return result; + } + + + bool decrypt(bufferlist& input, + off_t in_ofs, + size_t size, + bufferlist& output, + off_t stream_offset) + { + bool result = false; + size_t aligned_size = size / AES_256_IVSIZE * AES_256_IVSIZE; + size_t unaligned_rest_size = size - aligned_size; + output.clear(); + buffer::ptr buf(aligned_size + AES_256_IVSIZE); + unsigned char* buf_raw = reinterpret_cast(buf.c_str()); + unsigned char* input_raw = reinterpret_cast(input.c_str()); + + /* decrypt main bulk of data */ + result = cbc_transform(buf_raw, + input_raw + in_ofs, + aligned_size, + stream_offset, key, false); + if (result && unaligned_rest_size > 0) { + /* remainder to decrypt */ + if (aligned_size % CHUNK_SIZE > 0) { + /*use last chunk for unaligned part*/ + unsigned char iv[AES_256_IVSIZE] = {0}; + result = cbc_transform(buf_raw + aligned_size, + input_raw + in_ofs + aligned_size - AES_256_IVSIZE, + AES_256_IVSIZE, + iv, key, true); + } else { + /* 0 full blocks in current chunk, use IV as base for unaligned part */ + unsigned char iv[AES_256_IVSIZE] = {0}; + unsigned char data[AES_256_IVSIZE]; + prepare_iv(data, stream_offset + aligned_size); + result = cbc_transform(buf_raw + aligned_size, + data, + AES_256_IVSIZE, + iv, key, true); + } + if (result) { + for(size_t i = aligned_size; i < size; i++) { + *(buf_raw + i) ^= *(input_raw + in_ofs + i); + } + } + } + if (result) { + ldpp_dout(this->dpp, 25) << "Decrypted " << size << " bytes"<< dendl; + buf.set_length(size); + output.append(buf); + } else { + ldpp_dout(this->dpp, 5) << "Failed to decrypt" << dendl; + } + return result; + } + + + void prepare_iv(unsigned char (&iv)[AES_256_IVSIZE], off_t offset) { + off_t index = offset / AES_256_IVSIZE; + off_t i = AES_256_IVSIZE - 1; + unsigned int val; + unsigned int carry = 0; + while (i>=0) { + val = (index & 0xff) + IV[i] + carry; + iv[i] = val; + carry = val >> 8; + index = index >> 8; + i--; + } + } +}; + + +std::unique_ptr AES_256_CBC_create(const DoutPrefixProvider* dpp, CephContext* cct, const uint8_t* key, size_t len) +{ + auto cbc = std::unique_ptr(new AES_256_CBC(dpp, cct)); + cbc->set_key(key, AES_256_KEYSIZE); + return cbc; +} + + +const uint8_t AES_256_CBC::IV[AES_256_CBC::AES_256_IVSIZE] = + { 'a', 'e', 's', '2', '5', '6', 'i', 'v', '_', 'c', 't', 'r', '1', '3', '3', '7' }; + + +bool AES_256_ECB_encrypt(const DoutPrefixProvider* dpp, + CephContext* cct, + const uint8_t* key, + size_t key_size, + const uint8_t* data_in, + uint8_t* data_out, + size_t data_size) +{ + if (key_size == AES_256_KEYSIZE) { + return evp_sym_transform( + dpp, cct, EVP_aes_256_ecb(), data_out, data_in, data_size, + nullptr /* no IV in ECB */, key, true /* encrypt */); + } else { + ldpp_dout(dpp, 5) << "Key size must be 256 bits long" << dendl; + return false; + } +} + + +RGWGetObj_BlockDecrypt::RGWGetObj_BlockDecrypt(const DoutPrefixProvider *dpp, + CephContext* cct, + RGWGetObj_Filter* next, + std::unique_ptr crypt, + std::vector parts_len) + : + RGWGetObj_Filter(next), + dpp(dpp), + cct(cct), + crypt(std::move(crypt)), + enc_begin_skip(0), + ofs(0), + end(0), + cache(), + parts_len(std::move(parts_len)) +{ + block_size = this->crypt->get_block_size(); +} + +RGWGetObj_BlockDecrypt::~RGWGetObj_BlockDecrypt() { +} + +int RGWGetObj_BlockDecrypt::read_manifest_parts(const DoutPrefixProvider *dpp, + const bufferlist& manifest_bl, + std::vector& parts_len) +{ + RGWObjManifest manifest; + if (manifest_bl.length()) { + auto miter = manifest_bl.cbegin(); + try { + decode(manifest, miter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl; + return -EIO; + } + RGWObjManifest::obj_iterator mi; + for (mi = manifest.obj_begin(dpp); mi != manifest.obj_end(dpp); ++mi) { + if (mi.get_cur_stripe() == 0) { + parts_len.push_back(0); + } + parts_len.back() += mi.get_stripe_size(); + } + for (size_t i = 0; i [" << bl_ofs << "," << bl_end << "]" << dendl; + return 0; +} + +int RGWGetObj_BlockDecrypt::process(bufferlist& in, size_t part_ofs, size_t size) +{ + bufferlist data; + if (!crypt->decrypt(in, 0, size, data, part_ofs)) { + return -ERR_INTERNAL_ERROR; + } + off_t send_size = size - enc_begin_skip; + if (ofs + enc_begin_skip + send_size > end + 1) { + send_size = end + 1 - ofs - enc_begin_skip; + } + int res = next->handle_data(data, enc_begin_skip, send_size); + enc_begin_skip = 0; + ofs += size; + in.splice(0, size); + return res; +} + +int RGWGetObj_BlockDecrypt::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + ldpp_dout(this->dpp, 25) << "Decrypt " << bl_len << " bytes" << dendl; + bl.begin(bl_ofs).copy(bl_len, cache); + + int res = 0; + size_t part_ofs = ofs; + for (size_t part : parts_len) { + if (part_ofs >= part) { + part_ofs -= part; + } else if (part_ofs + cache.length() >= part) { + // flush data up to part boundaries, aligned or not + res = process(cache, part_ofs, part - part_ofs); + if (res < 0) { + return res; + } + part_ofs = 0; + } else { + break; + } + } + // write up to block boundaries, aligned only + off_t aligned_size = cache.length() & ~(block_size - 1); + if (aligned_size > 0) { + res = process(cache, part_ofs, aligned_size); + } + return res; +} + +/** + * flush remainder of data to output + */ +int RGWGetObj_BlockDecrypt::flush() { + ldpp_dout(this->dpp, 25) << "Decrypt flushing " << cache.length() << " bytes" << dendl; + int res = 0; + size_t part_ofs = ofs; + for (size_t part : parts_len) { + if (part_ofs >= part) { + part_ofs -= part; + } else if (part_ofs + cache.length() >= part) { + // flush data up to part boundaries, aligned or not + res = process(cache, part_ofs, part - part_ofs); + if (res < 0) { + return res; + } + part_ofs = 0; + } else { + break; + } + } + // flush up to block boundaries, aligned or not + if (cache.length() > 0) { + res = process(cache, part_ofs, cache.length()); + } + return res; +} + +RGWPutObj_BlockEncrypt::RGWPutObj_BlockEncrypt(const DoutPrefixProvider *dpp, + CephContext* cct, + rgw::sal::DataProcessor *next, + std::unique_ptr crypt) + : Pipe(next), + dpp(dpp), + cct(cct), + crypt(std::move(crypt)), + block_size(this->crypt->get_block_size()) +{ +} + +int RGWPutObj_BlockEncrypt::process(bufferlist&& data, uint64_t logical_offset) +{ + ldpp_dout(this->dpp, 25) << "Encrypt " << data.length() << " bytes" << dendl; + + // adjust logical offset to beginning of cached data + ceph_assert(logical_offset >= cache.length()); + logical_offset -= cache.length(); + + const bool flush = (data.length() == 0); + cache.claim_append(data); + + uint64_t proc_size = cache.length() & ~(block_size - 1); + if (flush) { + proc_size = cache.length(); + } + if (proc_size > 0) { + bufferlist in, out; + cache.splice(0, proc_size, &in); + if (!crypt->encrypt(in, 0, proc_size, out, logical_offset)) { + return -ERR_INTERNAL_ERROR; + } + int r = Pipe::process(std::move(out), logical_offset); + logical_offset += proc_size; + if (r < 0) + return r; + } + + if (flush) { + /*replicate 0-sized handle_data*/ + return Pipe::process({}, logical_offset); + } + return 0; +} + + +std::string create_random_key_selector(CephContext * const cct) { + char random[AES_256_KEYSIZE]; + cct->random()->get_bytes(&random[0], sizeof(random)); + return std::string(random, sizeof(random)); +} + +typedef enum { + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM=0, + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + X_AMZ_SERVER_SIDE_ENCRYPTION, + X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID, + X_AMZ_SERVER_SIDE_ENCRYPTION_CONTEXT, + X_AMZ_SERVER_SIDE_ENCRYPTION_LAST +} crypt_option_e; +struct crypt_option_names { + const std::string post_part_name; +}; + +static const crypt_option_names crypt_options[] = { + { "x-amz-server-side-encryption-customer-algorithm"}, + { "x-amz-server-side-encryption-customer-key"}, + { "x-amz-server-side-encryption-customer-key-md5"}, + { "x-amz-server-side-encryption"}, + { "x-amz-server-side-encryption-aws-kms-key-id"}, + { "x-amz-server-side-encryption-context"}, +}; + +struct CryptAttributes { + meta_map_t &x_meta_map; + + CryptAttributes(req_state *s) + : x_meta_map(s->info.crypt_attribute_map) { + } + + std::string_view get(crypt_option_e option) + { + static_assert( + X_AMZ_SERVER_SIDE_ENCRYPTION_LAST == sizeof(crypt_options)/sizeof(*crypt_options), + "Missing items in crypt_options"); + auto hdr { x_meta_map.find(crypt_options[option].post_part_name) }; + if (hdr != x_meta_map.end()) { + return std::string_view(hdr->second); + } else { + return std::string_view(); + } + } +}; + +std::string fetch_bucket_key_id(req_state *s) +{ + auto kek_iter = s->bucket_attrs.find(RGW_ATTR_BUCKET_ENCRYPTION_KEY_ID); + if (kek_iter == s->bucket_attrs.end()) + return std::string(); + std::string a_key { kek_iter->second.to_str() }; + // early code appends a nul; pretend that didn't happen + auto l { a_key.length() }; + if (l > 0 && a_key[l-1] == '\0') { + a_key.resize(--l); + } + return a_key; +} + +const std::string cant_expand_key{ "\uFFFD" }; +std::string expand_key_name(req_state *s, const std::string_view&t) +{ + std::string r; + size_t i, j; + for (i = 0;;) { + i = t.find('%', (j = i)); + if (i != j) { + if (i == std::string_view::npos) + r.append( t.substr(j) ); + else + r.append( t.substr(j, i-j) ); + } + if (i == std::string_view::npos) { + break; + } + if (t[i+1] == '%') { + r.append("%"); + i += 2; + continue; + } + if (t.compare(i+1, 9, "bucket_id") == 0) { + r.append(s->bucket->get_marker()); + i += 10; + continue; + } + if (t.compare(i+1, 8, "owner_id") == 0) { + r.append(s->bucket->get_info().owner.id); + i += 9; + continue; + } + return cant_expand_key; + } + return r; +} + +static int get_sse_s3_bucket_key(req_state *s, + std::string &key_id) +{ + int res; + std::string saved_key; + + key_id = expand_key_name(s, s->cct->_conf->rgw_crypt_sse_s3_key_template); + + if (key_id == cant_expand_key) { + ldpp_dout(s, 5) << "ERROR: unable to expand key_id " << + s->cct->_conf->rgw_crypt_sse_s3_key_template << " on bucket" << dendl; + s->err.message = "Server side error - unable to expand key_id"; + return -EINVAL; + } + + saved_key = fetch_bucket_key_id(s); + if (saved_key != "") { + ldpp_dout(s, 5) << "Found KEK ID: " << key_id << dendl; + } + if (saved_key != key_id) { + res = create_sse_s3_bucket_key(s, s->cct, key_id); + if (res != 0) { + return res; + } + bufferlist key_id_bl; + key_id_bl.append(key_id.c_str(), key_id.length()); + for (int count = 0; count < 15; ++count) { + rgw::sal::Attrs attrs = s->bucket->get_attrs(); + attrs[RGW_ATTR_BUCKET_ENCRYPTION_KEY_ID] = key_id_bl; + res = s->bucket->merge_and_store_attrs(s, attrs, s->yield); + if (res != -ECANCELED) { + break; + } + res = s->bucket->try_refresh_info(s, nullptr); + if (res != 0) { + break; + } + } + if (res != 0) { + ldpp_dout(s, 5) << "ERROR: unable to save new key_id on bucket" << dendl; + s->err.message = "Server side error - unable to save key_id"; + return res; + } + } + return 0; +} + +int rgw_s3_prepare_encrypt(req_state* s, + std::map& attrs, + std::unique_ptr* block_crypt, + std::map& crypt_http_responses) +{ + int res = 0; + CryptAttributes crypt_attributes { s }; + crypt_http_responses.clear(); + + { + std::string_view req_sse_ca = + crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM); + if (! req_sse_ca.empty()) { + if (req_sse_ca != "AES256") { + ldpp_dout(s, 5) << "ERROR: Invalid value for header " + << "x-amz-server-side-encryption-customer-algorithm" + << dendl; + s->err.message = "The requested encryption algorithm is not valid, must be AES256."; + return -ERR_INVALID_ENCRYPTION_ALGORITHM; + } + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldpp_dout(s, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + + std::string key_bin; + try { + key_bin = from_base64( + crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY) ); + } catch (...) { + ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption " + << "key which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) { + ldpp_dout(s, 5) << "ERROR: invalid encryption key size" << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + std::string_view keymd5 = + crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5); + + std::string keymd5_bin; + try { + keymd5_bin = from_base64(keymd5); + } catch (...) { + ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption key " + << "md5 which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key md5."; + return -EINVAL; + } + + if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) { + ldpp_dout(s, 5) << "ERROR: Invalid key md5 size" << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key md5."; + return -EINVAL; + } + + MD5 key_hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + key_hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + unsigned char key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE]; + key_hash.Update(reinterpret_cast(key_bin.c_str()), key_bin.size()); + key_hash.Final(key_hash_res); + + if (memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) { + ldpp_dout(s, 5) << "ERROR: Invalid key md5 hash" << dendl; + s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided."; + return -EINVAL; + } + + set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-C-AES256"); + set_attr(attrs, RGW_ATTR_CRYPT_KEYMD5, keymd5_bin); + + if (block_crypt) { + auto aes = std::unique_ptr(new AES_256_CBC(s, s->cct)); + aes->set_key(reinterpret_cast(key_bin.c_str()), AES_256_KEYSIZE); + *block_crypt = std::move(aes); + } + + crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256"; + crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = std::string(keymd5); + return 0; + } else { + std::string_view customer_key = + crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY); + if (!customer_key.empty()) { + ldpp_dout(s, 5) << "ERROR: SSE-C encryption request is missing the header " + << "x-amz-server-side-encryption-customer-algorithm" + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide a valid encryption algorithm."; + return -EINVAL; + } + + std::string_view customer_key_md5 = + crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5); + if (!customer_key_md5.empty()) { + ldpp_dout(s, 5) << "ERROR: SSE-C encryption request is missing the header " + << "x-amz-server-side-encryption-customer-algorithm" + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide a valid encryption algorithm."; + return -EINVAL; + } + } + + /* AMAZON server side encryption with KMS (key management service) */ + std::string_view req_sse = + crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION); + if (! req_sse.empty()) { + + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldpp_dout(s, 5) << "ERROR: insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + + if (req_sse == "aws:kms") { + std::string_view context = + crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CONTEXT); + std::string cooked_context; + if ((res = make_canonical_context(s, context, cooked_context))) + return res; + std::string_view key_id = + crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID); + if (key_id.empty()) { + ldpp_dout(s, 5) << "ERROR: not provide a valid key id" << dendl; + s->err.message = "Server Side Encryption with KMS managed key requires " + "HTTP header x-amz-server-side-encryption-aws-kms-key-id"; + return -EINVAL; + } + /* try to retrieve actual key */ + std::string key_selector = create_random_key_selector(s->cct); + set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-KMS"); + set_attr(attrs, RGW_ATTR_CRYPT_KEYID, key_id); + set_attr(attrs, RGW_ATTR_CRYPT_KEYSEL, key_selector); + set_attr(attrs, RGW_ATTR_CRYPT_CONTEXT, cooked_context); + std::string actual_key; + res = make_actual_key_from_kms(s, s->cct, attrs, actual_key); + if (res != 0) { + ldpp_dout(s, 5) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl; + s->err.message = "Failed to retrieve the actual key, kms-keyid: " + std::string(key_id); + return res; + } + if (actual_key.size() != AES_256_KEYSIZE) { + ldpp_dout(s, 5) << "ERROR: key obtained from key_id:" << + key_id << " is not 256 bit size" << dendl; + s->err.message = "KMS provided an invalid key for the given kms-keyid."; + return -EINVAL; + } + + if (block_crypt) { + auto aes = std::unique_ptr(new AES_256_CBC(s, s->cct)); + aes->set_key(reinterpret_cast(actual_key.c_str()), AES_256_KEYSIZE); + *block_crypt = std::move(aes); + } + ::ceph::crypto::zeroize_for_security(actual_key.data(), actual_key.length()); + + crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms"; + crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = std::string(key_id); + crypt_http_responses["x-amz-server-side-encryption-context"] = std::move(cooked_context); + return 0; + } else if (req_sse != "AES256") { + ldpp_dout(s, 5) << "ERROR: Invalid value for header x-amz-server-side-encryption" + << dendl; + s->err.message = "Server Side Encryption with KMS managed key requires " + "HTTP header x-amz-server-side-encryption : aws:kms or AES256"; + return -EINVAL; + } + + if (s->cct->_conf->rgw_crypt_sse_s3_backend != "vault") { + s->err.message = "Request specifies Server Side Encryption " + "but server configuration does not support this."; + return -EINVAL; + } + + ldpp_dout(s, 5) << "RGW_ATTR_BUCKET_ENCRYPTION ALGO: " + << req_sse << dendl; + std::string_view context = ""; + std::string cooked_context; + if ((res = make_canonical_context(s, context, cooked_context))) + return res; + + std::string key_id; + res = get_sse_s3_bucket_key(s, key_id); + if (res != 0) { + return res; + } + std::string key_selector = create_random_key_selector(s->cct); + + set_attr(attrs, RGW_ATTR_CRYPT_KEYSEL, key_selector); + set_attr(attrs, RGW_ATTR_CRYPT_CONTEXT, cooked_context); + set_attr(attrs, RGW_ATTR_CRYPT_MODE, "AES256"); + set_attr(attrs, RGW_ATTR_CRYPT_KEYID, key_id); + std::string actual_key; + res = make_actual_key_from_sse_s3(s, s->cct, attrs, actual_key); + if (res != 0) { + ldpp_dout(s, 5) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl; + s->err.message = "Failed to retrieve the actual key"; + return res; + } + if (actual_key.size() != AES_256_KEYSIZE) { + ldpp_dout(s, 5) << "ERROR: key obtained from key_id:" << + key_id << " is not 256 bit size" << dendl; + s->err.message = "SSE-S3 provided an invalid key for the given keyid."; + return -EINVAL; + } + + if (block_crypt) { + auto aes = std::unique_ptr(new AES_256_CBC(s, s->cct)); + aes->set_key(reinterpret_cast(actual_key.c_str()), AES_256_KEYSIZE); + *block_crypt = std::move(aes); + } + ::ceph::crypto::zeroize_for_security(actual_key.data(), actual_key.length()); + + crypt_http_responses["x-amz-server-side-encryption"] = "AES256"; + + return 0; + } else if (s->cct->_conf->rgw_crypt_default_encryption_key != "") { + std::string master_encryption_key; + try { + master_encryption_key = from_base64(s->cct->_conf->rgw_crypt_default_encryption_key); + } catch (...) { + ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_encrypt invalid default encryption key " + << "which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + if (master_encryption_key.size() != 256 / 8) { + ldpp_dout(s, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl; + /* not an error to return; missing encryption does not inhibit processing */ + return 0; + } + + set_attr(attrs, RGW_ATTR_CRYPT_MODE, "RGW-AUTO"); + std::string key_selector = create_random_key_selector(s->cct); + set_attr(attrs, RGW_ATTR_CRYPT_KEYSEL, key_selector); + + uint8_t actual_key[AES_256_KEYSIZE]; + if (AES_256_ECB_encrypt(s, s->cct, + reinterpret_cast(master_encryption_key.c_str()), AES_256_KEYSIZE, + reinterpret_cast(key_selector.c_str()), + actual_key, AES_256_KEYSIZE) != true) { + ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key)); + return -EIO; + } + if (block_crypt) { + auto aes = std::unique_ptr(new AES_256_CBC(s, s->cct)); + aes->set_key(reinterpret_cast(actual_key), AES_256_KEYSIZE); + *block_crypt = std::move(aes); + } + ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key)); + return 0; + } + } + return 0; +} + + +int rgw_s3_prepare_decrypt(req_state* s, + map& attrs, + std::unique_ptr* block_crypt, + std::map& crypt_http_responses) +{ + int res = 0; + std::string stored_mode = get_str_attribute(attrs, RGW_ATTR_CRYPT_MODE); + ldpp_dout(s, 15) << "Encryption mode: " << stored_mode << dendl; + + const char *req_sse = s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION", NULL); + if (nullptr != req_sse && (s->op == OP_GET || s->op == OP_HEAD)) { + return -ERR_INVALID_REQUEST; + } + + if (stored_mode == "SSE-C-AES256") { + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldpp_dout(s, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + const char *req_cust_alg = + s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM", NULL); + + if (nullptr == req_cust_alg) { + ldpp_dout(s, 5) << "ERROR: Request for SSE-C encrypted object missing " + << "x-amz-server-side-encryption-customer-algorithm" + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide a valid encryption algorithm."; + return -EINVAL; + } else if (strcmp(req_cust_alg, "AES256") != 0) { + ldpp_dout(s, 5) << "ERROR: The requested encryption algorithm is not valid, must be AES256." << dendl; + s->err.message = "The requested encryption algorithm is not valid, must be AES256."; + return -ERR_INVALID_ENCRYPTION_ALGORITHM; + } + + std::string key_bin; + try { + key_bin = from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", "")); + } catch (...) { + ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key " + << "which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) { + ldpp_dout(s, 5) << "ERROR: Invalid encryption key size" << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + std::string keymd5 = + s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5", ""); + std::string keymd5_bin; + try { + keymd5_bin = from_base64(keymd5); + } catch (...) { + ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key md5 " + << "which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key md5."; + return -EINVAL; + } + + + if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) { + ldpp_dout(s, 5) << "ERROR: Invalid key md5 size " << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key md5."; + return -EINVAL; + } + + MD5 key_hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + key_hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + uint8_t key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE]; + key_hash.Update(reinterpret_cast(key_bin.c_str()), key_bin.size()); + key_hash.Final(key_hash_res); + + if ((memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) || + (get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYMD5) != keymd5_bin)) { + s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided."; + return -EINVAL; + } + auto aes = std::unique_ptr(new AES_256_CBC(s, s->cct)); + aes->set_key(reinterpret_cast(key_bin.c_str()), AES_256_CBC::AES_256_KEYSIZE); + if (block_crypt) *block_crypt = std::move(aes); + + crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256"; + crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = keymd5; + return 0; + } + + if (stored_mode == "SSE-KMS") { + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldpp_dout(s, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + /* try to retrieve actual key */ + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + std::string actual_key; + res = reconstitute_actual_key_from_kms(s, s->cct, attrs, actual_key); + if (res != 0) { + ldpp_dout(s, 10) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl; + s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id; + return res; + } + if (actual_key.size() != AES_256_KEYSIZE) { + ldpp_dout(s, 0) << "ERROR: key obtained from key_id:" << + key_id << " is not 256 bit size" << dendl; + s->err.message = "KMS provided an invalid key for the given kms-keyid."; + return -EINVAL; + } + + auto aes = std::unique_ptr(new AES_256_CBC(s, s->cct)); + aes->set_key(reinterpret_cast(actual_key.c_str()), AES_256_KEYSIZE); + actual_key.replace(0, actual_key.length(), actual_key.length(), '\000'); + if (block_crypt) *block_crypt = std::move(aes); + + crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms"; + crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = key_id; + return 0; + } + + if (stored_mode == "RGW-AUTO") { + std::string master_encryption_key; + try { + master_encryption_key = from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key)); + } catch (...) { + ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_decrypt invalid default encryption key " + << "which contains character that is not base64 encoded." + << dendl; + s->err.message = "The default encryption key is not valid base64."; + return -EINVAL; + } + + if (master_encryption_key.size() != 256 / 8) { + ldpp_dout(s, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl; + return -EIO; + } + std::string attr_key_selector = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYSEL); + if (attr_key_selector.size() != AES_256_CBC::AES_256_KEYSIZE) { + ldpp_dout(s, 0) << "ERROR: missing or invalid " RGW_ATTR_CRYPT_KEYSEL << dendl; + return -EIO; + } + uint8_t actual_key[AES_256_KEYSIZE]; + if (AES_256_ECB_encrypt(s, s->cct, + reinterpret_cast(master_encryption_key.c_str()), + AES_256_KEYSIZE, + reinterpret_cast(attr_key_selector.c_str()), + actual_key, AES_256_KEYSIZE) != true) { + ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key)); + return -EIO; + } + auto aes = std::unique_ptr(new AES_256_CBC(s, s->cct)); + aes->set_key(actual_key, AES_256_KEYSIZE); + ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key)); + if (block_crypt) *block_crypt = std::move(aes); + return 0; + } + + /* SSE-S3 */ + if (stored_mode == "AES256") { + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldpp_dout(s, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + /* try to retrieve actual key */ + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + std::string actual_key; + res = reconstitute_actual_key_from_sse_s3(s, s->cct, attrs, actual_key); + if (res != 0) { + ldpp_dout(s, 10) << "ERROR: failed to retrieve actual key" << dendl; + s->err.message = "Failed to retrieve the actual key"; + return res; + } + if (actual_key.size() != AES_256_KEYSIZE) { + ldpp_dout(s, 0) << "ERROR: key obtained " << + "is not 256 bit size" << dendl; + s->err.message = "SSE-S3 provided an invalid key for the given keyid."; + return -EINVAL; + } + + auto aes = std::unique_ptr(new AES_256_CBC(s, s->cct)); + aes->set_key(reinterpret_cast(actual_key.c_str()), AES_256_KEYSIZE); + actual_key.replace(0, actual_key.length(), actual_key.length(), '\000'); + if (block_crypt) *block_crypt = std::move(aes); + + crypt_http_responses["x-amz-server-side-encryption"] = "AES256"; + return 0; + } + + + /*no decryption*/ + return 0; +} + +int rgw_remove_sse_s3_bucket_key(req_state *s) +{ + int res; + auto key_id { expand_key_name(s, s->cct->_conf->rgw_crypt_sse_s3_key_template) }; + auto saved_key { fetch_bucket_key_id(s) }; + size_t i; + + if (key_id == cant_expand_key) { + ldpp_dout(s, 5) << "ERROR: unable to expand key_id " << + s->cct->_conf->rgw_crypt_sse_s3_key_template << " on bucket" << dendl; + s->err.message = "Server side error - unable to expand key_id"; + return -EINVAL; + } + + if (saved_key == "") { + return 0; + } else if (saved_key != key_id) { + ldpp_dout(s, 5) << "Found but will not delete strange KEK ID: " << saved_key << dendl; + return 0; + } + i = s->cct->_conf->rgw_crypt_sse_s3_key_template.find("%bucket_id"); + if (i == std::string_view::npos) { + ldpp_dout(s, 5) << "Kept valid KEK ID: " << saved_key << dendl; + return 0; + } + ldpp_dout(s, 5) << "Removing valid KEK ID: " << saved_key << dendl; + res = remove_sse_s3_bucket_key(s, s->cct, saved_key); + if (res != 0) { + ldpp_dout(s, 0) << "ERROR: Unable to remove KEK ID: " << saved_key << " got " << res << dendl; + } + return res; +} + +/********************************************************************* +* "BOTTOM OF FILE" +* I've left some commented out lines above. They are there for +* a reason, which I will explain. The "canonical" json constructed +* by the code above as a crypto context must take a json object and +* turn it into a unique determinstic fixed form. For most json +* types this is easy. The hardest problem that is handled above is +* detailing with unicode strings; they must be turned into +* NFC form and sorted in a fixed order. Numbers, however, +* are another story. Json makes no distinction between integers +* and floating point, and both types have their problems. +* Integers can overflow, so very large numbers are a problem. +* Floating point is even worse; not all floating point numbers +* can be represented accurately in c++ data types, and there +* are many quirks regarding how overflow, underflow, and loss +* of significance are handled. +* +* In this version of the code, I took the simplest answer, I +* reject all numbers altogether. This is not ideal, but it's +* the only choice that is guaranteed to be future compatible. +* AWS S3 does not guarantee to support numbers at all; but it +* actually converts all numbers into strings right off. +* This has the interesting property that 7 and 007 are different, +* but that 007 and "007" are the same. I would rather +* treat numbers as a string of digits and have logic +* to produce the "most compact" equivalent form. This can +* fix all the overflow/underflow problems, but it requires +* fixing the json parser part, and I put that problem off. +* +* The commented code above indicates places in this code that +* will need to be revised depending on future work in this area. +* Removing those comments makes that work harder. +* February 25, 2021 +*********************************************************************/ diff --git a/src/rgw/rgw_crypt.h b/src/rgw/rgw_crypt.h new file mode 100644 index 000000000..d8f561eca --- /dev/null +++ b/src/rgw/rgw_crypt.h @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/** + * Crypto filters for Put/Post/Get operations. + */ + +#pragma once + +#include + +#include +#include +#include +#include "rgw_putobj.h" + +/** + * \brief Interface for block encryption methods + * + * Encrypts and decrypts data. + * Operations are performed in context of larger stream being divided into blocks. + * Each block can be processed independently, but only as a whole. + * Part block cannot be properly processed. + * Each request must start on block-aligned offset. + * Each request should have length that is multiply of block size. + * Request with unaligned length is only acceptable for last part of stream. + */ +class BlockCrypt { +public: + BlockCrypt(){}; + virtual ~BlockCrypt(){}; + + /** + * Determines size of encryption block. + * This is usually multiply of key size. + * It determines size of chunks that should be passed to \ref encrypt and \ref decrypt. + */ + virtual size_t get_block_size() = 0; + + /** + * Encrypts data. + * Argument \ref stream_offset shows where in generalized stream chunk is located. + * Input for encryption is \ref input buffer, with relevant data in range crypt; /**< already configured stateless BlockCrypt + for operations when enough data is accumulated */ + off_t enc_begin_skip; /**< amount of data to skip from beginning of received data */ + off_t ofs; /**< stream offset of data we expect to show up next through \ref handle_data */ + off_t end; /**< stream offset of last byte that is requested */ + bufferlist cache; /**< stores extra data that could not (yet) be processed by BlockCrypt */ + size_t block_size; /**< snapshot of \ref BlockCrypt.get_block_size() */ + std::vector parts_len; /**< size of parts of multipart object, parsed from manifest */ + + int process(bufferlist& cipher, size_t part_ofs, size_t size); + +public: + RGWGetObj_BlockDecrypt(const DoutPrefixProvider *dpp, + CephContext* cct, + RGWGetObj_Filter* next, + std::unique_ptr crypt, + std::vector parts_len); + virtual ~RGWGetObj_BlockDecrypt(); + + virtual int fixup_range(off_t& bl_ofs, + off_t& bl_end) override; + virtual int handle_data(bufferlist& bl, + off_t bl_ofs, + off_t bl_len) override; + virtual int flush() override; + + static int read_manifest_parts(const DoutPrefixProvider *dpp, + const bufferlist& manifest_bl, + std::vector& parts_len); +}; /* RGWGetObj_BlockDecrypt */ + + +class RGWPutObj_BlockEncrypt : public rgw::putobj::Pipe +{ + const DoutPrefixProvider *dpp; + CephContext* cct; + std::unique_ptr crypt; /**< already configured stateless BlockCrypt + for operations when enough data is accumulated */ + bufferlist cache; /**< stores extra data that could not (yet) be processed by BlockCrypt */ + const size_t block_size; /**< snapshot of \ref BlockCrypt.get_block_size() */ +public: + RGWPutObj_BlockEncrypt(const DoutPrefixProvider *dpp, + CephContext* cct, + rgw::sal::DataProcessor *next, + std::unique_ptr crypt); + + int process(bufferlist&& data, uint64_t logical_offset) override; +}; /* RGWPutObj_BlockEncrypt */ + + +int rgw_s3_prepare_encrypt(req_state* s, + std::map& attrs, + std::unique_ptr* block_crypt, + std::map& crypt_http_responses); + +int rgw_s3_prepare_decrypt(req_state* s, + std::map& attrs, + std::unique_ptr* block_crypt, + std::map& crypt_http_responses); + +static inline void set_attr(std::map& attrs, + const char* key, + std::string_view value) +{ + bufferlist bl; + bl.append(value.data(), value.size()); + attrs[key] = std::move(bl); +} + +static inline std::string get_str_attribute(std::map& attrs, + const char *name) +{ + auto iter = attrs.find(name); + if (iter == attrs.end()) { + return {}; + } + return iter->second.to_str(); +} + +int rgw_remove_sse_s3_bucket_key(req_state *s); diff --git a/src/rgw/rgw_crypt_sanitize.cc b/src/rgw/rgw_crypt_sanitize.cc new file mode 100644 index 000000000..05aec6d3b --- /dev/null +++ b/src/rgw/rgw_crypt_sanitize.cc @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * rgw_crypt_sanitize.cc + * + * Created on: Mar 3, 2017 + * Author: adam + */ + +#include "rgw_common.h" +#include "rgw_crypt_sanitize.h" +#include "boost/algorithm/string/predicate.hpp" + +namespace rgw { +namespace crypt_sanitize { +const char* HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY = "HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY"; +const char* x_amz_server_side_encryption_customer_key = "x-amz-server-side-encryption-customer-key"; +const char* dollar_x_amz_server_side_encryption_customer_key = "$x-amz-server-side-encryption-customer-key"; +const char* suppression_message = "=suppressed due to key presence="; + +std::ostream& operator<<(std::ostream& out, const env& e) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs) { + if (boost::algorithm::iequals( + e.name, + HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY)) + { + out << suppression_message; + return out; + } + if (boost::algorithm::iequals(e.name, "QUERY_STRING") && + boost::algorithm::ifind_first( + e.value, + x_amz_server_side_encryption_customer_key)) + { + out << suppression_message; + return out; + } + } + out << e.value; + return out; +} + +std::ostream& operator<<(std::ostream& out, const x_meta_map& x) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs && + boost::algorithm::iequals(x.name, x_amz_server_side_encryption_customer_key)) + { + out << suppression_message; + return out; + } + out << x.value; + return out; +} + +std::ostream& operator<<(std::ostream& out, const s3_policy& x) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs && + boost::algorithm::iequals(x.name, dollar_x_amz_server_side_encryption_customer_key)) + { + out << suppression_message; + return out; + } + out << x.value; + return out; +} + +std::ostream& operator<<(std::ostream& out, const auth& x) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs && + x.s->info.env->get(HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, nullptr) != nullptr) + { + out << suppression_message; + return out; + } + out << x.value; + return out; +} + +std::ostream& operator<<(std::ostream& out, const log_content& x) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs && + boost::algorithm::ifind_first(x.buf, x_amz_server_side_encryption_customer_key)) { + out << suppression_message; + return out; + } + out << x.buf; + return out; +} + +} +} diff --git a/src/rgw/rgw_crypt_sanitize.h b/src/rgw/rgw_crypt_sanitize.h new file mode 100644 index 000000000..aa0261fc2 --- /dev/null +++ b/src/rgw/rgw_crypt_sanitize.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "rgw_common.h" + +namespace rgw { +namespace crypt_sanitize { + +/* + * Temporary container for suppressing printing if variable contains secret key. + */ +struct env { + std::string_view name; + std::string_view value; + + env(std::string_view name, std::string_view value) + : name(name), value(value) {} +}; + +/* + * Temporary container for suppressing printing if aws meta attributes contains secret key. + */ +struct x_meta_map { + std::string_view name; + std::string_view value; + x_meta_map(std::string_view name, std::string_view value) + : name(name), value(value) {} +}; + +/* + * Temporary container for suppressing printing if s3_policy calculation variable contains secret key. + */ +struct s3_policy { + std::string_view name; + std::string_view value; + s3_policy(std::string_view name, std::string_view value) + : name(name), value(value) {} +}; + +/* + * Temporary container for suppressing printing if auth string contains secret key. + */ +struct auth { + const req_state* const s; + std::string_view value; + auth(const req_state* const s, std::string_view value) + : s(s), value(value) {} +}; + +/* + * Temporary container for suppressing printing if log made from civetweb may contain secret key. + */ +struct log_content { + const std::string_view buf; + explicit log_content(const std::string_view buf) + : buf(buf) {} +}; + +std::ostream& operator<<(std::ostream& out, const env& e); +std::ostream& operator<<(std::ostream& out, const x_meta_map& x); +std::ostream& operator<<(std::ostream& out, const s3_policy& x); +std::ostream& operator<<(std::ostream& out, const auth& x); +std::ostream& operator<<(std::ostream& out, const log_content& x); +} +} diff --git a/src/rgw/rgw_d3n_cacherequest.h b/src/rgw/rgw_d3n_cacherequest.h new file mode 100644 index 000000000..edc70247f --- /dev/null +++ b/src/rgw/rgw_d3n_cacherequest.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include "include/rados/librados.hpp" +#include "include/Context.h" +#include "common/async/completion.h" + +#include +#include "common/error_code.h" +#include "common/errno.h" + +#include "rgw_aio.h" +#include "rgw_cache.h" + + +struct D3nGetObjData { + std::mutex d3n_lock; +}; + +struct D3nL1CacheRequest { + ~D3nL1CacheRequest() { + lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "(): Read From Cache, complete" << dendl; + } + + // unique_ptr with custom deleter for struct aiocb + struct libaio_aiocb_deleter { + void operator()(struct aiocb* c) { + if(c->aio_fildes > 0) { + if( ::close(c->aio_fildes) != 0) { + lsubdout(g_ceph_context, rgw_datacache, 2) << "D3nDataCache: " << __func__ << "(): Error - can't close file, errno=" << -errno << dendl; + } + } + delete c; + } + }; + + using unique_aio_cb_ptr = std::unique_ptr; + + struct AsyncFileReadOp { + bufferlist result; + unique_aio_cb_ptr aio_cb; + using Signature = void(boost::system::error_code, bufferlist); + using Completion = ceph::async::Completion; + + int init_async_read(const DoutPrefixProvider *dpp, const std::string& location, off_t read_ofs, off_t read_len, void* arg) { + ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl; + aio_cb.reset(new struct aiocb); + memset(aio_cb.get(), 0, sizeof(struct aiocb)); + aio_cb->aio_fildes = TEMP_FAILURE_RETRY(::open(location.c_str(), O_RDONLY|O_CLOEXEC|O_BINARY)); + if(aio_cb->aio_fildes < 0) { + int err = errno; + ldpp_dout(dpp, 1) << "ERROR: D3nDataCache: " << __func__ << "(): can't open " << location << " : " << cpp_strerror(err) << dendl; + return -err; + } + if (g_conf()->rgw_d3n_l1_fadvise != POSIX_FADV_NORMAL) + posix_fadvise(aio_cb->aio_fildes, 0, 0, g_conf()->rgw_d3n_l1_fadvise); + + bufferptr bp(read_len); + aio_cb->aio_buf = bp.c_str(); + result.append(std::move(bp)); + + aio_cb->aio_nbytes = read_len; + aio_cb->aio_offset = read_ofs; + aio_cb->aio_sigevent.sigev_notify = SIGEV_THREAD; + aio_cb->aio_sigevent.sigev_notify_function = libaio_cb_aio_dispatch; + aio_cb->aio_sigevent.sigev_notify_attributes = nullptr; + aio_cb->aio_sigevent.sigev_value.sival_ptr = arg; + + return 0; + } + + static void libaio_cb_aio_dispatch(sigval sigval) { + lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl; + auto p = std::unique_ptr{static_cast(sigval.sival_ptr)}; + auto op = std::move(p->user_data); + const int ret = -aio_error(op.aio_cb.get()); + boost::system::error_code ec; + if (ret < 0) { + ec.assign(-ret, boost::system::system_category()); + } + + ceph::async::dispatch(std::move(p), ec, std::move(op.result)); + } + + template + static auto create(const Executor1& ex1, CompletionHandler&& handler) { + auto p = Completion::create(ex1, std::move(handler)); + return p; + } + }; + + template + auto async_read(const DoutPrefixProvider *dpp, ExecutionContext& ctx, const std::string& location, + off_t read_ofs, off_t read_len, CompletionToken&& token) { + using Op = AsyncFileReadOp; + using Signature = typename Op::Signature; + boost::asio::async_completion init(token); + auto p = Op::create(ctx.get_executor(), init.completion_handler); + auto& op = p->user_data; + + ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl; + int ret = op.init_async_read(dpp, location, read_ofs, read_len, p.get()); + if(0 == ret) { + ret = ::aio_read(op.aio_cb.get()); + } + ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): ::aio_read(), ret=" << ret << dendl; + if(ret < 0) { + auto ec = boost::system::error_code{-ret, boost::system::system_category()}; + ceph::async::post(std::move(p), ec, bufferlist{}); + } else { + (void)p.release(); + } + return init.result.get(); + } + + struct d3n_libaio_handler { + rgw::Aio* throttle = nullptr; + rgw::AioResult& r; + // read callback + void operator()(boost::system::error_code ec, bufferlist bl) const { + r.result = -ec.value(); + r.data = std::move(bl); + throttle->put(r); + } + }; + + void file_aio_read_abstract(const DoutPrefixProvider *dpp, boost::asio::io_context& context, yield_context yield, + std::string& cache_location, off_t read_ofs, off_t read_len, + rgw::Aio* aio, rgw::AioResult& r) { + using namespace boost::asio; + async_completion init(yield); + auto ex = get_associated_executor(init.completion_handler); + + auto& ref = r.obj.get_ref(); + ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): oid=" << ref.obj.oid << dendl; + async_read(dpp, context, cache_location+"/"+url_encode(ref.obj.oid, true), read_ofs, read_len, bind_executor(ex, d3n_libaio_handler{aio, r})); + } + +}; diff --git a/src/rgw/rgw_dencoder.cc b/src/rgw/rgw_dencoder.cc new file mode 100644 index 000000000..2475b45ed --- /dev/null +++ b/src/rgw/rgw_dencoder.cc @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_log.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_cache.h" +#include "rgw_meta_sync_status.h" +#include "rgw_data_sync.h" +#include "rgw_multi.h" +#include "rgw_bucket_encryption.h" + +#include "common/Formatter.h" + +using namespace std; + +static string shadow_ns = RGW_OBJ_NS_SHADOW; + +void obj_version::generate_test_instances(list& o) +{ + obj_version *v = new obj_version; + v->ver = 5; + v->tag = "tag"; + + o.push_back(v); + o.push_back(new obj_version); +} + +void RGWBucketEncryptionConfig::generate_test_instances(std::list& o) +{ + auto *bc = new RGWBucketEncryptionConfig("aws:kms", "some:key", true); + o.push_back(bc); + + bc = new RGWBucketEncryptionConfig("AES256"); + o.push_back(bc); + + o.push_back(new RGWBucketEncryptionConfig); +} diff --git a/src/rgw/rgw_dmclock.h b/src/rgw/rgw_dmclock.h new file mode 100644 index 000000000..6fad9cc18 --- /dev/null +++ b/src/rgw/rgw_dmclock.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * Copyright (C) 2019 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "dmclock/src/dmclock_server.h" + +namespace rgw::dmclock { +// TODO: implement read vs write +enum class client_id { + admin, //< /admin apis + auth, //< swift auth, sts + data, //< PutObj, GetObj + metadata, //< bucket operations, object metadata + count +}; + +// TODO move these to dmclock/types or so in submodule +using crimson::dmclock::Cost; +using crimson::dmclock::ClientInfo; + +enum class scheduler_t { + none, + throttler, + dmclock +}; + +inline scheduler_t get_scheduler_t(CephContext* const cct) +{ + const auto scheduler_type = cct->_conf.get_val("rgw_scheduler_type"); + if (scheduler_type == "dmclock") + return scheduler_t::dmclock; + else if (scheduler_type == "throttler") + return scheduler_t::throttler; + else + return scheduler_t::none; +} + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_dmclock_async_scheduler.cc b/src/rgw/rgw_dmclock_async_scheduler.cc new file mode 100644 index 000000000..28738e9f3 --- /dev/null +++ b/src/rgw/rgw_dmclock_async_scheduler.cc @@ -0,0 +1,183 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/async/completion.h" +#include "rgw_dmclock_async_scheduler.h" +#include "rgw_dmclock_scheduler.h" + +namespace rgw::dmclock { + +AsyncScheduler::~AsyncScheduler() +{ + cancel(); + if (observer) { + cct->_conf.remove_observer(this); + } +} + +const char** AsyncScheduler::get_tracked_conf_keys() const +{ + if (observer) { + return observer->get_tracked_conf_keys(); + } + static const char* keys[] = { "rgw_max_concurrent_requests", nullptr }; + return keys; +} + +void AsyncScheduler::handle_conf_change(const ConfigProxy& conf, + const std::set& changed) +{ + if (observer) { + observer->handle_conf_change(conf, changed); + } + if (changed.count("rgw_max_concurrent_requests")) { + auto new_max = conf.get_val("rgw_max_concurrent_requests"); + max_requests = new_max > 0 ? new_max : std::numeric_limits::max(); + } + queue.update_client_infos(); + schedule(crimson::dmclock::TimeZero); +} + +int AsyncScheduler::schedule_request_impl(const client_id& client, + const ReqParams& params, + const Time& time, const Cost& cost, + optional_yield yield_ctx) +{ + ceph_assert(yield_ctx); + + auto &yield = yield_ctx.get_yield_context(); + boost::system::error_code ec; + async_request(client, params, time, cost, yield[ec]); + + if (ec){ + if (ec == boost::system::errc::resource_unavailable_try_again) + return -EAGAIN; + else + return -ec.value(); + } + + return 0; +} + +void AsyncScheduler::request_complete() +{ + --outstanding_requests; + if(auto c = counters(client_id::count)){ + c->inc(throttle_counters::l_outstanding, -1); + } + schedule(crimson::dmclock::TimeZero); +} + +void AsyncScheduler::cancel() +{ + ClientSums sums; + + queue.remove_by_req_filter([&] (RequestRef&& request) { + inc(sums, request->client, request->cost); + auto c = static_cast(request.release()); + Completion::dispatch(std::unique_ptr{c}, + boost::asio::error::operation_aborted, + PhaseType::priority); + return true; + }); + timer.cancel(); + + for (size_t i = 0; i < client_count; i++) { + if (auto c = counters(static_cast(i))) { + on_cancel(c, sums[i]); + } + } +} + +void AsyncScheduler::cancel(const client_id& client) +{ + ClientSum sum; + + queue.remove_by_client(client, false, [&] (RequestRef&& request) { + sum.count++; + sum.cost += request->cost; + auto c = static_cast(request.release()); + Completion::dispatch(std::unique_ptr{c}, + boost::asio::error::operation_aborted, + PhaseType::priority); + }); + if (auto c = counters(client)) { + on_cancel(c, sum); + } + schedule(crimson::dmclock::TimeZero); +} + +void AsyncScheduler::schedule(const Time& time) +{ + timer.expires_at(Clock::from_double(time)); + timer.async_wait([this] (boost::system::error_code ec) { + // process requests unless the wait was canceled. note that a canceled + // wait may execute after this AsyncScheduler destructs + if (ec != boost::asio::error::operation_aborted) { + process(get_time()); + } + }); +} + +void AsyncScheduler::process(const Time& now) +{ + // must run in the executor. we should only invoke completion handlers if the + // executor is running + assert(get_executor().running_in_this_thread()); + + ClientSums rsums, psums; + + while (outstanding_requests < max_requests) { + auto pull = queue.pull_request(now); + + if (pull.is_none()) { + // no pending requests, cancel the timer + timer.cancel(); + break; + } + if (pull.is_future()) { + // update the timer based on the future time + schedule(pull.getTime()); + break; + } + ++outstanding_requests; + if(auto c = counters(client_id::count)){ + c->inc(throttle_counters::l_outstanding); + } + + // complete the request + auto& r = pull.get_retn(); + auto client = r.client; + auto phase = r.phase; + auto started = r.request->started; + auto cost = r.request->cost; + auto c = static_cast(r.request.release()); + Completion::post(std::unique_ptr{c}, + boost::system::error_code{}, phase); + + if (auto c = counters(client)) { + auto lat = Clock::from_double(now) - Clock::from_double(started); + if (phase == PhaseType::reservation) { + inc(rsums, client, cost); + c->tinc(queue_counters::l_res_latency, lat); + } else { + inc(psums, client, cost); + c->tinc(queue_counters::l_prio_latency, lat); + } + } + } + + if (outstanding_requests >= max_requests) { + if(auto c = counters(client_id::count)){ + c->inc(throttle_counters::l_throttle); + } + } + + for (size_t i = 0; i < client_count; i++) { + if (auto c = counters(static_cast(i))) { + on_process(c, rsums[i], psums[i]); + } + } +} + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_dmclock_async_scheduler.h b/src/rgw/rgw_dmclock_async_scheduler.h new file mode 100644 index 000000000..7bde75870 --- /dev/null +++ b/src/rgw/rgw_dmclock_async_scheduler.h @@ -0,0 +1,217 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "common/async/completion.h" + +#include +#include "rgw_dmclock_scheduler.h" +#include "rgw_dmclock_scheduler_ctx.h" + +namespace rgw::dmclock { + namespace async = ceph::async; + +/* + * A dmclock request scheduling service for use with boost::asio. + * + * An asynchronous dmclock priority queue, where scheduled requests complete + * on a boost::asio executor. + */ +class AsyncScheduler : public md_config_obs_t, public Scheduler { + public: + template // args forwarded to PullPriorityQueue ctor + AsyncScheduler(CephContext *cct, boost::asio::io_context& context, + GetClientCounters&& counters, md_config_obs_t *observer, + Args&& ...args); + ~AsyncScheduler(); + + using executor_type = boost::asio::io_context::executor_type; + + /// return the default executor for async_request() callbacks + executor_type get_executor() noexcept { + return timer.get_executor(); + } + + /// submit an async request for dmclock scheduling. the given completion + /// handler will be invoked with (error_code, PhaseType) when the request + /// is ready or canceled. on success, this grants a throttle unit that must + /// be returned with a call to request_complete() + template + auto async_request(const client_id& client, const ReqParams& params, + const Time& time, Cost cost, CompletionToken&& token); + + /// returns a throttle unit granted by async_request() + void request_complete() override; + + /// cancel all queued requests, invoking their completion handlers with an + /// operation_aborted error and default-constructed result + void cancel(); + + /// cancel all queued requests for a given client, invoking their completion + /// handler with an operation_aborted error and default-constructed result + void cancel(const client_id& client); + + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set& changed) override; + + private: + int schedule_request_impl(const client_id& client, const ReqParams& params, + const Time& time, const Cost& cost, + optional_yield yield_ctx) override; + + static constexpr bool IsDelayed = false; + using Queue = crimson::dmclock::PullPriorityQueue; + using RequestRef = typename Queue::RequestRef; + Queue queue; //< dmclock priority queue + + using Signature = void(boost::system::error_code, PhaseType); + using Completion = async::Completion>; + + using Clock = ceph::coarse_real_clock; + using Timer = boost::asio::basic_waitable_timer, executor_type>; + Timer timer; //< timer for the next scheduled request + + CephContext *const cct; + md_config_obs_t *const observer; //< observer to update ClientInfoFunc + GetClientCounters counters; //< provides per-client perf counters + + /// max request throttle + std::atomic max_requests; + std::atomic outstanding_requests = 0; + + /// set a timer to process the next request + void schedule(const Time& time); + + /// process ready requests, then schedule the next pending request + void process(const Time& now); +}; + + +template +AsyncScheduler::AsyncScheduler(CephContext *cct, boost::asio::io_context& context, + GetClientCounters&& counters, + md_config_obs_t *observer, Args&& ...args) + : queue(std::forward(args)...), + timer(context), cct(cct), observer(observer), + counters(std::move(counters)), + max_requests(cct->_conf.get_val("rgw_max_concurrent_requests")) +{ + if (max_requests <= 0) { + max_requests = std::numeric_limits::max(); + } + if (observer) { + cct->_conf.add_observer(this); + } +} + +template +auto AsyncScheduler::async_request(const client_id& client, + const ReqParams& params, + const Time& time, Cost cost, + CompletionToken&& token) +{ + boost::asio::async_completion init(token); + + auto ex1 = get_executor(); + auto& handler = init.completion_handler; + + // allocate the Request and add it to the queue + auto completion = Completion::create(ex1, std::move(handler), + Request{client, time, cost}); + // cast to unique_ptr + auto req = RequestRef{std::move(completion)}; + int r = queue.add_request(std::move(req), client, params, time, cost); + if (r == 0) { + // schedule an immediate call to process() on the executor + schedule(crimson::dmclock::TimeZero); + if (auto c = counters(client)) { + c->inc(queue_counters::l_qlen); + c->inc(queue_counters::l_cost, cost); + } + } else { + // post the error code + boost::system::error_code ec(r, boost::system::system_category()); + // cast back to Completion + auto completion = static_cast(req.release()); + async::post(std::unique_ptr{completion}, + ec, PhaseType::priority); + if (auto c = counters(client)) { + c->inc(queue_counters::l_limit); + c->inc(queue_counters::l_limit_cost, cost); + } + } + + return init.result.get(); +} + +class SimpleThrottler : public md_config_obs_t, public dmclock::Scheduler { +public: + SimpleThrottler(CephContext *cct) : + max_requests(cct->_conf.get_val("rgw_max_concurrent_requests")), + counters(cct, "simple-throttler") + { + if (max_requests <= 0) { + max_requests = std::numeric_limits::max(); + } + cct->_conf.add_observer(this); + } + + const char** get_tracked_conf_keys() const override { + static const char* keys[] = { "rgw_max_concurrent_requests", nullptr }; + return keys; + } + + void handle_conf_change(const ConfigProxy& conf, + const std::set& changed) override + { + if (changed.count("rgw_max_concurrent_requests")) { + auto new_max = conf.get_val("rgw_max_concurrent_requests"); + max_requests = new_max > 0 ? new_max : std::numeric_limits::max(); + } + } + + void request_complete() override { + --outstanding_requests; + if (auto c = counters(); + c != nullptr) { + c->inc(throttle_counters::l_outstanding, -1); + } + + } + +private: + int schedule_request_impl(const client_id&, const ReqParams&, + const Time&, const Cost&, + optional_yield) override { + if (outstanding_requests++ >= max_requests) { + if (auto c = counters(); + c != nullptr) { + c->inc(throttle_counters::l_outstanding); + c->inc(throttle_counters::l_throttle); + } + return -EAGAIN; + } + + return 0 ; + } + + std::atomic max_requests; + std::atomic outstanding_requests = 0; + ThrottleCounters counters; +}; + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_dmclock_scheduler.h b/src/rgw/rgw_dmclock_scheduler.h new file mode 100644 index 000000000..655e12bef --- /dev/null +++ b/src/rgw/rgw_dmclock_scheduler.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * (C) 2019 SUSE LLC + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "common/ceph_time.h" +#include "common/ceph_context.h" +#include "common/config.h" +#include "common/async/yield_context.h" +#include "rgw_dmclock.h" + +namespace rgw::dmclock { + +using crimson::dmclock::ReqParams; +using crimson::dmclock::PhaseType; +using crimson::dmclock::AtLimit; +using crimson::dmclock::Time; +using crimson::dmclock::get_time; + +/// function to provide client counters +using GetClientCounters = std::function; + +struct Request { + client_id client; + Time started; + Cost cost; +}; + +enum class ReqState { + Wait, + Ready, + Cancelled +}; + +template +class Completer { +public: + Completer(F &&f): f(std::move(f)) {} + // Default constructor is needed as we need to create an empty completer + // that'll be move assigned later in process request + Completer() = default; + ~Completer() { + if (f) { + f(); + } + } + Completer(const Completer&) = delete; + Completer& operator=(const Completer&) = delete; + Completer(Completer&& other) = default; + Completer& operator=(Completer&& other) = default; +private: + F f; +}; + +using SchedulerCompleter = Completer>; + +class Scheduler { +public: + auto schedule_request(const client_id& client, const ReqParams& params, + const Time& time, const Cost& cost, + optional_yield yield) + { + int r = schedule_request_impl(client,params,time,cost,yield); + return std::make_pair(r,SchedulerCompleter(std::bind(&Scheduler::request_complete,this))); + } + virtual void request_complete() {}; + + virtual ~Scheduler() {}; +private: + virtual int schedule_request_impl(const client_id&, const ReqParams&, + const Time&, const Cost&, + optional_yield) = 0; +}; + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_dmclock_scheduler_ctx.cc b/src/rgw/rgw_dmclock_scheduler_ctx.cc new file mode 100644 index 000000000..cc1170eb1 --- /dev/null +++ b/src/rgw/rgw_dmclock_scheduler_ctx.cc @@ -0,0 +1,178 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * (C) 2019 SUSE Linux LLC + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "rgw_dmclock_scheduler_ctx.h" + +namespace rgw::dmclock { + +ClientConfig::ClientConfig(CephContext *cct) +{ + update(cct->_conf); +} + +ClientInfo* ClientConfig::operator()(client_id client) +{ + return &clients[static_cast(client)]; +} + +const char** ClientConfig::get_tracked_conf_keys() const +{ + static const char* keys[] = { + "rgw_dmclock_admin_res", + "rgw_dmclock_admin_wgt", + "rgw_dmclock_admin_lim", + "rgw_dmclock_auth_res", + "rgw_dmclock_auth_wgt", + "rgw_dmclock_auth_lim", + "rgw_dmclock_data_res", + "rgw_dmclock_data_wgt", + "rgw_dmclock_data_lim", + "rgw_dmclock_metadata_res", + "rgw_dmclock_metadata_wgt", + "rgw_dmclock_metadata_lim", + "rgw_max_concurrent_requests", + nullptr + }; + return keys; +} + +void ClientConfig::update(const ConfigProxy& conf) +{ + clients.clear(); + static_assert(0 == static_cast(client_id::admin)); + clients.emplace_back(conf.get_val("rgw_dmclock_admin_res"), + conf.get_val("rgw_dmclock_admin_wgt"), + conf.get_val("rgw_dmclock_admin_lim")); + static_assert(1 == static_cast(client_id::auth)); + clients.emplace_back(conf.get_val("rgw_dmclock_auth_res"), + conf.get_val("rgw_dmclock_auth_wgt"), + conf.get_val("rgw_dmclock_auth_lim")); + static_assert(2 == static_cast(client_id::data)); + clients.emplace_back(conf.get_val("rgw_dmclock_data_res"), + conf.get_val("rgw_dmclock_data_wgt"), + conf.get_val("rgw_dmclock_data_lim")); + static_assert(3 == static_cast(client_id::metadata)); + clients.emplace_back(conf.get_val("rgw_dmclock_metadata_res"), + conf.get_val("rgw_dmclock_metadata_wgt"), + conf.get_val("rgw_dmclock_metadata_lim")); +} + +void ClientConfig::handle_conf_change(const ConfigProxy& conf, + const std::set& changed) +{ + update(conf); +} + +ClientCounters::ClientCounters(CephContext *cct) +{ + clients[static_cast(client_id::admin)] = + queue_counters::build(cct, "dmclock-admin"); + clients[static_cast(client_id::auth)] = + queue_counters::build(cct, "dmclock-auth"); + clients[static_cast(client_id::data)] = + queue_counters::build(cct, "dmclock-data"); + clients[static_cast(client_id::metadata)] = + queue_counters::build(cct, "dmclock-metadata"); + clients[static_cast(client_id::count)] = + throttle_counters::build(cct, "dmclock-scheduler"); +} + +void inc(ClientSums& sums, client_id client, Cost cost) +{ + auto& sum = sums[static_cast(client)]; + sum.count++; + sum.cost += cost; +} + +void on_cancel(PerfCounters *c, const ClientSum& sum) +{ + if (sum.count) { + c->dec(queue_counters::l_qlen, sum.count); + c->inc(queue_counters::l_cancel, sum.count); + } + if (sum.cost) { + c->dec(queue_counters::l_cost, sum.cost); + c->inc(queue_counters::l_cancel_cost, sum.cost); + } +} + +void on_process(PerfCounters* c, const ClientSum& rsum, const ClientSum& psum) +{ + if (rsum.count) { + c->inc(queue_counters::l_res, rsum.count); + } + if (rsum.cost) { + c->inc(queue_counters::l_res_cost, rsum.cost); + } + if (psum.count) { + c->inc(queue_counters::l_prio, psum.count); + } + if (psum.cost) { + c->inc(queue_counters::l_prio_cost, psum.cost); + } + if (rsum.count + psum.count) { + c->dec(queue_counters::l_qlen, rsum.count + psum.count); + } + if (rsum.cost + psum.cost) { + c->dec(queue_counters::l_cost, rsum.cost + psum.cost); + } +} +} // namespace rgw::dmclock + +namespace queue_counters { + +PerfCountersRef build(CephContext *cct, const std::string& name) +{ + if (!cct->_conf->throttler_perf_counter) { + return {}; + } + + PerfCountersBuilder b(cct, name, l_first, l_last); + b.add_u64(l_qlen, "qlen", "Queue size"); + b.add_u64(l_cost, "cost", "Cost of queued requests"); + b.add_u64_counter(l_res, "res", "Requests satisfied by reservation"); + b.add_u64_counter(l_res_cost, "res_cost", "Cost satisfied by reservation"); + b.add_u64_counter(l_prio, "prio", "Requests satisfied by priority"); + b.add_u64_counter(l_prio_cost, "prio_cost", "Cost satisfied by priority"); + b.add_u64_counter(l_limit, "limit", "Requests rejected by limit"); + b.add_u64_counter(l_limit_cost, "limit_cost", "Cost rejected by limit"); + b.add_u64_counter(l_cancel, "cancel", "Cancels"); + b.add_u64_counter(l_cancel_cost, "cancel_cost", "Canceled cost"); + b.add_time_avg(l_res_latency, "res latency", "Reservation latency"); + b.add_time_avg(l_prio_latency, "prio latency", "Priority latency"); + + auto logger = PerfCountersRef{ b.create_perf_counters(), cct }; + cct->get_perfcounters_collection()->add(logger.get()); + return logger; +} + +} // namespace queue_counters + +namespace throttle_counters { + +PerfCountersRef build(CephContext *cct, const std::string& name) +{ + if (!cct->_conf->throttler_perf_counter) { + return {}; + } + + PerfCountersBuilder b(cct, name, l_first, l_last); + b.add_u64(l_throttle, "throttle", "Requests throttled"); + b.add_u64(l_outstanding, "outstanding", "Outstanding Requests"); + + auto logger = PerfCountersRef{ b.create_perf_counters(), cct }; + cct->get_perfcounters_collection()->add(logger.get()); + return logger; +} + +} // namespace throttle_counters diff --git a/src/rgw/rgw_dmclock_scheduler_ctx.h b/src/rgw/rgw_dmclock_scheduler_ctx.h new file mode 100644 index 000000000..f27b81c26 --- /dev/null +++ b/src/rgw/rgw_dmclock_scheduler_ctx.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "common/perf_counters.h" +#include "common/ceph_context.h" +#include "common/config.h" +#include "rgw_dmclock.h" + +namespace queue_counters { + + enum { + l_first = 427150, + l_qlen, + l_cost, + l_res, + l_res_cost, + l_prio, + l_prio_cost, + l_limit, + l_limit_cost, + l_cancel, + l_cancel_cost, + l_res_latency, + l_prio_latency, + l_last, + }; + + PerfCountersRef build(CephContext *cct, const std::string& name); + +} // namespace queue_counters + +namespace throttle_counters { + enum { + l_first = 437219, + l_throttle, + l_outstanding, + l_last + }; + + PerfCountersRef build(CephContext *cct, const std::string& name); +} // namespace throttle + +namespace rgw::dmclock { + +// the last client counter would be for global scheduler stats +static constexpr auto counter_size = static_cast(client_id::count) + 1; +/// array of per-client counters to serve as GetClientCounters +class ClientCounters { + std::array clients; + public: + ClientCounters(CephContext *cct); + + PerfCounters* operator()(client_id client) const { + return clients[static_cast(client)].get(); + } +}; + +class ThrottleCounters { + PerfCountersRef counters; +public: + ThrottleCounters(CephContext* const cct,const std::string& name): + counters(throttle_counters::build(cct, name)) {} + + PerfCounters* operator()() const { + return counters.get(); + } +}; + + +struct ClientSum { + uint64_t count{0}; + Cost cost{0}; +}; + +constexpr auto client_count = static_cast(client_id::count); +using ClientSums = std::array; + +void inc(ClientSums& sums, client_id client, Cost cost); +void on_cancel(PerfCounters *c, const ClientSum& sum); +void on_process(PerfCounters* c, const ClientSum& rsum, const ClientSum& psum); + + +class ClientConfig : public md_config_obs_t { + std::vector clients; + + void update(const ConfigProxy &conf); + +public: + ClientConfig(CephContext *cct); + + ClientInfo* operator()(client_id client); + + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set& changed) override; +}; + +class SchedulerCtx { +public: + SchedulerCtx(CephContext* const cct) : sched_t(get_scheduler_t(cct)) + { + if(sched_t == scheduler_t::dmclock) { + dmc_client_config = std::make_shared(cct); + // we don't have a move only cref std::function yet + dmc_client_counters = std::make_optional(cct); + } + } + // We need to construct a std::function from a NonCopyable object + ClientCounters& get_dmc_client_counters() { return dmc_client_counters.value(); } + ClientConfig* const get_dmc_client_config() const { return dmc_client_config.get(); } +private: + scheduler_t sched_t; + std::shared_ptr dmc_client_config {nullptr}; + std::optional dmc_client_counters {std::nullopt}; +}; + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_dmclock_sync_scheduler.cc b/src/rgw/rgw_dmclock_sync_scheduler.cc new file mode 100644 index 000000000..06857202f --- /dev/null +++ b/src/rgw/rgw_dmclock_sync_scheduler.cc @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_dmclock_scheduler.h" +#include "rgw_dmclock_sync_scheduler.h" +#include "rgw_dmclock_scheduler_ctx.h" + +namespace rgw::dmclock { + +SyncScheduler::~SyncScheduler() +{ + cancel(); +} + +int SyncScheduler::add_request(const client_id& client, const ReqParams& params, + const Time& time, Cost cost) +{ + std::mutex req_mtx; + std::condition_variable req_cv; + ReqState rstate {ReqState::Wait}; + auto req = SyncRequest{client, time, cost, req_mtx, req_cv, rstate, counters}; + int r = queue.add_request_time(req, client, params, time, cost); + if (r == 0) { + if (auto c = counters(client)) { + c->inc(queue_counters::l_qlen); + c->inc(queue_counters::l_cost, cost); + } + queue.request_completed(); + // Perform a blocking wait until the request callback is called + { + std::unique_lock lock{req_mtx}; + req_cv.wait(lock, [&rstate] {return rstate != ReqState::Wait;}); + } + if (rstate == ReqState::Cancelled) { + //FIXME: decide on error code for cancelled request + r = -ECONNABORTED; + } + } else { + // post the error code + if (auto c = counters(client)) { + c->inc(queue_counters::l_limit); + c->inc(queue_counters::l_limit_cost, cost); + } + } + return r; +} + +void SyncScheduler::handle_request_cb(const client_id &c, + std::unique_ptr req, + PhaseType phase, Cost cost) +{ + { std::lock_guard lg(req->req_mtx); + req->req_state = ReqState::Ready; + req->req_cv.notify_one(); + } + + if (auto ctr = req->counters(c)) { + auto lat = Clock::from_double(get_time()) - Clock::from_double(req->started); + if (phase == PhaseType::reservation){ + ctr->tinc(queue_counters::l_res_latency, lat); + ctr->inc(queue_counters::l_res); + if (cost) ctr->inc(queue_counters::l_res_cost); + } else if (phase == PhaseType::priority){ + ctr->tinc(queue_counters::l_prio_latency, lat); + ctr->inc(queue_counters::l_prio); + if (cost) ctr->inc(queue_counters::l_prio_cost); + } + ctr->dec(queue_counters::l_qlen); + if (cost) ctr->dec(queue_counters::l_cost); + } +} + + +void SyncScheduler::cancel(const client_id& client) +{ + ClientSum sum; + + queue.remove_by_client(client, false, [&](RequestRef&& request) + { + sum.count++; + sum.cost += request->cost; + { + std::lock_guard lg(request->req_mtx); + request->req_state = ReqState::Cancelled; + request->req_cv.notify_one(); + } + }); + if (auto c = counters(client)) { + on_cancel(c, sum); + } + + queue.request_completed(); +} + +void SyncScheduler::cancel() +{ + ClientSums sums; + + queue.remove_by_req_filter([&](RequestRef&& request) -> bool + { + inc(sums, request->client, request->cost); + { + std::lock_guard lg(request->req_mtx); + request->req_state = ReqState::Cancelled; + request->req_cv.notify_one(); + } + return true; + }); + + for (size_t i = 0; i < client_count; i++) { + if (auto c = counters(static_cast(i))) { + on_cancel(c, sums[i]); + } + } +} + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_dmclock_sync_scheduler.h b/src/rgw/rgw_dmclock_sync_scheduler.h new file mode 100644 index 000000000..740234965 --- /dev/null +++ b/src/rgw/rgw_dmclock_sync_scheduler.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 SUSE Linux Gmbh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_dmclock_scheduler.h" +#include "rgw_dmclock_scheduler_ctx.h" + +namespace rgw::dmclock { +// For a blocking SyncRequest we hold a reference to a cv and the caller must +// ensure the lifetime +struct SyncRequest : public Request { + std::mutex& req_mtx; + std::condition_variable& req_cv; + ReqState& req_state; + GetClientCounters& counters; + explicit SyncRequest(client_id _id, Time started, Cost cost, + std::mutex& mtx, std::condition_variable& _cv, + ReqState& _state, GetClientCounters& counters): + Request{_id, started, cost}, req_mtx(mtx), req_cv(_cv), req_state(_state), counters(counters) {}; +}; + +class SyncScheduler: public Scheduler { +public: + template + SyncScheduler(CephContext *cct, GetClientCounters&& counters, + Args&& ...args); + ~SyncScheduler(); + + // submit a blocking request for dmclock scheduling, this function waits until + // the request is ready. + int add_request(const client_id& client, const ReqParams& params, + const Time& time, Cost cost); + + + void cancel(); + + void cancel(const client_id& client); + + static void handle_request_cb(const client_id& c, std::unique_ptr req, + PhaseType phase, Cost cost); +private: + int schedule_request_impl(const client_id& client, const ReqParams& params, + const Time& time, const Cost& cost, + optional_yield _y [[maybe_unused]]) override + { + return add_request(client, params, time, cost); + } + + static constexpr bool IsDelayed = false; + using Queue = crimson::dmclock::PushPriorityQueue; + using RequestRef = typename Queue::RequestRef; + using Clock = ceph::coarse_real_clock; + + Queue queue; + CephContext const *cct; + GetClientCounters counters; //< provides per-client perf counters +}; + +template +SyncScheduler::SyncScheduler(CephContext *cct, GetClientCounters&& counters, + Args&& ...args): + queue(std::forward(args)...), cct(cct), counters(std::move(counters)) +{} + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_env.cc b/src/rgw/rgw_env.cc new file mode 100644 index 000000000..d528f0e6d --- /dev/null +++ b/src/rgw/rgw_env.cc @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_common.h" +#include "rgw_log.h" + +#include +#include +#include "include/ceph_assert.h" +#include "rgw_crypt_sanitize.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void RGWEnv::init(CephContext *cct) +{ + conf.init(cct); +} + +void RGWEnv::set(std::string name, std::string val) +{ + env_map[std::move(name)] = std::move(val); +} + +void RGWEnv::init(CephContext *cct, char **envp) +{ + const char *p; + + env_map.clear(); + + for (int i=0; (p = envp[i]); ++i) { + string s(p); + int pos = s.find('='); + if (pos <= 0) // should never be 0 + continue; + string name = s.substr(0, pos); + string val = s.substr(pos + 1); + env_map[name] = val; + } + + init(cct); +} + +const char *rgw_conf_get(const map& conf_map, const char *name, const char *def_val) +{ + auto iter = conf_map.find(name); + if (iter == conf_map.end()) + return def_val; + + return iter->second.c_str(); +} + +boost::optional rgw_conf_get_optional(const map& conf_map, const std::string& name) +{ + auto iter = conf_map.find(name); + if (iter == conf_map.end()) + return boost::none; + + return boost::optional(iter->second); +} + +const char *RGWEnv::get(const char *name, const char *def_val) const +{ + return rgw_conf_get(env_map, name, def_val); +} + +boost::optional +RGWEnv::get_optional(const std::string& name) const +{ + return rgw_conf_get_optional(env_map, name); +} + +int rgw_conf_get_int(const map& conf_map, const char *name, int def_val) +{ + auto iter = conf_map.find(name); + if (iter == conf_map.end()) + return def_val; + + const char *s = iter->second.c_str(); + return atoi(s); +} + +int RGWEnv::get_int(const char *name, int def_val) const +{ + return rgw_conf_get_int(env_map, name, def_val); +} + +bool rgw_conf_get_bool(const map& conf_map, const char *name, bool def_val) +{ + auto iter = conf_map.find(name); + if (iter == conf_map.end()) + return def_val; + + const char *s = iter->second.c_str(); + return rgw_str_to_bool(s, def_val); +} + +bool RGWEnv::get_bool(const char *name, bool def_val) +{ + return rgw_conf_get_bool(env_map, name, def_val); +} + +size_t RGWEnv::get_size(const char *name, size_t def_val) const +{ + const auto iter = env_map.find(name); + if (iter == env_map.end()) + return def_val; + + size_t sz; + try{ + sz = stoull(iter->second); + } catch(...){ + /* it is very unlikely that we'll ever encounter out_of_range, but let's + return the default eitherway */ + sz = def_val; + } + + return sz; +} + +bool RGWEnv::exists(const char *name) const +{ + return env_map.find(name)!= env_map.end(); +} + +bool RGWEnv::exists_prefix(const char *prefix) const +{ + if (env_map.empty() || prefix == NULL) + return false; + + const auto iter = env_map.lower_bound(prefix); + if (iter == env_map.end()) + return false; + + return (strncmp(iter->first.c_str(), prefix, strlen(prefix)) == 0); +} + +void RGWEnv::remove(const char *name) +{ + map::iterator iter = env_map.find(name); + if (iter != env_map.end()) + env_map.erase(iter); +} + +void RGWConf::init(CephContext *cct) +{ + enable_ops_log = cct->_conf->rgw_enable_ops_log; + enable_usage_log = cct->_conf->rgw_enable_usage_log; + + defer_to_bucket_acls = 0; // default + if (cct->_conf->rgw_defer_to_bucket_acls == "recurse") { + defer_to_bucket_acls = RGW_DEFER_TO_BUCKET_ACLS_RECURSE; + } else if (cct->_conf->rgw_defer_to_bucket_acls == "full_control") { + defer_to_bucket_acls = RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL; + } +} diff --git a/src/rgw/rgw_es_main.cc b/src/rgw/rgw_es_main.cc new file mode 100644 index 000000000..6cfbc9352 --- /dev/null +++ b/src/rgw/rgw_es_main.cc @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include + +#include "global/global_init.h" +#include "global/global_context.h" + +#include "common/ceph_argparse.h" +#include "common/ceph_json.h" +#include "rgw_es_query.h" + +using namespace std; + +int main(int argc, char *argv[]) +{ + auto args = argv_to_vec(argc, argv); + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + + common_init_finish(g_ceph_context); + + string expr; + + if (argc > 1) { + expr = argv[1]; + } else { + expr = "age >= 30"; + } + + ESQueryCompiler es_query(expr, nullptr, "x-amz-meta-"); + + map aliases = { { "key", "name" }, + { "etag", "meta.etag" }, + { "size", "meta.size" }, + { "mtime", "meta.mtime" }, + { "lastmodified", "meta.mtime" }, + { "contenttype", "meta.contenttype" }, + }; + es_query.set_field_aliases(&aliases); + + map generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR}, + {"name", ESEntityTypeMap::ES_ENTITY_STR}, + {"instance", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.contenttype", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE}, + {"meta.size", ESEntityTypeMap::ES_ENTITY_INT} }; + ESEntityTypeMap gm(generic_map); + es_query.set_generic_type_map(&gm); + + map custom_map = { {"str", ESEntityTypeMap::ES_ENTITY_STR}, + {"int", ESEntityTypeMap::ES_ENTITY_INT}, + {"date", ESEntityTypeMap::ES_ENTITY_DATE} }; + ESEntityTypeMap em(custom_map); + es_query.set_custom_type_map(&em); + + string err; + + bool valid = es_query.compile(&err); + if (!valid) { + cout << "failed to compile query: " << err << std::endl; + return EINVAL; + } + + JSONFormatter f; + encode_json("root", es_query, &f); + + f.flush(cout); + + return 0; +} + diff --git a/src/rgw/rgw_es_query.cc b/src/rgw/rgw_es_query.cc new file mode 100644 index 000000000..16105d599 --- /dev/null +++ b/src/rgw/rgw_es_query.cc @@ -0,0 +1,696 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include +#include + +#include "common/ceph_json.h" +#include "rgw_common.h" +#include "rgw_es_query.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +bool pop_front(list& l, string *s) +{ + if (l.empty()) { + return false; + } + *s = l.front(); + l.pop_front(); + return true; +} + +map operator_map = { + { "or", 1 }, + { "and", 2 }, + { "<", 3 }, + { "<=", 3 }, + { "==", 3 }, + { "!=", 3 }, + { ">=", 3 }, + { ">", 3 }, +}; + +bool is_operator(const string& s) +{ + return (operator_map.find(s) != operator_map.end()); +} + +int operand_value(const string& op) +{ + auto i = operator_map.find(op); + if (i == operator_map.end()) { + return 0; + } + + return i->second; +} + +int check_precedence(const string& op1, const string& op2) +{ + return operand_value(op1) - operand_value(op2); +} + +static bool infix_to_prefix(list& source, list *out) +{ + list operator_stack; + list operand_stack; + + operator_stack.push_front("("); + source.push_back(")"); + + for (string& entity : source) { + if (entity == "(") { + operator_stack.push_front(entity); + } else if (entity == ")") { + string popped_operator; + if (!pop_front(operator_stack, &popped_operator)) { + return false; + } + + while (popped_operator != "(") { + operand_stack.push_front(popped_operator); + if (!pop_front(operator_stack, &popped_operator)) { + return false; + } + } + + } else if (is_operator(entity)) { + string popped_operator; + if (!pop_front(operator_stack, &popped_operator)) { + return false; + } + + int precedence = check_precedence(popped_operator, entity); + + while (precedence >= 0) { + operand_stack.push_front(popped_operator); + if (!pop_front(operator_stack, &popped_operator)) { + return false; + } + precedence = check_precedence(popped_operator, entity); + } + + operator_stack.push_front(popped_operator); + operator_stack.push_front(entity); + } else { + operand_stack.push_front(entity); + } + + } + + if (!operator_stack.empty()) { + return false; + } + + out->swap(operand_stack); + return true; +} + +class ESQueryNode { +protected: + ESQueryCompiler *compiler; +public: + ESQueryNode(ESQueryCompiler *_compiler) : compiler(_compiler) {} + virtual ~ESQueryNode() {} + + virtual bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) = 0; + + virtual void dump(Formatter *f) const = 0; +}; + +static bool alloc_node(ESQueryCompiler *compiler, ESQueryStack *s, ESQueryNode **pnode, string *perr); + +class ESQueryNode_Bool : public ESQueryNode { + string op; + ESQueryNode *first{nullptr}; + ESQueryNode *second{nullptr}; +public: + explicit ESQueryNode_Bool(ESQueryCompiler *compiler) : ESQueryNode(compiler) {} + ESQueryNode_Bool(ESQueryCompiler *compiler, const string& _op, ESQueryNode *_first, ESQueryNode *_second) :ESQueryNode(compiler), op(_op), first(_first), second(_second) {} + bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override { + bool valid = s->pop(&op); + if (!valid) { + *perr = "incorrect expression"; + return false; + } + valid = alloc_node(compiler, s, &first, perr) && + alloc_node(compiler, s, &second, perr); + if (!valid) { + return false; + } + *pnode = this; + return true; + } + virtual ~ESQueryNode_Bool() { + delete first; + delete second; + } + + void dump(Formatter *f) const override { + f->open_object_section("bool"); + const char *section = (op == "and" ? "must" : "should"); + f->open_array_section(section); + encode_json("entry", *first, f); + encode_json("entry", *second, f); + f->close_section(); + f->close_section(); + } + +}; + +class ESQueryNodeLeafVal { +public: + ESQueryNodeLeafVal() = default; + virtual ~ESQueryNodeLeafVal() {} + + virtual bool init(const string& str_val, string *perr) = 0; + virtual void encode_json(const string& field, Formatter *f) const = 0; +}; + +class ESQueryNodeLeafVal_Str : public ESQueryNodeLeafVal { + string val; +public: + ESQueryNodeLeafVal_Str() {} + bool init(const string& str_val, string *perr) override { + val = str_val; + return true; + } + void encode_json(const string& field, Formatter *f) const override { + ::encode_json(field.c_str(), val.c_str(), f); + } +}; + +class ESQueryNodeLeafVal_Int : public ESQueryNodeLeafVal { + int64_t val{0}; +public: + ESQueryNodeLeafVal_Int() {} + bool init(const string& str_val, string *perr) override { + string err; + val = strict_strtoll(str_val.c_str(), 10, &err); + if (!err.empty()) { + *perr = string("failed to parse integer: ") + err; + return false; + } + return true; + } + void encode_json(const string& field, Formatter *f) const override { + ::encode_json(field.c_str(), val, f); + } +}; + +class ESQueryNodeLeafVal_Date : public ESQueryNodeLeafVal { + ceph::real_time val; +public: + ESQueryNodeLeafVal_Date() {} + bool init(const string& str_val, string *perr) override { + if (parse_time(str_val.c_str(), &val) < 0) { + *perr = string("failed to parse date: ") + str_val; + return false; + } + return true; + } + void encode_json(const string& field, Formatter *f) const override { + string s; + rgw_to_iso8601(val, &s); + ::encode_json(field.c_str(), s, f); + } +}; + +class ESQueryNode_Op : public ESQueryNode { +protected: + string op; + string field; + string str_val; + ESQueryNodeLeafVal *val{nullptr}; + ESEntityTypeMap::EntityType entity_type{ESEntityTypeMap::ES_ENTITY_NONE}; + bool allow_restricted{false}; + + bool val_from_str(string *perr) { + switch (entity_type) { + case ESEntityTypeMap::ES_ENTITY_DATE: + val = new ESQueryNodeLeafVal_Date; + break; + case ESEntityTypeMap::ES_ENTITY_INT: + val = new ESQueryNodeLeafVal_Int; + break; + default: + val = new ESQueryNodeLeafVal_Str; + } + return val->init(str_val, perr); + } + bool do_init(ESQueryNode **pnode, string *perr) { + field = compiler->unalias_field(field); + ESQueryNode *effective_node; + if (!handle_nested(&effective_node, perr)) { + return false; + } + if (!val_from_str(perr)) { + return false; + } + *pnode = effective_node; + return true; + } + +public: + ESQueryNode_Op(ESQueryCompiler *compiler) : ESQueryNode(compiler) {} + ~ESQueryNode_Op() { + delete val; + } + virtual bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override { + bool valid = s->pop(&op) && + s->pop(&str_val) && + s->pop(&field); + if (!valid) { + *perr = "invalid expression"; + return false; + } + return do_init(pnode, perr); + } + bool handle_nested(ESQueryNode **pnode, string *perr); + + void set_allow_restricted(bool allow) { + allow_restricted = allow; + } + + virtual void dump(Formatter *f) const override = 0; +}; + +class ESQueryNode_Op_Equal : public ESQueryNode_Op { +public: + explicit ESQueryNode_Op_Equal(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {} + ESQueryNode_Op_Equal(ESQueryCompiler *compiler, const string& f, const string& v) : ESQueryNode_Op(compiler) { + op = "=="; + field = f; + str_val = v; + } + + bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override { + if (op.empty()) { + return ESQueryNode_Op::init(s, pnode, perr); + } + return do_init(pnode, perr); + } + + virtual void dump(Formatter *f) const override { + f->open_object_section("term"); + val->encode_json(field, f); + f->close_section(); + } +}; + +class ESQueryNode_Op_NotEqual : public ESQueryNode_Op { +public: + explicit ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {} + ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler, const string& f, const string& v) : ESQueryNode_Op(compiler) { + op = "!="; + field = f; + str_val = v; + } + + bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override { + if (op.empty()) { + return ESQueryNode_Op::init(s, pnode, perr); + } + return do_init(pnode, perr); + } + + virtual void dump(Formatter *f) const override { + f->open_object_section("bool"); + f->open_object_section("must_not"); + f->open_object_section("term"); + val->encode_json(field, f); + f->close_section(); + f->close_section(); + f->close_section(); + } +}; + +class ESQueryNode_Op_Range : public ESQueryNode_Op { + string range_str; +public: + ESQueryNode_Op_Range(ESQueryCompiler *compiler, const string& rs) : ESQueryNode_Op(compiler), range_str(rs) {} + + virtual void dump(Formatter *f) const override { + f->open_object_section("range"); + f->open_object_section(field.c_str()); + val->encode_json(range_str, f); + f->close_section(); + f->close_section(); + } +}; + +class ESQueryNode_Op_Nested_Parent : public ESQueryNode_Op { +public: + ESQueryNode_Op_Nested_Parent(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {} + + virtual string get_custom_leaf_field_name() = 0; +}; + +template +class ESQueryNode_Op_Nested : public ESQueryNode_Op_Nested_Parent { + string name; + ESQueryNode *next; +public: + ESQueryNode_Op_Nested(ESQueryCompiler *compiler, const string& _name, ESQueryNode *_next) : ESQueryNode_Op_Nested_Parent(compiler), + name(_name), next(_next) {} + ~ESQueryNode_Op_Nested() { + delete next; + } + + virtual void dump(Formatter *f) const override { + f->open_object_section("nested"); + string s = string("meta.custom-") + type_str(); + encode_json("path", s.c_str(), f); + f->open_object_section("query"); + f->open_object_section("bool"); + f->open_array_section("must"); + f->open_object_section("entry"); + f->open_object_section("match"); + string n = s + ".name"; + encode_json(n.c_str(), name.c_str(), f); + f->close_section(); + f->close_section(); + encode_json("entry", *next, f); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); + } + + string type_str() const; + string get_custom_leaf_field_name() override { + return string("meta.custom-") + type_str() + ".value"; + } +}; + +template<> +string ESQueryNode_Op_Nested::type_str() const { + return "string"; +} + +template<> +string ESQueryNode_Op_Nested::type_str() const { + return "int"; +} + +template<> +string ESQueryNode_Op_Nested::type_str() const { + return "date"; +} + +bool ESQueryNode_Op::handle_nested(ESQueryNode **pnode, string *perr) +{ + string field_name = field; + const string& custom_prefix = compiler->get_custom_prefix(); + if (!boost::algorithm::starts_with(field_name, custom_prefix)) { + *pnode = this; + auto m = compiler->get_generic_type_map(); + if (m) { + bool found = m->find(field_name, &entity_type) && + (allow_restricted || !compiler->is_restricted(field_name)); + if (!found) { + *perr = string("unexpected generic field '") + field_name + "'"; + } + return found; + } + *perr = "query parser does not support generic types"; + return false; + } + + field_name = field_name.substr(custom_prefix.size()); + auto m = compiler->get_custom_type_map(); + if (m) { + m->find(field_name, &entity_type); + /* ignoring returned bool, for now just treat it as string */ + } + + ESQueryNode_Op_Nested_Parent *new_node; + switch (entity_type) { + case ESEntityTypeMap::ES_ENTITY_INT: + new_node = new ESQueryNode_Op_Nested(compiler, field_name, this); + break; + case ESEntityTypeMap::ES_ENTITY_DATE: + new_node = new ESQueryNode_Op_Nested(compiler, field_name, this); + break; + default: + new_node = new ESQueryNode_Op_Nested(compiler, field_name, this); + } + + field = new_node->get_custom_leaf_field_name(); + *pnode = new_node; + + return true; +} + +static bool is_bool_op(const string& str) +{ + return (str == "or" || str == "and"); +} + +static bool alloc_node(ESQueryCompiler *compiler, ESQueryStack *s, ESQueryNode **pnode, string *perr) +{ + string op; + bool valid = s->peek(&op); + if (!valid) { + *perr = "incorrect expression"; + return false; + } + + ESQueryNode *node; + + if (is_bool_op(op)) { + node = new ESQueryNode_Bool(compiler); + } else if (op == "==") { + node = new ESQueryNode_Op_Equal(compiler); + } else if (op == "!=") { + node = new ESQueryNode_Op_NotEqual(compiler); + } else { + static map range_op_map = { + { "<", "lt"}, + { "<=", "lte"}, + { ">=", "gte"}, + { ">", "gt"}, + }; + + auto iter = range_op_map.find(op); + if (iter == range_op_map.end()) { + *perr = string("invalid operator: ") + op; + return false; + } + + node = new ESQueryNode_Op_Range(compiler, iter->second); + } + + if (!node->init(s, pnode, perr)) { + delete node; + return false; + } + return true; +} + + +bool is_key_char(char c) +{ + switch (c) { + case '(': + case ')': + case '<': + case '>': + case '!': + case '@': + case ',': + case ';': + case ':': + case '\\': + case '"': + case '/': + case '[': + case ']': + case '?': + case '=': + case '{': + case '}': + case ' ': + case '\t': + return false; + }; + return (isascii(c) > 0); +} + +static bool is_op_char(char c) +{ + switch (c) { + case '!': + case '<': + case '=': + case '>': + return true; + }; + return false; +} + +static bool is_val_char(char c) +{ + if (isspace(c)) { + return false; + } + return (c != ')'); +} + +void ESInfixQueryParser::skip_whitespace(const char *str, int size, int& pos) { + while (pos < size && isspace(str[pos])) { + ++pos; + } +} + +bool ESInfixQueryParser::get_next_token(bool (*filter)(char)) { + skip_whitespace(str, size, pos); + int token_start = pos; + while (pos < size && filter(str[pos])) { + ++pos; + } + if (pos == token_start) { + return false; + } + string token = string(str + token_start, pos - token_start); + args.push_back(token); + return true; +} + +bool ESInfixQueryParser::parse_condition() { + /* + * condition: + * + * whereas key: needs to conform to http header field restrictions + * operator: one of the following: < <= == != >= > + * val: ascii, terminated by either space or ')' (or end of string) + */ + + /* parse key */ + bool valid = get_next_token(is_key_char) && + get_next_token(is_op_char) && + get_next_token(is_val_char); + + if (!valid) { + return false; + } + + return true; +} + +bool ESInfixQueryParser::parse_and_or() { + skip_whitespace(str, size, pos); + if (pos + 3 <= size && strncmp(str + pos, "and", 3) == 0) { + pos += 3; + args.push_back("and"); + return true; + } + + if (pos + 2 <= size && strncmp(str + pos, "or", 2) == 0) { + pos += 2; + args.push_back("or"); + return true; + } + + return false; +} + +bool ESInfixQueryParser::parse_specific_char(const char *pchar) { + skip_whitespace(str, size, pos); + if (pos >= size) { + return false; + } + if (str[pos] != *pchar) { + return false; + } + + args.push_back(pchar); + ++pos; + return true; +} + +bool ESInfixQueryParser::parse_open_bracket() { + return parse_specific_char("("); +} + +bool ESInfixQueryParser::parse_close_bracket() { + return parse_specific_char(")"); +} + +bool ESInfixQueryParser::parse(list *result) { + /* + * expression: [(][[and/or]][)][and/or]... + */ + + while (pos < size) { + parse_open_bracket(); + if (!parse_condition()) { + return false; + } + parse_close_bracket(); + parse_and_or(); + } + + result->swap(args); + + return true; +} + +bool ESQueryCompiler::convert(list& infix, string *perr) { + list prefix; + if (!infix_to_prefix(infix, &prefix)) { + *perr = "invalid query"; + return false; + } + stack.assign(prefix); + if (!alloc_node(this, &stack, &query_root, perr)) { + return false; + } + if (!stack.done()) { + *perr = "invalid query"; + return false; + } + return true; +} + +ESQueryCompiler::~ESQueryCompiler() { + delete query_root; +} + +bool ESQueryCompiler::compile(string *perr) { + list infix; + if (!parser.parse(&infix)) { + *perr = "failed to parse query"; + return false; + } + + if (!convert(infix, perr)) { + return false; + } + + for (auto& c : eq_conds) { + ESQueryNode_Op_Equal *eq_node = new ESQueryNode_Op_Equal(this, c.first, c.second); + eq_node->set_allow_restricted(true); /* can access restricted fields */ + ESQueryNode *effective_node; + if (!eq_node->init(nullptr, &effective_node, perr)) { + delete eq_node; + return false; + } + query_root = new ESQueryNode_Bool(this, "and", effective_node, query_root); + } + + return true; +} + +void ESQueryCompiler::dump(Formatter *f) const { + encode_json("query", *query_root, f); +} + diff --git a/src/rgw/rgw_es_query.h b/src/rgw/rgw_es_query.h new file mode 100644 index 000000000..f96e06f75 --- /dev/null +++ b/src/rgw/rgw_es_query.h @@ -0,0 +1,164 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_string.h" + +class ESQueryStack { + std::list l; + std::list::iterator iter; + +public: + explicit ESQueryStack(std::list& src) { + assign(src); + } + + ESQueryStack() {} + + void assign(std::list& src) { + l.swap(src); + iter = l.begin(); + } + + bool peek(std::string *dest) { + if (done()) { + return false; + } + *dest = *iter; + return true; + } + + bool pop(std::string *dest) { + bool valid = peek(dest); + if (!valid) { + return false; + } + ++iter; + return true; + } + + bool done() { + return (iter == l.end()); + } +}; + +class ESInfixQueryParser { + std::string query; + int size; + const char *str; + int pos{0}; + std::list args; + + void skip_whitespace(const char *str, int size, int& pos); + bool get_next_token(bool (*filter)(char)); + + bool parse_condition(); + bool parse_and_or(); + bool parse_specific_char(const char *pchar); + bool parse_open_bracket(); + bool parse_close_bracket(); + +public: + explicit ESInfixQueryParser(const std::string& _query) : query(_query), size(query.size()), str(query.c_str()) {} + bool parse(std::list *result); +}; + +class ESQueryNode; + +struct ESEntityTypeMap { + enum EntityType { + ES_ENTITY_NONE = 0, + ES_ENTITY_STR = 1, + ES_ENTITY_INT = 2, + ES_ENTITY_DATE = 3, + }; + + std::map m; + + explicit ESEntityTypeMap(std::map& _m) : m(_m) {} + + bool find(const std::string& entity, EntityType *ptype) { + auto i = m.find(entity); + if (i != m.end()) { + *ptype = i->second; + return true; + } + + *ptype = ES_ENTITY_NONE; + return false; + } +}; + +class ESQueryCompiler { + ESInfixQueryParser parser; + ESQueryStack stack; + ESQueryNode *query_root{nullptr}; + + std::string custom_prefix; + + bool convert(std::list& infix, std::string *perr); + + std::list > eq_conds; + + ESEntityTypeMap *generic_type_map{nullptr}; + ESEntityTypeMap *custom_type_map{nullptr}; + + std::map *field_aliases = nullptr; + std::set *restricted_fields = nullptr; + +public: + ESQueryCompiler(const std::string& query, + std::list > *prepend_eq_conds, + const std::string& _custom_prefix) + : parser(query), custom_prefix(_custom_prefix) { + if (prepend_eq_conds) { + eq_conds = std::move(*prepend_eq_conds); + } + } + ~ESQueryCompiler(); + + bool compile(std::string *perr); + void dump(Formatter *f) const; + + void set_generic_type_map(ESEntityTypeMap *entity_map) { + generic_type_map = entity_map; + } + + ESEntityTypeMap *get_generic_type_map() { + return generic_type_map; + } + const std::string& get_custom_prefix() { return custom_prefix; } + + void set_custom_type_map(ESEntityTypeMap *entity_map) { + custom_type_map = entity_map; + } + + ESEntityTypeMap *get_custom_type_map() { + return custom_type_map; + } + + void set_field_aliases(std::map *fa) { + field_aliases = fa; + } + + std::string unalias_field(const std::string& field) { + if (!field_aliases) { + return field; + } + auto i = field_aliases->find(field); + if (i == field_aliases->end()) { + return field; + } + + return i->second; + } + + void set_restricted_fields(std::set *rf) { + restricted_fields = rf; + } + + bool is_restricted(const std::string& f) { + return (restricted_fields && restricted_fields->find(f) != restricted_fields->end()); + } +}; diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc new file mode 100644 index 000000000..ee32170a1 --- /dev/null +++ b/src/rgw/rgw_file.cc @@ -0,0 +1,2787 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "include/compat.h" +#include "include/rados/rgw_file.h" + +#include +#include + +#include "rgw_lib.h" +#include "rgw_resolve.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_rest_user.h" +#include "rgw_rest_s3.h" +#include "rgw_os_lib.h" +#include "rgw_auth_s3.h" +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_zone.h" +#include "rgw_file.h" +#include "rgw_lib_frontend.h" +#include "rgw_perf_counters.h" +#include "common/errno.h" + +#include "services/svc_zone.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace rgw; + +namespace rgw { + + const string RGWFileHandle::root_name = "/"; + + std::atomic RGWLibFS::fs_inst_counter; + + uint32_t RGWLibFS::write_completion_interval_s = 10; + + ceph::timer RGWLibFS::write_timer{ + ceph::construct_suspended}; + + inline int valid_fs_bucket_name(const string& name) { + int rc = valid_s3_bucket_name(name, false /* relaxed */); + if (rc != 0) { + if (name.size() > 255) + return -ENAMETOOLONG; + return -EINVAL; + } + return 0; + } + + inline int valid_fs_object_name(const string& name) { + int rc = valid_s3_object_name(name); + if (rc != 0) { + if (name.size() > 1024) + return -ENAMETOOLONG; + return -EINVAL; + } + return 0; + } + + class XattrHash + { + public: + std::size_t operator()(const rgw_xattrstr& att) const noexcept { + return XXH64(att.val, att.len, 5882300); + } + }; + + class XattrEqual + { + public: + bool operator()(const rgw_xattrstr& lhs, const rgw_xattrstr& rhs) const { + return ((lhs.len == rhs.len) && + (strncmp(lhs.val, rhs.val, lhs.len) == 0)); + } + }; + + /* well-known attributes */ + static const std::unordered_set< + rgw_xattrstr, XattrHash, XattrEqual> rgw_exposed_attrs = { + rgw_xattrstr{const_cast(RGW_ATTR_ETAG), sizeof(RGW_ATTR_ETAG)-1} + }; + + static inline bool is_exposed_attr(const rgw_xattrstr& k) { + return (rgw_exposed_attrs.find(k) != rgw_exposed_attrs.end()); + } + + LookupFHResult RGWLibFS::stat_bucket(RGWFileHandle* parent, const char *path, + RGWLibFS::BucketStats& bs, + uint32_t flags) + { + LookupFHResult fhr{nullptr, 0}; + std::string bucket_name{path}; + RGWStatBucketRequest req(cct, user->clone(), bucket_name, bs); + + int rc = g_rgwlib->get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0) && + (req.matched())) { + fhr = lookup_fh(parent, path, + (flags & RGWFileHandle::FLAG_LOCKED)| + RGWFileHandle::FLAG_CREATE| + RGWFileHandle::FLAG_BUCKET); + if (get<0>(fhr)) { + RGWFileHandle* rgw_fh = get<0>(fhr); + if (! (flags & RGWFileHandle::FLAG_LOCKED)) { + rgw_fh->mtx.lock(); + } + rgw_fh->set_times(req.get_ctime()); + /* restore attributes */ + auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1); + auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1); + if (ux_key && ux_attrs) { + DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs); + if (get<0>(dar) || get<1>(dar)) { + update_fh(rgw_fh); + } + } + if (! (flags & RGWFileHandle::FLAG_LOCKED)) { + rgw_fh->mtx.unlock(); + } + } + } + return fhr; + } + + LookupFHResult RGWLibFS::fake_leaf(RGWFileHandle* parent, + const char *path, + enum rgw_fh_type type, + struct stat *st, uint32_t st_mask, + uint32_t flags) + { + /* synthesize a minimal handle from parent, path, type, and st */ + using std::get; + + flags |= RGWFileHandle::FLAG_CREATE; + + switch (type) { + case RGW_FS_TYPE_DIRECTORY: + flags |= RGWFileHandle::FLAG_DIRECTORY; + break; + default: + /* file */ + break; + }; + + LookupFHResult fhr = lookup_fh(parent, path, flags); + if (get<0>(fhr)) { + RGWFileHandle* rgw_fh = get<0>(fhr); + if (st) { + lock_guard guard(rgw_fh->mtx); + if (st_mask & RGW_SETATTR_SIZE) { + rgw_fh->set_size(st->st_size); + } + if (st_mask & RGW_SETATTR_MTIME) { + rgw_fh->set_times(st->st_mtim); + } + } /* st */ + } /* rgw_fh */ + return fhr; + } /* RGWLibFS::fake_leaf */ + + LookupFHResult RGWLibFS::stat_leaf(RGWFileHandle* parent, + const char *path, + enum rgw_fh_type type, + uint32_t flags) + { + /* find either-of , , only one of + * which should exist; atomicity? */ + using std::get; + + LookupFHResult fhr{nullptr, 0}; + + /* XXX the need for two round-trip operations to identify file or + * directory leaf objects is unecessary--the current proposed + * mechanism to avoid this is to store leaf object names with an + * object locator w/o trailing slash */ + + std::string obj_path = parent->format_child_name(path, false); + + for (auto ix : { 0, 1, 2 }) { + switch (ix) { + case 0: + { + /* type hint */ + if (type == RGW_FS_TYPE_DIRECTORY) + continue; + + RGWStatObjRequest req(cct, user->clone(), + parent->bucket_name(), obj_path, + RGWStatObjRequest::FLAG_NONE); + int rc = g_rgwlib->get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0)) { + fhr = lookup_fh(parent, path, RGWFileHandle::FLAG_CREATE); + if (get<0>(fhr)) { + RGWFileHandle* rgw_fh = get<0>(fhr); + lock_guard guard(rgw_fh->mtx); + rgw_fh->set_size(req.get_size()); + rgw_fh->set_times(req.get_mtime()); + /* restore attributes */ + auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1); + auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1); + rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG))); + rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL))); + if (!(flags & RGWFileHandle::FLAG_IN_CB) && + ux_key && ux_attrs) { + DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs); + if (get<0>(dar) || get<1>(dar)) { + update_fh(rgw_fh); + } + } + } + goto done; + } + } + break; + case 1: + { + /* try dir form */ + /* type hint */ + if (type == RGW_FS_TYPE_FILE) + continue; + + obj_path += "/"; + RGWStatObjRequest req(cct, user->clone(), + parent->bucket_name(), obj_path, + RGWStatObjRequest::FLAG_NONE); + int rc = g_rgwlib->get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0)) { + fhr = lookup_fh(parent, path, RGWFileHandle::FLAG_DIRECTORY); + if (get<0>(fhr)) { + RGWFileHandle* rgw_fh = get<0>(fhr); + lock_guard guard(rgw_fh->mtx); + rgw_fh->set_size(req.get_size()); + rgw_fh->set_times(req.get_mtime()); + /* restore attributes */ + auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1); + auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1); + rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG))); + rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL))); + if (!(flags & RGWFileHandle::FLAG_IN_CB) && + ux_key && ux_attrs) { + DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs); + if (get<0>(dar) || get<1>(dar)) { + update_fh(rgw_fh); + } + } + } + goto done; + } + } + break; + case 2: + { + std::string object_name{path}; + RGWStatLeafRequest req(cct, user->clone(), + parent, object_name); + int rc = g_rgwlib->get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0)) { + if (req.matched) { + /* we need rgw object's key name equal to file name, if + * not return NULL */ + if ((flags & RGWFileHandle::FLAG_EXACT_MATCH) && + !req.exact_matched) { + lsubdout(get_context(), rgw, 15) + << __func__ + << ": stat leaf not exact match file name = " + << path << dendl; + goto done; + } + fhr = lookup_fh(parent, path, + RGWFileHandle::FLAG_CREATE| + ((req.is_dir) ? + RGWFileHandle::FLAG_DIRECTORY : + RGWFileHandle::FLAG_NONE)); + /* XXX we don't have an object--in general, there need not + * be one (just a path segment in some other object). In + * actual leaf an object exists, but we'd need another round + * trip to get attrs */ + if (get<0>(fhr)) { + /* for now use the parent object's mtime */ + RGWFileHandle* rgw_fh = get<0>(fhr); + lock_guard guard(rgw_fh->mtx); + rgw_fh->set_mtime(parent->get_mtime()); + } + } + } + } + break; + default: + /* not reached */ + break; + } + } + done: + return fhr; + } /* RGWLibFS::stat_leaf */ + + int RGWLibFS::read(RGWFileHandle* rgw_fh, uint64_t offset, size_t length, + size_t* bytes_read, void* buffer, uint32_t flags) + { + if (! rgw_fh->is_file()) + return -EINVAL; + + if (rgw_fh->deleted()) + return -ESTALE; + + RGWReadRequest req(get_context(), user->clone(), rgw_fh, offset, length, buffer); + + int rc = g_rgwlib->get_fe()->execute_req(&req); + if ((rc == 0) && + ((rc = req.get_ret()) == 0)) { + lock_guard guard(rgw_fh->mtx); + rgw_fh->set_atime(real_clock::to_timespec(real_clock::now())); + *bytes_read = req.nread; + } + + return rc; + } + + int RGWLibFS::readlink(RGWFileHandle* rgw_fh, uint64_t offset, size_t length, + size_t* bytes_read, void* buffer, uint32_t flags) + { + if (! rgw_fh->is_link()) + return -EINVAL; + + if (rgw_fh->deleted()) + return -ESTALE; + + RGWReadRequest req(get_context(), user->clone(), rgw_fh, offset, length, buffer); + + int rc = g_rgwlib->get_fe()->execute_req(&req); + if ((rc == 0) && + ((rc = req.get_ret()) == 0)) { + lock_guard(rgw_fh->mtx); + rgw_fh->set_atime(real_clock::to_timespec(real_clock::now())); + *bytes_read = req.nread; + } + + return rc; + } + + int RGWLibFS::unlink(RGWFileHandle* rgw_fh, const char* name, uint32_t flags) + { + int rc = 0; + BucketStats bs; + RGWFileHandle* parent = nullptr; + RGWFileHandle* bkt_fh = nullptr; + + if (unlikely(flags & RGWFileHandle::FLAG_UNLINK_THIS)) { + /* LOCKED */ + parent = rgw_fh->get_parent(); + } else { + /* atomicity */ + parent = rgw_fh; + LookupFHResult fhr = lookup_fh(parent, name, RGWFileHandle::FLAG_LOCK); + rgw_fh = get<0>(fhr); + /* LOCKED */ + } + + if (parent->is_root()) { + /* a bucket may have an object storing Unix attributes, check + * for and delete it */ + LookupFHResult fhr; + fhr = stat_bucket(parent, name, bs, (rgw_fh) ? + RGWFileHandle::FLAG_LOCKED : + RGWFileHandle::FLAG_NONE); + bkt_fh = get<0>(fhr); + if (unlikely(! bkt_fh)) { + /* implies !rgw_fh, so also !LOCKED */ + return -ENOENT; + } + + if (bs.num_entries > 1) { + unref(bkt_fh); /* return stat_bucket ref */ + if (likely(!! rgw_fh)) { /* return lock and ref from + * lookup_fh (or caller in the + * special case of + * RGWFileHandle::FLAG_UNLINK_THIS) */ + rgw_fh->mtx.unlock(); + unref(rgw_fh); + } + return -ENOTEMPTY; + } else { + /* delete object w/key "/" (uxattrs), if any */ + string oname{"/"}; + RGWDeleteObjRequest req(cct, user->clone(), bkt_fh->bucket_name(), oname); + rc = g_rgwlib->get_fe()->execute_req(&req); + /* don't care if ENOENT */ + unref(bkt_fh); + } + + string bname{name}; + RGWDeleteBucketRequest req(cct, user->clone(), bname); + rc = g_rgwlib->get_fe()->execute_req(&req); + if (! rc) { + rc = req.get_ret(); + } + } else { + /* + * leaf object + */ + if (! rgw_fh) { + /* XXX for now, peform a hard lookup to deduce the type of + * object to be deleted ("foo" vs. "foo/")--also, ensures + * atomicity at this endpoint */ + struct rgw_file_handle *fh; + rc = rgw_lookup(get_fs(), parent->get_fh(), name, &fh, + nullptr /* st */, 0 /* mask */, + RGW_LOOKUP_FLAG_NONE); + if (!! rc) + return rc; + + /* rgw_fh ref+ */ + rgw_fh = get_rgwfh(fh); + rgw_fh->mtx.lock(); /* LOCKED */ + } + + std::string oname = rgw_fh->relative_object_name(); + if (rgw_fh->is_dir()) { + /* for the duration of our cache timer, trust positive + * child cache */ + if (rgw_fh->has_children()) { + rgw_fh->mtx.unlock(); + unref(rgw_fh); + return(-ENOTEMPTY); + } + oname += "/"; + } + RGWDeleteObjRequest req(cct, user->clone(), parent->bucket_name(), oname); + rc = g_rgwlib->get_fe()->execute_req(&req); + if (! rc) { + rc = req.get_ret(); + } + } + + /* ENOENT when raced with other s3 gateway */ + if (! rc || rc == -ENOENT) { + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh, + RGWFileHandle::FHCache::FLAG_LOCK); + } + + if (! rc) { + real_time t = real_clock::now(); + parent->set_mtime(real_clock::to_timespec(t)); + parent->set_ctime(real_clock::to_timespec(t)); + } + + rgw_fh->mtx.unlock(); + unref(rgw_fh); + + return rc; + } /* RGWLibFS::unlink */ + + int RGWLibFS::rename(RGWFileHandle* src_fh, RGWFileHandle* dst_fh, + const char *_src_name, const char *_dst_name) + + { + /* XXX initial implementation: try-copy, and delete if copy + * succeeds */ + int rc = -EINVAL; + real_time t; + + std::string src_name{_src_name}; + std::string dst_name{_dst_name}; + + /* atomicity */ + LookupFHResult fhr = lookup_fh(src_fh, _src_name, RGWFileHandle::FLAG_LOCK); + RGWFileHandle* rgw_fh = get<0>(fhr); + + /* should not happen */ + if (! rgw_fh) { + ldout(get_context(), 0) << __func__ + << " BUG no such src renaming path=" + << src_name + << dendl; + goto out; + } + + /* forbid renaming of directories (unreasonable at scale) */ + if (rgw_fh->is_dir()) { + ldout(get_context(), 12) << __func__ + << " rejecting attempt to rename directory path=" + << rgw_fh->full_object_name() + << dendl; + rc = -EPERM; + goto unlock; + } + + /* forbid renaming open files (violates intent, for now) */ + if (rgw_fh->is_open()) { + ldout(get_context(), 12) << __func__ + << " rejecting attempt to rename open file path=" + << rgw_fh->full_object_name() + << dendl; + rc = -EPERM; + goto unlock; + } + + t = real_clock::now(); + + for (int ix : {0, 1}) { + switch (ix) { + case 0: + { + RGWCopyObjRequest req(cct, user->clone(), src_fh, dst_fh, src_name, dst_name); + int rc = g_rgwlib->get_fe()->execute_req(&req); + if ((rc != 0) || + ((rc = req.get_ret()) != 0)) { + ldout(get_context(), 1) + << __func__ + << " rename step 0 failed src=" + << src_fh->full_object_name() << " " << src_name + << " dst=" << dst_fh->full_object_name() + << " " << dst_name + << "rc " << rc + << dendl; + goto unlock; + } + ldout(get_context(), 12) + << __func__ + << " rename step 0 success src=" + << src_fh->full_object_name() << " " << src_name + << " dst=" << dst_fh->full_object_name() + << " " << dst_name + << " rc " << rc + << dendl; + /* update dst change id */ + dst_fh->set_times(t); + } + break; + case 1: + { + rc = this->unlink(rgw_fh /* LOCKED */, _src_name, + RGWFileHandle::FLAG_UNLINK_THIS); + /* !LOCKED, -ref */ + if (! rc) { + ldout(get_context(), 12) + << __func__ + << " rename step 1 success src=" + << src_fh->full_object_name() << " " << src_name + << " dst=" << dst_fh->full_object_name() + << " " << dst_name + << " rc " << rc + << dendl; + /* update src change id */ + src_fh->set_times(t); + } else { + ldout(get_context(), 1) + << __func__ + << " rename step 1 failed src=" + << src_fh->full_object_name() << " " << src_name + << " dst=" << dst_fh->full_object_name() + << " " << dst_name + << " rc " << rc + << dendl; + } + } + goto out; + default: + ceph_abort(); + } /* switch */ + } /* ix */ + unlock: + rgw_fh->mtx.unlock(); /* !LOCKED */ + unref(rgw_fh); /* -ref */ + + out: + return rc; + } /* RGWLibFS::rename */ + + MkObjResult RGWLibFS::mkdir(RGWFileHandle* parent, const char *name, + struct stat *st, uint32_t mask, uint32_t flags) + { + int rc, rc2; + rgw_file_handle *lfh; + + rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh, + nullptr /* st */, 0 /* mask */, + RGW_LOOKUP_FLAG_NONE); + if (! rc) { + /* conflict! */ + rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE); + // ignore return code + return MkObjResult{nullptr, -EEXIST}; + } + + MkObjResult mkr{nullptr, -EINVAL}; + LookupFHResult fhr; + RGWFileHandle* rgw_fh = nullptr; + buffer::list ux_key, ux_attrs; + + fhr = lookup_fh(parent, name, + RGWFileHandle::FLAG_CREATE| + RGWFileHandle::FLAG_DIRECTORY| + RGWFileHandle::FLAG_LOCK); + rgw_fh = get<0>(fhr); + if (rgw_fh) { + rgw_fh->create_stat(st, mask); + rgw_fh->set_times(real_clock::now()); + /* save attrs */ + rgw_fh->encode_attrs(ux_key, ux_attrs); + if (st) + rgw_fh->stat(st, RGWFileHandle::FLAG_LOCKED); + get<0>(mkr) = rgw_fh; + } else { + get<1>(mkr) = -EIO; + return mkr; + } + + if (parent->is_root()) { + /* bucket */ + string bname{name}; + /* enforce S3 name restrictions */ + rc = valid_fs_bucket_name(bname); + if (rc != 0) { + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh, + RGWFileHandle::FHCache::FLAG_LOCK); + rgw_fh->mtx.unlock(); + unref(rgw_fh); + get<0>(mkr) = nullptr; + get<1>(mkr) = rc; + return mkr; + } + + RGWCreateBucketRequest req(get_context(), user->clone(), bname); + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + } else { + /* create an object representing the directory */ + buffer::list bl; + string dir_name = parent->format_child_name(name, true); + + /* need valid S3 name (characters, length <= 1024, etc) */ + rc = valid_fs_object_name(dir_name); + if (rc != 0) { + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh, + RGWFileHandle::FHCache::FLAG_LOCK); + rgw_fh->mtx.unlock(); + unref(rgw_fh); + get<0>(mkr) = nullptr; + get<1>(mkr) = rc; + return mkr; + } + + RGWPutObjRequest req(get_context(), user->clone(), parent->bucket_name(), dir_name, bl); + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + } + + if (! ((rc == 0) && + (rc2 == 0))) { + /* op failed */ + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + rgw_fh->mtx.unlock(); /* !LOCKED */ + unref(rgw_fh); + get<0>(mkr) = nullptr; + /* fixup rc */ + if (!rc) + rc = rc2; + } else { + real_time t = real_clock::now(); + parent->set_mtime(real_clock::to_timespec(t)); + parent->set_ctime(real_clock::to_timespec(t)); + rgw_fh->mtx.unlock(); /* !LOCKED */ + } + + get<1>(mkr) = rc; + + return mkr; + } /* RGWLibFS::mkdir */ + + MkObjResult RGWLibFS::create(RGWFileHandle* parent, const char *name, + struct stat *st, uint32_t mask, uint32_t flags) + { + int rc, rc2; + + using std::get; + + rgw_file_handle *lfh; + rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh, + nullptr /* st */, 0 /* mask */, + RGW_LOOKUP_FLAG_NONE); + if (! rc) { + /* conflict! */ + rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE); + // ignore return code + return MkObjResult{nullptr, -EEXIST}; + } + + /* expand and check name */ + std::string obj_name = parent->format_child_name(name, false); + rc = valid_fs_object_name(obj_name); + if (rc != 0) { + return MkObjResult{nullptr, rc}; + } + + /* create it */ + buffer::list bl; + RGWPutObjRequest req(cct, user->clone(), parent->bucket_name(), obj_name, bl); + MkObjResult mkr{nullptr, -EINVAL}; + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + + if ((rc == 0) && + (rc2 == 0)) { + /* XXX atomicity */ + LookupFHResult fhr = lookup_fh(parent, name, RGWFileHandle::FLAG_CREATE | + RGWFileHandle::FLAG_LOCK); + RGWFileHandle* rgw_fh = get<0>(fhr); + if (rgw_fh) { + if (get<1>(fhr) & RGWFileHandle::FLAG_CREATE) { + /* fill in stat data */ + real_time t = real_clock::now(); + rgw_fh->create_stat(st, mask); + rgw_fh->set_times(t); + + parent->set_mtime(real_clock::to_timespec(t)); + parent->set_ctime(real_clock::to_timespec(t)); + } + if (st) + (void) rgw_fh->stat(st, RGWFileHandle::FLAG_LOCKED); + + rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG))); + rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL))); + + get<0>(mkr) = rgw_fh; + rgw_fh->file_ondisk_version = 0; // inital version + rgw_fh->mtx.unlock(); + } else + rc = -EIO; + } + + get<1>(mkr) = rc; + + /* case like : quota exceed will be considered as fail too*/ + if(rc2 < 0) + get<1>(mkr) = rc2; + + return mkr; + } /* RGWLibFS::create */ + + MkObjResult RGWLibFS::symlink(RGWFileHandle* parent, const char *name, + const char* link_path, struct stat *st, uint32_t mask, uint32_t flags) + { + int rc, rc2; + + using std::get; + + rgw_file_handle *lfh; + rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh, + nullptr /* st */, 0 /* mask */, + RGW_LOOKUP_FLAG_NONE); + if (! rc) { + /* conflict! */ + rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE); + // ignore return code + return MkObjResult{nullptr, -EEXIST}; + } + + MkObjResult mkr{nullptr, -EINVAL}; + LookupFHResult fhr; + RGWFileHandle* rgw_fh = nullptr; + buffer::list ux_key, ux_attrs; + + fhr = lookup_fh(parent, name, + RGWFileHandle::FLAG_CREATE| + RGWFileHandle::FLAG_SYMBOLIC_LINK| + RGWFileHandle::FLAG_LOCK); + rgw_fh = get<0>(fhr); + if (rgw_fh) { + rgw_fh->create_stat(st, mask); + rgw_fh->set_times(real_clock::now()); + /* save attrs */ + rgw_fh->encode_attrs(ux_key, ux_attrs); + if (st) + rgw_fh->stat(st); + get<0>(mkr) = rgw_fh; + } else { + get<1>(mkr) = -EIO; + return mkr; + } + + /* need valid S3 name (characters, length <= 1024, etc) */ + rc = valid_fs_object_name(name); + if (rc != 0) { + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh, + RGWFileHandle::FHCache::FLAG_LOCK); + rgw_fh->mtx.unlock(); + unref(rgw_fh); + get<0>(mkr) = nullptr; + get<1>(mkr) = rc; + return mkr; + } + + string obj_name = std::string(name); + /* create an object representing the directory */ + buffer::list bl; + + /* XXXX */ +#if 0 + bl.push_back( + buffer::create_static(len, static_cast(buffer))); +#else + + bl.push_back( + buffer::copy(link_path, strlen(link_path))); +#endif + + RGWPutObjRequest req(get_context(), user->clone(), parent->bucket_name(), obj_name, bl); + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + if (! ((rc == 0) && + (rc2 == 0))) { + /* op failed */ + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + rgw_fh->mtx.unlock(); /* !LOCKED */ + unref(rgw_fh); + get<0>(mkr) = nullptr; + /* fixup rc */ + if (!rc) + rc = rc2; + } else { + real_time t = real_clock::now(); + parent->set_mtime(real_clock::to_timespec(t)); + parent->set_ctime(real_clock::to_timespec(t)); + rgw_fh->mtx.unlock(); /* !LOCKED */ + } + + get<1>(mkr) = rc; + + return mkr; + } /* RGWLibFS::symlink */ + + int RGWLibFS::getattr(RGWFileHandle* rgw_fh, struct stat* st) + { + switch(rgw_fh->fh.fh_type) { + case RGW_FS_TYPE_FILE: + { + if (rgw_fh->deleted()) + return -ESTALE; + } + break; + default: + break; + }; + /* if rgw_fh is a directory, mtime will be advanced */ + return rgw_fh->stat(st); + } /* RGWLibFS::getattr */ + + int RGWLibFS::setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask, + uint32_t flags) + { + int rc, rc2; + buffer::list ux_key, ux_attrs; + buffer::list etag = rgw_fh->get_etag(); + buffer::list acls = rgw_fh->get_acls(); + + lock_guard guard(rgw_fh->mtx); + + switch(rgw_fh->fh.fh_type) { + case RGW_FS_TYPE_FILE: + { + if (rgw_fh->deleted()) + return -ESTALE; + } + break; + default: + break; + }; + + string obj_name{rgw_fh->relative_object_name()}; + + if (rgw_fh->is_dir() && + (likely(! rgw_fh->is_bucket()))) { + obj_name += "/"; + } + + RGWSetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name); + + rgw_fh->create_stat(st, mask); + rgw_fh->encode_attrs(ux_key, ux_attrs); + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + req.emplace_attr(RGW_ATTR_ETAG, std::move(etag)); + req.emplace_attr(RGW_ATTR_ACL, std::move(acls)); + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + + if (rc == -ENOENT) { + /* special case: materialize placeholder dir */ + buffer::list bl; + RGWPutObjRequest req(get_context(), user->clone(), rgw_fh->bucket_name(), obj_name, bl); + + rgw_fh->encode_attrs(ux_key, ux_attrs); /* because std::moved */ + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + } + + if ((rc != 0) || (rc2 != 0)) { + return -EIO; + } + + rgw_fh->set_ctime(real_clock::to_timespec(real_clock::now())); + + return 0; + } /* RGWLibFS::setattr */ + + static inline std::string prefix_xattr_keystr(const rgw_xattrstr& key) { + std::string keystr; + keystr.reserve(sizeof(RGW_ATTR_META_PREFIX) + key.len); + keystr += string{RGW_ATTR_META_PREFIX}; + keystr += string{key.val, key.len}; + return keystr; + } + + static inline std::string_view unprefix_xattr_keystr(const std::string& key) + { + std::string_view svk{key}; + auto pos = svk.find(RGW_ATTR_META_PREFIX); + if (pos == std::string_view::npos) { + return std::string_view{""}; + } else if (pos == 0) { + svk.remove_prefix(sizeof(RGW_ATTR_META_PREFIX)-1); + } + return svk; + } + + int RGWLibFS::getxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist *attrs, + rgw_getxattr_cb cb, void *cb_arg, + uint32_t flags) + { + /* cannot store on fs_root, should not on buckets? */ + if ((rgw_fh->is_bucket()) || + (rgw_fh->is_root())) { + return -EINVAL; + } + + int rc, rc2, rc3; + string obj_name{rgw_fh->relative_object_name2()}; + + RGWGetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name); + + for (uint32_t ix = 0; ix < attrs->xattr_cnt; ++ix) { + auto& xattr = attrs->xattrs[ix]; + + /* pass exposed attr keys as given, else prefix */ + std::string k = is_exposed_attr(xattr.key) + ? std::string{xattr.key.val, xattr.key.len} + : prefix_xattr_keystr(xattr.key); + + req.emplace_key(std::move(k)); + } + + if (ldlog_p1(get_context(), ceph_subsys_rgw, 15)) { + lsubdout(get_context(), rgw, 15) + << __func__ + << " get keys for: " + << rgw_fh->object_name() + << " keys:" + << dendl; + for (const auto& attr: req.get_attrs()) { + lsubdout(get_context(), rgw, 15) + << "\tkey: " << attr.first << dendl; + } + } + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + rc3 = ((rc == 0) && (rc2 == 0)) ? 0 : -EIO; + + /* call back w/xattr data */ + if (rc3 == 0) { + const auto& attrs = req.get_attrs(); + for (const auto& attr : attrs) { + + if (!attr.second.has_value()) + continue; + + const auto& k = attr.first; + const auto& v = attr.second.value(); + + /* return exposed attr keys as given, else unprefix -- + * yes, we could have memoized the exposed check, but + * to be efficient it would need to be saved with + * RGWGetAttrs::attrs, I think */ + std::string_view svk = + is_exposed_attr(rgw_xattrstr{const_cast(k.c_str()), + uint32_t(k.length())}) + ? k + : unprefix_xattr_keystr(k); + + /* skip entries not matching prefix */ + if (svk.empty()) + continue; + + rgw_xattrstr xattr_k = { const_cast(svk.data()), + uint32_t(svk.length())}; + rgw_xattrstr xattr_v = + {const_cast(const_cast(v).c_str()), + uint32_t(v.length())}; + rgw_xattr xattr = { xattr_k, xattr_v }; + rgw_xattrlist xattrlist = { &xattr, 1 }; + + cb(&xattrlist, cb_arg, RGW_GETXATTR_FLAG_NONE); + } + } + + return rc3; + } /* RGWLibFS::getxattrs */ + + int RGWLibFS::lsxattrs( + RGWFileHandle* rgw_fh, rgw_xattrstr *filter_prefix, rgw_getxattr_cb cb, + void *cb_arg, uint32_t flags) + { + /* cannot store on fs_root, should not on buckets? */ + if ((rgw_fh->is_bucket()) || + (rgw_fh->is_root())) { + return -EINVAL; + } + + int rc, rc2, rc3; + string obj_name{rgw_fh->relative_object_name2()}; + + RGWGetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name); + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + rc3 = ((rc == 0) && (rc2 == 0)) ? 0 : -EIO; + + /* call back w/xattr data--check for eof */ + if (rc3 == 0) { + const auto& keys = req.get_attrs(); + for (const auto& k : keys) { + + /* return exposed attr keys as given, else unprefix */ + std::string_view svk = + is_exposed_attr(rgw_xattrstr{const_cast(k.first.c_str()), + uint32_t(k.first.length())}) + ? k.first + : unprefix_xattr_keystr(k.first); + + /* skip entries not matching prefix */ + if (svk.empty()) + continue; + + rgw_xattrstr xattr_k = { const_cast(svk.data()), + uint32_t(svk.length())}; + rgw_xattrstr xattr_v = { nullptr, 0 }; + rgw_xattr xattr = { xattr_k, xattr_v }; + rgw_xattrlist xattrlist = { &xattr, 1 }; + + auto cbr = cb(&xattrlist, cb_arg, RGW_LSXATTR_FLAG_NONE); + if (cbr & RGW_LSXATTR_FLAG_STOP) + break; + } + } + + return rc3; + } /* RGWLibFS::lsxattrs */ + + int RGWLibFS::setxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist *attrs, + uint32_t flags) + { + /* cannot store on fs_root, should not on buckets? */ + if ((rgw_fh->is_bucket()) || + (rgw_fh->is_root())) { + return -EINVAL; + } + + int rc, rc2; + string obj_name{rgw_fh->relative_object_name2()}; + + RGWSetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name); + + for (uint32_t ix = 0; ix < attrs->xattr_cnt; ++ix) { + auto& xattr = attrs->xattrs[ix]; + buffer::list attr_bl; + /* don't allow storing at RGW_ATTR_META_PREFIX */ + if (! (xattr.key.len > 0)) + continue; + + /* reject lexical match with any exposed attr */ + if (is_exposed_attr(xattr.key)) + continue; + + string k = prefix_xattr_keystr(xattr.key); + attr_bl.append(xattr.val.val, xattr.val.len); + req.emplace_attr(k.c_str(), std::move(attr_bl)); + } + + /* don't send null requests */ + if (! (req.get_attrs().size() > 0)) { + return -EINVAL; + } + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + + return (((rc == 0) && (rc2 == 0)) ? 0 : -EIO); + + } /* RGWLibFS::setxattrs */ + + int RGWLibFS::rmxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist* attrs, + uint32_t flags) + { + /* cannot store on fs_root, should not on buckets? */ + if ((rgw_fh->is_bucket()) || + (rgw_fh->is_root())) { + return -EINVAL; + } + + int rc, rc2; + string obj_name{rgw_fh->relative_object_name2()}; + + RGWRMAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name); + + for (uint32_t ix = 0; ix < attrs->xattr_cnt; ++ix) { + auto& xattr = attrs->xattrs[ix]; + /* don't allow storing at RGW_ATTR_META_PREFIX */ + if (! (xattr.key.len > 0)) { + continue; + } + string k = prefix_xattr_keystr(xattr.key); + req.emplace_key(std::move(k)); + } + + /* don't send null requests */ + if (! (req.get_attrs().size() > 0)) { + return -EINVAL; + } + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + + return (((rc == 0) && (rc2 == 0)) ? 0 : -EIO); + + } /* RGWLibFS::rmxattrs */ + + /* called with rgw_fh->mtx held */ + void RGWLibFS::update_fh(RGWFileHandle *rgw_fh) + { + int rc, rc2; + string obj_name{rgw_fh->relative_object_name()}; + buffer::list ux_key, ux_attrs; + + if (rgw_fh->is_dir() && + (likely(! rgw_fh->is_bucket()))) { + obj_name += "/"; + } + + lsubdout(get_context(), rgw, 17) + << __func__ + << " update old versioned fh : " << obj_name + << dendl; + + RGWSetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name); + + rgw_fh->encode_attrs(ux_key, ux_attrs, false); + + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = g_rgwlib->get_fe()->execute_req(&req); + rc2 = req.get_ret(); + + if ((rc != 0) || (rc2 != 0)) { + lsubdout(get_context(), rgw, 17) + << __func__ + << " update fh failed : " << obj_name + << dendl; + } + } /* RGWLibFS::update_fh */ + + void RGWLibFS::close() + { + state.flags |= FLAG_CLOSED; + + class ObjUnref + { + RGWLibFS* fs; + public: + explicit ObjUnref(RGWLibFS* _fs) : fs(_fs) {} + void operator()(RGWFileHandle* fh) const { + lsubdout(fs->get_context(), rgw, 5) + << __PRETTY_FUNCTION__ + << fh->name + << " before ObjUnref refs=" << fh->get_refcnt() + << dendl; + fs->unref(fh); + } + }; + + /* force cache drain, forces objects to evict */ + fh_cache.drain(ObjUnref(this), + RGWFileHandle::FHCache::FLAG_LOCK); + g_rgwlib->get_fe()->get_process()->unregister_fs(this); + rele(); + } /* RGWLibFS::close */ + + inline std::ostream& operator<<(std::ostream &os, fh_key const &fhk) { + os << ""; + return os; + } + + inline std::ostream& operator<<(std::ostream &os, struct timespec const &ts) { + os << ""; + return os; + } + + std::ostream& operator<<(std::ostream &os, RGWLibFS::event const &ev) { + os << ""; + return os; + } + + void RGWLibFS::gc() + { + using std::get; + using directory = RGWFileHandle::directory; + + /* dirent invalidate timeout--basically, the upper-bound on + * inconsistency with the S3 namespace */ + auto expire_s + = get_context()->_conf->rgw_nfs_namespace_expire_secs; + + /* max events to gc in one cycle */ + uint32_t max_ev = get_context()->_conf->rgw_nfs_max_gc; + + struct timespec now, expire_ts; + event_vector ve; + bool stop = false; + std::deque &events = state.events; + + do { + (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + lsubdout(get_context(), rgw, 15) + << "GC: top of expire loop" + << " now=" << now + << " expire_s=" << expire_s + << dendl; + { + lock_guard guard(state.mtx); /* LOCKED */ + lsubdout(get_context(), rgw, 15) + << "GC: processing" + << " count=" << events.size() + << " events" + << dendl; + /* just return if no events */ + if (events.empty()) { + return; + } + uint32_t _max_ev = + (events.size() < 500) ? max_ev : (events.size() / 4); + for (uint32_t ix = 0; (ix < _max_ev) && (events.size() > 0); ++ix) { + event& ev = events.front(); + expire_ts = ev.ts; + expire_ts.tv_sec += expire_s; + if (expire_ts > now) { + stop = true; + break; + } + ve.push_back(ev); + events.pop_front(); + } + } /* anon */ + /* !LOCKED */ + for (auto& ev : ve) { + lsubdout(get_context(), rgw, 15) + << "try-expire ev: " << ev << dendl; + if (likely(ev.t == event::type::READDIR)) { + RGWFileHandle* rgw_fh = lookup_handle(ev.fhk.fh_hk); + lsubdout(get_context(), rgw, 15) + << "ev rgw_fh: " << rgw_fh << dendl; + if (rgw_fh) { + RGWFileHandle::directory* d; + if (unlikely(! rgw_fh->is_dir())) { + lsubdout(get_context(), rgw, 0) + << __func__ + << " BUG non-directory found with READDIR event " + << "(" << rgw_fh->bucket_name() << "," + << rgw_fh->object_name() << ")" + << dendl; + goto rele; + } + /* maybe clear state */ + d = get(&rgw_fh->variant_type); + if (d) { + struct timespec ev_ts = ev.ts; + lock_guard guard(rgw_fh->mtx); + struct timespec d_last_readdir = d->last_readdir; + if (unlikely(ev_ts < d_last_readdir)) { + /* readdir cycle in progress, don't invalidate */ + lsubdout(get_context(), rgw, 15) + << "GC: delay expiration for " + << rgw_fh->object_name() + << " ev.ts=" << ev_ts + << " last_readdir=" << d_last_readdir + << dendl; + continue; + } else { + lsubdout(get_context(), rgw, 15) + << "GC: expiring " + << rgw_fh->object_name() + << dendl; + rgw_fh->clear_state(); + rgw_fh->invalidate(); + } + } + rele: + unref(rgw_fh); + } /* rgw_fh */ + } /* event::type::READDIR */ + } /* ev */ + ve.clear(); + } while (! (stop || shutdown)); + } /* RGWLibFS::gc */ + + std::ostream& operator<<(std::ostream &os, + RGWFileHandle const &rgw_fh) + { + const auto& fhk = rgw_fh.get_key(); + const auto& fh = const_cast(rgw_fh).get_fh(); + os << "fh_type) { + case RGW_FS_TYPE_DIRECTORY: + os << "type=DIRECTORY;"; + break; + case RGW_FS_TYPE_FILE: + os << "type=FILE;"; + break; + default: + os << "type=UNKNOWN;"; + break; + }; + os << "fid=" << fhk.fh_hk.bucket << ":" << fhk.fh_hk.object << ";"; + os << "name=" << rgw_fh.object_name() << ";"; + os << "refcnt=" << rgw_fh.get_refcnt() << ";"; + os << ">"; + return os; + } + + RGWFileHandle::~RGWFileHandle() { + /* !recycle case, handle may STILL be in handle table, BUT + * the partition lock is not held in this path */ + if (fh_hook.is_linked()) { + fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_LOCK); + } + /* cond-unref parent */ + if (parent && (! parent->is_mount())) { + /* safe because if parent->unref causes its deletion, + * there are a) by refcnt, no other objects/paths pointing + * to it and b) by the semantics of valid iteration of + * fh_lru (observed, e.g., by cohort_lru::drain()) + * no unsafe iterators reaching it either--n.b., this constraint + * is binding oncode which may in future attempt to e.g., + * cause the eviction of objects in LRU order */ + (void) get_fs()->unref(parent); + } + } + + fh_key RGWFileHandle::make_fhk(const std::string& name) + { + std::string tenant = get_fs()->get_user()->user_id.to_str(); + if (depth == 0) { + /* S3 bucket -- assert mount-at-bucket case reaches here */ + return fh_key(name, name, tenant); + } else { + std::string key_name = make_key_name(name.c_str()); + return fh_key(fhk.fh_hk.bucket, key_name.c_str(), tenant); + } + } + + void RGWFileHandle::encode_attrs(ceph::buffer::list& ux_key1, + ceph::buffer::list& ux_attrs1, + bool inc_ov) + { + using ceph::encode; + fh_key fhk(this->fh.fh_hk); + encode(fhk, ux_key1); + bool need_ondisk_version = + (fh.fh_type == RGW_FS_TYPE_FILE || + fh.fh_type == RGW_FS_TYPE_SYMBOLIC_LINK); + if (need_ondisk_version && + file_ondisk_version < 0) { + file_ondisk_version = 0; + } + encode(*this, ux_attrs1); + if (need_ondisk_version && inc_ov) { + file_ondisk_version++; + } + } /* RGWFileHandle::encode_attrs */ + + DecodeAttrsResult RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1, + const ceph::buffer::list* ux_attrs1) + { + using ceph::decode; + DecodeAttrsResult dar { false, false }; + fh_key fhk; + auto bl_iter_key1 = ux_key1->cbegin(); + decode(fhk, bl_iter_key1); + get<0>(dar) = true; + + // decode to a temporary file handle which may not be + // copied to the current file handle if its file_ondisk_version + // is not newer + RGWFileHandle tmp_fh(fs); + tmp_fh.fh.fh_type = fh.fh_type; + auto bl_iter_unix1 = ux_attrs1->cbegin(); + decode(tmp_fh, bl_iter_unix1); + + fh.fh_type = tmp_fh.fh.fh_type; + // for file handles that represent files and whose file_ondisk_version + // is newer, no updates are need, otherwise, go updating the current + // file handle + if (!((fh.fh_type == RGW_FS_TYPE_FILE || + fh.fh_type == RGW_FS_TYPE_SYMBOLIC_LINK) && + file_ondisk_version >= tmp_fh.file_ondisk_version)) { + // make sure the following "encode" always encode a greater version + file_ondisk_version = tmp_fh.file_ondisk_version + 1; + state.dev = tmp_fh.state.dev; + state.size = tmp_fh.state.size; + state.nlink = tmp_fh.state.nlink; + state.owner_uid = tmp_fh.state.owner_uid; + state.owner_gid = tmp_fh.state.owner_gid; + state.unix_mode = tmp_fh.state.unix_mode; + state.ctime = tmp_fh.state.ctime; + state.mtime = tmp_fh.state.mtime; + state.atime = tmp_fh.state.atime; + state.version = tmp_fh.state.version; + } + + if (this->state.version < 2) { + get<1>(dar) = true; + } + + return dar; + } /* RGWFileHandle::decode_attrs */ + + bool RGWFileHandle::reclaim(const cohort::lru::ObjectFactory* newobj_fac) { + lsubdout(fs->get_context(), rgw, 17) + << __func__ << " " << *this + << dendl; + auto factory = dynamic_cast(newobj_fac); + if (factory == nullptr) { + return false; + } + /* make sure the reclaiming object is the same partiton with newobject factory, + * then we can recycle the object, and replace with newobject */ + if (!fs->fh_cache.is_same_partition(factory->fhk.fh_hk.object, fh.fh_hk.object)) { + return false; + } + /* in the non-delete case, handle may still be in handle table */ + if (fh_hook.is_linked()) { + /* in this case, we are being called from a context which holds + * the partition lock */ + fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_NONE); + } + return true; + } /* RGWFileHandle::reclaim */ + + bool RGWFileHandle::has_children() const + { + if (unlikely(! is_dir())) + return false; + + RGWRMdirCheck req(fs->get_context(), + g_rgwlib->get_driver()->get_user(fs->get_user()->user_id), + this); + int rc = g_rgwlib->get_fe()->execute_req(&req); + if (! rc) { + return req.valid && req.has_children; + } + + return false; + } + + std::ostream& operator<<(std::ostream &os, + RGWFileHandle::readdir_offset const &offset) + { + using boost::get; + if (unlikely(!! get(&offset))) { + uint64_t* ioff = get(offset); + os << *ioff; + } + else + os << get(offset); + return os; + } + + int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg, + readdir_offset offset, + bool *eof, uint32_t flags) + { + using event = RGWLibFS::event; + using boost::get; + int rc = 0; + struct timespec now; + CephContext* cct = fs->get_context(); + + lsubdout(cct, rgw, 10) + << __func__ << " readdir called on " + << object_name() + << dendl; + + directory* d = get(&variant_type); + if (d) { + (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */ + lock_guard guard(mtx); + d->last_readdir = now; + } + + bool initial_off; + char* mk{nullptr}; + + if (likely(!! get(&offset))) { + mk = const_cast(get(offset)); + initial_off = !mk; + } else { + initial_off = (*get(offset) == 0); + } + + if (is_root()) { + RGWListBucketsRequest req(cct, g_rgwlib->get_driver()->get_user(fs->get_user()->user_id), + this, rcb, cb_arg, offset); + rc = g_rgwlib->get_fe()->execute_req(&req); + if (! rc) { + (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */ + lock_guard guard(mtx); + state.atime = now; + if (initial_off) + set_nlink(2); + inc_nlink(req.d_count); + *eof = req.eof(); + } + } else { + RGWReaddirRequest req(cct, g_rgwlib->get_driver()->get_user(fs->get_user()->user_id), + this, rcb, cb_arg, offset); + rc = g_rgwlib->get_fe()->execute_req(&req); + if (! rc) { + (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */ + lock_guard guard(mtx); + state.atime = now; + if (initial_off) + set_nlink(2); + inc_nlink(req.d_count); + *eof = req.eof(); + } + } + + event ev(event::type::READDIR, get_key(), state.atime); + lock_guard sguard(fs->state.mtx); + fs->state.push_event(ev); + + lsubdout(fs->get_context(), rgw, 15) + << __func__ + << " final link count=" << state.nlink + << dendl; + + return rc; + } /* RGWFileHandle::readdir */ + + int RGWFileHandle::write(uint64_t off, size_t len, size_t *bytes_written, + void *buffer) + { + using std::get; + using WriteCompletion = RGWLibFS::WriteCompletion; + + lock_guard guard(mtx); + + int rc = 0; + + file* f = get(&variant_type); + if (! f) + return -EISDIR; + + if (deleted()) { + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << " write attempted on deleted object " + << this->object_name() + << dendl; + /* zap write transaction, if any */ + if (f->write_req) { + delete f->write_req; + f->write_req = nullptr; + } + return -ESTALE; + } + + if (! f->write_req) { + /* guard--we do not support (e.g., COW-backed) partial writes */ + if (off != 0) { + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << " " << object_name() + << " non-0 initial write position " << off + << " (mounting with -o sync required)" + << dendl; + return -EIO; + } + + const RGWProcessEnv& penv = g_rgwlib->get_fe()->get_process()->get_env(); + + /* start */ + std::string object_name = relative_object_name(); + f->write_req = + new RGWWriteRequest(g_rgwlib->get_driver(), penv, + g_rgwlib->get_driver()->get_user(fs->get_user()->user_id), + this, bucket_name(), object_name); + rc = g_rgwlib->get_fe()->start_req(f->write_req); + if (rc < 0) { + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << this->object_name() + << " write start failed " << off + << " (" << rc << ")" + << dendl; + /* zap failed write transaction */ + delete f->write_req; + f->write_req = nullptr; + return -EIO; + } else { + if (stateless_open()) { + /* start write timer */ + f->write_req->timer_id = + RGWLibFS::write_timer.add_event( + std::chrono::seconds(RGWLibFS::write_completion_interval_s), + WriteCompletion(*this)); + } + } + } + + int overlap = 0; + if ((static_cast(off) < f->write_req->real_ofs) && + ((f->write_req->real_ofs - off) <= len)) { + overlap = f->write_req->real_ofs - off; + off = f->write_req->real_ofs; + buffer = static_cast(buffer) + overlap; + len -= overlap; + } + + buffer::list bl; + /* XXXX */ +#if 0 + bl.push_back( + buffer::create_static(len, static_cast(buffer))); +#else + bl.push_back( + buffer::copy(static_cast(buffer), len)); +#endif + + f->write_req->put_data(off, bl); + rc = f->write_req->exec_continue(); + + if (rc == 0) { + size_t min_size = off + len; + if (min_size > get_size()) + set_size(min_size); + if (stateless_open()) { + /* bump write timer */ + RGWLibFS::write_timer.adjust_event( + f->write_req->timer_id, std::chrono::seconds(10)); + } + } else { + /* continuation failed (e.g., non-contiguous write position) */ + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << object_name() + << " failed write at position " << off + << " (fails write transaction) " + << dendl; + /* zap failed write transaction */ + delete f->write_req; + f->write_req = nullptr; + rc = -EIO; + } + + *bytes_written = (rc == 0) ? (len + overlap) : 0; + return rc; + } /* RGWFileHandle::write */ + + int RGWFileHandle::write_finish(uint32_t flags) + { + unique_lock guard{mtx, std::defer_lock}; + int rc = 0; + + if (! (flags & FLAG_LOCKED)) { + guard.lock(); + } + + file* f = get(&variant_type); + if (f && (f->write_req)) { + lsubdout(fs->get_context(), rgw, 10) + << __func__ + << " finishing write trans on " << object_name() + << dendl; + rc = g_rgwlib->get_fe()->finish_req(f->write_req); + if (! rc) { + rc = f->write_req->get_ret(); + } + delete f->write_req; + f->write_req = nullptr; + } + + return rc; + } /* RGWFileHandle::write_finish */ + + int RGWFileHandle::close() + { + lock_guard guard(mtx); + + int rc = write_finish(FLAG_LOCKED); + + flags &= ~FLAG_OPEN; + flags &= ~FLAG_STATELESS_OPEN; + + return rc; + } /* RGWFileHandle::close */ + + RGWFileHandle::file::~file() + { + delete write_req; + } + + void RGWFileHandle::clear_state() + { + directory* d = get(&variant_type); + if (d) { + state.nlink = 2; + d->last_marker = rgw_obj_key{}; + } + } + + void RGWFileHandle::advance_mtime(uint32_t flags) { + /* intended for use on directories, fast-forward mtime so as to + * ensure a new, higher value for the change attribute */ + unique_lock uniq(mtx, std::defer_lock); + if (likely(! (flags & RGWFileHandle::FLAG_LOCKED))) { + uniq.lock(); + } + + /* advance mtime only if stored mtime is older than the + * configured namespace expiration */ + auto now = real_clock::now(); + auto cmptime = state.mtime; + cmptime.tv_sec += + fs->get_context()->_conf->rgw_nfs_namespace_expire_secs; + if (cmptime < real_clock::to_timespec(now)) { + /* sets ctime as well as mtime, to avoid masking updates should + * ctime inexplicably hold a higher value */ + set_times(now); + } + } + + void RGWFileHandle::invalidate() { + RGWLibFS *fs = get_fs(); + if (fs->invalidate_cb) { + fs->invalidate_cb(fs->invalidate_arg, get_key().fh_hk); + } + } + + int RGWWriteRequest::exec_start() { + req_state* state = get_state(); + + /* Object needs a bucket from this point */ + state->object->set_bucket(state->bucket.get()); + + auto compression_type = + get_driver()->get_compression_type(state->bucket->get_placement_rule()); + + /* not obviously supportable */ + ceph_assert(! dlo_manifest); + ceph_assert(! slo_info); + + perfcounter->inc(l_rgw_put); + op_ret = -EINVAL; + + if (state->object->empty()) { + ldout(state->cct, 0) << __func__ << " called on empty object" << dendl; + goto done; + } + + op_ret = get_params(null_yield); + if (op_ret < 0) + goto done; + + op_ret = get_system_versioning_params(state, &olh_epoch, &version_id); + if (op_ret < 0) { + goto done; + } + + /* user-supplied MD5 check skipped (not supplied) */ + /* early quota check skipped--we don't have size yet */ + /* skipping user-supplied etag--we might have one in future, but + * like data it and other attrs would arrive after open */ + + aio.emplace(state->cct->_conf->rgw_put_obj_min_window_size); + + if (state->bucket->versioning_enabled()) { + if (!version_id.empty()) { + state->object->set_instance(version_id); + } else { + state->object->gen_rand_obj_instance_name(); + version_id = state->object->get_instance(); + } + } + processor = get_driver()->get_atomic_writer(this, state->yield, state->object.get(), + state->bucket_owner.get_id(), + &state->dest_placement, 0, state->req_id); + + op_ret = processor->prepare(state->yield); + if (op_ret < 0) { + ldout(state->cct, 20) << "processor->prepare() returned ret=" << op_ret + << dendl; + goto done; + } + filter = &*processor; + if (compression_type != "none") { + plugin = Compressor::create(state->cct, compression_type); + if (! plugin) { + ldout(state->cct, 1) << "Cannot load plugin for rgw_compression_type " + << compression_type << dendl; + } else { + compressor.emplace(state->cct, plugin, filter); + filter = &*compressor; + } + } + + done: + return op_ret; + } /* exec_start */ + + int RGWWriteRequest::exec_continue() + { + req_state* state = get_state(); + op_ret = 0; + + /* check guards (e.g., contig write) */ + if (eio) { + ldout(state->cct, 5) + << " chunks arrived in wrong order" + << " (mounting with -o sync required)" + << dendl; + return -EIO; + } + + op_ret = state->bucket->check_quota(this, quota, real_ofs, null_yield, true); + /* max_size exceed */ + if (op_ret < 0) + return -EIO; + + size_t len = data.length(); + if (! len) + return 0; + + hash.Update((const unsigned char *)data.c_str(), data.length()); + op_ret = filter->process(std::move(data), ofs); + if (op_ret < 0) { + goto done; + } + bytes_written += len; + + done: + return op_ret; + } /* exec_continue */ + + int RGWWriteRequest::exec_finish() + { + buffer::list bl, aclbl, ux_key, ux_attrs; + map::iterator iter; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + req_state* state = get_state(); + + size_t osize = rgw_fh->get_size(); + struct timespec octime = rgw_fh->get_ctime(); + struct timespec omtime = rgw_fh->get_mtime(); + real_time appx_t = real_clock::now(); + + state->obj_size = bytes_written; + perfcounter->inc(l_rgw_put_b, state->obj_size); + + // flush data in filters + op_ret = filter->process({}, state->obj_size); + if (op_ret < 0) { + goto done; + } + + op_ret = state->bucket->check_quota(this, quota, state->obj_size, null_yield, true); + /* max_size exceed */ + if (op_ret < 0) { + goto done; + } + + hash.Final(m); + + if (compressor && compressor->is_compressed()) { + bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = state->obj_size; + cs_info.blocks = std::move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + attrs[RGW_ATTR_COMPRESSION] = tmp; + ldpp_dout(this, 20) << "storing " << RGW_ATTR_COMPRESSION + << " with type=" << cs_info.compression_type + << ", orig_size=" << cs_info.orig_size + << ", blocks=" << cs_info.blocks.size() << dendl; + } + + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + etag = calc_md5; + + bl.append(etag.c_str(), etag.size() + 1); + emplace_attr(RGW_ATTR_ETAG, std::move(bl)); + + policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + /* unix attrs */ + rgw_fh->set_mtime(real_clock::to_timespec(appx_t)); + rgw_fh->set_ctime(real_clock::to_timespec(appx_t)); + rgw_fh->set_size(bytes_written); + rgw_fh->encode_attrs(ux_key, ux_attrs); + + emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + for (iter = state->generic_attrs.begin(); iter != state->generic_attrs.end(); + ++iter) { + buffer::list& attrbl = attrs[iter->first]; + const string& val = iter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + + op_ret = rgw_get_request_metadata(this, state->cct, state->info, attrs); + if (op_ret < 0) { + goto done; + } + encode_delete_at_attr(delete_at, attrs); + + /* Add a custom metadata to expose the information whether an object + * is an SLO or not. Appending the attribute must be performed AFTER + * processing any input from user in order to prohibit overwriting. */ + if (unlikely(!! slo_info)) { + buffer::list slo_userindicator_bl; + using ceph::encode; + encode("True", slo_userindicator_bl); + emplace_attr(RGW_ATTR_SLO_UINDICATOR, std::move(slo_userindicator_bl)); + } + + op_ret = processor->complete(state->obj_size, etag, &mtime, real_time(), attrs, + (delete_at ? *delete_at : real_time()), + if_match, if_nomatch, nullptr, nullptr, nullptr, + state->yield); + if (op_ret != 0) { + /* revert attr updates */ + rgw_fh->set_mtime(omtime); + rgw_fh->set_ctime(octime); + rgw_fh->set_size(osize); + } + + done: + perfcounter->tinc(l_rgw_put_lat, state->time_elapsed()); + return op_ret; + } /* exec_finish */ + +} /* namespace rgw */ + +/* librgw */ +extern "C" { + +void rgwfile_version(int *major, int *minor, int *extra) +{ + if (major) + *major = LIBRGW_FILE_VER_MAJOR; + if (minor) + *minor = LIBRGW_FILE_VER_MINOR; + if (extra) + *extra = LIBRGW_FILE_VER_EXTRA; +} + +/* + attach rgw namespace +*/ + int rgw_mount(librgw_t rgw, const char *uid, const char *acc_key, + const char *sec_key, struct rgw_fs **rgw_fs, + uint32_t flags) +{ + int rc = 0; + + /* stash access data for "mount" */ + RGWLibFS* new_fs = new RGWLibFS(static_cast(rgw), uid, acc_key, + sec_key, "/"); + ceph_assert(new_fs); + + const DoutPrefix dp(g_rgwlib->get_driver()->ctx(), dout_subsys, "rgw mount: "); + rc = new_fs->authorize(&dp, g_rgwlib->get_driver()); + if (rc != 0) { + delete new_fs; + return -EINVAL; + } + + /* register fs for shared gc */ + g_rgwlib->get_fe()->get_process()->register_fs(new_fs); + + struct rgw_fs *fs = new_fs->get_fs(); + fs->rgw = rgw; + + /* XXX we no longer assume "/" is unique, but we aren't tracking the + * roots atm */ + + *rgw_fs = fs; + + return 0; +} + +int rgw_mount2(librgw_t rgw, const char *uid, const char *acc_key, + const char *sec_key, const char *root, struct rgw_fs **rgw_fs, + uint32_t flags) +{ + int rc = 0; + + /* if the config has no value for path/root, choose "/" */ + RGWLibFS* new_fs{nullptr}; + if(root && + (!strcmp(root, ""))) { + /* stash access data for "mount" */ + new_fs = new RGWLibFS( + static_cast(rgw), uid, acc_key, sec_key, "/"); + } + else { + /* stash access data for "mount" */ + new_fs = new RGWLibFS( + static_cast(rgw), uid, acc_key, sec_key, root); + } + + ceph_assert(new_fs); /* should we be using ceph_assert? */ + + const DoutPrefix dp(g_rgwlib->get_driver()->ctx(), dout_subsys, "rgw mount2: "); + rc = new_fs->authorize(&dp, g_rgwlib->get_driver()); + if (rc != 0) { + delete new_fs; + return -EINVAL; + } + + /* register fs for shared gc */ + g_rgwlib->get_fe()->get_process()->register_fs(new_fs); + + struct rgw_fs *fs = new_fs->get_fs(); + fs->rgw = rgw; + + /* XXX we no longer assume "/" is unique, but we aren't tracking the + * roots atm */ + + *rgw_fs = fs; + + return 0; +} + +/* + register invalidate callbacks +*/ +int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb, + void *arg, uint32_t flags) + +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + return fs->register_invalidate(cb, arg, flags); +} + +/* + detach rgw namespace +*/ +int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + fs->close(); + return 0; +} + +/* + get filesystem attributes +*/ +int rgw_statfs(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + struct rgw_statvfs *vfs_st, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + struct rados_cluster_stat_t stats; + + RGWGetClusterStatReq req(fs->get_context(), + g_rgwlib->get_driver()->get_user(fs->get_user()->user_id), + stats); + int rc = g_rgwlib->get_fe()->execute_req(&req); + if (rc < 0) { + lderr(fs->get_context()) << "ERROR: getting total cluster usage" + << cpp_strerror(-rc) << dendl; + return rc; + } + + //Set block size to 1M. + constexpr uint32_t CEPH_BLOCK_SHIFT = 20; + vfs_st->f_bsize = 1 << CEPH_BLOCK_SHIFT; + vfs_st->f_frsize = 1 << CEPH_BLOCK_SHIFT; + vfs_st->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10); + vfs_st->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10); + vfs_st->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10); + vfs_st->f_files = stats.num_objects; + vfs_st->f_ffree = -1; + vfs_st->f_fsid[0] = fs->get_fsid(); + vfs_st->f_fsid[1] = fs->get_fsid(); + vfs_st->f_flag = 0; + vfs_st->f_namemax = 4096; + return 0; +} + +/* + generic create -- create an empty regular file +*/ +int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags) +{ + using std::get; + + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* parent = get_rgwfh(parent_fh); + + if ((! parent) || + (parent->is_root()) || + (parent->is_file())) { + /* bad parent */ + return -EINVAL; + } + + MkObjResult fhr = fs->create(parent, name, st, mask, flags); + RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success + + if (nfh) + *fh = nfh->get_fh(); + + return get<1>(fhr); +} /* rgw_create */ + +/* + create a symbolic link + */ +int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, const char *link_path, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags) +{ + using std::get; + + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* parent = get_rgwfh(parent_fh); + + if ((! parent) || + (parent->is_root()) || + (parent->is_file())) { + /* bad parent */ + return -EINVAL; + } + + MkObjResult fhr = fs->symlink(parent, name, link_path, st, mask, flags); + RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success + + if (nfh) + *fh = nfh->get_fh(); + + return get<1>(fhr); +} /* rgw_symlink */ + +/* + create a new directory +*/ +int rgw_mkdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t flags) +{ + using std::get; + + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* parent = get_rgwfh(parent_fh); + + if (! parent) { + /* bad parent */ + return -EINVAL; + } + + MkObjResult fhr = fs->mkdir(parent, name, st, mask, flags); + RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success + + if (nfh) + *fh = nfh->get_fh(); + + return get<1>(fhr); +} /* rgw_mkdir */ + +/* + rename object +*/ +int rgw_rename(struct rgw_fs *rgw_fs, + struct rgw_file_handle *src, const char* src_name, + struct rgw_file_handle *dst, const char* dst_name, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + + RGWFileHandle* src_fh = get_rgwfh(src); + RGWFileHandle* dst_fh = get_rgwfh(dst); + + return fs->rename(src_fh, dst_fh, src_name, dst_name); +} + +/* + remove file or directory +*/ +int rgw_unlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* parent = get_rgwfh(parent_fh); + + return fs->unlink(parent, name); +} + +/* + lookup object by name (POSIX style) +*/ +int rgw_lookup(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char* path, + struct rgw_file_handle **fh, + struct stat *st, uint32_t mask, uint32_t flags) +{ + //CephContext* cct = static_cast(rgw_fs->rgw); + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + + RGWFileHandle* parent = get_rgwfh(parent_fh); + if ((! parent) || + (! parent->is_dir())) { + /* bad parent */ + return -EINVAL; + } + + RGWFileHandle* rgw_fh; + LookupFHResult fhr; + + if (parent->is_root()) { + /* special: parent lookup--note lack of ref()! */ + if (unlikely((strcmp(path, "..") == 0) || + (strcmp(path, "/") == 0))) { + rgw_fh = parent; + } else { + RGWLibFS::BucketStats bstat; + fhr = fs->stat_bucket(parent, path, bstat, RGWFileHandle::FLAG_NONE); + rgw_fh = get<0>(fhr); + if (! rgw_fh) + return -ENOENT; + } + } else { + /* special: after readdir--note extra ref()! */ + if (unlikely((strcmp(path, "..") == 0))) { + rgw_fh = parent; + lsubdout(fs->get_context(), rgw, 17) + << __func__ << " BANG"<< *rgw_fh + << dendl; + fs->ref(rgw_fh); + } else { + enum rgw_fh_type fh_type = fh_type_of(flags); + + uint32_t sl_flags = (flags & RGW_LOOKUP_FLAG_RCB) + ? RGWFileHandle::FLAG_IN_CB + : RGWFileHandle::FLAG_EXACT_MATCH; + + bool fast_attrs= fs->get_context()->_conf->rgw_nfs_s3_fast_attrs; + + if ((flags & RGW_LOOKUP_FLAG_RCB) && fast_attrs) { + /* FAKE STAT--this should mean, interpolate special + * owner, group, and perms masks */ + fhr = fs->fake_leaf(parent, path, fh_type, st, mask, sl_flags); + } else { + if ((fh_type == RGW_FS_TYPE_DIRECTORY) && fast_attrs) { + /* trust cached dir, if present */ + fhr = fs->lookup_fh(parent, path, RGWFileHandle::FLAG_DIRECTORY); + if (get<0>(fhr)) { + rgw_fh = get<0>(fhr); + goto done; + } + } + fhr = fs->stat_leaf(parent, path, fh_type, sl_flags); + } + if (! get<0>(fhr)) { + if (! (flags & RGW_LOOKUP_FLAG_CREATE)) + return -ENOENT; + else + fhr = fs->lookup_fh(parent, path, RGWFileHandle::FLAG_CREATE); + } + rgw_fh = get<0>(fhr); + } + } /* !root */ + +done: + struct rgw_file_handle *rfh = rgw_fh->get_fh(); + *fh = rfh; + + return 0; +} /* rgw_lookup */ + +/* + lookup object by handle (NFS style) +*/ +int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk, + struct rgw_file_handle **fh, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + + RGWFileHandle* rgw_fh = fs->lookup_handle(*fh_hk); + if (! rgw_fh) { + /* not found */ + return -ENOENT; + } + + struct rgw_file_handle *rfh = rgw_fh->get_fh(); + *fh = rfh; + + return 0; +} + +/* + * release file handle + */ +int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + lsubdout(fs->get_context(), rgw, 17) + << __func__ << " " << *rgw_fh + << dendl; + + fs->unref(rgw_fh); + return 0; +} + +/* + get unix attributes for object +*/ +int rgw_getattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->getattr(rgw_fh, st); +} + +/* + set unix attributes for object +*/ +int rgw_setattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, + uint32_t mask, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->setattr(rgw_fh, st, mask, flags); +} + +/* + truncate file +*/ +int rgw_truncate(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t size, uint32_t flags) +{ + return 0; +} + +/* + open file +*/ +int rgw_open(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint32_t posix_flags, uint32_t flags) +{ + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + /* XXX + * need to track specific opens--at least read opens and + * a write open; we need to know when a write open is returned, + * that closes a write transaction + * + * for now, we will support single-open only, it's preferable to + * anything we can otherwise do without access to the NFS state + */ + if (! rgw_fh->is_file()) + return -EISDIR; + + return rgw_fh->open(flags); +} + +/* + close file +*/ +int rgw_close(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + int rc = rgw_fh->close(/* XXX */); + + if (flags & RGW_CLOSE_FLAG_RELE) + fs->unref(rgw_fh); + + return rc; +} + +int rgw_readdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, uint64_t *offset, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags) +{ + RGWFileHandle* parent = get_rgwfh(parent_fh); + if (! parent) { + /* bad parent */ + return -EINVAL; + } + + lsubdout(parent->get_fs()->get_context(), rgw, 15) + << __func__ + << " offset=" << *offset + << dendl; + + if ((*offset == 0) && + (flags & RGW_READDIR_FLAG_DOTDOT)) { + /* send '.' and '..' with their NFS-defined offsets */ + rcb(".", cb_arg, 1, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + rcb("..", cb_arg, 2, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + } + + int rc = parent->readdir(rcb, cb_arg, offset, eof, flags); + return rc; +} /* rgw_readdir */ + +/* enumeration continuing from name */ +int rgw_readdir2(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char *name, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags) +{ + RGWFileHandle* parent = get_rgwfh(parent_fh); + if (! parent) { + /* bad parent */ + return -EINVAL; + } + + lsubdout(parent->get_fs()->get_context(), rgw, 15) + << __func__ + << " offset=" << ((name) ? name : "(nil)") + << dendl; + + if ((! name) && + (flags & RGW_READDIR_FLAG_DOTDOT)) { + /* send '.' and '..' with their NFS-defined offsets */ + rcb(".", cb_arg, 1, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + rcb("..", cb_arg, 2, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + } + + int rc = parent->readdir(rcb, cb_arg, name, eof, flags); + return rc; +} /* rgw_readdir2 */ + +/* project offset of dirent name */ +int rgw_dirent_offset(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, int64_t *offset, + uint32_t flags) +{ + RGWFileHandle* parent = get_rgwfh(parent_fh); + if ((! parent)) { + /* bad parent */ + return -EINVAL; + } + std::string sname{name}; + int rc = parent->offset_of(sname, offset, flags); + return rc; +} + +/* + read data from file +*/ +int rgw_read(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->read(rgw_fh, offset, length, bytes_read, buffer, flags); +} + +/* + read symbolic link +*/ +int rgw_readlink(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->readlink(rgw_fh, offset, length, bytes_read, buffer, flags); +} + +/* + write data to file +*/ +int rgw_write(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_written, void *buffer, + uint32_t flags) +{ + RGWFileHandle* rgw_fh = get_rgwfh(fh); + int rc; + + *bytes_written = 0; + + if (! rgw_fh->is_file()) + return -EISDIR; + + if (! rgw_fh->is_open()) { + if (flags & RGW_OPEN_FLAG_V3) { + rc = rgw_fh->open(flags); + if (!! rc) + return rc; + } else + return -EPERM; + } + + rc = rgw_fh->write(offset, length, bytes_written, buffer); + + return rc; +} + +/* + read data from file (vector) +*/ +class RGWReadV +{ + buffer::list bl; + struct rgw_vio* vio; + +public: + RGWReadV(buffer::list& _bl, rgw_vio* _vio) : vio(_vio) { + bl = std::move(_bl); + } + + struct rgw_vio* get_vio() { return vio; } + + const auto& buffers() { return bl.buffers(); } + + unsigned /* XXX */ length() { return bl.length(); } + +}; + +void rgw_readv_rele(struct rgw_uio *uio, uint32_t flags) +{ + RGWReadV* rdv = static_cast(uio->uio_p1); + rdv->~RGWReadV(); + ::operator delete(rdv); +} + +int rgw_readv(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags) +{ +#if 0 /* XXX */ + CephContext* cct = static_cast(rgw_fs->rgw); + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + if (! rgw_fh->is_file()) + return -EINVAL; + + int rc = 0; + + buffer::list bl; + RGWGetObjRequest req(cct, fs->get_user(), rgw_fh->bucket_name(), + rgw_fh->object_name(), uio->uio_offset, uio->uio_resid, + bl); + req.do_hexdump = false; + + rc = g_rgwlib->get_fe()->execute_req(&req); + + if (! rc) { + RGWReadV* rdv = static_cast( + ::operator new(sizeof(RGWReadV) + + (bl.buffers().size() * sizeof(struct rgw_vio)))); + + (void) new (rdv) + RGWReadV(bl, reinterpret_cast(rdv+sizeof(RGWReadV))); + + uio->uio_p1 = rdv; + uio->uio_cnt = rdv->buffers().size(); + uio->uio_resid = rdv->length(); + uio->uio_vio = rdv->get_vio(); + uio->uio_rele = rgw_readv_rele; + + int ix = 0; + auto& buffers = rdv->buffers(); + for (auto& bp : buffers) { + rgw_vio *vio = &(uio->uio_vio[ix]); + vio->vio_base = const_cast(bp.c_str()); + vio->vio_len = bp.length(); + vio->vio_u1 = nullptr; + vio->vio_p1 = nullptr; + ++ix; + } + } + + return rc; +#else + return 0; +#endif +} + +/* + write data to file (vector) +*/ +int rgw_writev(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_uio *uio, uint32_t flags) +{ + + // not supported - rest of function is ignored + return -ENOTSUP; + + CephContext* cct = static_cast(rgw_fs->rgw); + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + if (! rgw_fh->is_file()) + return -EINVAL; + + buffer::list bl; + for (unsigned int ix = 0; ix < uio->uio_cnt; ++ix) { + rgw_vio *vio = &(uio->uio_vio[ix]); + bl.push_back( + buffer::create_static(vio->vio_len, + static_cast(vio->vio_base))); + } + + std::string oname = rgw_fh->relative_object_name(); + RGWPutObjRequest req(cct, g_rgwlib->get_driver()->get_user(fs->get_user()->user_id), + rgw_fh->bucket_name(), oname, bl); + + int rc = g_rgwlib->get_fe()->execute_req(&req); + + /* XXX update size (in request) */ + + return rc; +} + +/* + sync written data +*/ +int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *handle, + uint32_t flags) +{ + return 0; +} + +int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint64_t offset, uint64_t length, uint32_t flags) +{ + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return rgw_fh->commit(offset, length, RGWFileHandle::FLAG_NONE); +} + +/* + extended attributes + */ + +int rgw_getxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, rgw_getxattr_cb cb, void *cb_arg, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->getxattrs(rgw_fh, attrs, cb, cb_arg, flags); +} + +int rgw_lsxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrstr *filter_prefix /* ignored */, + rgw_getxattr_cb cb, void *cb_arg, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->lsxattrs(rgw_fh, filter_prefix, cb, cb_arg, flags); +} + +int rgw_setxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->setxattrs(rgw_fh, attrs, flags); +} + +int rgw_rmxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->rmxattrs(rgw_fh, attrs, flags); +} + +} /* extern "C" */ diff --git a/src/rgw/rgw_file.h b/src/rgw/rgw_file.h new file mode 100644 index 000000000..65ec3dd15 --- /dev/null +++ b/src/rgw/rgw_file.h @@ -0,0 +1,2857 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "include/rados/rgw_file.h" + +/* internal header */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xxhash.h" +#include "include/buffer.h" +#include "common/cohort_lru.h" +#include "common/ceph_timer.h" +#include "rgw_common.h" +#include "rgw_user.h" +#include "rgw_lib.h" +#include "rgw_ldap.h" +#include "rgw_token.h" +#include "rgw_putobj_processor.h" +#include "rgw_aio_throttle.h" +#include "rgw_compression.h" + + +/* XXX + * ASSERT_H somehow not defined after all the above (which bring + * in common/debug.h [e.g., dout]) + */ +#include "include/ceph_assert.h" + + +#define RGW_RWXMODE (S_IRWXU | S_IRWXG | S_IRWXO) + +#define RGW_RWMODE (RGW_RWXMODE & \ + ~(S_IXUSR | S_IXGRP | S_IXOTH)) + + +namespace rgw { + + template + static inline void ignore(T &&) {} + + + namespace bi = boost::intrusive; + + class RGWLibFS; + class RGWFileHandle; + class RGWWriteRequest; + + inline bool operator <(const struct timespec& lhs, + const struct timespec& rhs) { + if (lhs.tv_sec == rhs.tv_sec) + return lhs.tv_nsec < rhs.tv_nsec; + else + return lhs.tv_sec < rhs.tv_sec; + } + + inline bool operator ==(const struct timespec& lhs, + const struct timespec& rhs) { + return ((lhs.tv_sec == rhs.tv_sec) && + (lhs.tv_nsec == rhs.tv_nsec)); + } + + /* + * XXX + * The current 64-bit, non-cryptographic hash used here is intended + * for prototyping only. + * + * However, the invariant being prototyped is that objects be + * identifiable by their hash components alone. We believe this can + * be legitimately implemented using 128-hash values for bucket and + * object components, together with a cluster-resident cryptographic + * key. Since an MD5 or SHA-1 key is 128 bits and the (fast), + * non-cryptographic CityHash128 hash algorithm takes a 128-bit seed, + * speculatively we could use that for the final hash computations. + */ + struct fh_key + { + rgw_fh_hk fh_hk {}; + uint32_t version; + + static constexpr uint64_t seed = 8675309; + + fh_key() : version(0) {} + + fh_key(const rgw_fh_hk& _hk) + : fh_hk(_hk), version(0) { + // nothing + } + + fh_key(const uint64_t bk, const uint64_t ok) + : version(0) { + fh_hk.bucket = bk; + fh_hk.object = ok; + } + + fh_key(const uint64_t bk, const char *_o, const std::string& _t) + : version(0) { + fh_hk.bucket = bk; + std::string to = _t + ":" + _o; + fh_hk.object = XXH64(to.c_str(), to.length(), seed); + } + + fh_key(const std::string& _b, const std::string& _o, + const std::string& _t /* tenant */) + : version(0) { + std::string tb = _t + ":" + _b; + std::string to = _t + ":" + _o; + fh_hk.bucket = XXH64(tb.c_str(), tb.length(), seed); + fh_hk.object = XXH64(to.c_str(), to.length(), seed); + } + + void encode(buffer::list& bl) const { + ENCODE_START(2, 1, bl); + encode(fh_hk.bucket, bl); + encode(fh_hk.object, bl); + encode((uint32_t)2, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(fh_hk.bucket, bl); + decode(fh_hk.object, bl); + if (struct_v >= 2) { + decode(version, bl); + } + DECODE_FINISH(bl); + } + + friend std::ostream& operator<<(std::ostream &os, fh_key const &fhk); + + }; /* fh_key */ + + WRITE_CLASS_ENCODER(fh_key); + + inline bool operator<(const fh_key& lhs, const fh_key& rhs) + { + return ((lhs.fh_hk.bucket < rhs.fh_hk.bucket) || + ((lhs.fh_hk.bucket == rhs.fh_hk.bucket) && + (lhs.fh_hk.object < rhs.fh_hk.object))); + } + + inline bool operator>(const fh_key& lhs, const fh_key& rhs) + { + return (rhs < lhs); + } + + inline bool operator==(const fh_key& lhs, const fh_key& rhs) + { + return ((lhs.fh_hk.bucket == rhs.fh_hk.bucket) && + (lhs.fh_hk.object == rhs.fh_hk.object)); + } + + inline bool operator!=(const fh_key& lhs, const fh_key& rhs) + { + return !(lhs == rhs); + } + + inline bool operator<=(const fh_key& lhs, const fh_key& rhs) + { + return (lhs < rhs) || (lhs == rhs); + } + + using boost::variant; + using boost::container::flat_map; + + typedef std::tuple DecodeAttrsResult; + + class RGWFileHandle : public cohort::lru::Object + { + struct rgw_file_handle fh; + std::mutex mtx; + + RGWLibFS* fs; + RGWFileHandle* bucket; + RGWFileHandle* parent; + std::atomic_int64_t file_ondisk_version; // version of unix attrs, file only + /* const */ std::string name; /* XXX file or bucket name */ + /* const */ fh_key fhk; + + using lock_guard = std::lock_guard; + using unique_lock = std::unique_lock; + + /* TODO: keeping just the last marker is sufficient for + * nfs-ganesha 2.4.5; in the near future, nfs-ganesha will + * be able to hint the name of the next dirent required, + * from which we can directly synthesize a RADOS marker. + * using marker_cache_t = flat_map; + */ + + struct State { + uint64_t dev; + uint64_t size; + uint64_t nlink; + uint32_t owner_uid; /* XXX need Unix attr */ + uint32_t owner_gid; /* XXX need Unix attr */ + mode_t unix_mode; + struct timespec ctime; + struct timespec mtime; + struct timespec atime; + uint32_t version; + State() : dev(0), size(0), nlink(1), owner_uid(0), owner_gid(0), unix_mode(0), + ctime{0,0}, mtime{0,0}, atime{0,0}, version(0) {} + } state; + + struct file { + RGWWriteRequest* write_req; + file() : write_req(nullptr) {} + ~file(); + }; + + struct directory { + + static constexpr uint32_t FLAG_NONE = 0x0000; + + uint32_t flags; + rgw_obj_key last_marker; + struct timespec last_readdir; + + directory() : flags(FLAG_NONE), last_readdir{0,0} {} + }; + + void clear_state(); + void advance_mtime(uint32_t flags = FLAG_NONE); + + boost::variant variant_type; + + uint16_t depth; + uint32_t flags; + + ceph::buffer::list etag; + ceph::buffer::list acls; + + public: + const static std::string root_name; + + static constexpr uint16_t MAX_DEPTH = 256; + + static constexpr uint32_t FLAG_NONE = 0x0000; + static constexpr uint32_t FLAG_OPEN = 0x0001; + static constexpr uint32_t FLAG_ROOT = 0x0002; + static constexpr uint32_t FLAG_CREATE = 0x0004; + static constexpr uint32_t FLAG_CREATING = 0x0008; + static constexpr uint32_t FLAG_SYMBOLIC_LINK = 0x0009; + static constexpr uint32_t FLAG_DIRECTORY = 0x0010; + static constexpr uint32_t FLAG_BUCKET = 0x0020; + static constexpr uint32_t FLAG_LOCK = 0x0040; + static constexpr uint32_t FLAG_DELETED = 0x0080; + static constexpr uint32_t FLAG_UNLINK_THIS = 0x0100; + static constexpr uint32_t FLAG_LOCKED = 0x0200; + static constexpr uint32_t FLAG_STATELESS_OPEN = 0x0400; + static constexpr uint32_t FLAG_EXACT_MATCH = 0x0800; + static constexpr uint32_t FLAG_MOUNT = 0x1000; + static constexpr uint32_t FLAG_IN_CB = 0x2000; + +#define CREATE_FLAGS(x) \ + ((x) & ~(RGWFileHandle::FLAG_CREATE|RGWFileHandle::FLAG_LOCK)) + + static constexpr uint32_t RCB_MASK = \ + RGW_SETATTR_MTIME|RGW_SETATTR_CTIME|RGW_SETATTR_ATIME|RGW_SETATTR_SIZE; + + friend class RGWLibFS; + + private: + explicit RGWFileHandle(RGWLibFS* _fs) + : fs(_fs), bucket(nullptr), parent(nullptr), file_ondisk_version(-1), + variant_type{directory()}, depth(0), flags(FLAG_NONE) + { + fh.fh_hk.bucket = 0; + fh.fh_hk.object = 0; + /* root */ + fh.fh_type = RGW_FS_TYPE_DIRECTORY; + variant_type = directory(); + /* stat */ + state.unix_mode = RGW_RWXMODE|S_IFDIR; + /* pointer to self */ + fh.fh_private = this; + } + + uint64_t init_fsid(std::string& uid) { + return XXH64(uid.c_str(), uid.length(), fh_key::seed); + } + + void init_rootfs(std::string& fsid, const std::string& object_name, + bool is_bucket) { + /* fh_key */ + fh.fh_hk.bucket = XXH64(fsid.c_str(), fsid.length(), fh_key::seed); + fh.fh_hk.object = XXH64(object_name.c_str(), object_name.length(), + fh_key::seed); + fhk = fh.fh_hk; + name = object_name; + + state.dev = init_fsid(fsid); + + if (is_bucket) { + flags |= RGWFileHandle::FLAG_BUCKET | RGWFileHandle::FLAG_MOUNT; + bucket = this; + depth = 1; + } else { + flags |= RGWFileHandle::FLAG_ROOT | RGWFileHandle::FLAG_MOUNT; + } + } + + void encode(buffer::list& bl) const { + ENCODE_START(3, 1, bl); + encode(uint32_t(fh.fh_type), bl); + encode(state.dev, bl); + encode(state.size, bl); + encode(state.nlink, bl); + encode(state.owner_uid, bl); + encode(state.owner_gid, bl); + encode(state.unix_mode, bl); + for (const auto& t : { state.ctime, state.mtime, state.atime }) { + encode(real_clock::from_timespec(t), bl); + } + encode((uint32_t)2, bl); + encode(file_ondisk_version.load(), bl); + ENCODE_FINISH(bl); + } + + //XXX: RGWFileHandle::decode method can only be called from + // RGWFileHandle::decode_attrs, otherwise the file_ondisk_version + // fied would be contaminated + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + uint32_t fh_type; + decode(fh_type, bl); + if ((fh.fh_type != fh_type) && + (fh_type == RGW_FS_TYPE_SYMBOLIC_LINK)) + fh.fh_type = RGW_FS_TYPE_SYMBOLIC_LINK; + decode(state.dev, bl); + decode(state.size, bl); + decode(state.nlink, bl); + decode(state.owner_uid, bl); + decode(state.owner_gid, bl); + decode(state.unix_mode, bl); + ceph::real_time enc_time; + for (auto t : { &(state.ctime), &(state.mtime), &(state.atime) }) { + decode(enc_time, bl); + *t = real_clock::to_timespec(enc_time); + } + if (struct_v >= 2) { + decode(state.version, bl); + } + if (struct_v >= 3) { + int64_t fov; + decode(fov, bl); + file_ondisk_version = fov; + } + DECODE_FINISH(bl); + } + + friend void encode(const RGWFileHandle& c, ::ceph::buffer::list &bl, uint64_t features); + friend void decode(RGWFileHandle &c, ::ceph::bufferlist::const_iterator &p); + public: + RGWFileHandle(RGWLibFS* _fs, RGWFileHandle* _parent, + const fh_key& _fhk, std::string& _name, uint32_t _flags) + : fs(_fs), bucket(nullptr), parent(_parent), file_ondisk_version(-1), + name(std::move(_name)), fhk(_fhk), flags(_flags) { + + if (parent->is_root()) { + fh.fh_type = RGW_FS_TYPE_DIRECTORY; + variant_type = directory(); + flags |= FLAG_BUCKET; + } else { + bucket = parent->is_bucket() ? parent + : parent->bucket; + if (flags & FLAG_DIRECTORY) { + fh.fh_type = RGW_FS_TYPE_DIRECTORY; + variant_type = directory(); + } else if(flags & FLAG_SYMBOLIC_LINK) { + fh.fh_type = RGW_FS_TYPE_SYMBOLIC_LINK; + variant_type = file(); + } else { + fh.fh_type = RGW_FS_TYPE_FILE; + variant_type = file(); + } + } + + depth = parent->depth + 1; + + /* save constant fhk */ + fh.fh_hk = fhk.fh_hk; /* XXX redundant in fh_hk */ + + /* inherits parent's fsid */ + state.dev = parent->state.dev; + + switch (fh.fh_type) { + case RGW_FS_TYPE_DIRECTORY: + state.unix_mode = RGW_RWXMODE|S_IFDIR; + /* virtual directories are always invalid */ + advance_mtime(); + break; + case RGW_FS_TYPE_FILE: + state.unix_mode = RGW_RWMODE|S_IFREG; + break; + case RGW_FS_TYPE_SYMBOLIC_LINK: + state.unix_mode = RGW_RWMODE|S_IFLNK; + break; + default: + break; + } + + /* pointer to self */ + fh.fh_private = this; + } + + const std::string& get_name() const { + return name; + } + + const fh_key& get_key() const { + return fhk; + } + + directory* get_directory() { + return boost::get(&variant_type); + } + + size_t get_size() const { return state.size; } + + const char* stype() { + return is_dir() ? "DIR" : "FILE"; + } + + uint16_t get_depth() const { return depth; } + + struct rgw_file_handle* get_fh() { return &fh; } + + RGWLibFS* get_fs() { return fs; } + + RGWFileHandle* get_parent() { return parent; } + + uint32_t get_owner_uid() const { return state.owner_uid; } + uint32_t get_owner_gid() const { return state.owner_gid; } + + struct timespec get_ctime() const { return state.ctime; } + struct timespec get_mtime() const { return state.mtime; } + + const ceph::buffer::list& get_etag() const { return etag; } + const ceph::buffer::list& get_acls() const { return acls; } + + void create_stat(struct stat* st, uint32_t mask) { + if (mask & RGW_SETATTR_UID) + state.owner_uid = st->st_uid; + + if (mask & RGW_SETATTR_GID) + state.owner_gid = st->st_gid; + + if (mask & RGW_SETATTR_MODE) { + switch (fh.fh_type) { + case RGW_FS_TYPE_DIRECTORY: + state.unix_mode = st->st_mode|S_IFDIR; + break; + case RGW_FS_TYPE_FILE: + state.unix_mode = st->st_mode|S_IFREG; + break; + case RGW_FS_TYPE_SYMBOLIC_LINK: + state.unix_mode = st->st_mode|S_IFLNK; + break; + default: + break; + } + } + + if (mask & RGW_SETATTR_ATIME) + state.atime = st->st_atim; + + if (mask & RGW_SETATTR_MTIME) { + if (fh.fh_type != RGW_FS_TYPE_DIRECTORY) + state.mtime = st->st_mtim; + } + + if (mask & RGW_SETATTR_CTIME) + state.ctime = st->st_ctim; + } + + int stat(struct stat* st, uint32_t flags = FLAG_NONE) { + /* partial Unix attrs */ + /* FIPS zeroization audit 20191115: this memset is not security + * related. */ + memset(st, 0, sizeof(struct stat)); + st->st_dev = state.dev; + st->st_ino = fh.fh_hk.object; // XXX + + st->st_uid = state.owner_uid; + st->st_gid = state.owner_gid; + + st->st_mode = state.unix_mode; + + switch (fh.fh_type) { + case RGW_FS_TYPE_DIRECTORY: + /* virtual directories are always invalid */ + advance_mtime(flags); + st->st_nlink = state.nlink; + break; + case RGW_FS_TYPE_FILE: + st->st_nlink = 1; + st->st_blksize = 4096; + st->st_size = state.size; + st->st_blocks = (state.size) / 512; + break; + case RGW_FS_TYPE_SYMBOLIC_LINK: + st->st_nlink = 1; + st->st_blksize = 4096; + st->st_size = state.size; + st->st_blocks = (state.size) / 512; + break; + default: + break; + } + +#ifdef HAVE_STAT_ST_MTIMESPEC_TV_NSEC + st->st_atimespec = state.atime; + st->st_mtimespec = state.mtime; + st->st_ctimespec = state.ctime; +#else + st->st_atim = state.atime; + st->st_mtim = state.mtime; + st->st_ctim = state.ctime; +#endif + + return 0; + } + + const std::string& bucket_name() const { + if (is_root()) + return root_name; + if (is_bucket()) + return name; + return bucket->object_name(); + } + + const std::string& object_name() const { return name; } + + std::string full_object_name(bool omit_bucket = false) const { + std::string path; + std::vector segments; + int reserve = 0; + const RGWFileHandle* tfh = this; + while (tfh && !tfh->is_root() && !(tfh->is_bucket() && omit_bucket)) { + segments.push_back(&tfh->object_name()); + reserve += (1 + tfh->object_name().length()); + tfh = tfh->parent; + } + int pos = 1; + path.reserve(reserve); + for (auto& s : boost::adaptors::reverse(segments)) { + if (pos > 1) { + path += "/"; + } else { + if (!omit_bucket && + ((path.length() == 0) || (path.front() != '/'))) + path += "/"; + } + path += *s; + ++pos; + } + return path; + } + + inline std::string relative_object_name() const { + return full_object_name(true /* omit_bucket */); + } + + inline std::string relative_object_name2() { + std::string rname = full_object_name(true /* omit_bucket */); + if (is_dir()) { + rname += "/"; + } + return rname; + } + + inline std::string format_child_name(const std::string& cbasename, + bool is_dir) const { + std::string child_name{relative_object_name()}; + if ((child_name.size() > 0) && + (child_name.back() != '/')) + child_name += "/"; + child_name += cbasename; + if (is_dir) + child_name += "/"; + return child_name; + } + + inline std::string make_key_name(const char *name) const { + std::string key_name{full_object_name()}; + if (key_name.length() > 0) + key_name += "/"; + key_name += name; + return key_name; + } + + fh_key make_fhk(const std::string& name); + + void add_marker(uint64_t off, const rgw_obj_key& marker, + uint8_t obj_type) { + using std::get; + directory* d = get(&variant_type); + if (d) { + unique_lock guard(mtx); + d->last_marker = marker; + } + } + + const rgw_obj_key* find_marker(uint64_t off) const { + using std::get; + if (off > 0) { + const directory* d = get(&variant_type); + if (d ) { + return &d->last_marker; + } + } + return nullptr; + } + + int offset_of(const std::string& name, int64_t *offset, uint32_t flags) { + if (unlikely(! is_dir())) { + return -EINVAL; + } + *offset = XXH64(name.c_str(), name.length(), fh_key::seed); + return 0; + } + + bool is_open() const { return flags & FLAG_OPEN; } + bool is_root() const { return flags & FLAG_ROOT; } + bool is_mount() const { return flags & FLAG_MOUNT; } + bool is_bucket() const { return flags & FLAG_BUCKET; } + bool is_object() const { return !is_bucket(); } + bool is_file() const { return (fh.fh_type == RGW_FS_TYPE_FILE); } + bool is_dir() const { return (fh.fh_type == RGW_FS_TYPE_DIRECTORY); } + bool is_link() const { return (fh.fh_type == RGW_FS_TYPE_SYMBOLIC_LINK); } + bool creating() const { return flags & FLAG_CREATING; } + bool deleted() const { return flags & FLAG_DELETED; } + bool stateless_open() const { return flags & FLAG_STATELESS_OPEN; } + bool has_children() const; + + int open(uint32_t gsh_flags) { + lock_guard guard(mtx); + if (! is_open()) { + if (gsh_flags & RGW_OPEN_FLAG_V3) { + flags |= FLAG_STATELESS_OPEN; + } + flags |= FLAG_OPEN; + return 0; + } + return -EPERM; + } + + typedef boost::variant readdir_offset; + + int readdir(rgw_readdir_cb rcb, void *cb_arg, readdir_offset offset, + bool *eof, uint32_t flags); + + int write(uint64_t off, size_t len, size_t *nbytes, void *buffer); + + int commit(uint64_t offset, uint64_t length, uint32_t flags) { + /* NFS3 and NFSv4 COMMIT implementation + * the current atomic update strategy doesn't actually permit + * clients to read-stable until either CLOSE (NFSv4+) or the + * expiration of the active write timer (NFS3). In the + * interim, the client may send an arbitrary number of COMMIT + * operations which must return a success result */ + return 0; + } + + int write_finish(uint32_t flags = FLAG_NONE); + int close(); + + void open_for_create() { + lock_guard guard(mtx); + flags |= FLAG_CREATING; + } + + void clear_creating() { + lock_guard guard(mtx); + flags &= ~FLAG_CREATING; + } + + void inc_nlink(const uint64_t n) { + state.nlink += n; + } + + void set_nlink(const uint64_t n) { + state.nlink = n; + } + + void set_size(const size_t size) { + state.size = size; + } + + void set_times(const struct timespec &ts) { + state.ctime = ts; + state.mtime = state.ctime; + state.atime = state.ctime; + } + + void set_times(real_time t) { + set_times(real_clock::to_timespec(t)); + } + + void set_ctime(const struct timespec &ts) { + state.ctime = ts; + } + + void set_mtime(const struct timespec &ts) { + state.mtime = ts; + } + + void set_atime(const struct timespec &ts) { + state.atime = ts; + } + + void set_etag(const ceph::buffer::list& _etag ) { + etag = _etag; + } + + void set_acls(const ceph::buffer::list& _acls ) { + acls = _acls; + } + + void encode_attrs(ceph::buffer::list& ux_key1, + ceph::buffer::list& ux_attrs1, + bool inc_ov = true); + + DecodeAttrsResult decode_attrs(const ceph::buffer::list* ux_key1, + const ceph::buffer::list* ux_attrs1); + + void invalidate(); + + bool reclaim(const cohort::lru::ObjectFactory* newobj_fac) override; + + typedef cohort::lru::LRU FhLRU; + + struct FhLT + { + // for internal ordering + bool operator()(const RGWFileHandle& lhs, const RGWFileHandle& rhs) const + { return (lhs.get_key() < rhs.get_key()); } + + // for external search by fh_key + bool operator()(const fh_key& k, const RGWFileHandle& fh) const + { return k < fh.get_key(); } + + bool operator()(const RGWFileHandle& fh, const fh_key& k) const + { return fh.get_key() < k; } + }; + + struct FhEQ + { + bool operator()(const RGWFileHandle& lhs, const RGWFileHandle& rhs) const + { return (lhs.get_key() == rhs.get_key()); } + + bool operator()(const fh_key& k, const RGWFileHandle& fh) const + { return k == fh.get_key(); } + + bool operator()(const RGWFileHandle& fh, const fh_key& k) const + { return fh.get_key() == k; } + }; + + typedef bi::link_mode link_mode; /* XXX normal */ +#if defined(FHCACHE_AVL) + typedef bi::avl_set_member_hook tree_hook_type; +#else + /* RBT */ + typedef bi::set_member_hook tree_hook_type; +#endif + tree_hook_type fh_hook; + + typedef bi::member_hook< + RGWFileHandle, tree_hook_type, &RGWFileHandle::fh_hook> FhHook; + +#if defined(FHCACHE_AVL) + typedef bi::avltree, FhHook> FHTree; +#else + typedef bi::rbtree, FhHook> FhTree; +#endif + typedef cohort::lru::TreeX FHCache; + + ~RGWFileHandle() override; + + friend std::ostream& operator<<(std::ostream &os, + RGWFileHandle const &rgw_fh); + + class Factory : public cohort::lru::ObjectFactory + { + public: + RGWLibFS* fs; + RGWFileHandle* parent; + const fh_key& fhk; + std::string& name; + uint32_t flags; + + Factory() = delete; + + Factory(RGWLibFS* _fs, RGWFileHandle* _parent, + const fh_key& _fhk, std::string& _name, uint32_t _flags) + : fs(_fs), parent(_parent), fhk(_fhk), name(_name), + flags(_flags) {} + + void recycle (cohort::lru::Object* o) override { + /* re-use an existing object */ + o->~Object(); // call lru::Object virtual dtor + // placement new! + new (o) RGWFileHandle(fs, parent, fhk, name, flags); + } + + cohort::lru::Object* alloc() override { + return new RGWFileHandle(fs, parent, fhk, name, flags); + } + }; /* Factory */ + + }; /* RGWFileHandle */ + + WRITE_CLASS_ENCODER(RGWFileHandle); + + inline RGWFileHandle* get_rgwfh(struct rgw_file_handle* fh) { + return static_cast(fh->fh_private); + } + + inline enum rgw_fh_type fh_type_of(uint32_t flags) { + enum rgw_fh_type fh_type; + switch(flags & RGW_LOOKUP_TYPE_FLAGS) + { + case RGW_LOOKUP_FLAG_DIR: + fh_type = RGW_FS_TYPE_DIRECTORY; + break; + case RGW_LOOKUP_FLAG_FILE: + fh_type = RGW_FS_TYPE_FILE; + break; + default: + fh_type = RGW_FS_TYPE_NIL; + }; + return fh_type; + } + + typedef std::tuple LookupFHResult; + typedef std::tuple MkObjResult; + + class RGWLibFS + { + CephContext* cct; + struct rgw_fs fs{}; + RGWFileHandle root_fh; + rgw_fh_callback_t invalidate_cb; + void *invalidate_arg; + bool shutdown; + + mutable std::atomic refcnt; + + RGWFileHandle::FHCache fh_cache; + RGWFileHandle::FhLRU fh_lru; + + std::string uid; // should match user.user_id, iiuc + + std::unique_ptr user; + RGWAccessKey key; // XXXX acc_key + + static std::atomic fs_inst_counter; + + static uint32_t write_completion_interval_s; + + using lock_guard = std::lock_guard; + using unique_lock = std::unique_lock; + + struct event + { + enum class type : uint8_t { READDIR } ; + type t; + const fh_key fhk; + struct timespec ts; + event(type t, const fh_key& k, const struct timespec& ts) + : t(t), fhk(k), ts(ts) {} + }; + + friend std::ostream& operator<<(std::ostream &os, + RGWLibFS::event const &ev); + + using event_vector = /* boost::small_vector */ + std::vector; + + struct WriteCompletion + { + RGWFileHandle& rgw_fh; + + explicit WriteCompletion(RGWFileHandle& _fh) : rgw_fh(_fh) { + rgw_fh.get_fs()->ref(&rgw_fh); + } + + void operator()() { + rgw_fh.close(); /* will finish in-progress write */ + rgw_fh.get_fs()->unref(&rgw_fh); + } + }; + + static ceph::timer write_timer; + + struct State { + std::mutex mtx; + std::atomic flags; + std::deque events; + + State() : flags(0) {} + + void push_event(const event& ev) { + events.push_back(ev); + } + } state; + + uint32_t new_inst() { + return ++fs_inst_counter; + } + + friend class RGWFileHandle; + friend class RGWLibProcess; + + public: + + static constexpr uint32_t FLAG_NONE = 0x0000; + static constexpr uint32_t FLAG_CLOSED = 0x0001; + + struct BucketStats { + size_t size; + size_t size_rounded; + real_time creation_time; + uint64_t num_entries; + }; + + RGWLibFS(CephContext* _cct, const char *_uid, const char *_user_id, + const char* _key, const char *root) + : cct(_cct), root_fh(this), invalidate_cb(nullptr), + invalidate_arg(nullptr), shutdown(false), refcnt(1), + fh_cache(cct->_conf->rgw_nfs_fhcache_partitions, + cct->_conf->rgw_nfs_fhcache_size), + fh_lru(cct->_conf->rgw_nfs_lru_lanes, + cct->_conf->rgw_nfs_lru_lane_hiwat), + uid(_uid), key(_user_id, _key) { + + if (!root || !strcmp(root, "/")) { + root_fh.init_rootfs(uid, RGWFileHandle::root_name, false); + } else { + root_fh.init_rootfs(uid, root, true); + } + + /* pointer to self */ + fs.fs_private = this; + + /* expose public root fh */ + fs.root_fh = root_fh.get_fh(); + + new_inst(); + } + + friend void intrusive_ptr_add_ref(const RGWLibFS* fs) { + fs->refcnt.fetch_add(1, std::memory_order_relaxed); + } + + friend void intrusive_ptr_release(const RGWLibFS* fs) { + if (fs->refcnt.fetch_sub(1, std::memory_order_release) == 0) { + std::atomic_thread_fence(std::memory_order_acquire); + delete fs; + } + } + + RGWLibFS* ref() { + intrusive_ptr_add_ref(this); + return this; + } + + inline void rele() { + intrusive_ptr_release(this); + } + + void stop() { shutdown = true; } + + void release_evict(RGWFileHandle* fh) { + /* remove from cache, releases sentinel ref */ + fh_cache.remove(fh->fh.fh_hk.object, fh, + RGWFileHandle::FHCache::FLAG_LOCK); + /* release call-path ref */ + (void) fh_lru.unref(fh, cohort::lru::FLAG_NONE); + } + + int authorize(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver) { + int ret = driver->get_user_by_access_key(dpp, key.id, null_yield, &user); + if (ret == 0) { + RGWAccessKey* k = user->get_info().get_key(key.id); + if (!k || (k->key != key.key)) + return -EINVAL; + if (user->get_info().suspended) + return -ERR_USER_SUSPENDED; + } else { + /* try external authenticators (ldap for now) */ + rgw::LDAPHelper* ldh = g_rgwlib->get_ldh(); /* !nullptr */ + RGWToken token; + /* boost filters and/or string_ref may throw on invalid input */ + try { + token = rgw::from_base64(key.id); + } catch(...) { + token = std::string(""); + } + if (token.valid() && (ldh->auth(token.id, token.key) == 0)) { + /* try to driver user if it doesn't already exist */ + if (user->load_user(dpp, null_yield) < 0) { + int ret = user->store_user(dpp, null_yield, true); + if (ret < 0) { + lsubdout(get_context(), rgw, 10) + << "NOTICE: failed to driver new user's info: ret=" << ret + << dendl; + } + } + } /* auth success */ + } + return ret; + } /* authorize */ + + int register_invalidate(rgw_fh_callback_t cb, void *arg, uint32_t flags) { + invalidate_cb = cb; + invalidate_arg = arg; + return 0; + } + + /* find RGWFileHandle by id */ + LookupFHResult lookup_fh(const fh_key& fhk, + const uint32_t flags = RGWFileHandle::FLAG_NONE) { + using std::get; + + // cast int32_t(RGWFileHandle::FLAG_NONE) due to strictness of Clang + // the cast transfers a lvalue into a rvalue in the ctor + // check the commit message for the full details + LookupFHResult fhr { nullptr, uint32_t(RGWFileHandle::FLAG_NONE) }; + + RGWFileHandle::FHCache::Latch lat; + bool fh_locked = flags & RGWFileHandle::FLAG_LOCKED; + + retry: + RGWFileHandle* fh = + fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/, + fhk /* key */, lat /* serializer */, + RGWFileHandle::FHCache::FLAG_LOCK); + /* LATCHED */ + if (fh) { + if (likely(! fh_locked)) + fh->mtx.lock(); // XXX !RAII because may-return-LOCKED + /* need initial ref from LRU (fast path) */ + if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) { + lat.lock->unlock(); + if (likely(! fh_locked)) + fh->mtx.unlock(); + goto retry; /* !LATCHED */ + } + /* LATCHED, LOCKED */ + if (! (flags & RGWFileHandle::FLAG_LOCK)) + fh->mtx.unlock(); /* ! LOCKED */ + } + lat.lock->unlock(); /* !LATCHED */ + get<0>(fhr) = fh; + if (fh) { + lsubdout(get_context(), rgw, 17) + << __func__ << " 1 " << *fh + << dendl; + } + return fhr; + } /* lookup_fh(const fh_key&) */ + + /* find or create an RGWFileHandle */ + LookupFHResult lookup_fh(RGWFileHandle* parent, const char *name, + const uint32_t flags = RGWFileHandle::FLAG_NONE) { + using std::get; + + // cast int32_t(RGWFileHandle::FLAG_NONE) due to strictness of Clang + // the cast transfers a lvalue into a rvalue in the ctor + // check the commit message for the full details + LookupFHResult fhr { nullptr, uint32_t(RGWFileHandle::FLAG_NONE) }; + + /* mount is stale? */ + if (state.flags & FLAG_CLOSED) + return fhr; + + RGWFileHandle::FHCache::Latch lat; + bool fh_locked = flags & RGWFileHandle::FLAG_LOCKED; + + std::string obj_name{name}; + std::string key_name{parent->make_key_name(name)}; + fh_key fhk = parent->make_fhk(obj_name); + + lsubdout(get_context(), rgw, 10) + << __func__ << " called on " + << parent->object_name() << " for " << key_name + << " (" << obj_name << ")" + << " -> " << fhk + << dendl; + + retry: + RGWFileHandle* fh = + fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/, + fhk /* key */, lat /* serializer */, + RGWFileHandle::FHCache::FLAG_LOCK); + /* LATCHED */ + if (fh) { + if (likely(! fh_locked)) + fh->mtx.lock(); // XXX !RAII because may-return-LOCKED + if (fh->flags & RGWFileHandle::FLAG_DELETED) { + /* for now, delay briefly and retry */ + lat.lock->unlock(); + if (likely(! fh_locked)) + fh->mtx.unlock(); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + goto retry; /* !LATCHED */ + } + /* need initial ref from LRU (fast path) */ + if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) { + lat.lock->unlock(); + if (likely(! fh_locked)) + fh->mtx.unlock(); + goto retry; /* !LATCHED */ + } + /* LATCHED, LOCKED */ + if (! (flags & RGWFileHandle::FLAG_LOCK)) + if (likely(! fh_locked)) + fh->mtx.unlock(); /* ! LOCKED */ + } else { + /* make or re-use handle */ + RGWFileHandle::Factory prototype(this, parent, fhk, + obj_name, CREATE_FLAGS(flags)); + uint32_t iflags{cohort::lru::FLAG_INITIAL}; + fh = static_cast( + fh_lru.insert(&prototype, + cohort::lru::Edge::MRU, + iflags)); + if (fh) { + /* lock fh (LATCHED) */ + if (flags & RGWFileHandle::FLAG_LOCK) + fh->mtx.lock(); + if (likely(! (iflags & cohort::lru::FLAG_RECYCLE))) { + /* inserts at cached insert iterator, releasing latch */ + fh_cache.insert_latched( + fh, lat, RGWFileHandle::FHCache::FLAG_UNLOCK); + } else { + /* recycle step invalidates Latch */ + fh_cache.insert( + fhk.fh_hk.object, fh, RGWFileHandle::FHCache::FLAG_NONE); + lat.lock->unlock(); /* !LATCHED */ + } + get<1>(fhr) |= RGWFileHandle::FLAG_CREATE; + /* ref parent (non-initial ref cannot fail on valid object) */ + if (! parent->is_mount()) { + (void) fh_lru.ref(parent, cohort::lru::FLAG_NONE); + } + goto out; /* !LATCHED */ + } else { + lat.lock->unlock(); + goto retry; /* !LATCHED */ + } + } + lat.lock->unlock(); /* !LATCHED */ + out: + get<0>(fhr) = fh; + if (fh) { + lsubdout(get_context(), rgw, 17) + << __func__ << " 2 " << *fh + << dendl; + } + return fhr; + } /* lookup_fh(RGWFileHandle*, const char *, const uint32_t) */ + + inline void unref(RGWFileHandle* fh) { + if (likely(! fh->is_mount())) { + (void) fh_lru.unref(fh, cohort::lru::FLAG_NONE); + } + } + + inline RGWFileHandle* ref(RGWFileHandle* fh) { + if (likely(! fh->is_mount())) { + fh_lru.ref(fh, cohort::lru::FLAG_NONE); + } + return fh; + } + + int getattr(RGWFileHandle* rgw_fh, struct stat* st); + + int setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask, + uint32_t flags); + + int getxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist* attrs, + rgw_getxattr_cb cb, void *cb_arg, uint32_t flags); + + int lsxattrs(RGWFileHandle* rgw_fh, rgw_xattrstr *filter_prefix, + rgw_getxattr_cb cb, void *cb_arg, uint32_t flags); + + int setxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist* attrs, uint32_t flags); + + int rmxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist* attrs, uint32_t flags); + + void update_fh(RGWFileHandle *rgw_fh); + + LookupFHResult stat_bucket(RGWFileHandle* parent, const char *path, + RGWLibFS::BucketStats& bs, + uint32_t flags); + + LookupFHResult fake_leaf(RGWFileHandle* parent, const char *path, + enum rgw_fh_type type = RGW_FS_TYPE_NIL, + struct stat *st = nullptr, uint32_t mask = 0, + uint32_t flags = RGWFileHandle::FLAG_NONE); + + LookupFHResult stat_leaf(RGWFileHandle* parent, const char *path, + enum rgw_fh_type type = RGW_FS_TYPE_NIL, + uint32_t flags = RGWFileHandle::FLAG_NONE); + + int read(RGWFileHandle* rgw_fh, uint64_t offset, size_t length, + size_t* bytes_read, void* buffer, uint32_t flags); + + int readlink(RGWFileHandle* rgw_fh, uint64_t offset, size_t length, + size_t* bytes_read, void* buffer, uint32_t flags); + + int rename(RGWFileHandle* old_fh, RGWFileHandle* new_fh, + const char *old_name, const char *new_name); + + MkObjResult create(RGWFileHandle* parent, const char *name, struct stat *st, + uint32_t mask, uint32_t flags); + + MkObjResult symlink(RGWFileHandle* parent, const char *name, + const char *link_path, struct stat *st, uint32_t mask, uint32_t flags); + + MkObjResult mkdir(RGWFileHandle* parent, const char *name, struct stat *st, + uint32_t mask, uint32_t flags); + + int unlink(RGWFileHandle* rgw_fh, const char *name, + uint32_t flags = FLAG_NONE); + + /* find existing RGWFileHandle */ + RGWFileHandle* lookup_handle(struct rgw_fh_hk fh_hk) { + + if (state.flags & FLAG_CLOSED) + return nullptr; + + RGWFileHandle::FHCache::Latch lat; + fh_key fhk(fh_hk); + + retry: + RGWFileHandle* fh = + fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/, + fhk /* key */, lat /* serializer */, + RGWFileHandle::FHCache::FLAG_LOCK); + /* LATCHED */ + if (! fh) { + if (unlikely(fhk == root_fh.fh.fh_hk)) { + /* lookup for root of this fs */ + fh = &root_fh; + goto out; + } + lsubdout(get_context(), rgw, 0) + << __func__ << " handle lookup failed " << fhk + << dendl; + goto out; + } + fh->mtx.lock(); + if (fh->flags & RGWFileHandle::FLAG_DELETED) { + /* for now, delay briefly and retry */ + lat.lock->unlock(); + fh->mtx.unlock(); /* !LOCKED */ + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + goto retry; /* !LATCHED */ + } + if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) { + lat.lock->unlock(); + fh->mtx.unlock(); + goto retry; /* !LATCHED */ + } + /* LATCHED */ + fh->mtx.unlock(); /* !LOCKED */ + out: + lat.lock->unlock(); /* !LATCHED */ + + /* special case: lookup root_fh */ + if (! fh) { + if (unlikely(fh_hk == root_fh.fh.fh_hk)) { + fh = &root_fh; + } + } + + return fh; + } + + CephContext* get_context() { + return cct; + } + + struct rgw_fs* get_fs() { return &fs; } + + RGWFileHandle& get_fh() { return root_fh; } + + uint64_t get_fsid() { return root_fh.state.dev; } + + RGWUserInfo* get_user() { return &user->get_info(); } + + void update_user(const DoutPrefixProvider *dpp) { + (void) g_rgwlib->get_driver()->get_user_by_access_key(dpp, key.id, null_yield, &user); + } + + void close(); + void gc(); + }; /* RGWLibFS */ + +static inline std::string make_uri(const std::string& bucket_name, + const std::string& object_name) { + std::string uri("/"); + uri.reserve(bucket_name.length() + object_name.length() + 2); + uri += bucket_name; + uri += "/"; + uri += object_name; + return uri; +} + +/* + read directory content (buckets) +*/ + +class RGWListBucketsRequest : public RGWLibRequest, + public RGWListBuckets /* RGWOp */ +{ +public: + RGWFileHandle* rgw_fh; + RGWFileHandle::readdir_offset offset; + void* cb_arg; + rgw_readdir_cb rcb; + uint64_t* ioff; + size_t ix; + uint32_t d_count; + bool rcb_eof; // caller forced early stop in readdir cycle + + RGWListBucketsRequest(CephContext* _cct, std::unique_ptr _user, + RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb, + void* _cb_arg, RGWFileHandle::readdir_offset& _offset) + : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), offset(_offset), + cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0), + rcb_eof(false) { + + using boost::get; + + if (unlikely(!! get(&offset))) { + ioff = get(offset); + const auto& mk = rgw_fh->find_marker(*ioff); + if (mk) { + marker = mk->name; + } + } else { + const char* mk = get(offset); + if (mk) { + marker = mk; + } + } + op = this; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + req_state* state = get_state(); + state->info.method = "GET"; + state->op = OP_GET; + + /* XXX derp derp derp */ + state->relative_uri = "/"; + state->info.request_uri = "/"; // XXX + state->info.effective_uri = "/"; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + int get_params(optional_yield) override { + limit = -1; /* no limit */ + return 0; + } + + void send_response_begin(bool has_buckets) override { + sent_data = true; + } + + void send_response_data(rgw::sal::BucketList& buckets) override { + if (!sent_data) + return; + auto& m = buckets.get_buckets(); + for (const auto& iter : m) { + std::string_view marker{iter.first}; + auto& ent = iter.second; + if (! this->operator()(ent->get_name(), marker)) { + /* caller cannot accept more */ + lsubdout(cct, rgw, 5) << "ListBuckets rcb failed" + << " dirent=" << ent->get_name() + << " call count=" << ix + << dendl; + rcb_eof = true; + return; + } + ++ix; + } + } /* send_response_data */ + + void send_response_end() override { + // do nothing + } + + int operator()(const std::string_view& name, + const std::string_view& marker) { + uint64_t off = XXH64(name.data(), name.length(), fh_key::seed); + if (!! ioff) { + *ioff = off; + } + /* update traversal cache */ + rgw_fh->add_marker(off, rgw_obj_key{marker.data(), ""}, + RGW_FS_TYPE_DIRECTORY); + ++d_count; + return rcb(name.data(), cb_arg, off, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + } + + bool eof() { + using boost::get; + + if (unlikely(cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15))) { + bool is_offset = + unlikely(! get(&offset)) || + !! get(offset); + lsubdout(cct, rgw, 15) << "READDIR offset: " << + ((is_offset) ? offset : "(nil)") + << " is_truncated: " << is_truncated + << dendl; + } + return !is_truncated && !rcb_eof; + } + +}; /* RGWListBucketsRequest */ + +/* + read directory content (bucket objects) +*/ + +class RGWReaddirRequest : public RGWLibRequest, + public RGWListBucket /* RGWOp */ +{ +public: + RGWFileHandle* rgw_fh; + RGWFileHandle::readdir_offset offset; + void* cb_arg; + rgw_readdir_cb rcb; + uint64_t* ioff; + size_t ix; + uint32_t d_count; + bool rcb_eof; // caller forced early stop in readdir cycle + + RGWReaddirRequest(CephContext* _cct, std::unique_ptr _user, + RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb, + void* _cb_arg, RGWFileHandle::readdir_offset& _offset) + : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), offset(_offset), + cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0), + rcb_eof(false) { + + using boost::get; + + if (unlikely(!! get(&offset))) { + ioff = get(offset); + const auto& mk = rgw_fh->find_marker(*ioff); + if (mk) { + marker = *mk; + } + } else { + const char* mk = get(offset); + if (mk) { + std::string tmark{rgw_fh->relative_object_name()}; + if (tmark.length() > 0) + tmark += "/"; + tmark += mk; + marker = rgw_obj_key{std::move(tmark), "", ""}; + } + } + + default_max = 1000; // XXX was being omitted + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + req_state* state = get_state(); + state->info.method = "GET"; + state->op = OP_GET; + + /* XXX derp derp derp */ + std::string uri = "/" + rgw_fh->bucket_name() + "/"; + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + prefix = rgw_fh->relative_object_name(); + if (prefix.length() > 0) + prefix += "/"; + delimiter = '/'; + + return 0; + } + + int operator()(const std::string_view name, const rgw_obj_key& marker, + const ceph::real_time& t, const uint64_t fsz, uint8_t type) { + + assert(name.length() > 0); // all cases handled in callers + + /* hash offset of name in parent (short name) for NFS readdir cookie */ + uint64_t off = XXH64(name.data(), name.length(), fh_key::seed); + if (unlikely(!! ioff)) { + *ioff = off; + } + + /* update traversal cache */ + rgw_fh->add_marker(off, marker, type); + ++d_count; + + /* set c/mtime and size from bucket index entry */ + struct stat st = {}; +#ifdef HAVE_STAT_ST_MTIMESPEC_TV_NSEC + st.st_atimespec = ceph::real_clock::to_timespec(t); + st.st_mtimespec = st.st_atimespec; + st.st_ctimespec = st.st_atimespec; +#else + st.st_atim = ceph::real_clock::to_timespec(t); + st.st_mtim = st.st_atim; + st.st_ctim = st.st_atim; +#endif + st.st_size = fsz; + + return rcb(name.data(), cb_arg, off, &st, RGWFileHandle::RCB_MASK, + (type == RGW_FS_TYPE_DIRECTORY) ? + RGW_LOOKUP_FLAG_DIR : + RGW_LOOKUP_FLAG_FILE); + } + + int get_params(optional_yield) override { + max = default_max; + return 0; + } + + void send_response() override { + req_state* state = get_state(); + auto cnow = real_clock::now(); + + /* enumerate objs and common_prefixes in parallel, + * avoiding increment on and end iterator, which is + * undefined */ + + class DirIterator + { + std::vector& objs; + std::vector::iterator obj_iter; + + std::map& common_prefixes; + std::map::iterator cp_iter; + + boost::optional obj_sref; + boost::optional cp_sref; + bool _skip_cp; + + public: + + DirIterator(std::vector& objs, + std::map& common_prefixes) + : objs(objs), common_prefixes(common_prefixes), _skip_cp(false) + { + obj_iter = objs.begin(); + parse_obj(); + cp_iter = common_prefixes.begin(); + parse_cp(); + } + + bool is_obj() { + return (obj_iter != objs.end()); + } + + bool is_cp(){ + return (cp_iter != common_prefixes.end()); + } + + bool eof() { + return ((!is_obj()) && (!is_cp())); + } + + void parse_obj() { + if (is_obj()) { + std::string_view sref{obj_iter->key.name}; + size_t last_del = sref.find_last_of('/'); + if (last_del != string::npos) + sref.remove_prefix(last_del+1); + obj_sref = sref; + } + } /* parse_obj */ + + void next_obj() { + ++obj_iter; + parse_obj(); + } + + void parse_cp() { + if (is_cp()) { + /* leading-/ skip case */ + if (cp_iter->first == "/") { + _skip_cp = true; + return; + } else + _skip_cp = false; + + /* it's safest to modify the element in place--a suffix-modifying + * string_ref operation is problematic since ULP rgw_file callers + * will ultimately need a c-string */ + if (cp_iter->first.back() == '/') + const_cast(cp_iter->first).pop_back(); + + std::string_view sref{cp_iter->first}; + size_t last_del = sref.find_last_of('/'); + if (last_del != string::npos) + sref.remove_prefix(last_del+1); + cp_sref = sref; + } /* is_cp */ + } /* parse_cp */ + + void next_cp() { + ++cp_iter; + parse_cp(); + } + + bool skip_cp() { + return _skip_cp; + } + + bool entry_is_obj() { + return (is_obj() && + ((! is_cp()) || + (obj_sref.get() < cp_sref.get()))); + } + + std::string_view get_obj_sref() { + return obj_sref.get(); + } + + std::string_view get_cp_sref() { + return cp_sref.get(); + } + + std::vector::iterator& get_obj_iter() { + return obj_iter; + } + + std::map::iterator& get_cp_iter() { + return cp_iter; + } + + }; /* DirIterator */ + + DirIterator di{objs, common_prefixes}; + + for (;;) { + + if (di.eof()) { + break; // done + } + + /* assert: one of is_obj() || is_cp() holds */ + if (di.entry_is_obj()) { + auto sref = di.get_obj_sref(); + if (sref.empty()) { + /* recursive list of a leaf dir (iirc), do nothing */ + } else { + /* send a file entry */ + auto obj_entry = *(di.get_obj_iter()); + + lsubdout(cct, rgw, 15) << "RGWReaddirRequest " + << __func__ << " " + << "list uri=" << state->relative_uri << " " + << " prefix=" << prefix << " " + << " obj path=" << obj_entry.key.name + << " (" << sref << ")" << "" + << " mtime=" + << real_clock::to_time_t(obj_entry.meta.mtime) + << " size=" << obj_entry.meta.accounted_size + << dendl; + + if (! this->operator()(sref, next_marker, obj_entry.meta.mtime, + obj_entry.meta.accounted_size, + RGW_FS_TYPE_FILE)) { + /* caller cannot accept more */ + lsubdout(cct, rgw, 5) << "readdir rcb caller signalled stop" + << " dirent=" << sref.data() + << " call count=" << ix + << dendl; + rcb_eof = true; + return; + } + } + di.next_obj(); // and advance object + } else { + /* send a dir entry */ + if (! di.skip_cp()) { + auto sref = di.get_cp_sref(); + + lsubdout(cct, rgw, 15) << "RGWReaddirRequest " + << __func__ << " " + << "list uri=" << state->relative_uri << " " + << " prefix=" << prefix << " " + << " cpref=" << sref + << dendl; + + if (sref.empty()) { + /* null path segment--could be created in S3 but has no NFS + * interpretation */ + } else { + if (! this->operator()(sref, next_marker, cnow, 0, + RGW_FS_TYPE_DIRECTORY)) { + /* caller cannot accept more */ + lsubdout(cct, rgw, 5) << "readdir rcb caller signalled stop" + << " dirent=" << sref.data() + << " call count=" << ix + << dendl; + rcb_eof = true; + return; + } + } + } + di.next_cp(); // and advance common_prefixes + } /* ! di.entry_is_obj() */ + } /* for (;;) */ + } + + virtual void send_versioned_response() { + send_response(); + } + + bool eof() { + using boost::get; + + if (unlikely(cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15))) { + bool is_offset = + unlikely(! get(&offset)) || + !! get(offset); + lsubdout(cct, rgw, 15) << "READDIR offset: " << + ((is_offset) ? offset : "(nil)") + << " next marker: " << next_marker + << " is_truncated: " << is_truncated + << dendl; + } + return !is_truncated && !rcb_eof; + } + +}; /* RGWReaddirRequest */ + +/* + dir has-children predicate (bucket objects) +*/ + +class RGWRMdirCheck : public RGWLibRequest, + public RGWListBucket /* RGWOp */ +{ +public: + const RGWFileHandle* rgw_fh; + bool valid; + bool has_children; + + RGWRMdirCheck (CephContext* _cct, std::unique_ptr _user, + const RGWFileHandle* _rgw_fh) + : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), valid(false), + has_children(false) { + default_max = 2; + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + req_state* state = get_state(); + state->info.method = "GET"; + state->op = OP_GET; + + std::string uri = "/" + rgw_fh->bucket_name() + "/"; + state->relative_uri = uri; + state->info.request_uri = uri; + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + prefix = rgw_fh->relative_object_name(); + if (prefix.length() > 0) + prefix += "/"; + delimiter = '/'; + + return 0; + } + + int get_params(optional_yield) override { + max = default_max; + return 0; + } + + void send_response() override { + valid = true; + if ((objs.size() > 1) || + (! objs.empty() && + (objs.front().key.name != prefix))) { + has_children = true; + return; + } + for (auto& iter : common_prefixes) { + /* readdir never produces a name for this case */ + if (iter.first == "/") + continue; + has_children = true; + break; + } + } + + virtual void send_versioned_response() { + send_response(); + } + +}; /* RGWRMdirCheck */ + +/* + create bucket +*/ + +class RGWCreateBucketRequest : public RGWLibRequest, + public RGWCreateBucket /* RGWOp */ +{ +public: + const std::string& bucket_name; + + RGWCreateBucketRequest(CephContext* _cct, std::unique_ptr _user, + std::string& _bname) + : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname) { + op = this; + } + + bool only_bucket() override { return false; } + + int read_permissions(RGWOp* op_obj, optional_yield) override { + /* we ARE a 'create bucket' request (cf. rgw_rest.cc, ll. 1305-6) */ + return 0; + } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "PUT"; + state->op = OP_PUT; + + string uri = "/" + bucket_name; + /* XXX derp derp derp */ + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + int get_params(optional_yield) override { + req_state* state = get_state(); + RGWAccessControlPolicy_S3 s3policy(state->cct); + /* we don't have (any) headers, so just create canned ACLs */ + int ret = s3policy.create_canned(state->owner, state->bucket_owner, state->canned_acl); + policy = s3policy; + return ret; + } + + void send_response() override { + /* TODO: something (maybe) */ + } +}; /* RGWCreateBucketRequest */ + +/* + delete bucket +*/ + +class RGWDeleteBucketRequest : public RGWLibRequest, + public RGWDeleteBucket /* RGWOp */ +{ +public: + const std::string& bucket_name; + + RGWDeleteBucketRequest(CephContext* _cct, std::unique_ptr _user, + std::string& _bname) + : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname) { + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "DELETE"; + state->op = OP_DELETE; + + string uri = "/" + bucket_name; + /* XXX derp derp derp */ + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + void send_response() override {} + +}; /* RGWDeleteBucketRequest */ + +/* + put object +*/ +class RGWPutObjRequest : public RGWLibRequest, + public RGWPutObj /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + buffer::list& bl; /* XXX */ + size_t bytes_written; + + RGWPutObjRequest(CephContext* _cct, std::unique_ptr _user, + const std::string& _bname, const std::string& _oname, + buffer::list& _bl) + : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname), obj_name(_oname), + bl(_bl), bytes_written(0) { + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + + int rc = valid_s3_object_name(obj_name); + if (rc != 0) + return rc; + + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "PUT"; + state->op = OP_PUT; + + /* XXX derp derp derp */ + std::string uri = make_uri(bucket_name, obj_name); + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + /* XXX required in RGWOp::execute() */ + state->content_length = bl.length(); + + return 0; + } + + int get_params(optional_yield) override { + req_state* state = get_state(); + RGWAccessControlPolicy_S3 s3policy(state->cct); + /* we don't have (any) headers, so just create canned ACLs */ + int ret = s3policy.create_canned(state->owner, state->bucket_owner, state->canned_acl); + policy = s3policy; + return ret; + } + + int get_data(buffer::list& _bl) override { + /* XXX for now, use sharing semantics */ + _bl = std::move(bl); + uint32_t len = _bl.length(); + bytes_written += len; + return len; + } + + void send_response() override {} + + int verify_params() override { + if (bl.length() > cct->_conf->rgw_max_put_size) + return -ERR_TOO_LARGE; + return 0; + } + + buffer::list* get_attr(const std::string& k) { + auto iter = attrs.find(k); + return (iter != attrs.end()) ? &(iter->second) : nullptr; + } + +}; /* RGWPutObjRequest */ + +/* + get object +*/ + +class RGWReadRequest : public RGWLibRequest, + public RGWGetObj /* RGWOp */ +{ +public: + RGWFileHandle* rgw_fh; + void *ulp_buffer; + size_t nread; + size_t read_resid; /* initialize to len, <= sizeof(ulp_buffer) */ + bool do_hexdump = false; + + RGWReadRequest(CephContext* _cct, std::unique_ptr _user, + RGWFileHandle* _rgw_fh, uint64_t off, uint64_t len, + void *_ulp_buffer) + : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), ulp_buffer(_ulp_buffer), + nread(0), read_resid(len) { + op = this; + + /* fixup RGWGetObj (already know range parameters) */ + RGWGetObj::range_parsed = true; + RGWGetObj::get_data = true; // XXX + RGWGetObj::partial_content = true; + RGWGetObj::ofs = off; + RGWGetObj::end = off + len; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "GET"; + state->op = OP_GET; + + /* XXX derp derp derp */ + state->relative_uri = make_uri(rgw_fh->bucket_name(), + rgw_fh->relative_object_name()); + state->info.request_uri = state->relative_uri; // XXX + state->info.effective_uri = state->relative_uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + int get_params(optional_yield) override { + return 0; + } + + int send_response_data(ceph::buffer::list& bl, off_t bl_off, + off_t bl_len) override { + size_t bytes; + for (auto& bp : bl.buffers()) { + /* if for some reason bl_off indicates the start-of-data is not at + * the current buffer::ptr, skip it and account */ + if (bl_off > bp.length()) { + bl_off -= bp.length(); + continue; + } + /* read no more than read_resid */ + bytes = std::min(read_resid, size_t(bp.length()-bl_off)); + memcpy(static_cast(ulp_buffer)+nread, bp.c_str()+bl_off, bytes); + read_resid -= bytes; /* reduce read_resid by bytes read */ + nread += bytes; + bl_off = 0; + /* stop if we have no residual ulp_buffer */ + if (! read_resid) + break; + } + return 0; + } + + int send_response_data_error(optional_yield) override { + /* S3 implementation just sends nothing--there is no side effect + * to simulate here */ + return 0; + } + + bool prefetch_data() override { return false; } + +}; /* RGWReadRequest */ + +/* + delete object +*/ + +class RGWDeleteObjRequest : public RGWLibRequest, + public RGWDeleteObj /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + + RGWDeleteObjRequest(CephContext* _cct, std::unique_ptr _user, + const std::string& _bname, const std::string& _oname) + : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname), obj_name(_oname) { + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "DELETE"; + state->op = OP_DELETE; + + /* XXX derp derp derp */ + std::string uri = make_uri(bucket_name, obj_name); + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + void send_response() override {} + +}; /* RGWDeleteObjRequest */ + +class RGWStatObjRequest : public RGWLibRequest, + public RGWGetObj /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + uint64_t _size; + uint32_t flags; + + static constexpr uint32_t FLAG_NONE = 0x000; + + RGWStatObjRequest(CephContext* _cct, std::unique_ptr _user, + const std::string& _bname, const std::string& _oname, + uint32_t _flags) + : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname), obj_name(_oname), + _size(0), flags(_flags) { + op = this; + + /* fixup RGWGetObj (already know range parameters) */ + RGWGetObj::range_parsed = true; + RGWGetObj::get_data = false; // XXX + RGWGetObj::partial_content = true; + RGWGetObj::ofs = 0; + RGWGetObj::end = UINT64_MAX; + } + + const char* name() const override { return "stat_obj"; } + RGWOpType get_type() override { return RGW_OP_STAT_OBJ; } + + real_time get_mtime() const { + return lastmod; + } + + /* attributes */ + uint64_t get_size() { return _size; } + real_time ctime() { return mod_time; } // XXX + real_time mtime() { return mod_time; } + std::map& get_attrs() { return attrs; } + + buffer::list* get_attr(const std::string& k) { + auto iter = attrs.find(k); + return (iter != attrs.end()) ? &(iter->second) : nullptr; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "GET"; + state->op = OP_GET; + + /* XXX derp derp derp */ + state->relative_uri = make_uri(bucket_name, obj_name); + state->info.request_uri = state->relative_uri; // XXX + state->info.effective_uri = state->relative_uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + int get_params(optional_yield) override { + return 0; + } + + int send_response_data(ceph::buffer::list& _bl, off_t s_off, + off_t e_off) override { + /* NOP */ + /* XXX save attrs? */ + return 0; + } + + int send_response_data_error(optional_yield) override { + /* NOP */ + return 0; + } + + void execute(optional_yield y) override { + RGWGetObj::execute(y); + _size = get_state()->obj_size; + } + +}; /* RGWStatObjRequest */ + +class RGWStatBucketRequest : public RGWLibRequest, + public RGWStatBucket /* RGWOp */ +{ +public: + std::string uri; + std::map attrs; + RGWLibFS::BucketStats& bs; + + RGWStatBucketRequest(CephContext* _cct, std::unique_ptr _user, + const std::string& _path, + RGWLibFS::BucketStats& _stats) + : RGWLibRequest(_cct, std::move(_user)), bs(_stats) { + uri = "/" + _path; + op = this; + } + + buffer::list* get_attr(const std::string& k) { + auto iter = attrs.find(k); + return (iter != attrs.end()) ? &(iter->second) : nullptr; + } + + real_time get_ctime() const { + return bucket->get_creation_time(); + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "GET"; + state->op = OP_GET; + + /* XXX derp derp derp */ + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + virtual int get_params() { + return 0; + } + + void send_response() override { + bucket->get_creation_time() = get_state()->bucket->get_info().creation_time; + bs.size = bucket->get_size(); + bs.size_rounded = bucket->get_size_rounded(); + bs.creation_time = bucket->get_creation_time(); + bs.num_entries = bucket->get_count(); + std::swap(attrs, get_state()->bucket_attrs); + } + + bool matched() { + return (bucket->get_name().length() > 0); + } + +}; /* RGWStatBucketRequest */ + +class RGWStatLeafRequest : public RGWLibRequest, + public RGWListBucket /* RGWOp */ +{ +public: + RGWFileHandle* rgw_fh; + std::string path; + bool matched; + bool is_dir; + bool exact_matched; + + RGWStatLeafRequest(CephContext* _cct, std::unique_ptr _user, + RGWFileHandle* _rgw_fh, const std::string& _path) + : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), path(_path), + matched(false), is_dir(false), exact_matched(false) { + default_max = 1000; // logical max {"foo", "foo/"} + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "GET"; + state->op = OP_GET; + + /* XXX derp derp derp */ + std::string uri = "/" + rgw_fh->bucket_name() + "/"; + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + prefix = rgw_fh->relative_object_name(); + if (prefix.length() > 0) + prefix += "/"; + prefix += path; + delimiter = '/'; + + return 0; + } + + int get_params(optional_yield) override { + max = default_max; + return 0; + } + + void send_response() override { + req_state* state = get_state(); + // try objects + for (const auto& iter : objs) { + auto& name = iter.key.name; + lsubdout(cct, rgw, 15) << "RGWStatLeafRequest " + << __func__ << " " + << "list uri=" << state->relative_uri << " " + << " prefix=" << prefix << " " + << " obj path=" << name << "" + << " target = " << path << "" + << dendl; + /* XXX is there a missing match-dir case (trailing '/')? */ + matched = true; + if (name == path) { + exact_matched = true; + return; + } + } + // try prefixes + for (auto& iter : common_prefixes) { + auto& name = iter.first; + lsubdout(cct, rgw, 15) << "RGWStatLeafRequest " + << __func__ << " " + << "list uri=" << state->relative_uri << " " + << " prefix=" << prefix << " " + << " pref path=" << name << " (not chomped)" + << " target = " << path << "" + << dendl; + matched = true; + /* match-dir case (trailing '/') */ + if (name == prefix + "/") { + exact_matched = true; + is_dir = true; + return; + } + } + } + + virtual void send_versioned_response() { + send_response(); + } +}; /* RGWStatLeafRequest */ + +/* + put object +*/ + +class RGWWriteRequest : public RGWLibContinuedReq, + public RGWPutObj /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + RGWFileHandle* rgw_fh; + std::optional aio; + std::unique_ptr processor; + rgw::sal::DataProcessor* filter; + boost::optional compressor; + CompressorRef plugin; + buffer::list data; + uint64_t timer_id; + MD5 hash; + off_t real_ofs; + size_t bytes_written; + bool eio; + + RGWWriteRequest(rgw::sal::Driver* driver, const RGWProcessEnv& penv, + std::unique_ptr _user, + RGWFileHandle* _fh, const std::string& _bname, + const std::string& _oname) + : RGWLibContinuedReq(driver->ctx(), penv, std::move(_user)), + bucket_name(_bname), obj_name(_oname), + rgw_fh(_fh), filter(nullptr), timer_id(0), real_ofs(0), + bytes_written(0), eio(false) { + + // in ctr this is not a virtual call + // invoking this classes's header_init() + (void) RGWWriteRequest::header_init(); + op = this; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "PUT"; + state->op = OP_PUT; + + /* XXX derp derp derp */ + std::string uri = make_uri(bucket_name, obj_name); + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + int get_params(optional_yield) override { + req_state* state = get_state(); + RGWAccessControlPolicy_S3 s3policy(state->cct); + /* we don't have (any) headers, so just create canned ACLs */ + int ret = s3policy.create_canned(state->owner, state->bucket_owner, state->canned_acl); + policy = s3policy; + return ret; + } + + int get_data(buffer::list& _bl) override { + /* XXX for now, use sharing semantics */ + uint32_t len = data.length(); + _bl = std::move(data); + bytes_written += len; + return len; + } + + void put_data(off_t off, buffer::list& _bl) { + if (off != real_ofs) { + eio = true; + } + data = std::move(_bl); + real_ofs += data.length(); + ofs = off; /* consumed in exec_continue() */ + } + + int exec_start() override; + int exec_continue() override; + int exec_finish() override; + + void send_response() override {} + + int verify_params() override { + return 0; + } +}; /* RGWWriteRequest */ + +/* + copy object +*/ +class RGWCopyObjRequest : public RGWLibRequest, + public RGWCopyObj /* RGWOp */ +{ +public: + RGWFileHandle* src_parent; + RGWFileHandle* dst_parent; + const std::string& src_name; + const std::string& dst_name; + + RGWCopyObjRequest(CephContext* _cct, std::unique_ptr _user, + RGWFileHandle* _src_parent, RGWFileHandle* _dst_parent, + const std::string& _src_name, const std::string& _dst_name) + : RGWLibRequest(_cct, std::move(_user)), src_parent(_src_parent), + dst_parent(_dst_parent), src_name(_src_name), dst_name(_dst_name) { + /* all requests have this */ + op = this; + + /* allow this request to replace selected attrs */ + attrs_mod = rgw::sal::ATTRSMOD_MERGE; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "PUT"; // XXX check + state->op = OP_PUT; + + state->src_bucket_name = src_parent->bucket_name(); + state->bucket_name = dst_parent->bucket_name(); + + std::string dest_obj_name = dst_parent->format_child_name(dst_name, false); + + int rc = valid_s3_object_name(dest_obj_name); + if (rc != 0) + return rc; + + state->object = RGWHandler::driver->get_object(rgw_obj_key(dest_obj_name)); + + /* XXX and fixup key attr (could optimize w/string ref and + * dest_obj_name) */ + buffer::list ux_key; + fh_key fhk = dst_parent->make_fhk(dst_name); + rgw::encode(fhk, ux_key); + emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + +#if 0 /* XXX needed? */ + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ +#endif + + return 0; + } + + int get_params(optional_yield) override { + req_state* s = get_state(); + RGWAccessControlPolicy_S3 s3policy(s->cct); + /* we don't have (any) headers, so just create canned ACLs */ + int ret = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); + dest_policy = s3policy; + /* src_object required before RGWCopyObj::verify_permissions() */ + rgw_obj_key k = rgw_obj_key(src_name); + s->src_object = s->bucket->get_object(k); + s->object = s->src_object->clone(); // needed to avoid trap at rgw_op.cc:5150 + return ret; + } + + void send_response() override {} + void send_partial_response(off_t ofs) override {} + +}; /* RGWCopyObjRequest */ + +class RGWGetAttrsRequest : public RGWLibRequest, + public RGWGetAttrs /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + + RGWGetAttrsRequest(CephContext* _cct, + std::unique_ptr _user, + const std::string& _bname, const std::string& _oname) + : RGWLibRequest(_cct, std::move(_user)), RGWGetAttrs(), + bucket_name(_bname), obj_name(_oname) { + op = this; + } + + const flat_map>& get_attrs() { + return attrs; + } + + virtual bool only_bucket() { return false; } + + virtual int op_init() { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + virtual int header_init() { + + req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + + std::string uri = make_uri(bucket_name, obj_name); + s->relative_uri = uri; + s->info.request_uri = uri; + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + return 0; + } + + virtual int get_params() { + return 0; + } + + virtual void send_response() {} + +}; /* RGWGetAttrsRequest */ + +class RGWSetAttrsRequest : public RGWLibRequest, + public RGWSetAttrs /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + + RGWSetAttrsRequest(CephContext* _cct, std::unique_ptr _user, + const std::string& _bname, const std::string& _oname) + : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname), obj_name(_oname) { + op = this; + } + + const std::map& get_attrs() { + return attrs; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + req_state* state = get_state(); + state->info.method = "PUT"; + state->op = OP_PUT; + + /* XXX derp derp derp */ + std::string uri = make_uri(bucket_name, obj_name); + state->relative_uri = uri; + state->info.request_uri = uri; // XXX + state->info.effective_uri = uri; + state->info.request_params = ""; + state->info.domain = ""; /* XXX ? */ + + return 0; + } + + int get_params(optional_yield) override { + return 0; + } + + void send_response() override {} + +}; /* RGWSetAttrsRequest */ + +class RGWRMAttrsRequest : public RGWLibRequest, + public RGWRMAttrs /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + + RGWRMAttrsRequest(CephContext* _cct, + std::unique_ptr _user, + const std::string& _bname, const std::string& _oname) + : RGWLibRequest(_cct, std::move(_user)), RGWRMAttrs(), + bucket_name(_bname), obj_name(_oname) { + op = this; + } + + const rgw::sal::Attrs& get_attrs() { + return attrs; + } + + virtual bool only_bucket() { return false; } + + virtual int op_init() { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + virtual int header_init() { + + req_state* s = get_state(); + s->info.method = "DELETE"; + s->op = OP_PUT; + + std::string uri = make_uri(bucket_name, obj_name); + s->relative_uri = uri; + s->info.request_uri = uri; + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + return 0; + } + + virtual int get_params() { + return 0; + } + + virtual void send_response() {} + +}; /* RGWRMAttrsRequest */ + +/* + * Send request to get the rados cluster stats + */ +class RGWGetClusterStatReq : public RGWLibRequest, + public RGWGetClusterStat { +public: + struct rados_cluster_stat_t& stats_req; + RGWGetClusterStatReq(CephContext* _cct, std::unique_ptr _user, + rados_cluster_stat_t& _stats): + RGWLibRequest(_cct, std::move(_user)), stats_req(_stats){ + op = this; + } + + int op_init() override { + // assign driver, s, and dialect_handler + // framework promises to call op_init after parent init + RGWOp::init(RGWHandler::driver, get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + req_state* state = get_state(); + state->info.method = "GET"; + state->op = OP_GET; + return 0; + } + + int get_params(optional_yield) override { return 0; } + bool only_bucket() override { return false; } + void send_response() override { + stats_req.kb = stats_op.kb; + stats_req.kb_avail = stats_op.kb_avail; + stats_req.kb_used = stats_op.kb_used; + stats_req.num_objects = stats_op.num_objects; + } +}; /* RGWGetClusterStatReq */ + + +} /* namespace rgw */ diff --git a/src/rgw/rgw_flight.cc b/src/rgw/rgw_flight.cc new file mode 100644 index 000000000..f37d934b3 --- /dev/null +++ b/src/rgw/rgw_flight.cc @@ -0,0 +1,724 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright 2023 IBM + * + * See file COPYING for licensing information. + */ + +#include +#include +#include +#include +#include + +#include "arrow/type.h" +#include "arrow/buffer.h" +#include "arrow/util/string_view.h" +#include "arrow/io/interfaces.h" +#include "arrow/ipc/reader.h" +#include "arrow/table.h" + +#include "arrow/flight/server.h" + +#include "parquet/arrow/reader.h" + +#include "common/dout.h" +#include "rgw_op.h" + +#include "rgw_flight.h" +#include "rgw_flight_frontend.h" + + +namespace rgw::flight { + +// Ticket and FlightKey + +std::atomic next_flight_key = null_flight_key; + +flt::Ticket FlightKeyToTicket(const FlightKey& key) { + flt::Ticket result; + result.ticket = std::to_string(key); + return result; +} + +arw::Result TicketToFlightKey(const flt::Ticket& t) { + try { + return (FlightKey) std::stoul(t.ticket); + } catch (std::invalid_argument const& ex) { + return arw::Status::Invalid( + "could not convert Ticket containing \"%s\" into a Flight Key", + t.ticket); + } catch (const std::out_of_range& ex) { + return arw::Status::Invalid( + "could not convert Ticket containing \"%s\" into a Flight Key due to range", + t.ticket); + } +} + +// FlightData + +FlightData::FlightData(const std::string& _uri, + const std::string& _tenant_name, + const std::string& _bucket_name, + const rgw_obj_key& _object_key, + uint64_t _num_records, + uint64_t _obj_size, + std::shared_ptr& _schema, + std::shared_ptr& _kv_metadata, + rgw_user _user_id) : + key(++next_flight_key), + /* expires(coarse_real_clock::now() + lifespan), */ + uri(_uri), + tenant_name(_tenant_name), + bucket_name(_bucket_name), + object_key(_object_key), + num_records(_num_records), + obj_size(_obj_size), + schema(_schema), + kv_metadata(_kv_metadata), + user_id(_user_id) +{ } + +/**** FlightStore ****/ + +FlightStore::FlightStore(const DoutPrefix& _dp) : + dp(_dp) +{ } + +FlightStore::~FlightStore() { } + +/**** MemoryFlightStore ****/ + +MemoryFlightStore::MemoryFlightStore(const DoutPrefix& _dp) : + FlightStore(_dp) +{ } + +MemoryFlightStore::~MemoryFlightStore() { } + +FlightKey MemoryFlightStore::add_flight(FlightData&& flight) { + std::pair result; + { + const std::lock_guard lock(mtx); + result = map.insert( {flight.key, std::move(flight)} ); + } + ceph_assertf(result.second, + "unable to add FlightData to MemoryFlightStore"); // temporary until error handling + + return result.first->second.key; +} + +arw::Result MemoryFlightStore::get_flight(const FlightKey& key) const { + const std::lock_guard lock(mtx); + auto i = map.find(key); + if (i == map.cend()) { + return arw::Status::KeyError("could not find Flight with Key %" PRIu32, + key); + } else { + return i->second; + } +} + +// returns either the next FilghtData or, if at end, empty optional +std::optional MemoryFlightStore::after_key(const FlightKey& key) const { + std::optional result; + { + const std::lock_guard lock(mtx); + auto i = map.upper_bound(key); + if (i != map.end()) { + result = i->second; + } + } + return result; +} + +int MemoryFlightStore::remove_flight(const FlightKey& key) { + return 0; +} + +int MemoryFlightStore::expire_flights() { + return 0; +} + +/**** FlightServer ****/ + +FlightServer::FlightServer(RGWProcessEnv& _env, + FlightStore* _flight_store, + const DoutPrefix& _dp) : + env(_env), + driver(env.driver), + dp(_dp), + flight_store(_flight_store) +{ } + +FlightServer::~FlightServer() +{ } + + +arw::Status FlightServer::ListFlights(const flt::ServerCallContext& context, + const flt::Criteria* criteria, + std::unique_ptr* listings) { + + // function local class to implement FlightListing interface + class RGWFlightListing : public flt::FlightListing { + + FlightStore* flight_store; + FlightKey previous_key; + + public: + + RGWFlightListing(FlightStore* flight_store) : + flight_store(flight_store), + previous_key(null_flight_key) + { } + + arw::Status Next(std::unique_ptr* info) { + std::optional fd = flight_store->after_key(previous_key); + if (fd) { + previous_key = fd->key; + auto descriptor = + flt::FlightDescriptor::Path( + { fd->tenant_name, fd->bucket_name, fd->object_key.name, fd->object_key.instance, fd->object_key.ns }); + flt::FlightEndpoint endpoint; + endpoint.ticket = FlightKeyToTicket(fd->key); + std::vector endpoints { endpoint }; + + ARROW_ASSIGN_OR_RAISE(flt::FlightInfo info_obj, + flt::FlightInfo::Make(*fd->schema, descriptor, endpoints, fd->num_records, fd->obj_size)); + *info = std::make_unique(std::move(info_obj)); + return arw::Status::OK(); + } else { + *info = nullptr; + return arw::Status::OK(); + } + } + }; // class RGWFlightListing + + *listings = std::make_unique(flight_store); + return arw::Status::OK(); +} // FlightServer::ListFlights + + +arw::Status FlightServer::GetFlightInfo(const flt::ServerCallContext &context, + const flt::FlightDescriptor &request, + std::unique_ptr *info) { + return arw::Status::OK(); +} // FlightServer::GetFlightInfo + + +arw::Status FlightServer::GetSchema(const flt::ServerCallContext &context, + const flt::FlightDescriptor &request, + std::unique_ptr *schema) { + return arw::Status::OK(); +} // FlightServer::GetSchema + + // A Buffer that owns its memory and frees it when the Buffer is + // destructed +class OwnedBuffer : public arw::Buffer { + + uint8_t* buffer; + +protected: + + OwnedBuffer(uint8_t* _buffer, int64_t _size) : + Buffer(_buffer, _size), + buffer(_buffer) + { } + +public: + + ~OwnedBuffer() override { + delete[] buffer; + } + + static arw::Result> make(int64_t size) { + uint8_t* buffer = new (std::nothrow) uint8_t[size]; + if (!buffer) { + return arw::Status::OutOfMemory("could not allocated buffer of size %" PRId64, size); + } + + OwnedBuffer* ptr = new OwnedBuffer(buffer, size); + std::shared_ptr result; + result.reset(ptr); + return result; + } + + // if what's read in is less than capacity + void set_size(int64_t size) { + size_ = size; + } + + // pointer that can be used to write into buffer + uint8_t* writeable_data() { + return buffer; + } +}; // class OwnedBuffer + +#if 0 // remove classes used for testing and incrementally building + +// make local to DoGet eventually +class LocalInputStream : public arw::io::InputStream { + + std::iostream::pos_type position; + std::fstream file; + std::shared_ptr kv_metadata; + const DoutPrefix dp; + +public: + + LocalInputStream(std::shared_ptr _kv_metadata, + const DoutPrefix _dp) : + kv_metadata(_kv_metadata), + dp(_dp) + {} + + arw::Status Open() { + file.open("/tmp/green_tripdata_2022-04.parquet", std::ios::in); + if (!file.good()) { + return arw::Status::IOError("unable to open file"); + } + + INFO << "file opened successfully" << dendl; + position = file.tellg(); + return arw::Status::OK(); + } + + arw::Status Close() override { + file.close(); + INFO << "file closed" << dendl; + return arw::Status::OK(); + } + + arw::Result Tell() const override { + if (position < 0) { + return arw::Status::IOError( + "could not query file implementaiton with tellg"); + } else { + return int64_t(position); + } + } + + bool closed() const override { + return file.is_open(); + } + + arw::Result Read(int64_t nbytes, void* out) override { + INFO << "entered: asking for " << nbytes << " bytes" << dendl; + if (file.read(reinterpret_cast(out), + reinterpret_cast(nbytes))) { + const std::streamsize bytes_read = file.gcount(); + INFO << "Point A: read bytes " << bytes_read << dendl; + position = file.tellg(); + return bytes_read; + } else { + ERROR << "unable to read from file" << dendl; + return arw::Status::IOError("unable to read from offset %" PRId64, + int64_t(position)); + } + } + + arw::Result> Read(int64_t nbytes) override { + INFO << "entered: " << ": asking for " << nbytes << " bytes" << dendl; + + std::shared_ptr buffer; + ARROW_ASSIGN_OR_RAISE(buffer, OwnedBuffer::make(nbytes)); + + if (file.read(reinterpret_cast(buffer->writeable_data()), + reinterpret_cast(nbytes))) { + const auto bytes_read = file.gcount(); + INFO << "Point B: read bytes " << bytes_read << dendl; + // buffer->set_size(bytes_read); + position = file.tellg(); + return buffer; + } else if (file.rdstate() & std::ifstream::failbit && + file.rdstate() & std::ifstream::eofbit) { + const auto bytes_read = file.gcount(); + INFO << "3 read bytes " << bytes_read << " and reached EOF" << dendl; + // buffer->set_size(bytes_read); + position = file.tellg(); + return buffer; + } else { + ERROR << "unable to read from file" << dendl; + return arw::Status::IOError("unable to read from offset %ld", position); + } + } + + arw::Result Peek(int64_t nbytes) override { + INFO << "called, not implemented" << dendl; + return arw::Status::NotImplemented("peek not currently allowed"); + } + + bool supports_zero_copy() const override { + return false; + } + + arw::Result> ReadMetadata() override { + INFO << "called" << dendl; + return kv_metadata; + } +}; // class LocalInputStream + +class LocalRandomAccessFile : public arw::io::RandomAccessFile { + + FlightData flight_data; + const DoutPrefix dp; + + std::iostream::pos_type position; + std::fstream file; + +public: + LocalRandomAccessFile(const FlightData& _flight_data, const DoutPrefix _dp) : + flight_data(_flight_data), + dp(_dp) + { } + + // implement InputStream + + arw::Status Open() { + file.open("/tmp/green_tripdata_2022-04.parquet", std::ios::in); + if (!file.good()) { + return arw::Status::IOError("unable to open file"); + } + + INFO << "file opened successfully" << dendl; + position = file.tellg(); + return arw::Status::OK(); + } + + arw::Status Close() override { + file.close(); + INFO << "file closed" << dendl; + return arw::Status::OK(); + } + + arw::Result Tell() const override { + if (position < 0) { + return arw::Status::IOError( + "could not query file implementaiton with tellg"); + } else { + return int64_t(position); + } + } + + bool closed() const override { + return file.is_open(); + } + + arw::Result Read(int64_t nbytes, void* out) override { + INFO << "entered: asking for " << nbytes << " bytes" << dendl; + if (file.read(reinterpret_cast(out), + reinterpret_cast(nbytes))) { + const std::streamsize bytes_read = file.gcount(); + INFO << "Point A: read bytes " << bytes_read << dendl; + position = file.tellg(); + return bytes_read; + } else { + ERROR << "unable to read from file" << dendl; + return arw::Status::IOError("unable to read from offset %" PRId64, + int64_t(position)); + } + } + + arw::Result> Read(int64_t nbytes) override { + INFO << "entered: asking for " << nbytes << " bytes" << dendl; + + std::shared_ptr buffer; + ARROW_ASSIGN_OR_RAISE(buffer, OwnedBuffer::make(nbytes)); + + if (file.read(reinterpret_cast(buffer->writeable_data()), + reinterpret_cast(nbytes))) { + const auto bytes_read = file.gcount(); + INFO << "Point B: read bytes " << bytes_read << dendl; + // buffer->set_size(bytes_read); + position = file.tellg(); + return buffer; + } else if (file.rdstate() & std::ifstream::failbit && + file.rdstate() & std::ifstream::eofbit) { + const auto bytes_read = file.gcount(); + INFO << "3 read bytes " << bytes_read << " and reached EOF" << dendl; + // buffer->set_size(bytes_read); + position = file.tellg(); + return buffer; + } else { + ERROR << "unable to read from file" << dendl; + return arw::Status::IOError("unable to read from offset %ld", position); + } + } + + bool supports_zero_copy() const override { + return false; + } + + // implement Seekable + + arw::Result GetSize() override { + return flight_data.obj_size; + } + + arw::Result Peek(int64_t nbytes) override { + std::iostream::pos_type here = file.tellg(); + if (here == -1) { + return arw::Status::IOError( + "unable to determine current position ahead of peek"); + } + + ARROW_ASSIGN_OR_RAISE(OwningStringView result, + OwningStringView::make(nbytes)); + + // read + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, + Read(nbytes, (void*) result.writeable_data())); + (void) bytes_read; // silence unused variable warnings + + // return offset to original + ARROW_RETURN_NOT_OK(Seek(here)); + + return result; + } + + arw::Result> ReadMetadata() { + return flight_data.kv_metadata; + } + + arw::Future> ReadMetadataAsync( + const arw::io::IOContext& io_context) override { + return arw::Future>::MakeFinished(ReadMetadata()); + } + + // implement Seekable interface + + arw::Status Seek(int64_t position) { + file.seekg(position); + if (file.fail()) { + return arw::Status::IOError( + "error encountered during seek to %" PRId64, position); + } else { + return arw::Status::OK(); + } + } +}; // class LocalRandomAccessFile +#endif + +class RandomAccessObject : public arw::io::RandomAccessFile { + + FlightData flight_data; + const DoutPrefix dp; + + int64_t position; + bool is_closed; + std::unique_ptr op; + +public: + + RandomAccessObject(const FlightData& _flight_data, + std::unique_ptr& obj, + const DoutPrefix _dp) : + flight_data(_flight_data), + dp(_dp), + position(-1), + is_closed(false) + { + op = obj->get_read_op(); + } + + arw::Status Open() { + int ret = op->prepare(null_yield, &dp); + if (ret < 0) { + return arw::Status::IOError( + "unable to prepare object with error %d", ret); + } + INFO << "file opened successfully" << dendl; + position = 0; + return arw::Status::OK(); + } + + // implement InputStream + + arw::Status Close() override { + position = -1; + is_closed = true; + (void) op.reset(); + INFO << "object closed" << dendl; + return arw::Status::OK(); + } + + arw::Result Tell() const override { + if (position < 0) { + return arw::Status::IOError("could not determine position"); + } else { + return position; + } + } + + bool closed() const override { + return is_closed; + } + + arw::Result Read(int64_t nbytes, void* out) override { + INFO << "entered: asking for " << nbytes << " bytes" << dendl; + + if (position < 0) { + ERROR << "error, position indicated error" << dendl; + return arw::Status::IOError("object read op is in bad state"); + } + + // note: read function reads through end_position inclusive + int64_t end_position = position + nbytes - 1; + + bufferlist bl; + + const int64_t bytes_read = + op->read(position, end_position, bl, null_yield, &dp); + if (bytes_read < 0) { + const int64_t former_position = position; + position = -1; + ERROR << "read operation returned " << bytes_read << dendl; + return arw::Status::IOError( + "unable to read object at position %" PRId64 ", error code: %" PRId64, + former_position, + bytes_read); + } + + // TODO: see if there's a way to get rid of this copy, perhaps + // updating rgw::sal::read_op + bl.cbegin().copy(bytes_read, reinterpret_cast(out)); + + position += bytes_read; + + if (nbytes != bytes_read) { + INFO << "partial read: nbytes=" << nbytes << + ", bytes_read=" << bytes_read << dendl; + } + INFO << bytes_read << " bytes read" << dendl; + return bytes_read; + } + + arw::Result> Read(int64_t nbytes) override { + INFO << "entered: asking for " << nbytes << " bytes" << dendl; + + std::shared_ptr buffer; + ARROW_ASSIGN_OR_RAISE(buffer, OwnedBuffer::make(nbytes)); + + ARROW_ASSIGN_OR_RAISE(const int64_t bytes_read, + Read(nbytes, buffer->writeable_data())); + buffer->set_size(bytes_read); + + return buffer; + } + + bool supports_zero_copy() const override { + return false; + } + + // implement Seekable + + arw::Result GetSize() override { + INFO << "entered: " << flight_data.obj_size << " returned" << dendl; + return flight_data.obj_size; + } + + arw::Result Peek(int64_t nbytes) override { + INFO << "entered: " << nbytes << " bytes" << dendl; + + int64_t saved_position = position; + + ARROW_ASSIGN_OR_RAISE(OwningStringView buffer, + OwningStringView::make(nbytes)); + + ARROW_ASSIGN_OR_RAISE(const int64_t bytes_read, + Read(nbytes, (void*) buffer.writeable_data())); + + // restore position for a peek + position = saved_position; + + if (bytes_read < nbytes) { + // create new OwningStringView with moved buffer + return OwningStringView::shrink(std::move(buffer), bytes_read); + } else { + return buffer; + } + } + + arw::Result> ReadMetadata() { + return flight_data.kv_metadata; + } + + arw::Future> ReadMetadataAsync( + const arw::io::IOContext& io_context) override { + return arw::Future>::MakeFinished(ReadMetadata()); + } + + // implement Seekable interface + + arw::Status Seek(int64_t new_position) { + INFO << "entered: position: " << new_position << dendl; + if (position < 0) { + ERROR << "error, position indicated error" << dendl; + return arw::Status::IOError("object read op is in bad state"); + } else { + position = new_position; + return arw::Status::OK(); + } + } +}; // class RandomAccessObject + +arw::Status FlightServer::DoGet(const flt::ServerCallContext &context, + const flt::Ticket &request, + std::unique_ptr *stream) { + int ret; + + ARROW_ASSIGN_OR_RAISE(FlightKey key, TicketToFlightKey(request)); + ARROW_ASSIGN_OR_RAISE(FlightData fd, get_flight_store()->get_flight(key)); + + std::unique_ptr user = driver->get_user(fd.user_id); + if (user->empty()) { + INFO << "user is empty" << dendl; + } else { + // TODO: test what happens if user is not loaded + ret = user->load_user(&dp, null_yield); + if (ret < 0) { + ERROR << "load_user returned " << ret << dendl; + // TODO return something + } + INFO << "user is " << user->get_display_name() << dendl; + } + + std::unique_ptr bucket; + + ret = driver->get_bucket(&dp, &(*user), fd.tenant_name, fd.bucket_name, + &bucket, null_yield); + if (ret < 0) { + ERROR << "get_bucket returned " << ret << dendl; + // TODO return something + } + + std::unique_ptr object = bucket->get_object(fd.object_key); + + auto input = std::make_shared(fd, object, dp); + ARROW_RETURN_NOT_OK(input->Open()); + + std::unique_ptr reader; + ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, + arw::default_memory_pool(), + &reader)); + + std::shared_ptr table; + ARROW_RETURN_NOT_OK(reader->ReadTable(&table)); + + std::vector> batches; + arw::TableBatchReader batch_reader(*table); + ARROW_RETURN_NOT_OK(batch_reader.ReadAll(&batches)); + + ARROW_ASSIGN_OR_RAISE(auto owning_reader, + arw::RecordBatchReader::Make( + std::move(batches), table->schema())); + *stream = std::unique_ptr( + new flt::RecordBatchStream(owning_reader)); + + return arw::Status::OK(); +} // flightServer::DoGet + +} // namespace rgw::flight diff --git a/src/rgw/rgw_flight.h b/src/rgw/rgw_flight.h new file mode 100644 index 000000000..bb0a987d0 --- /dev/null +++ b/src/rgw/rgw_flight.h @@ -0,0 +1,221 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright 2023 IBM + * + * See file COPYING for licensing information. + */ + +#pragma once + +#include +#include +#include + +#include "include/common_fwd.h" +#include "common/ceph_context.h" +#include "common/Thread.h" +#include "common/ceph_time.h" +#include "rgw_frontend.h" +#include "arrow/type.h" +#include "arrow/flight/server.h" +#include "arrow/util/string_view.h" + +#include "rgw_flight_frontend.h" + + +#define INFO_F(dp) ldpp_dout(&dp, 20) << "INFO: " << __func__ << ": " +#define STATUS_F(dp) ldpp_dout(&dp, 10) << "STATUS: " << __func__ << ": " +#define WARN_F(dp) ldpp_dout(&dp, 0) << "WARNING: " << __func__ << ": " +#define ERROR_F(dp) ldpp_dout(&dp, 0) << "ERROR: " << __func__ << ": " + +#define INFO INFO_F(dp) +#define STATUS STATUS_F(dp) +#define WARN WARN_F(dp) +#define ERROR ERROR_F(dp) + + +namespace arw = arrow; +namespace flt = arrow::flight; + + +struct req_state; + +namespace rgw::flight { + +static const coarse_real_clock::duration lifespan = std::chrono::hours(1); + +struct FlightData { + FlightKey key; + // coarse_real_clock::time_point expires; + std::string uri; + std::string tenant_name; + std::string bucket_name; + rgw_obj_key object_key; + // NB: what about object's namespace and instance? + uint64_t num_records; + uint64_t obj_size; + std::shared_ptr schema; + std::shared_ptr kv_metadata; + + rgw_user user_id; // TODO: this should be removed when we do + // proper flight authentication + + FlightData(const std::string& _uri, + const std::string& _tenant_name, + const std::string& _bucket_name, + const rgw_obj_key& _object_key, + uint64_t _num_records, + uint64_t _obj_size, + std::shared_ptr& _schema, + std::shared_ptr& _kv_metadata, + rgw_user _user_id); +}; + +// stores flights that have been created and helps expire them +class FlightStore { + +protected: + + const DoutPrefix& dp; + +public: + + FlightStore(const DoutPrefix& dp); + virtual ~FlightStore(); + virtual FlightKey add_flight(FlightData&& flight) = 0; + + // TODO consider returning const shared pointers to FlightData in + // the following two functions + virtual arw::Result get_flight(const FlightKey& key) const = 0; + virtual std::optional after_key(const FlightKey& key) const = 0; + + virtual int remove_flight(const FlightKey& key) = 0; + virtual int expire_flights() = 0; +}; + +class MemoryFlightStore : public FlightStore { + std::map map; + mutable std::mutex mtx; // for map + +public: + + MemoryFlightStore(const DoutPrefix& dp); + virtual ~MemoryFlightStore(); + FlightKey add_flight(FlightData&& flight) override; + arw::Result get_flight(const FlightKey& key) const override; + std::optional after_key(const FlightKey& key) const override; + int remove_flight(const FlightKey& key) override; + int expire_flights() override; +}; + +class FlightServer : public flt::FlightServerBase { + + using Data1 = std::vector>; + + RGWProcessEnv& env; + rgw::sal::Driver* driver; + const DoutPrefix& dp; + FlightStore* flight_store; + + std::map data; + +public: + + static constexpr int default_port = 8077; + + FlightServer(RGWProcessEnv& env, + FlightStore* flight_store, + const DoutPrefix& dp); + ~FlightServer() override; + + FlightStore* get_flight_store() { + return flight_store; + } + + arw::Status ListFlights(const flt::ServerCallContext& context, + const flt::Criteria* criteria, + std::unique_ptr* listings) override; + + arw::Status GetFlightInfo(const flt::ServerCallContext &context, + const flt::FlightDescriptor &request, + std::unique_ptr *info) override; + + arw::Status GetSchema(const flt::ServerCallContext &context, + const flt::FlightDescriptor &request, + std::unique_ptr *schema) override; + + arw::Status DoGet(const flt::ServerCallContext &context, + const flt::Ticket &request, + std::unique_ptr *stream) override; +}; // class FlightServer + +class OwningStringView : public arw::util::string_view { + + uint8_t* buffer; + int64_t capacity; + int64_t consumed; + + OwningStringView(uint8_t* _buffer, int64_t _size) : + arw::util::string_view((const char*) _buffer, _size), + buffer(_buffer), + capacity(_size), + consumed(_size) + { } + + OwningStringView(OwningStringView&& from, int64_t new_size) : + buffer(nullptr), + capacity(from.capacity), + consumed(new_size) + { + // should be impossible due to static function check + ceph_assertf(consumed <= capacity, "new size cannot exceed capacity"); + + std::swap(buffer, from.buffer); + from.capacity = 0; + from.consumed = 0; + } + +public: + + OwningStringView(OwningStringView&&) = default; + OwningStringView& operator=(OwningStringView&&) = default; + + uint8_t* writeable_data() { + return buffer; + } + + ~OwningStringView() { + if (buffer) { + delete[] buffer; + } + } + + static arw::Result make(int64_t size) { + uint8_t* buffer = new uint8_t[size]; + if (!buffer) { + return arw::Status::OutOfMemory("could not allocated buffer of size %" PRId64, size); + } + return OwningStringView(buffer, size); + } + + static arw::Result shrink(OwningStringView&& from, + int64_t new_size) { + if (new_size > from.capacity) { + return arw::Status::Invalid("new size cannot exceed capacity"); + } else { + return OwningStringView(std::move(from), new_size); + } + } + +}; + +// GLOBAL + +flt::Ticket FlightKeyToTicket(const FlightKey& key); +arw::Status TicketToFlightKey(const flt::Ticket& t, FlightKey& key); + +} // namespace rgw::flight diff --git a/src/rgw/rgw_flight_frontend.cc b/src/rgw/rgw_flight_frontend.cc new file mode 100644 index 000000000..c29703fe5 --- /dev/null +++ b/src/rgw/rgw_flight_frontend.cc @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright 2023 IBM + * + * See file COPYING for licensing information. + */ + +#include +#include +#include + +#include "arrow/type.h" +#include "arrow/flight/server.h" +#include "arrow/io/file.h" + +#include "parquet/arrow/reader.h" +#include "parquet/arrow/schema.h" +#include "parquet/stream_reader.h" + +#include "rgw_flight_frontend.h" +#include "rgw_flight.h" + + +// logging +constexpr unsigned dout_subsys = ceph_subsys_rgw_flight; +constexpr const char* dout_prefix_str = "rgw arrow_flight: "; + + +namespace rgw::flight { + +const FlightKey null_flight_key = 0; + +FlightFrontend::FlightFrontend(RGWProcessEnv& _env, + RGWFrontendConfig* _config, + int _port) : + env(_env), + config(_config), + port(_port), + dp(env.driver->ctx(), dout_subsys, dout_prefix_str) +{ + env.flight_store = new MemoryFlightStore(dp); + env.flight_server = new FlightServer(env, env.flight_store, dp); + INFO << "flight server started" << dendl; +} + +FlightFrontend::~FlightFrontend() { + delete env.flight_server; + env.flight_server = nullptr; + + delete env.flight_store; + env.flight_store = nullptr; + + INFO << "flight server shut down" << dendl; +} + +int FlightFrontend::init() { + if (port <= 0) { + port = FlightServer::default_port; + } + const std::string url = + std::string("grpc+tcp://localhost:") + std::to_string(port); + flt::Location location; + arw::Status s = flt::Location::Parse(url, &location); + if (!s.ok()) { + ERROR << "couldn't parse url=" << url << ", status=" << s << dendl; + return -EINVAL; + } + + flt::FlightServerOptions options(location); + options.verify_client = false; + s = env.flight_server->Init(options); + if (!s.ok()) { + ERROR << "couldn't init flight server; status=" << s << dendl; + return -EINVAL; + } + + INFO << "FlightServer inited; will use port " << port << dendl; + return 0; +} + +int FlightFrontend::run() { + try { + flight_thread = make_named_thread(server_thread_name, + &FlightServer::Serve, + env.flight_server); + + INFO << "FlightServer thread started, id=" << + flight_thread.get_id() << + ", joinable=" << flight_thread.joinable() << dendl; + return 0; + } catch (std::system_error& e) { + ERROR << "FlightServer thread failed to start" << dendl; + return -e.code().value(); + } +} + +void FlightFrontend::stop() { + env.flight_server->Shutdown(); + env.flight_server->Wait(); + INFO << "FlightServer shut down" << dendl; +} + +void FlightFrontend::join() { + flight_thread.join(); + INFO << "FlightServer thread joined" << dendl; +} + +void FlightFrontend::pause_for_new_config() { + // ignore since config changes won't alter flight_server +} + +void FlightFrontend::unpause_with_new_config() { + // ignore since config changes won't alter flight_server +} + +/* ************************************************************ */ + +FlightGetObj_Filter::FlightGetObj_Filter(const req_state* request, + RGWGetObj_Filter* next) : + RGWGetObj_Filter(next), + penv(request->penv), + dp(request->cct->get(), dout_subsys, dout_prefix_str), + current_offset(0), + expected_size(request->obj_size), + uri(request->decoded_uri), + tenant_name(request->bucket->get_tenant()), + bucket_name(request->bucket->get_name()), + object_key(request->object->get_key()), + // note: what about object namespace and instance? + schema_status(arrow::StatusCode::Cancelled, + "schema determination incomplete"), + user_id(request->user->get_id()) +{ +#warning "TODO: fix use of tmpnam" + char name[L_tmpnam]; + const char* namep = std::tmpnam(name); + if (!namep) { + // + } + temp_file_name = namep; + + temp_file.open(temp_file_name); +} + +FlightGetObj_Filter::~FlightGetObj_Filter() { + if (temp_file.is_open()) { + temp_file.close(); + } + std::error_code error; + std::filesystem::remove(temp_file_name, error); + if (error) { + ERROR << "FlightGetObj_Filter got error when removing temp file; " + "error=" << error.value() << + ", temp_file_name=" << temp_file_name << dendl; + } else { + INFO << "parquet/arrow schema determination status: " << + schema_status << dendl; + } +} + +int FlightGetObj_Filter::handle_data(bufferlist& bl, + off_t bl_ofs, off_t bl_len) { + INFO << "flight handling data from offset " << + current_offset << " (" << bl_ofs << ") of size " << bl_len << dendl; + + current_offset += bl_len; + + if (temp_file.is_open()) { + bl.write_stream(temp_file); + + if (current_offset >= expected_size) { + INFO << "data read is completed, current_offset=" << + current_offset << ", expected_size=" << expected_size << dendl; + temp_file.close(); + + std::shared_ptr kv_metadata; + std::shared_ptr aw_schema; + int64_t num_rows = 0; + + auto process_metadata = [&aw_schema, &num_rows, &kv_metadata, this]() -> arrow::Status { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr file, + arrow::io::ReadableFile::Open(temp_file_name)); + const std::shared_ptr metadata = parquet::ReadMetaData(file); + + file->Close(); + + num_rows = metadata->num_rows(); + kv_metadata = metadata->key_value_metadata(); + const parquet::SchemaDescriptor* pq_schema = metadata->schema(); + ARROW_RETURN_NOT_OK(parquet::arrow::FromParquetSchema(pq_schema, &aw_schema)); + + return arrow::Status::OK(); + }; + + schema_status = process_metadata(); + if (!schema_status.ok()) { + ERROR << "reading metadata to access schema, error=" << schema_status << dendl; + } else { + // INFO << "arrow_schema=" << *aw_schema << dendl; + FlightStore* store = penv.flight_store; + auto key = + store->add_flight(FlightData(uri, tenant_name, bucket_name, + object_key, num_rows, + expected_size, aw_schema, + kv_metadata, user_id)); + (void) key; // suppress unused variable warning + } + } // if last block + } // if file opened + + // chain to next filter in stream + int ret = RGWGetObj_Filter::handle_data(bl, bl_ofs, bl_len); + + return ret; +} + +#if 0 +void code_snippets() { + INFO << "num_columns:" << md->num_columns() << + " num_schema_elements:" << md->num_schema_elements() << + " num_rows:" << md->num_rows() << + " num_row_groups:" << md->num_row_groups() << dendl; + + + INFO << "file schema: name=" << schema1->name() << ", ToString:" << schema1->ToString() << ", num_columns=" << schema1->num_columns() << dendl; + for (int c = 0; c < schema1->num_columns(); ++c) { + const parquet::ColumnDescriptor* cd = schema1->Column(c); + // const parquet::ConvertedType::type t = cd->converted_type; + const std::shared_ptr lt = cd->logical_type(); + INFO << "column " << c << ": name=" << cd->name() << ", ToString=" << cd->ToString() << ", logical_type=" << lt->ToString() << dendl; + } + + INFO << "There are " << md->num_rows() << " rows and " << md->num_row_groups() << " row groups" << dendl; + for (int rg = 0; rg < md->num_row_groups(); ++rg) { + INFO << "Row Group " << rg << dendl; + auto rg_md = md->RowGroup(rg); + auto schema2 = rg_md->schema(); + } +} +#endif + +} // namespace rgw::flight diff --git a/src/rgw/rgw_flight_frontend.h b/src/rgw/rgw_flight_frontend.h new file mode 100644 index 000000000..dfc470a3b --- /dev/null +++ b/src/rgw/rgw_flight_frontend.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright 2023 IBM + * + * See file COPYING for licensing information. + */ + +#pragma once + +#include "include/common_fwd.h" +#include "common/Thread.h" +#include "rgw_frontend.h" +#include "rgw_op.h" + +#include "arrow/status.h" + + +namespace rgw::flight { + +using FlightKey = uint32_t; +extern const FlightKey null_flight_key; + +class FlightServer; + +class FlightFrontend : public RGWFrontend { + + static constexpr std::string_view server_thread_name = + "Arrow Flight Server thread"; + + RGWProcessEnv& env; + std::thread flight_thread; + RGWFrontendConfig* config; + int port; + + const DoutPrefix dp; + +public: + + // port <= 0 means let server decide; typically 8077 + FlightFrontend(RGWProcessEnv& env, + RGWFrontendConfig* config, + int port = -1); + ~FlightFrontend() override; + int init() override; + int run() override; + void stop() override; + void join() override; + + void pause_for_new_config() override; + void unpause_with_new_config() override; +}; // class FlightFrontend + +class FlightGetObj_Filter : public RGWGetObj_Filter { + + const RGWProcessEnv& penv; + const DoutPrefix dp; + FlightKey key; + uint64_t current_offset; + uint64_t expected_size; + std::string uri; + std::string tenant_name; + std::string bucket_name; + rgw_obj_key object_key; + std::string temp_file_name; + std::ofstream temp_file; + arrow::Status schema_status; + rgw_user user_id; // TODO: this should be removed when we do + // proper flight authentication + +public: + + FlightGetObj_Filter(const req_state* request, RGWGetObj_Filter* next); + ~FlightGetObj_Filter(); + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override; +#if 0 + // this would allow the range to be modified if necessary; + int fixup_range(off_t& ofs, off_t& end) override; +#endif +}; + +} // namespace rgw::flight diff --git a/src/rgw/rgw_formats.cc b/src/rgw/rgw_formats.cc new file mode 100644 index 000000000..7ff312802 --- /dev/null +++ b/src/rgw/rgw_formats.cc @@ -0,0 +1,381 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include + +#include "common/escape.h" +#include "common/Formatter.h" +#include "rgw/rgw_common.h" +#include "rgw/rgw_formats.h" +#include "rgw/rgw_rest.h" + +#define LARGE_SIZE 8192 + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWFormatter_Plain::RGWFormatter_Plain(const bool ukv) + : use_kv(ukv) +{ +} + +RGWFormatter_Plain::~RGWFormatter_Plain() +{ + free(buf); +} + +void RGWFormatter_Plain::flush(ostream& os) +{ + if (!buf) + return; + + if (len) { + os << buf; + os.flush(); + } + + reset_buf(); +} + +void RGWFormatter_Plain::reset_buf() +{ + free(buf); + buf = NULL; + len = 0; + max_len = 0; +} + +void RGWFormatter_Plain::reset() +{ + reset_buf(); + stack.clear(); + min_stack_level = 0; +} + +void RGWFormatter_Plain::open_array_section(std::string_view name) +{ + struct plain_stack_entry new_entry; + new_entry.is_array = true; + new_entry.size = 0; + + if (use_kv && min_stack_level > 0 && !stack.empty()) { + struct plain_stack_entry& entry = stack.back(); + + if (!entry.is_array) + dump_format(name, ""); + } + + stack.push_back(new_entry); +} + +void RGWFormatter_Plain::open_array_section_in_ns(std::string_view name, const char *ns) +{ + ostringstream oss; + oss << name << " " << ns; + open_array_section(oss.str().c_str()); +} + +void RGWFormatter_Plain::open_object_section(std::string_view name) +{ + struct plain_stack_entry new_entry; + new_entry.is_array = false; + new_entry.size = 0; + + if (use_kv && min_stack_level > 0) + dump_format(name, ""); + + stack.push_back(new_entry); +} + +void RGWFormatter_Plain::open_object_section_in_ns(std::string_view name, + const char *ns) +{ + ostringstream oss; + oss << name << " " << ns; + open_object_section(oss.str().c_str()); +} + +void RGWFormatter_Plain::close_section() +{ + stack.pop_back(); +} + +void RGWFormatter_Plain::dump_null(std::string_view name) +{ + dump_value_int(name, "null"); /* I feel a little bad about this. */ +} + +void RGWFormatter_Plain::dump_unsigned(std::string_view name, uint64_t u) +{ + dump_value_int(name, "%" PRIu64, u); +} + +void RGWFormatter_Plain::dump_int(std::string_view name, int64_t u) +{ + dump_value_int(name, "%" PRId64, u); +} + +void RGWFormatter_Plain::dump_float(std::string_view name, double d) +{ + dump_value_int(name, "%f", d); +} + +void RGWFormatter_Plain::dump_string(std::string_view name, std::string_view s) +{ + dump_format(name, "%.*s", s.size(), s.data()); +} + +std::ostream& RGWFormatter_Plain::dump_stream(std::string_view name) +{ + // TODO: implement this! + ceph_abort(); +} + +void RGWFormatter_Plain::dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) +{ + char buf[LARGE_SIZE]; + + struct plain_stack_entry& entry = stack.back(); + + if (!min_stack_level) + min_stack_level = stack.size(); + + bool should_print = ((stack.size() == min_stack_level && !entry.size) || use_kv); + + entry.size++; + + if (!should_print) + return; + + vsnprintf(buf, LARGE_SIZE, fmt, ap); + + const char *eol; + if (wrote_something) { + if (use_kv && entry.is_array && entry.size > 1) + eol = ", "; + else + eol = "\n"; + } else + eol = ""; + wrote_something = true; + + if (use_kv && !entry.is_array) + write_data("%s%.*s: %s", eol, name.size(), name.data(), buf); + else + write_data("%s%s", eol, buf); +} + +int RGWFormatter_Plain::get_len() const +{ + // don't include null termination in length + return (len ? len - 1 : 0); +} + +void RGWFormatter_Plain::write_raw_data(const char *data) +{ + write_data("%s", data); +} + +void RGWFormatter_Plain::write_data(const char *fmt, ...) +{ +#define LARGE_ENOUGH_LEN 128 + int n, size = LARGE_ENOUGH_LEN; + char s[size + 8]; + char *p, *np; + bool p_on_stack; + va_list ap; + int pos; + + p = s; + p_on_stack = true; + + while (1) { + va_start(ap, fmt); + n = vsnprintf(p, size, fmt, ap); + va_end(ap); + + if (n > -1 && n < size) + goto done; + /* Else try again with more space. */ + if (n > -1) /* glibc 2.1 */ + size = n+1; /* precisely what is needed */ + else /* glibc 2.0 */ + size *= 2; /* twice the old size */ + if (p_on_stack) + np = (char *)malloc(size + 8); + else + np = (char *)realloc(p, size + 8); + if (!np) + goto done_free; + p = np; + p_on_stack = false; + } +done: +#define LARGE_ENOUGH_BUF 4096 + if (!buf) { + max_len = std::max(LARGE_ENOUGH_BUF, size); + buf = (char *)malloc(max_len); + if (!buf) { + cerr << "ERROR: RGWFormatter_Plain::write_data: failed allocating " << max_len << " bytes" << std::endl; + goto done_free; + } + } + + if (len + size > max_len) { + max_len = len + size + LARGE_ENOUGH_BUF; + void *_realloc = NULL; + if ((_realloc = realloc(buf, max_len)) == NULL) { + cerr << "ERROR: RGWFormatter_Plain::write_data: failed allocating " << max_len << " bytes" << std::endl; + goto done_free; + } else { + buf = (char *)_realloc; + } + } + + pos = len; + if (len) + pos--; // squash null termination + strcpy(buf + pos, p); + len = pos + strlen(p) + 1; +done_free: + if (!p_on_stack) + free(p); +} + +void RGWFormatter_Plain::dump_value_int(std::string_view name, const char *fmt, ...) +{ + char buf[LARGE_SIZE]; + va_list ap; + + if (!min_stack_level) + min_stack_level = stack.size(); + + struct plain_stack_entry& entry = stack.back(); + bool should_print = ((stack.size() == min_stack_level && !entry.size) || use_kv); + + entry.size++; + + if (!should_print) + return; + + va_start(ap, fmt); + vsnprintf(buf, LARGE_SIZE, fmt, ap); + va_end(ap); + + const char *eol; + if (wrote_something) { + eol = "\n"; + } else + eol = ""; + wrote_something = true; + + if (use_kv && !entry.is_array) + write_data("%s%.*s: %s", eol, name.size(), name.data(), buf); + else + write_data("%s%s", eol, buf); + +} + + +/* An utility class that serves as a mean to access the protected static + * methods of XMLFormatter. */ +class HTMLHelper : public XMLFormatter { +public: + static std::string escape(const std::string& unescaped_str) { + int len = escape_xml_attr_len(unescaped_str.c_str()); + std::string escaped(len, 0); + escape_xml_attr(unescaped_str.c_str(), escaped.data()); + return escaped; + } +}; + +void RGWSwiftWebsiteListingFormatter::generate_header( + const std::string& dir_path, + const std::string& css_path) +{ + ss << R"()"; + + ss << "Listing of " << xml_stream_escaper(dir_path) + << ""; + + if (! css_path.empty()) { + ss << boost::format(R"()") + % url_encode(css_path); + } else { + ss << R"()"; + } + + ss << ""; + + ss << R"(

Listing of )" << xml_stream_escaper(dir_path) << "

" + << R"()" + << R"()" + << R"()" + << R"()" + << R"()" + << R"()"; + + if (! prefix.empty()) { + ss << R"()" + << R"()" + << R"()" + << R"()" + << R"()"; + } +} + +void RGWSwiftWebsiteListingFormatter::generate_footer() +{ + ss << R"(
NameSizeDate
../  
)"; +} + +std::string RGWSwiftWebsiteListingFormatter::format_name( + const std::string& item_name) const +{ + return item_name.substr(prefix.length()); +} + +void RGWSwiftWebsiteListingFormatter::dump_object(const rgw_bucket_dir_entry& objent) +{ + const auto name = format_name(objent.key.name); + ss << boost::format(R"()") + % "default" + << boost::format(R"(%s)") + % url_encode(name) + % HTMLHelper::escape(name) + << boost::format(R"(%lld)") % objent.meta.size + << boost::format(R"(%s)") + % dump_time_to_str(objent.meta.mtime) + << R"()"; +} + +void RGWSwiftWebsiteListingFormatter::dump_subdir(const std::string& name) +{ + const auto fname = format_name(name); + ss << R"()" + << boost::format(R"(%s)") + % url_encode(fname) + % HTMLHelper::escape(fname) + << R"( )" + << R"( )" + << R"()"; +} diff --git a/src/rgw/rgw_formats.h b/src/rgw/rgw_formats.h new file mode 100644 index 000000000..e645d3ec2 --- /dev/null +++ b/src/rgw/rgw_formats.h @@ -0,0 +1,134 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "common/Formatter.h" + +#include +#include +#include +#include + +struct plain_stack_entry { + int size; + bool is_array; +}; + +/* FIXME: this class is mis-named. + * FIXME: This was a hack to send certain swift messages. + * There is a much better way to do this. + */ +class RGWFormatter_Plain : public Formatter { + void reset_buf(); +public: + explicit RGWFormatter_Plain(bool use_kv = false); + ~RGWFormatter_Plain() override; + + void set_status(int status, const char* status_name) override {}; + void output_header() override {}; + void output_footer() override {}; + void enable_line_break() override {}; + void flush(std::ostream& os) override; + void reset() override; + + void open_array_section(std::string_view name) override; + void open_array_section_in_ns(std::string_view name, const char *ns) override; + void open_object_section(std::string_view name) override; + void open_object_section_in_ns(std::string_view name, const char *ns) override; + void close_section() override; + void dump_null(std::string_view name) override; + void dump_unsigned(std::string_view name, uint64_t u) override; + void dump_int(std::string_view name, int64_t u) override; + void dump_float(std::string_view name, double d) override; + void dump_string(std::string_view name, std::string_view s) override; + std::ostream& dump_stream(std::string_view name) override; + void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override; + int get_len() const override; + void write_raw_data(const char *data) override; + +private: + void write_data(const char *fmt, ...); + void dump_value_int(std::string_view name, const char *fmt, ...); + + char *buf = nullptr; + int len = 0; + int max_len = 0; + + std::list stack; + size_t min_stack_level = 0; + bool use_kv; + bool wrote_something = 0; +}; + + +/* This is a presentation layer. No logic inside, please. */ +class RGWSwiftWebsiteListingFormatter { + std::ostream& ss; + const std::string prefix; +protected: + std::string format_name(const std::string& item_name) const; +public: + RGWSwiftWebsiteListingFormatter(std::ostream& ss, + std::string prefix) + : ss(ss), + prefix(std::move(prefix)) { + } + + /* The supplied css_path can be empty. In such situation a default, + * embedded style sheet will be generated. */ + void generate_header(const std::string& dir_path, + const std::string& css_path); + void generate_footer(); + void dump_object(const rgw_bucket_dir_entry& objent); + void dump_subdir(const std::string& name); +}; + + +class RGWFormatterFlusher { +protected: + Formatter *formatter; + bool flushed; + bool started; + virtual void do_flush() = 0; + virtual void do_start(int ret) {} + void set_formatter(Formatter *f) { + formatter = f; + } +public: + explicit RGWFormatterFlusher(Formatter *f) : formatter(f), flushed(false), started(false) {} + virtual ~RGWFormatterFlusher() {} + + void flush() { + do_flush(); + flushed = true; + } + + virtual void start(int client_ret) { + if (!started) + do_start(client_ret); + started = true; + } + + Formatter *get_formatter() { return formatter; } + bool did_flush() { return flushed; } + bool did_start() { return started; } +}; + +class RGWStreamFlusher : public RGWFormatterFlusher { + std::ostream& os; +protected: + void do_flush() override { + formatter->flush(os); + } +public: + RGWStreamFlusher(Formatter *f, std::ostream& _os) : RGWFormatterFlusher(f), os(_os) {} +}; + +class RGWNullFlusher : public RGWFormatterFlusher { +protected: + void do_flush() override { + } +public: + RGWNullFlusher() : RGWFormatterFlusher(nullptr) {} +}; diff --git a/src/rgw/rgw_frontend.cc b/src/rgw/rgw_frontend.cc new file mode 100644 index 000000000..ea5cbbafe --- /dev/null +++ b/src/rgw/rgw_frontend.cc @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "rgw_frontend.h" +#include "include/str_list.h" + +#include "include/ceph_assert.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int RGWFrontendConfig::parse_config(const string& config, + std::multimap& config_map) +{ + for (auto& entry : get_str_vec(config, " ")) { + string key; + string val; + + if (framework.empty()) { + framework = entry; + dout(0) << "framework: " << framework << dendl; + continue; + } + + ssize_t pos = entry.find('='); + if (pos < 0) { + dout(0) << "framework conf key: " << entry << dendl; + config_map.emplace(std::move(entry), ""); + continue; + } + + int ret = parse_key_value(entry, key, val); + if (ret < 0) { + cerr << "ERROR: can't parse " << entry << std::endl; + return ret; + } + + dout(0) << "framework conf key: " << key << ", val: " << val << dendl; + config_map.emplace(std::move(key), std::move(val)); + } + + return 0; +} + +void RGWFrontendConfig::set_default_config(RGWFrontendConfig& def_conf) +{ + const auto& def_conf_map = def_conf.get_config_map(); + + for (auto& entry : def_conf_map) { + if (config_map.find(entry.first) == config_map.end()) { + config_map.emplace(entry.first, entry.second); + } + } +} + +std::optional RGWFrontendConfig::get_val(const std::string& key) +{ + auto iter = config_map.find(key); + if (iter == config_map.end()) { + return std::nullopt; + } + + return iter->second; +} + +bool RGWFrontendConfig::get_val(const string& key, const string& def_val, + string *out) +{ + auto iter = config_map.find(key); + if (iter == config_map.end()) { + *out = def_val; + return false; + } + + *out = iter->second; + return true; +} + +bool RGWFrontendConfig::get_val(const string& key, int def_val, int *out) +{ + string str; + bool found = get_val(key, "", &str); + if (!found) { + *out = def_val; + return false; + } + string err; + *out = strict_strtol(str.c_str(), 10, &err); + if (!err.empty()) { + cerr << "error parsing int: " << str << ": " << err << std::endl; + return -EINVAL; + } + return 0; +} + +void RGWProcessFrontend::stop() +{ + pprocess->close_fd(); + thread->kill(SIGUSR1); +} diff --git a/src/rgw/rgw_frontend.h b/src/rgw/rgw_frontend.h new file mode 100644 index 000000000..ca1a8cba1 --- /dev/null +++ b/src/rgw/rgw_frontend.h @@ -0,0 +1,211 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include "common/RWLock.h" + +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_process_env.h" +#include "rgw_realm_reloader.h" + +#include "rgw_auth_registry.h" +#include "rgw_sal_rados.h" + +#define dout_context g_ceph_context + +namespace rgw::dmclock { + class SyncScheduler; + class ClientConfig; + class SchedulerCtx; +} + +class RGWFrontendConfig { + std::string config; + std::multimap config_map; + std::string framework; + + int parse_config(const std::string& config, + std::multimap& config_map); + +public: + explicit RGWFrontendConfig(const std::string& config) + : config(config) { + } + + int init() { + const int ret = parse_config(config, config_map); + return ret < 0 ? ret : 0; + } + + void set_default_config(RGWFrontendConfig& def_conf); + + std::optional get_val(const std::string& key); + + bool get_val(const std::string& key, + const std::string& def_val, + std::string* out); + bool get_val(const std::string& key, int def_val, int *out); + + std::string get_val(const std::string& key, + const std::string& def_val) { + std::string out; + get_val(key, def_val, &out); + return out; + } + + const std::string& get_config() { + return config; + } + + std::multimap& get_config_map() { + return config_map; + } + + std::string get_framework() const { + return framework; + } +}; + +class RGWFrontend { +public: + virtual ~RGWFrontend() {} + + virtual int init() = 0; + + virtual int run() = 0; + virtual void stop() = 0; + virtual void join() = 0; + + virtual void pause_for_new_config() = 0; + virtual void unpause_with_new_config() = 0; +}; + + +class RGWProcessFrontend : public RGWFrontend { +protected: + RGWFrontendConfig* conf; + RGWProcess* pprocess; + RGWProcessEnv& env; + RGWProcessControlThread* thread; + +public: + RGWProcessFrontend(RGWProcessEnv& pe, RGWFrontendConfig* _conf) + : conf(_conf), pprocess(nullptr), env(pe), thread(nullptr) { + } + + ~RGWProcessFrontend() override { + delete thread; + delete pprocess; + } + + int run() override { + ceph_assert(pprocess); /* should have initialized by init() */ + thread = new RGWProcessControlThread(pprocess); + thread->create("rgw_frontend"); + return 0; + } + + void stop() override; + + void join() override { + thread->join(); + } + + void pause_for_new_config() override { + pprocess->pause(); + } + + void unpause_with_new_config() override { + pprocess->unpause_with_new_config(); + } +}; /* RGWProcessFrontend */ + +class RGWLoadGenFrontend : public RGWProcessFrontend, public DoutPrefixProvider { +public: + RGWLoadGenFrontend(RGWProcessEnv& pe, RGWFrontendConfig *_conf) + : RGWProcessFrontend(pe, _conf) {} + + CephContext *get_cct() const { + return env.driver->ctx(); + } + + unsigned get_subsys() const + { + return ceph_subsys_rgw; + } + + std::ostream& gen_prefix(std::ostream& out) const + { + return out << "rgw loadgen frontend: "; + } + + int init() override { + int num_threads; + conf->get_val("num_threads", g_conf()->rgw_thread_pool_size, &num_threads); + std::string uri_prefix; + conf->get_val("prefix", "", &uri_prefix); + + RGWLoadGenProcess *pp = new RGWLoadGenProcess( + g_ceph_context, env, num_threads, std::move(uri_prefix), conf); + + pprocess = pp; + + std::string uid_str; + conf->get_val("uid", "", &uid_str); + if (uid_str.empty()) { + derr << "ERROR: uid param must be specified for loadgen frontend" + << dendl; + return -EINVAL; + } + + rgw_user uid(uid_str); + std::unique_ptr user = env.driver->get_user(uid); + + int ret = user->load_user(this, null_yield); + if (ret < 0) { + derr << "ERROR: failed reading user info: uid=" << uid << " ret=" + << ret << dendl; + return ret; + } + + auto aiter = user->get_info().access_keys.begin(); + if (aiter == user->get_info().access_keys.end()) { + derr << "ERROR: user has no S3 access keys set" << dendl; + return -EINVAL; + } + + pp->set_access_key(aiter->second); + + return 0; + } +}; /* RGWLoadGenFrontend */ + +// FrontendPauser implementation for RGWRealmReloader +class RGWFrontendPauser : public RGWRealmReloader::Pauser { + std::vector &frontends; + RGWRealmReloader::Pauser* pauser; + + public: + RGWFrontendPauser(std::vector &frontends, + RGWRealmReloader::Pauser* pauser = nullptr) + : frontends(frontends), pauser(pauser) {} + + void pause() override { + for (auto frontend : frontends) + frontend->pause_for_new_config(); + if (pauser) + pauser->pause(); + } + void resume(rgw::sal::Driver* driver) override { + for (auto frontend : frontends) + frontend->unpause_with_new_config(); + if (pauser) + pauser->resume(driver); + } +}; diff --git a/src/rgw/rgw_gc_log.h b/src/rgw/rgw_gc_log.h new file mode 100644 index 000000000..a37672617 --- /dev/null +++ b/src/rgw/rgw_gc_log.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "include/rados/librados.hpp" +#include "cls/rgw/cls_rgw_types.h" + + +// initialize the cls_rgw_gc queue +void gc_log_init2(librados::ObjectWriteOperation& op, + uint64_t max_size, uint64_t max_deferred); + +// enqueue a gc entry to omap with cls_rgw +void gc_log_enqueue1(librados::ObjectWriteOperation& op, + uint32_t expiration, cls_rgw_gc_obj_info& info); + +// enqueue a gc entry to the cls_rgw_gc queue +void gc_log_enqueue2(librados::ObjectWriteOperation& op, + uint32_t expiration, const cls_rgw_gc_obj_info& info); + +// defer a gc entry in omap with cls_rgw +void gc_log_defer1(librados::ObjectWriteOperation& op, + uint32_t expiration, const cls_rgw_gc_obj_info& info); + +// defer a gc entry in the cls_rgw_gc queue +void gc_log_defer2(librados::ObjectWriteOperation& op, + uint32_t expiration, const cls_rgw_gc_obj_info& info); diff --git a/src/rgw/rgw_http_client.cc b/src/rgw/rgw_http_client.cc new file mode 100644 index 000000000..255db71a5 --- /dev/null +++ b/src/rgw/rgw_http_client.cc @@ -0,0 +1,1223 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "include/compat.h" +#include "common/errno.h" + + +#include +#include +#include + +#include "rgw_common.h" +#include "rgw_http_client.h" +#include "rgw_http_errors.h" +#include "common/async/completion.h" +#include "common/RefCountedObj.h" + +#include "rgw_coroutine.h" +#include "rgw_tools.h" + +#include +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWHTTPManager *rgw_http_manager; + +struct RGWCurlHandle; + +static void do_curl_easy_cleanup(RGWCurlHandle *curl_handle); + +struct rgw_http_req_data : public RefCountedObject { + RGWCurlHandle *curl_handle{nullptr}; + curl_slist *h{nullptr}; + uint64_t id; + int ret{0}; + std::atomic done = { false }; + RGWHTTPClient *client{nullptr}; + rgw_io_id control_io_id; + void *user_info{nullptr}; + bool registered{false}; + RGWHTTPManager *mgr{nullptr}; + char error_buf[CURL_ERROR_SIZE]; + bool write_paused{false}; + bool read_paused{false}; + + optional user_ret; + + ceph::mutex lock = ceph::make_mutex("rgw_http_req_data::lock"); + ceph::condition_variable cond; + + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion; + std::unique_ptr completion; + + rgw_http_req_data() : id(-1) { + // FIPS zeroization audit 20191115: this memset is not security related. + memset(error_buf, 0, sizeof(error_buf)); + } + + template + auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { + boost::asio::async_completion init(token); + auto& handler = init.completion_handler; + { + std::unique_lock l{lock}; + completion = Completion::create(ctx.get_executor(), std::move(handler)); + } + return init.result.get(); + } + + int wait(optional_yield y) { + if (done) { + return ret; + } + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + boost::system::error_code ec; + async_wait(context, yield[ec]); + return -ec.value(); + } + // work on asio threads should be asynchronous, so warn when they block + if (is_asio_thread) { + dout(20) << "WARNING: blocking http request" << dendl; + } + std::unique_lock l{lock}; + cond.wait(l, [this]{return done==true;}); + return ret; + } + + void set_state(int bitmask); + + void finish(int r, long http_status = -1) { + std::lock_guard l{lock}; + if (http_status != -1) { + if (client) { + client->set_http_status(http_status); + } + } + ret = r; + if (curl_handle) + do_curl_easy_cleanup(curl_handle); + + if (h) + curl_slist_free_all(h); + + curl_handle = NULL; + h = NULL; + done = true; + if (completion) { + boost::system::error_code ec(-ret, boost::system::system_category()); + Completion::post(std::move(completion), ec); + } else { + cond.notify_all(); + } + } + + bool is_done() { + return done; + } + + int get_retcode() { + std::lock_guard l{lock}; + return ret; + } + + RGWHTTPManager *get_manager() { + std::lock_guard l{lock}; + return mgr; + } + + CURL *get_easy_handle() const; +}; + +struct RGWCurlHandle { + int uses; + mono_time lastuse; + CURL* h; + + explicit RGWCurlHandle(CURL* h) : uses(0), h(h) {}; + CURL* operator*() { + return this->h; + } +}; + +void rgw_http_req_data::set_state(int bitmask) { + /* no need to lock here, moreover curl_easy_pause() might trigger + * the data receive callback :/ + */ + CURLcode rc = curl_easy_pause(**curl_handle, bitmask); + if (rc != CURLE_OK) { + dout(0) << "ERROR: curl_easy_pause() returned rc=" << rc << dendl; + } +} + +#define MAXIDLE 5 +class RGWCurlHandles : public Thread { +public: + ceph::mutex cleaner_lock = ceph::make_mutex("RGWCurlHandles::cleaner_lock"); + std::vector saved_curl; + int cleaner_shutdown; + ceph::condition_variable cleaner_cond; + + RGWCurlHandles() : + cleaner_shutdown{0} { + } + + RGWCurlHandle* get_curl_handle(); + void release_curl_handle_now(RGWCurlHandle* curl); + void release_curl_handle(RGWCurlHandle* curl); + void flush_curl_handles(); + void* entry(); + void stop(); +}; + +RGWCurlHandle* RGWCurlHandles::get_curl_handle() { + RGWCurlHandle* curl = 0; + CURL* h; + { + std::lock_guard lock{cleaner_lock}; + if (!saved_curl.empty()) { + curl = *saved_curl.begin(); + saved_curl.erase(saved_curl.begin()); + } + } + if (curl) { + } else if ((h = curl_easy_init())) { + curl = new RGWCurlHandle{h}; + } else { + // curl = 0; + } + return curl; +} + +void RGWCurlHandles::release_curl_handle_now(RGWCurlHandle* curl) +{ + curl_easy_cleanup(**curl); + delete curl; +} + +void RGWCurlHandles::release_curl_handle(RGWCurlHandle* curl) +{ + if (cleaner_shutdown) { + release_curl_handle_now(curl); + } else { + curl_easy_reset(**curl); + std::lock_guard lock{cleaner_lock}; + curl->lastuse = mono_clock::now(); + saved_curl.insert(saved_curl.begin(), 1, curl); + } +} + +void* RGWCurlHandles::entry() +{ + RGWCurlHandle* curl; + std::unique_lock lock{cleaner_lock}; + + for (;;) { + if (cleaner_shutdown) { + if (saved_curl.empty()) + break; + } else { + cleaner_cond.wait_for(lock, std::chrono::seconds(MAXIDLE)); + } + mono_time now = mono_clock::now(); + while (!saved_curl.empty()) { + auto cend = saved_curl.end(); + --cend; + curl = *cend; + if (!cleaner_shutdown && now - curl->lastuse < std::chrono::seconds(MAXIDLE)) + break; + saved_curl.erase(cend); + release_curl_handle_now(curl); + } + } + return nullptr; +} + +void RGWCurlHandles::stop() +{ + std::lock_guard lock{cleaner_lock}; + cleaner_shutdown = 1; + cleaner_cond.notify_all(); +} + +void RGWCurlHandles::flush_curl_handles() +{ + stop(); + join(); + if (!saved_curl.empty()) { + dout(0) << "ERROR: " << __func__ << " failed final cleanup" << dendl; + } + saved_curl.shrink_to_fit(); +} + +CURL *rgw_http_req_data::get_easy_handle() const +{ + return **curl_handle; +} + +static RGWCurlHandles *handles; + +static RGWCurlHandle *do_curl_easy_init() +{ + return handles->get_curl_handle(); +} + +static void do_curl_easy_cleanup(RGWCurlHandle *curl_handle) +{ + handles->release_curl_handle(curl_handle); +} + +// XXX make this part of the token cache? (but that's swift-only; +// and this especially needs to integrates with s3...) + +void rgw_setup_saved_curl_handles() +{ + handles = new RGWCurlHandles(); + handles->create("rgw_curl"); +} + +void rgw_release_all_curl_handles() +{ + handles->flush_curl_handles(); + delete handles; +} + +void RGWIOProvider::assign_io(RGWIOIDProvider& io_id_provider, int io_type) +{ + if (id == 0) { + id = io_id_provider.get_next(); + } +} + +RGWHTTPClient::RGWHTTPClient(CephContext *cct, + const string& _method, + const string& _url) + : NoDoutPrefix(cct, dout_subsys), + has_send_len(false), + http_status(HTTP_STATUS_NOSTATUS), + req_data(nullptr), + verify_ssl(cct->_conf->rgw_verify_ssl), + cct(cct), + method(_method), + url(_url) { + init(); +} + +std::ostream& RGWHTTPClient::gen_prefix(std::ostream& out) const +{ + out << "http_client[" << method << "/" << url << "]"; + return out; +} + +void RGWHTTPClient::init() +{ + auto pos = url.find("://"); + if (pos == string::npos) { + host = url; + return; + } + + protocol = url.substr(0, pos); + + pos += 3; + + auto host_end_pos = url.find("/", pos); + if (host_end_pos == string::npos) { + host = url.substr(pos); + return; + } + + host = url.substr(pos, host_end_pos - pos); + resource_prefix = url.substr(host_end_pos + 1); + if (resource_prefix.size() > 0 && resource_prefix[resource_prefix.size() - 1] != '/') { + resource_prefix.append("/"); + } +} + +/* + * the following set of callbacks will be called either on RGWHTTPManager::process(), + * or via the RGWHTTPManager async processing. + */ +size_t RGWHTTPClient::receive_http_header(void * const ptr, + const size_t size, + const size_t nmemb, + void * const _info) +{ + rgw_http_req_data *req_data = static_cast(_info); + size_t len = size * nmemb; + + std::lock_guard l{req_data->lock}; + + if (!req_data->registered) { + return len; + } + + int ret = req_data->client->receive_header(ptr, size * nmemb); + if (ret < 0) { + dout(5) << "WARNING: client->receive_header() returned ret=" << ret << dendl; + req_data->user_ret = ret; + return CURLE_WRITE_ERROR; + } + + return len; +} + +size_t RGWHTTPClient::receive_http_data(void * const ptr, + const size_t size, + const size_t nmemb, + void * const _info) +{ + rgw_http_req_data *req_data = static_cast(_info); + size_t len = size * nmemb; + + bool pause = false; + + RGWHTTPClient *client; + + { + std::lock_guard l{req_data->lock}; + if (!req_data->registered) { + return len; + } + + client = req_data->client; + } + + size_t& skip_bytes = client->receive_pause_skip; + + if (skip_bytes >= len) { + skip_bytes -= len; + return len; + } + + int ret = client->receive_data((char *)ptr + skip_bytes, len - skip_bytes, &pause); + if (ret < 0) { + dout(5) << "WARNING: client->receive_data() returned ret=" << ret << dendl; + req_data->user_ret = ret; + return CURLE_WRITE_ERROR; + } + + if (pause) { + dout(20) << "RGWHTTPClient::receive_http_data(): pause" << dendl; + skip_bytes = len; + std::lock_guard l{req_data->lock}; + req_data->read_paused = true; + return CURL_WRITEFUNC_PAUSE; + } + + skip_bytes = 0; + + return len; +} + +size_t RGWHTTPClient::send_http_data(void * const ptr, + const size_t size, + const size_t nmemb, + void * const _info) +{ + rgw_http_req_data *req_data = static_cast(_info); + + RGWHTTPClient *client; + + { + std::lock_guard l{req_data->lock}; + + if (!req_data->registered) { + return 0; + } + + client = req_data->client; + } + + bool pause = false; + + int ret = client->send_data(ptr, size * nmemb, &pause); + if (ret < 0) { + dout(5) << "WARNING: client->send_data() returned ret=" << ret << dendl; + req_data->user_ret = ret; + return CURLE_READ_ERROR; + } + + if (ret == 0 && + pause) { + std::lock_guard l{req_data->lock}; + req_data->write_paused = true; + return CURL_READFUNC_PAUSE; + } + + return ret; +} + +ceph::mutex& RGWHTTPClient::get_req_lock() +{ + return req_data->lock; +} + +void RGWHTTPClient::_set_write_paused(bool pause) +{ + ceph_assert(ceph_mutex_is_locked(req_data->lock)); + + RGWHTTPManager *mgr = req_data->mgr; + if (pause == req_data->write_paused) { + return; + } + if (pause) { + mgr->set_request_state(this, SET_WRITE_PAUSED); + } else { + mgr->set_request_state(this, SET_WRITE_RESUME); + } +} + +void RGWHTTPClient::_set_read_paused(bool pause) +{ + ceph_assert(ceph_mutex_is_locked(req_data->lock)); + + RGWHTTPManager *mgr = req_data->mgr; + if (pause == req_data->read_paused) { + return; + } + if (pause) { + mgr->set_request_state(this, SET_READ_PAUSED); + } else { + mgr->set_request_state(this, SET_READ_RESUME); + } +} + +static curl_slist *headers_to_slist(param_vec_t& headers) +{ + curl_slist *h = NULL; + + param_vec_t::iterator iter; + for (iter = headers.begin(); iter != headers.end(); ++iter) { + pair& p = *iter; + string val = p.first; + + if (strncmp(val.c_str(), "HTTP_", 5) == 0) { + val = val.substr(5); + } + + /* we need to convert all underscores into dashes as some web servers forbid them + * in the http header field names + */ + for (size_t i = 0; i < val.size(); i++) { + if (val[i] == '_') { + val[i] = '-'; + } + } + + val = camelcase_dash_http_attr(val); + + // curl won't send headers with empty values unless it ends with a ; instead + if (p.second.empty()) { + val.append(1, ';'); + } else { + val.append(": "); + val.append(p.second); + } + h = curl_slist_append(h, val.c_str()); + } + + return h; +} + +static bool is_upload_request(const string& method) +{ + return method == "POST" || method == "PUT"; +} + +/* + * process a single simple one off request + */ +int RGWHTTPClient::process(optional_yield y) +{ + return RGWHTTP::process(this, y); +} + +string RGWHTTPClient::to_str() +{ + string method_str = (method.empty() ? "" : method); + string url_str = (url.empty() ? "" : url); + return method_str + " " + url_str; +} + +int RGWHTTPClient::get_req_retcode() +{ + if (!req_data) { + return -EINVAL; + } + + return req_data->get_retcode(); +} + +/* + * init request, will be used later with RGWHTTPManager + */ +int RGWHTTPClient::init_request(rgw_http_req_data *_req_data) +{ + ceph_assert(!req_data); + _req_data->get(); + req_data = _req_data; + + req_data->curl_handle = do_curl_easy_init(); + + CURL *easy_handle = req_data->get_easy_handle(); + + dout(20) << "sending request to " << url << dendl; + + curl_slist *h = headers_to_slist(headers); + + req_data->h = h; + + curl_easy_setopt(easy_handle, CURLOPT_CUSTOMREQUEST, method.c_str()); + curl_easy_setopt(easy_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(easy_handle, CURLOPT_NOPROGRESS, 1L); + curl_easy_setopt(easy_handle, CURLOPT_NOSIGNAL, 1L); + curl_easy_setopt(easy_handle, CURLOPT_HEADERFUNCTION, receive_http_header); + curl_easy_setopt(easy_handle, CURLOPT_WRITEHEADER, (void *)req_data); + curl_easy_setopt(easy_handle, CURLOPT_WRITEFUNCTION, receive_http_data); + curl_easy_setopt(easy_handle, CURLOPT_WRITEDATA, (void *)req_data); + curl_easy_setopt(easy_handle, CURLOPT_ERRORBUFFER, (void *)req_data->error_buf); + curl_easy_setopt(easy_handle, CURLOPT_LOW_SPEED_TIME, cct->_conf->rgw_curl_low_speed_time); + curl_easy_setopt(easy_handle, CURLOPT_LOW_SPEED_LIMIT, cct->_conf->rgw_curl_low_speed_limit); + curl_easy_setopt(easy_handle, CURLOPT_TCP_KEEPALIVE, cct->_conf->rgw_curl_tcp_keepalive); + curl_easy_setopt(easy_handle, CURLOPT_READFUNCTION, send_http_data); + curl_easy_setopt(easy_handle, CURLOPT_READDATA, (void *)req_data); + curl_easy_setopt(easy_handle, CURLOPT_BUFFERSIZE, cct->_conf->rgw_curl_buffersize); + if (send_data_hint || is_upload_request(method)) { + curl_easy_setopt(easy_handle, CURLOPT_UPLOAD, 1L); + } + if (has_send_len) { + // TODO: prevent overflow by using curl_off_t + // and: CURLOPT_INFILESIZE_LARGE, CURLOPT_POSTFIELDSIZE_LARGE + const long size = send_len; + curl_easy_setopt(easy_handle, CURLOPT_INFILESIZE, size); + if (method == "POST") { + curl_easy_setopt(easy_handle, CURLOPT_POSTFIELDSIZE, size); + // TODO: set to size smaller than 1MB should prevent the "Expect" field + // from being sent. So explicit removal is not needed + h = curl_slist_append(h, "Expect:"); + } + } + + if (method == "HEAD") { + curl_easy_setopt(easy_handle, CURLOPT_NOBODY, 1L); + } + + if (h) { + curl_easy_setopt(easy_handle, CURLOPT_HTTPHEADER, (void *)h); + } + if (!verify_ssl) { + curl_easy_setopt(easy_handle, CURLOPT_SSL_VERIFYPEER, 0L); + curl_easy_setopt(easy_handle, CURLOPT_SSL_VERIFYHOST, 0L); + dout(20) << "ssl verification is set to off" << dendl; + } else { + if (!ca_path.empty()) { + curl_easy_setopt(easy_handle, CURLOPT_CAINFO, ca_path.c_str()); + dout(20) << "using custom ca cert "<< ca_path.c_str() << " for ssl" << dendl; + } + if (!client_cert.empty()) { + if (!client_key.empty()) { + curl_easy_setopt(easy_handle, CURLOPT_SSLCERT, client_cert.c_str()); + curl_easy_setopt(easy_handle, CURLOPT_SSLKEY, client_key.c_str()); + dout(20) << "using custom client cert " << client_cert.c_str() + << " and private key " << client_key.c_str() << dendl; + } else { + dout(5) << "private key is missing for client certificate" << dendl; + } + } + } + curl_easy_setopt(easy_handle, CURLOPT_PRIVATE, (void *)req_data); + curl_easy_setopt(easy_handle, CURLOPT_TIMEOUT, req_timeout); + + return 0; +} + +bool RGWHTTPClient::is_done() +{ + return req_data->is_done(); +} + +/* + * wait for async request to complete + */ +int RGWHTTPClient::wait(optional_yield y) +{ + return req_data->wait(y); +} + +void RGWHTTPClient::cancel() +{ + if (req_data) { + RGWHTTPManager *http_manager = req_data->mgr; + if (http_manager) { + http_manager->remove_request(this); + } + } +} + +RGWHTTPClient::~RGWHTTPClient() +{ + cancel(); + if (req_data) { + req_data->put(); + } +} + + +int RGWHTTPHeadersCollector::receive_header(void * const ptr, const size_t len) +{ + const std::string_view header_line(static_cast(ptr), len); + + /* We're tokening the line that way due to backward compatibility. */ + const size_t sep_loc = header_line.find_first_of(" \t:"); + + if (std::string_view::npos == sep_loc) { + /* Wrongly formatted header? Just skip it. */ + return 0; + } + + header_name_t name(header_line.substr(0, sep_loc)); + if (0 == relevant_headers.count(name)) { + /* Not interested in this particular header. */ + return 0; + } + + const auto value_part = header_line.substr(sep_loc + 1); + + /* Skip spaces and tabs after the separator. */ + const size_t val_loc_s = value_part.find_first_not_of(' '); + const size_t val_loc_e = value_part.find_first_of("\r\n"); + + if (std::string_view::npos == val_loc_s || + std::string_view::npos == val_loc_e) { + /* Empty value case. */ + found_headers.emplace(name, header_value_t()); + } else { + found_headers.emplace(name, header_value_t( + value_part.substr(val_loc_s, val_loc_e - val_loc_s))); + } + + return 0; +} + +int RGWHTTPTransceiver::send_data(void* ptr, size_t len, bool* pause) +{ + int length_to_copy = 0; + if (post_data_index < post_data.length()) { + length_to_copy = min(post_data.length() - post_data_index, len); + memcpy(ptr, post_data.data() + post_data_index, length_to_copy); + post_data_index += length_to_copy; + } + return length_to_copy; +} + + +static int clear_signal(int fd) +{ + // since we're in non-blocking mode, we can try to read a lot more than + // one signal from signal_thread() to avoid later wakeups + std::array buf{}; + int ret = ::read(fd, (void *)buf.data(), buf.size()); + if (ret < 0) { + ret = -errno; + return ret == -EAGAIN ? 0 : ret; // clear EAGAIN + } + return 0; +} + +static int do_curl_wait(CephContext *cct, CURLM *handle, int signal_fd) +{ + int num_fds; + struct curl_waitfd wait_fd; + + wait_fd.fd = signal_fd; + wait_fd.events = CURL_WAIT_POLLIN; + wait_fd.revents = 0; + + int ret = curl_multi_wait(handle, &wait_fd, 1, cct->_conf->rgw_curl_wait_timeout_ms, &num_fds); + if (ret) { + ldout(cct, 0) << "ERROR: curl_multi_wait() returned " << ret << dendl; + return -EIO; + } + + if (wait_fd.revents > 0) { + ret = clear_signal(signal_fd); + if (ret < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << "(): read() returned " << ret << dendl; + return ret; + } + } + return 0; +} + +void *RGWHTTPManager::ReqsThread::entry() +{ + manager->reqs_thread_entry(); + return NULL; +} + +/* + * RGWHTTPManager has two modes of operation: threaded and non-threaded. + */ +RGWHTTPManager::RGWHTTPManager(CephContext *_cct, RGWCompletionManager *_cm) : cct(_cct), + completion_mgr(_cm) +{ + multi_handle = (void *)curl_multi_init(); + thread_pipe[0] = -1; + thread_pipe[1] = -1; +} + +RGWHTTPManager::~RGWHTTPManager() { + stop(); + if (multi_handle) + curl_multi_cleanup((CURLM *)multi_handle); +} + +void RGWHTTPManager::register_request(rgw_http_req_data *req_data) +{ + std::unique_lock rl{reqs_lock}; + req_data->id = num_reqs; + req_data->registered = true; + reqs[num_reqs] = req_data; + num_reqs++; + ldout(cct, 20) << __func__ << " mgr=" << this << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl; +} + +bool RGWHTTPManager::unregister_request(rgw_http_req_data *req_data) +{ + std::unique_lock rl{reqs_lock}; + if (!req_data->registered) { + return false; + } + req_data->get(); + req_data->registered = false; + unregistered_reqs.push_back(req_data); + ldout(cct, 20) << __func__ << " mgr=" << this << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl; + return true; +} + +void RGWHTTPManager::complete_request(rgw_http_req_data *req_data) +{ + std::unique_lock rl{reqs_lock}; + _complete_request(req_data); +} + +void RGWHTTPManager::_complete_request(rgw_http_req_data *req_data) +{ + map::iterator iter = reqs.find(req_data->id); + if (iter != reqs.end()) { + reqs.erase(iter); + } + { + std::lock_guard l{req_data->lock}; + req_data->mgr = nullptr; + } + if (completion_mgr) { + completion_mgr->complete(NULL, req_data->control_io_id, req_data->user_info); + } + + req_data->put(); +} + +void RGWHTTPManager::finish_request(rgw_http_req_data *req_data, int ret, long http_status) +{ + req_data->finish(ret, http_status); + complete_request(req_data); +} + +void RGWHTTPManager::_finish_request(rgw_http_req_data *req_data, int ret) +{ + req_data->finish(ret); + _complete_request(req_data); +} + +void RGWHTTPManager::_set_req_state(set_state& ss) +{ + ss.req->set_state(ss.bitmask); +} +/* + * hook request to the curl multi handle + */ +int RGWHTTPManager::link_request(rgw_http_req_data *req_data) +{ + ldout(cct, 20) << __func__ << " req_data=" << req_data << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl; + CURLMcode mstatus = curl_multi_add_handle((CURLM *)multi_handle, req_data->get_easy_handle()); + if (mstatus) { + dout(0) << "ERROR: failed on curl_multi_add_handle, status=" << mstatus << dendl; + return -EIO; + } + return 0; +} + +/* + * unhook request from the curl multi handle, and finish request if it wasn't finished yet as + * there will be no more processing on this request + */ +void RGWHTTPManager::_unlink_request(rgw_http_req_data *req_data) +{ + if (req_data->curl_handle) { + curl_multi_remove_handle((CURLM *)multi_handle, req_data->get_easy_handle()); + } + if (!req_data->is_done()) { + _finish_request(req_data, -ECANCELED); + } +} + +void RGWHTTPManager::unlink_request(rgw_http_req_data *req_data) +{ + std::unique_lock wl{reqs_lock}; + _unlink_request(req_data); +} + +void RGWHTTPManager::manage_pending_requests() +{ + reqs_lock.lock_shared(); + if (max_threaded_req == num_reqs && + unregistered_reqs.empty() && + reqs_change_state.empty()) { + reqs_lock.unlock_shared(); + return; + } + reqs_lock.unlock_shared(); + + std::unique_lock wl{reqs_lock}; + + if (!reqs_change_state.empty()) { + for (auto siter : reqs_change_state) { + _set_req_state(siter); + } + reqs_change_state.clear(); + } + + if (!unregistered_reqs.empty()) { + for (auto& r : unregistered_reqs) { + _unlink_request(r); + r->put(); + } + + unregistered_reqs.clear(); + } + + map::iterator iter = reqs.find(max_threaded_req); + + list > remove_reqs; + + for (; iter != reqs.end(); ++iter) { + rgw_http_req_data *req_data = iter->second; + int r = link_request(req_data); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to link http request" << dendl; + remove_reqs.push_back(std::make_pair(iter->second, r)); + } else { + max_threaded_req = iter->first + 1; + } + } + + for (auto piter : remove_reqs) { + rgw_http_req_data *req_data = piter.first; + int r = piter.second; + + _finish_request(req_data, r); + } +} + +int RGWHTTPManager::add_request(RGWHTTPClient *client) +{ + rgw_http_req_data *req_data = new rgw_http_req_data; + + int ret = client->init_request(req_data); + if (ret < 0) { + req_data->put(); + req_data = NULL; + return ret; + } + + req_data->mgr = this; + req_data->client = client; + req_data->control_io_id = client->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_CONTROL); + req_data->user_info = client->get_io_user_info(); + + register_request(req_data); + + if (!is_started) { + ret = link_request(req_data); + if (ret < 0) { + req_data->put(); + req_data = NULL; + } + return ret; + } + ret = signal_thread(); + if (ret < 0) { + finish_request(req_data, ret); + } + + return ret; +} + +int RGWHTTPManager::remove_request(RGWHTTPClient *client) +{ + rgw_http_req_data *req_data = client->get_req_data(); + + if (!is_started) { + unlink_request(req_data); + return 0; + } + if (!unregister_request(req_data)) { + return 0; + } + int ret = signal_thread(); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWHTTPManager::set_request_state(RGWHTTPClient *client, RGWHTTPRequestSetState state) +{ + rgw_http_req_data *req_data = client->get_req_data(); + + ceph_assert(ceph_mutex_is_locked(req_data->lock)); + + /* can only do that if threaded */ + if (!is_started) { + return -EINVAL; + } + + bool suggested_wr_paused = req_data->write_paused; + bool suggested_rd_paused = req_data->read_paused; + + switch (state) { + case SET_WRITE_PAUSED: + suggested_wr_paused = true; + break; + case SET_WRITE_RESUME: + suggested_wr_paused = false; + break; + case SET_READ_PAUSED: + suggested_rd_paused = true; + break; + case SET_READ_RESUME: + suggested_rd_paused = false; + break; + default: + /* shouldn't really be here */ + return -EIO; + } + if (suggested_wr_paused == req_data->write_paused && + suggested_rd_paused == req_data->read_paused) { + return 0; + } + + req_data->write_paused = suggested_wr_paused; + req_data->read_paused = suggested_rd_paused; + + int bitmask = CURLPAUSE_CONT; + + if (req_data->write_paused) { + bitmask |= CURLPAUSE_SEND; + } + + if (req_data->read_paused) { + bitmask |= CURLPAUSE_RECV; + } + + reqs_change_state.push_back(set_state(req_data, bitmask)); + int ret = signal_thread(); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWHTTPManager::start() +{ + if (pipe_cloexec(thread_pipe, 0) < 0) { + int e = errno; + ldout(cct, 0) << "ERROR: pipe(): " << cpp_strerror(e) << dendl; + return -e; + } + + // enable non-blocking reads + if (::fcntl(thread_pipe[0], F_SETFL, O_NONBLOCK) < 0) { + int e = errno; + ldout(cct, 0) << "ERROR: fcntl(): " << cpp_strerror(e) << dendl; + TEMP_FAILURE_RETRY(::close(thread_pipe[0])); + TEMP_FAILURE_RETRY(::close(thread_pipe[1])); + return -e; + } + + is_started = true; + reqs_thread = new ReqsThread(this); + reqs_thread->create("http_manager"); + return 0; +} + +void RGWHTTPManager::stop() +{ + if (is_stopped) { + return; + } + + is_stopped = true; + + if (is_started) { + going_down = true; + signal_thread(); + reqs_thread->join(); + delete reqs_thread; + TEMP_FAILURE_RETRY(::close(thread_pipe[1])); + TEMP_FAILURE_RETRY(::close(thread_pipe[0])); + } +} + +int RGWHTTPManager::signal_thread() +{ + uint32_t buf = 0; + int ret = write(thread_pipe[1], (void *)&buf, sizeof(buf)); + if (ret < 0) { + ret = -errno; + ldout(cct, 0) << "ERROR: " << __func__ << ": write() returned ret=" << ret << dendl; + return ret; + } + return 0; +} + +void *RGWHTTPManager::reqs_thread_entry() +{ + int still_running; + int mstatus; + + ldout(cct, 20) << __func__ << ": start" << dendl; + + while (!going_down) { + int ret = do_curl_wait(cct, (CURLM *)multi_handle, thread_pipe[0]); + if (ret < 0) { + dout(0) << "ERROR: do_curl_wait() returned: " << ret << dendl; + return NULL; + } + + manage_pending_requests(); + + mstatus = curl_multi_perform((CURLM *)multi_handle, &still_running); + switch (mstatus) { + case CURLM_OK: + case CURLM_CALL_MULTI_PERFORM: + break; + default: + dout(10) << "curl_multi_perform returned: " << mstatus << dendl; + break; + } + int msgs_left; + CURLMsg *msg; + while ((msg = curl_multi_info_read((CURLM *)multi_handle, &msgs_left))) { + if (msg->msg == CURLMSG_DONE) { + int result = msg->data.result; + CURL *e = msg->easy_handle; + rgw_http_req_data *req_data; + curl_easy_getinfo(e, CURLINFO_PRIVATE, (void **)&req_data); + curl_multi_remove_handle((CURLM *)multi_handle, e); + + long http_status; + int status; + if (!req_data->user_ret) { + curl_easy_getinfo(e, CURLINFO_RESPONSE_CODE, (void **)&http_status); + + status = rgw_http_error_to_errno(http_status); + if (result != CURLE_OK && status == 0) { + dout(0) << "ERROR: curl error: " << curl_easy_strerror((CURLcode)result) << ", maybe network unstable" << dendl; + status = -EAGAIN; + } + } else { + status = *req_data->user_ret; + rgw_err err; + set_req_state_err(err, status, 0); + http_status = err.http_ret; + } + int id = req_data->id; + finish_request(req_data, status, http_status); + switch (result) { + case CURLE_OK: + break; + case CURLE_OPERATION_TIMEDOUT: + dout(0) << "WARNING: curl operation timed out, network average transfer speed less than " + << cct->_conf->rgw_curl_low_speed_limit << " Bytes per second during " << cct->_conf->rgw_curl_low_speed_time << " seconds." << dendl; + default: + dout(20) << "ERROR: msg->data.result=" << result << " req_data->id=" << id << " http_status=" << http_status << dendl; + dout(20) << "ERROR: curl error: " << curl_easy_strerror((CURLcode)result) << " req_data->error_buf=" << req_data->error_buf << dendl; + break; + } + } + } + } + + + std::unique_lock rl{reqs_lock}; + for (auto r : unregistered_reqs) { + _unlink_request(r); + } + + unregistered_reqs.clear(); + + auto all_reqs = std::move(reqs); + for (auto iter : all_reqs) { + _unlink_request(iter.second); + } + + reqs.clear(); + + if (completion_mgr) { + completion_mgr->go_down(); + } + + return 0; +} + +void rgw_http_client_init(CephContext *cct) +{ + curl_global_init(CURL_GLOBAL_ALL); + rgw_http_manager = new RGWHTTPManager(cct); + rgw_http_manager->start(); +} + +void rgw_http_client_cleanup() +{ + rgw_http_manager->stop(); + delete rgw_http_manager; + curl_global_cleanup(); +} + + +int RGWHTTP::send(RGWHTTPClient *req) { + if (!req) { + return 0; + } + int r = rgw_http_manager->add_request(req); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWHTTP::process(RGWHTTPClient *req, optional_yield y) { + if (!req) { + return 0; + } + int r = send(req); + if (r < 0) { + return r; + } + + return req->wait(y); +} + diff --git a/src/rgw/rgw_http_client.h b/src/rgw/rgw_http_client.h new file mode 100644 index 000000000..dbd705a18 --- /dev/null +++ b/src/rgw/rgw_http_client.h @@ -0,0 +1,348 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "common/async/yield_context.h" +#include "common/Cond.h" +#include "rgw_common.h" +#include "rgw_string.h" +#include "rgw_http_client_types.h" + +#include + +using param_pair_t = std::pair; +using param_vec_t = std::vector; + +void rgw_http_client_init(CephContext *cct); +void rgw_http_client_cleanup(); + +struct rgw_http_req_data; +class RGWHTTPManager; + +class RGWHTTPClient : public RGWIOProvider, + public NoDoutPrefix +{ + friend class RGWHTTPManager; + + bufferlist send_bl; + bufferlist::iterator send_iter; + bool has_send_len; + long http_status; + bool send_data_hint{false}; + size_t receive_pause_skip{0}; /* how many bytes to skip next time receive_data is called + due to being paused */ + + void *user_info{nullptr}; + + rgw_http_req_data *req_data; + + bool verify_ssl; // Do not validate self signed certificates, default to false + + std::string ca_path; + + std::string client_cert; + + std::string client_key; + + std::atomic stopped { 0 }; + + +protected: + CephContext *cct; + + std::string method; + std::string url; + + std::string protocol; + std::string host; + std::string resource_prefix; + + size_t send_len{0}; + + param_vec_t headers; + + long req_timeout{0L}; + + void init(); + + RGWHTTPManager *get_manager(); + + int init_request(rgw_http_req_data *req_data); + + virtual int receive_header(void *ptr, size_t len) { + return 0; + } + virtual int receive_data(void *ptr, size_t len, bool *pause) { + return 0; + } + + virtual int send_data(void *ptr, size_t len, bool *pause=nullptr) { + return 0; + } + + /* Callbacks for libcurl. */ + static size_t receive_http_header(void *ptr, + size_t size, + size_t nmemb, + void *_info); + + static size_t receive_http_data(void *ptr, + size_t size, + size_t nmemb, + void *_info); + + static size_t send_http_data(void *ptr, + size_t size, + size_t nmemb, + void *_info); + + ceph::mutex& get_req_lock(); + + /* needs to be called under req_lock() */ + void _set_write_paused(bool pause); + void _set_read_paused(bool pause); +public: + static const long HTTP_STATUS_NOSTATUS = 0; + static const long HTTP_STATUS_UNAUTHORIZED = 401; + static const long HTTP_STATUS_NOTFOUND = 404; + + static constexpr int HTTPCLIENT_IO_READ = 0x1; + static constexpr int HTTPCLIENT_IO_WRITE = 0x2; + static constexpr int HTTPCLIENT_IO_CONTROL = 0x4; + + virtual ~RGWHTTPClient(); + explicit RGWHTTPClient(CephContext *cct, + const std::string& _method, + const std::string& _url); + + std::ostream& gen_prefix(std::ostream& out) const override; + + + void append_header(const std::string& name, const std::string& val) { + headers.push_back(std::pair(name, val)); + } + + void set_send_length(size_t len) { + send_len = len; + has_send_len = true; + } + + void set_send_data_hint(bool hint) { + send_data_hint = hint; + } + + long get_http_status() const { + return http_status; + } + + void set_http_status(long _http_status) { + http_status = _http_status; + } + + void set_verify_ssl(bool flag) { + verify_ssl = flag; + } + + // set request timeout in seconds + // zero (default) mean that request will never timeout + void set_req_timeout(long timeout) { + req_timeout = timeout; + } + + int process(optional_yield y); + + int wait(optional_yield y); + void cancel(); + bool is_done(); + + rgw_http_req_data *get_req_data() { return req_data; } + + std::string to_str(); + + int get_req_retcode(); + + void set_url(const std::string& _url) { + url = _url; + } + + void set_method(const std::string& _method) { + method = _method; + } + + void set_io_user_info(void *_user_info) override { + user_info = _user_info; + } + + void *get_io_user_info() override { + return user_info; + } + + void set_ca_path(const std::string& _ca_path) { + ca_path = _ca_path; + } + + void set_client_cert(const std::string& _client_cert) { + client_cert = _client_cert; + } + + void set_client_key(const std::string& _client_key) { + client_key = _client_key; + } +}; + + +class RGWHTTPHeadersCollector : public RGWHTTPClient { +public: + typedef std::string header_name_t; + typedef std::string header_value_t; + typedef std::set header_spec_t; + + RGWHTTPHeadersCollector(CephContext * const cct, + const std::string& method, + const std::string& url, + const header_spec_t &relevant_headers) + : RGWHTTPClient(cct, method, url), + relevant_headers(relevant_headers) { + } + + std::map get_headers() const { + return found_headers; + } + + /* Throws std::out_of_range */ + const header_value_t& get_header_value(const header_name_t& name) const { + return found_headers.at(name); + } + +protected: + int receive_header(void *ptr, size_t len) override; + +private: + const std::set relevant_headers; + std::map found_headers; +}; + + +class RGWHTTPTransceiver : public RGWHTTPHeadersCollector { + bufferlist * const read_bl; + std::string post_data; + size_t post_data_index; + +public: + RGWHTTPTransceiver(CephContext * const cct, + const std::string& method, + const std::string& url, + bufferlist * const read_bl, + const header_spec_t intercept_headers = {}) + : RGWHTTPHeadersCollector(cct, method, url, intercept_headers), + read_bl(read_bl), + post_data_index(0) { + } + + RGWHTTPTransceiver(CephContext * const cct, + const std::string& method, + const std::string& url, + bufferlist * const read_bl, + const bool verify_ssl, + const header_spec_t intercept_headers = {}) + : RGWHTTPHeadersCollector(cct, method, url, intercept_headers), + read_bl(read_bl), + post_data_index(0) { + set_verify_ssl(verify_ssl); + } + + void set_post_data(const std::string& _post_data) { + this->post_data = _post_data; + } + +protected: + int send_data(void* ptr, size_t len, bool *pause=nullptr) override; + + int receive_data(void *ptr, size_t len, bool *pause) override { + read_bl->append((char *)ptr, len); + return 0; + } +}; + +typedef RGWHTTPTransceiver RGWPostHTTPData; + + +class RGWCompletionManager; + +enum RGWHTTPRequestSetState { + SET_NOP = 0, + SET_WRITE_PAUSED = 1, + SET_WRITE_RESUME = 2, + SET_READ_PAUSED = 3, + SET_READ_RESUME = 4, +}; + +class RGWHTTPManager { + struct set_state { + rgw_http_req_data *req; + int bitmask; + + set_state(rgw_http_req_data *_req, int _bitmask) : req(_req), bitmask(_bitmask) {} + }; + CephContext *cct; + RGWCompletionManager *completion_mgr; + void *multi_handle; + bool is_started = false; + std::atomic going_down { 0 }; + std::atomic is_stopped { 0 }; + + ceph::shared_mutex reqs_lock = ceph::make_shared_mutex("RGWHTTPManager::reqs_lock"); + std::map reqs; + std::list unregistered_reqs; + std::list reqs_change_state; + std::map complete_reqs; + int64_t num_reqs = 0; + int64_t max_threaded_req = 0; + int thread_pipe[2]; + + void register_request(rgw_http_req_data *req_data); + void complete_request(rgw_http_req_data *req_data); + void _complete_request(rgw_http_req_data *req_data); + bool unregister_request(rgw_http_req_data *req_data); + void _unlink_request(rgw_http_req_data *req_data); + void unlink_request(rgw_http_req_data *req_data); + void finish_request(rgw_http_req_data *req_data, int r, long http_status = -1); + void _finish_request(rgw_http_req_data *req_data, int r); + void _set_req_state(set_state& ss); + int link_request(rgw_http_req_data *req_data); + + void manage_pending_requests(); + + class ReqsThread : public Thread { + RGWHTTPManager *manager; + + public: + explicit ReqsThread(RGWHTTPManager *_m) : manager(_m) {} + void *entry() override; + }; + + ReqsThread *reqs_thread = nullptr; + + void *reqs_thread_entry(); + + int signal_thread(); + +public: + RGWHTTPManager(CephContext *_cct, RGWCompletionManager *completion_mgr = NULL); + ~RGWHTTPManager(); + + int start(); + void stop(); + + int add_request(RGWHTTPClient *client); + int remove_request(RGWHTTPClient *client); + int set_request_state(RGWHTTPClient *client, RGWHTTPRequestSetState state); +}; + +class RGWHTTP +{ +public: + static int send(RGWHTTPClient *req); + static int process(RGWHTTPClient *req, optional_yield y); +}; diff --git a/src/rgw/rgw_http_client_curl.cc b/src/rgw/rgw_http_client_curl.cc new file mode 100644 index 000000000..2477cfceb --- /dev/null +++ b/src/rgw/rgw_http_client_curl.cc @@ -0,0 +1,112 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_http_client_curl.h" +#include +#include +#include + +#include "rgw_common.h" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +#ifdef WITH_CURL_OPENSSL +#include +#endif + +#if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L +namespace openssl { + +class RGWSSLSetup +{ + std::vector locks; +public: + explicit RGWSSLSetup(int n) : locks (n){} + + void set_lock(int id){ + try { + locks.at(id).lock(); + } catch (std::out_of_range& e) { + dout(0) << __func__ << " failed to set locks" << dendl; + } + } + + void clear_lock(int id){ + try { + locks.at(id).unlock(); + } catch (std::out_of_range& e) { + dout(0) << __func__ << " failed to unlock" << dendl; + } + } +}; + + +void rgw_ssl_locking_callback(int mode, int id, const char *file, int line) +{ + static RGWSSLSetup locks(CRYPTO_num_locks()); + if (mode & CRYPTO_LOCK) + locks.set_lock(id); + else + locks.clear_lock(id); +} + +unsigned long rgw_ssl_thread_id_callback(){ + return (unsigned long)pthread_self(); +} + +void init_ssl(){ + CRYPTO_set_id_callback((unsigned long (*) ()) rgw_ssl_thread_id_callback); + CRYPTO_set_locking_callback(rgw_ssl_locking_callback); +} + +} /* namespace openssl */ +#endif // WITH_CURL_OPENSSL + + +namespace rgw { +namespace curl { + +#if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L +void init_ssl() { + ::openssl::init_ssl(); +} + +bool fe_inits_ssl(boost::optional m, long& curl_global_flags){ + if (m) { + for (const auto& kv: *m){ + if (kv.first == "beast"){ + std::string cert; + kv.second->get_val("ssl_certificate","", &cert); + if (!cert.empty()){ + /* TODO this flag is no op for curl > 7.57 */ + curl_global_flags &= ~CURL_GLOBAL_SSL; + return true; + } + } + } + } + return false; +} +#endif // WITH_CURL_OPENSSL + +std::once_flag curl_init_flag; + +void setup_curl(boost::optional m) { + long curl_global_flags = CURL_GLOBAL_ALL; + + #if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L + if (!fe_inits_ssl(m, curl_global_flags)) + init_ssl(); + #endif + + std::call_once(curl_init_flag, curl_global_init, curl_global_flags); + rgw_setup_saved_curl_handles(); +} + +void cleanup_curl() { + rgw_release_all_curl_handles(); + curl_global_cleanup(); +} + +} /* namespace curl */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_http_client_curl.h b/src/rgw/rgw_http_client_curl.h new file mode 100644 index 000000000..a28826b0d --- /dev/null +++ b/src/rgw/rgw_http_client_curl.h @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 SUSE Linux GmBH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include "rgw_frontend.h" + +namespace rgw { +namespace curl { +using fe_map_t = std::multimap ; + +void setup_curl(boost::optional m); +void cleanup_curl(); +} +} diff --git a/src/rgw/rgw_http_client_types.h b/src/rgw/rgw_http_client_types.h new file mode 100644 index 000000000..84e6ed678 --- /dev/null +++ b/src/rgw/rgw_http_client_types.h @@ -0,0 +1,69 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include + +struct rgw_io_id { + int64_t id{0}; + int channels{0}; + + rgw_io_id() {} + rgw_io_id(int64_t _id, int _channels) : id(_id), channels(_channels) {} + + bool intersects(const rgw_io_id& rhs) { + return (id == rhs.id && ((channels | rhs.channels) != 0)); + } + + bool operator<(const rgw_io_id& rhs) const { + if (id < rhs.id) { + return true; + } + return (id == rhs.id && + channels < rhs.channels); + } +}; + +class RGWIOIDProvider +{ + std::atomic max = {0}; + +public: + RGWIOIDProvider() {} + int64_t get_next() { + return ++max; + } +}; + +class RGWIOProvider +{ + int64_t id{-1}; + +public: + RGWIOProvider() {} + virtual ~RGWIOProvider() = default; + + void assign_io(RGWIOIDProvider& io_id_provider, int io_type = -1); + rgw_io_id get_io_id(int io_type) { + return rgw_io_id{id, io_type}; + } + + virtual void set_io_user_info(void *_user_info) = 0; + virtual void *get_io_user_info() = 0; +}; + diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h new file mode 100644 index 000000000..5e052819e --- /dev/null +++ b/src/rgw/rgw_http_errors.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_common.h" + +typedef const std::map> rgw_http_errors; + +extern rgw_http_errors rgw_http_s3_errors; + +extern rgw_http_errors rgw_http_swift_errors; + +extern rgw_http_errors rgw_http_sts_errors; + +extern rgw_http_errors rgw_http_iam_errors; + +static inline int rgw_http_error_to_errno(int http_err) +{ + if (http_err >= 200 && http_err <= 299) + return 0; + switch (http_err) { + case 304: + return -ERR_NOT_MODIFIED; + case 400: + return -EINVAL; + case 401: + return -EPERM; + case 403: + return -EACCES; + case 404: + return -ENOENT; + case 405: + return -ERR_METHOD_NOT_ALLOWED; + case 409: + return -ENOTEMPTY; + case 503: + return -EBUSY; + default: + return -EIO; + } + + return 0; /* unreachable */ +} diff --git a/src/rgw/rgw_iam_policy.cc b/src/rgw/rgw_iam_policy.cc new file mode 100644 index 000000000..35aeb15fc --- /dev/null +++ b/src/rgw/rgw_iam_policy.cc @@ -0,0 +1,1663 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "rapidjson/reader.h" + +#include "include/expected.hpp" + +#include "rgw_auth.h" +#include "rgw_iam_policy.h" + + +namespace { +constexpr int dout_subsys = ceph_subsys_rgw; +} + +using std::dec; +using std::hex; +using std::int64_t; +using std::size_t; +using std::string; +using std::stringstream; +using std::ostream; +using std::uint16_t; +using std::uint64_t; + +using boost::container::flat_set; +using std::regex; +using std::regex_match; +using std::smatch; + +using rapidjson::BaseReaderHandler; +using rapidjson::UTF8; +using rapidjson::SizeType; +using rapidjson::Reader; +using rapidjson::kParseCommentsFlag; +using rapidjson::kParseNumbersAsStringsFlag; +using rapidjson::StringStream; + +using rgw::auth::Principal; + +namespace rgw { +namespace IAM { +#include "rgw_iam_policy_keywords.frag.cc" + +struct actpair { + const char* name; + const uint64_t bit; +}; + + + +static const actpair actpairs[] = +{{ "s3:AbortMultipartUpload", s3AbortMultipartUpload }, + { "s3:CreateBucket", s3CreateBucket }, + { "s3:DeleteBucketPolicy", s3DeleteBucketPolicy }, + { "s3:DeleteBucket", s3DeleteBucket }, + { "s3:DeleteBucketWebsite", s3DeleteBucketWebsite }, + { "s3:DeleteObject", s3DeleteObject }, + { "s3:DeleteObjectVersion", s3DeleteObjectVersion }, + { "s3:DeleteObjectTagging", s3DeleteObjectTagging }, + { "s3:DeleteObjectVersionTagging", s3DeleteObjectVersionTagging }, + { "s3:DeleteBucketPublicAccessBlock", s3DeleteBucketPublicAccessBlock}, + { "s3:DeletePublicAccessBlock", s3DeletePublicAccessBlock}, + { "s3:DeleteReplicationConfiguration", s3DeleteReplicationConfiguration }, + { "s3:GetAccelerateConfiguration", s3GetAccelerateConfiguration }, + { "s3:GetBucketAcl", s3GetBucketAcl }, + { "s3:GetBucketCORS", s3GetBucketCORS }, + { "s3:GetBucketEncryption", s3GetBucketEncryption }, + { "s3:GetBucketLocation", s3GetBucketLocation }, + { "s3:GetBucketLogging", s3GetBucketLogging }, + { "s3:GetBucketNotification", s3GetBucketNotification }, + { "s3:GetBucketPolicy", s3GetBucketPolicy }, + { "s3:GetBucketPolicyStatus", s3GetBucketPolicyStatus }, + { "s3:GetBucketPublicAccessBlock", s3GetBucketPublicAccessBlock }, + { "s3:GetBucketRequestPayment", s3GetBucketRequestPayment }, + { "s3:GetBucketTagging", s3GetBucketTagging }, + { "s3:GetBucketVersioning", s3GetBucketVersioning }, + { "s3:GetBucketWebsite", s3GetBucketWebsite }, + { "s3:GetLifecycleConfiguration", s3GetLifecycleConfiguration }, + { "s3:GetBucketObjectLockConfiguration", s3GetBucketObjectLockConfiguration }, + { "s3:GetPublicAccessBlock", s3GetPublicAccessBlock }, + { "s3:GetObjectAcl", s3GetObjectAcl }, + { "s3:GetObject", s3GetObject }, + { "s3:GetObjectTorrent", s3GetObjectTorrent }, + { "s3:GetObjectVersionAcl", s3GetObjectVersionAcl }, + { "s3:GetObjectVersion", s3GetObjectVersion }, + { "s3:GetObjectVersionTorrent", s3GetObjectVersionTorrent }, + { "s3:GetObjectTagging", s3GetObjectTagging }, + { "s3:GetObjectVersionTagging", s3GetObjectVersionTagging}, + { "s3:GetObjectRetention", s3GetObjectRetention}, + { "s3:GetObjectLegalHold", s3GetObjectLegalHold}, + { "s3:GetReplicationConfiguration", s3GetReplicationConfiguration }, + { "s3:ListAllMyBuckets", s3ListAllMyBuckets }, + { "s3:ListBucketMultipartUploads", s3ListBucketMultipartUploads }, + { "s3:ListBucket", s3ListBucket }, + { "s3:ListBucketVersions", s3ListBucketVersions }, + { "s3:ListMultipartUploadParts", s3ListMultipartUploadParts }, + { "s3:PutAccelerateConfiguration", s3PutAccelerateConfiguration }, + { "s3:PutBucketAcl", s3PutBucketAcl }, + { "s3:PutBucketCORS", s3PutBucketCORS }, + { "s3:PutBucketEncryption", s3PutBucketEncryption }, + { "s3:PutBucketLogging", s3PutBucketLogging }, + { "s3:PutBucketNotification", s3PutBucketNotification }, + { "s3:PutBucketPolicy", s3PutBucketPolicy }, + { "s3:PutBucketRequestPayment", s3PutBucketRequestPayment }, + { "s3:PutBucketTagging", s3PutBucketTagging }, + { "s3:PutBucketVersioning", s3PutBucketVersioning }, + { "s3:PutBucketWebsite", s3PutBucketWebsite }, + { "s3:PutLifecycleConfiguration", s3PutLifecycleConfiguration }, + { "s3:PutBucketObjectLockConfiguration", s3PutBucketObjectLockConfiguration }, + { "s3:PutObjectAcl", s3PutObjectAcl }, + { "s3:PutObject", s3PutObject }, + { "s3:PutObjectVersionAcl", s3PutObjectVersionAcl }, + { "s3:PutObjectTagging", s3PutObjectTagging }, + { "s3:PutObjectVersionTagging", s3PutObjectVersionTagging }, + { "s3:PutObjectRetention", s3PutObjectRetention }, + { "s3:PutObjectLegalHold", s3PutObjectLegalHold }, + { "s3:BypassGovernanceRetention", s3BypassGovernanceRetention }, + { "s3:PutBucketPublicAccessBlock", s3PutBucketPublicAccessBlock }, + { "s3:PutPublicAccessBlock", s3PutPublicAccessBlock }, + { "s3:PutReplicationConfiguration", s3PutReplicationConfiguration }, + { "s3:RestoreObject", s3RestoreObject }, + { "iam:PutUserPolicy", iamPutUserPolicy }, + { "iam:GetUserPolicy", iamGetUserPolicy }, + { "iam:DeleteUserPolicy", iamDeleteUserPolicy }, + { "iam:ListUserPolicies", iamListUserPolicies }, + { "iam:CreateRole", iamCreateRole}, + { "iam:DeleteRole", iamDeleteRole}, + { "iam:GetRole", iamGetRole}, + { "iam:ModifyRoleTrustPolicy", iamModifyRoleTrustPolicy}, + { "iam:ListRoles", iamListRoles}, + { "iam:PutRolePolicy", iamPutRolePolicy}, + { "iam:GetRolePolicy", iamGetRolePolicy}, + { "iam:ListRolePolicies", iamListRolePolicies}, + { "iam:DeleteRolePolicy", iamDeleteRolePolicy}, + { "iam:CreateOIDCProvider", iamCreateOIDCProvider}, + { "iam:DeleteOIDCProvider", iamDeleteOIDCProvider}, + { "iam:GetOIDCProvider", iamGetOIDCProvider}, + { "iam:ListOIDCProviders", iamListOIDCProviders}, + { "iam:TagRole", iamTagRole}, + { "iam:ListRoleTags", iamListRoleTags}, + { "iam:UntagRole", iamUntagRole}, + { "iam:UpdateRole", iamUpdateRole}, + { "sts:AssumeRole", stsAssumeRole}, + { "sts:AssumeRoleWithWebIdentity", stsAssumeRoleWithWebIdentity}, + { "sts:GetSessionToken", stsGetSessionToken}, + { "sts:TagSession", stsTagSession}, +}; + +struct PolicyParser; + +const Keyword top[1]{{"", TokenKind::pseudo, TokenID::Top, 0, false, + false}}; +const Keyword cond_key[1]{{"", TokenKind::cond_key, + TokenID::CondKey, 0, true, false}}; + +struct ParseState { + PolicyParser* pp; + const Keyword* w; + + bool arraying = false; + bool objecting = false; + bool cond_ifexists = false; + + void reset(); + + void annotate(std::string&& a); + + boost::optional parse_principal(string&& s, string* errmsg); + + ParseState(PolicyParser* pp, const Keyword* w) + : pp(pp), w(w) {} + + bool obj_start(); + + bool obj_end(); + + bool array_start() { + if (w->arrayable && !arraying) { + arraying = true; + return true; + } + annotate(fmt::format("`{}` does not take array.", + w->name)); + return false; + } + + bool array_end(); + + bool key(const char* s, size_t l); + bool do_string(CephContext* cct, const char* s, size_t l); + bool number(const char* str, size_t l); +}; + +// If this confuses you, look up the Curiously Recurring Template Pattern +struct PolicyParser : public BaseReaderHandler, PolicyParser> { + keyword_hash tokens; + std::vector s; + CephContext* cct; + const string& tenant; + Policy& policy; + uint32_t v = 0; + + const bool reject_invalid_principals; + + uint32_t seen = 0; + + std::string annotation{"No error?"}; + + uint32_t dex(TokenID in) const { + switch (in) { + case TokenID::Version: + return 0x1; + case TokenID::Id: + return 0x2; + case TokenID::Statement: + return 0x4; + case TokenID::Sid: + return 0x8; + case TokenID::Effect: + return 0x10; + case TokenID::Principal: + return 0x20; + case TokenID::NotPrincipal: + return 0x40; + case TokenID::Action: + return 0x80; + case TokenID::NotAction: + return 0x100; + case TokenID::Resource: + return 0x200; + case TokenID::NotResource: + return 0x400; + case TokenID::Condition: + return 0x800; + case TokenID::AWS: + return 0x1000; + case TokenID::Federated: + return 0x2000; + case TokenID::Service: + return 0x4000; + case TokenID::CanonicalUser: + return 0x8000; + default: + ceph_abort(); + } + } + bool test(TokenID in) { + return seen & dex(in); + } + void set(TokenID in) { + seen |= dex(in); + if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) | + dex(TokenID::Principal) | dex(TokenID::NotPrincipal) | + dex(TokenID::Action) | dex(TokenID::NotAction) | + dex(TokenID::Resource) | dex(TokenID::NotResource) | + dex(TokenID::Condition) | dex(TokenID::AWS) | + dex(TokenID::Federated) | dex(TokenID::Service) | + dex(TokenID::CanonicalUser))) { + v |= dex(in); + } + } + void set(std::initializer_list l) { + for (auto in : l) { + seen |= dex(in); + if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) | + dex(TokenID::Principal) | dex(TokenID::NotPrincipal) | + dex(TokenID::Action) | dex(TokenID::NotAction) | + dex(TokenID::Resource) | dex(TokenID::NotResource) | + dex(TokenID::Condition) | dex(TokenID::AWS) | + dex(TokenID::Federated) | dex(TokenID::Service) | + dex(TokenID::CanonicalUser))) { + v |= dex(in); + } + } + } + void reset(TokenID in) { + seen &= ~dex(in); + if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) | + dex(TokenID::Principal) | dex(TokenID::NotPrincipal) | + dex(TokenID::Action) | dex(TokenID::NotAction) | + dex(TokenID::Resource) | dex(TokenID::NotResource) | + dex(TokenID::Condition) | dex(TokenID::AWS) | + dex(TokenID::Federated) | dex(TokenID::Service) | + dex(TokenID::CanonicalUser))) { + v &= ~dex(in); + } + } + void reset(std::initializer_list l) { + for (auto in : l) { + seen &= ~dex(in); + if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) | + dex(TokenID::Principal) | dex(TokenID::NotPrincipal) | + dex(TokenID::Action) | dex(TokenID::NotAction) | + dex(TokenID::Resource) | dex(TokenID::NotResource) | + dex(TokenID::Condition) | dex(TokenID::AWS) | + dex(TokenID::Federated) | dex(TokenID::Service) | + dex(TokenID::CanonicalUser))) { + v &= ~dex(in); + } + } + } + void reset(uint32_t& v) { + seen &= ~v; + v = 0; + } + + PolicyParser(CephContext* cct, const string& tenant, Policy& policy, + bool reject_invalid_principals) + : cct(cct), tenant(tenant), policy(policy), + reject_invalid_principals(reject_invalid_principals) {} + PolicyParser(const PolicyParser& policy) = delete; + + bool StartObject() { + if (s.empty()) { + s.push_back({this, top}); + s.back().objecting = true; + return true; + } + + return s.back().obj_start(); + } + bool EndObject(SizeType memberCount) { + if (s.empty()) { + annotation = "Attempt to end unopened object at top level."; + return false; + } + return s.back().obj_end(); + } + bool Key(const char* str, SizeType length, bool copy) { + if (s.empty()) { + annotation = "Key not allowed at top level."; + return false; + } + return s.back().key(str, length); + } + + bool String(const char* str, SizeType length, bool copy) { + if (s.empty()) { + annotation = "String not allowed at top level."; + return false; + } + return s.back().do_string(cct, str, length); + } + bool RawNumber(const char* str, SizeType length, bool copy) { + if (s.empty()) { + annotation = "Number not allowed at top level."; + return false; + } + + return s.back().number(str, length); + } + bool StartArray() { + if (s.empty()) { + annotation = "Array not allowed at top level."; + return false; + } + + return s.back().array_start(); + } + bool EndArray(SizeType) { + if (s.empty()) { + return false; + } + + return s.back().array_end(); + } + + bool Default() { + return false; + } +}; + + +// I really despise this misfeature of C++. +// +void ParseState::annotate(std::string&& a) { + pp->annotation = std::move(a); +} + +bool ParseState::obj_end() { + if (objecting) { + objecting = false; + if (!arraying) { + pp->s.pop_back(); + } else { + reset(); + } + return true; + } + annotate( + fmt::format("Attempt to end unopened object on keyword `{}`.", + w->name)); + return false; +} + +bool ParseState::key(const char* s, size_t l) { + auto token_len = l; + bool ifexists = false; + if (w->id == TokenID::Condition && w->kind == TokenKind::statement) { + static constexpr char IfExists[] = "IfExists"; + if (boost::algorithm::ends_with(std::string_view{s, l}, IfExists)) { + ifexists = true; + token_len -= sizeof(IfExists)-1; + } + } + auto k = pp->tokens.lookup(s, token_len); + + if (!k) { + if (w->kind == TokenKind::cond_op) { + auto id = w->id; + auto& t = pp->policy.statements.back(); + auto c_ife = cond_ifexists; + pp->s.emplace_back(pp, cond_key); + t.conditions.emplace_back(id, s, l, c_ife); + return true; + } else { + annotate(fmt::format("Unknown key `{}`.", std::string_view{s, token_len})); + return false; + } + } + + // If the token we're going with belongs within the condition at the + // top of the stack and we haven't already encountered it, push it + // on the stack + // Top + if ((((w->id == TokenID::Top) && (k->kind == TokenKind::top)) || + // Statement + ((w->id == TokenID::Statement) && (k->kind == TokenKind::statement)) || + + /// Principal + ((w->id == TokenID::Principal || w->id == TokenID::NotPrincipal) && + (k->kind == TokenKind::princ_type))) && + + // Check that it hasn't been encountered. Note that this + // conjoins with the run of disjunctions above. + !pp->test(k->id)) { + pp->set(k->id); + pp->s.emplace_back(pp, k); + return true; + } else if ((w->id == TokenID::Condition) && + (k->kind == TokenKind::cond_op)) { + pp->s.emplace_back(pp, k); + pp->s.back().cond_ifexists = ifexists; + return true; + } + annotate(fmt::format("Token `{}` is not allowed in the context of `{}`.", + k->name, w->name)); + return false; +} + +// I should just rewrite a few helper functions to use iterators, +// which will make all of this ever so much nicer. +boost::optional ParseState::parse_principal(string&& s, + string* errmsg) { + if ((w->id == TokenID::AWS) && (s == "*")) { + // Wildcard! + return Principal::wildcard(); + } else if (w->id == TokenID::CanonicalUser) { + // Do nothing for now. + if (errmsg) + *errmsg = "RGW does not support canonical users."; + return boost::none; + } else if (w->id == TokenID::AWS || w->id == TokenID::Federated) { + // AWS and Federated ARNs + if (auto a = ARN::parse(s)) { + if (a->resource == "root") { + return Principal::tenant(std::move(a->account)); + } + + static const char rx_str[] = "([^/]*)/(.*)"; + static const regex rx(rx_str, sizeof(rx_str) - 1, + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + smatch match; + if (regex_match(a->resource, match, rx) && match.size() == 3) { + if (match[1] == "user") { + return Principal::user(std::move(a->account), + match[2]); + } + + if (match[1] == "role") { + return Principal::role(std::move(a->account), + match[2]); + } + + if (match[1] == "oidc-provider") { + return Principal::oidc_provider(std::move(match[2])); + } + if (match[1] == "assumed-role") { + return Principal::assumed_role(std::move(a->account), match[2]); + } + } + } else if (std::none_of(s.begin(), s.end(), + [](const char& c) { + return (c == ':') || (c == '/'); + })) { + // Since tenants are simply prefixes, there's no really good + // way to see if one exists or not. So we return the thing and + // let them try to match against it. + return Principal::tenant(std::move(s)); + } + if (errmsg) + *errmsg = + fmt::format( + "`{}` is not a supported AWS or Federated ARN. Supported ARNs are " + "forms like: " + "`arn:aws:iam::tenant:root` or a bare tenant name for a tenant, " + "`arn:aws:iam::tenant:role/role-name` for a role, " + "`arn:aws:sts::tenant:assumed-role/role-name/role-session-name` " + "for an assumed role, " + "`arn:aws:iam::tenant:user/user-name` for a user, " + "`arn:aws:iam::tenant:oidc-provider/idp-url` for OIDC.", s); + } + + if (errmsg) + *errmsg = fmt::format("RGW does not support principals of type `{}`.", + w->name); + return boost::none; +} + +bool ParseState::do_string(CephContext* cct, const char* s, size_t l) { + auto k = pp->tokens.lookup(s, l); + Policy& p = pp->policy; + bool is_action = false; + bool is_validaction = false; + Statement* t = p.statements.empty() ? nullptr : &(p.statements.back()); + + // Top level! + if (w->id == TokenID::Version) { + if (k && k->kind == TokenKind::version_key) { + p.version = static_cast(k->specific); + } else { + annotate( + fmt::format("`{}` is not a valid version. Valid versions are " + "`2008-10-17` and `2012-10-17`.", + std::string_view{s, l})); + + return false; + } + } else if (w->id == TokenID::Id) { + p.id = string(s, l); + + // Statement + + } else if (w->id == TokenID::Sid) { + t->sid.emplace(s, l); + } else if (w->id == TokenID::Effect) { + if (k && k->kind == TokenKind::effect_key) { + t->effect = static_cast(k->specific); + } else { + annotate(fmt::format("`{}` is not a valid effect.", + std::string_view{s, l})); + return false; + } + } else if (w->id == TokenID::Principal && s && *s == '*') { + t->princ.emplace(Principal::wildcard()); + } else if (w->id == TokenID::NotPrincipal && s && *s == '*') { + t->noprinc.emplace(Principal::wildcard()); + } else if ((w->id == TokenID::Action) || + (w->id == TokenID::NotAction)) { + is_action = true; + if (*s == '*') { + is_validaction = true; + (w->id == TokenID::Action ? + t->action = allValue : t->notaction = allValue); + } else { + for (auto& p : actpairs) { + if (match_policy({s, l}, p.name, MATCH_POLICY_ACTION)) { + is_validaction = true; + (w->id == TokenID::Action ? t->action[p.bit] = 1 : t->notaction[p.bit] = 1); + } + if ((t->action & s3AllValue) == s3AllValue) { + t->action[s3All] = 1; + } + if ((t->notaction & s3AllValue) == s3AllValue) { + t->notaction[s3All] = 1; + } + if ((t->action & iamAllValue) == iamAllValue) { + t->action[iamAll] = 1; + } + if ((t->notaction & iamAllValue) == iamAllValue) { + t->notaction[iamAll] = 1; + } + if ((t->action & stsAllValue) == stsAllValue) { + t->action[stsAll] = 1; + } + if ((t->notaction & stsAllValue) == stsAllValue) { + t->notaction[stsAll] = 1; + } + } + } + } else if (w->id == TokenID::Resource || w->id == TokenID::NotResource) { + auto a = ARN::parse({s, l}, true); + if (!a) { + annotate( + fmt::format("`{}` is not a valid ARN. Resource ARNs should have a " + "format like `arn:aws:s3::tenant:resource' or " + "`arn:aws:s3:::resource`.", + std::string_view{s, l})); + return false; + } + // You can't specify resources for someone ELSE'S account. + if (a->account.empty() || a->account == pp->tenant || + a->account == "*") { + if (a->account.empty() || a->account == "*") + a->account = pp->tenant; + (w->id == TokenID::Resource ? t->resource : t->notresource) + .emplace(std::move(*a)); + } else { + annotate(fmt::format("Policy owned by tenant `{}` cannot grant access to " + "resource owned by tenant `{}`.", + pp->tenant, a->account)); + return false; + } + } else if (w->kind == TokenKind::cond_key) { + auto& t = pp->policy.statements.back(); + if (l > 0 && *s == '$') { + if (l >= 2 && *(s+1) == '{') { + if (l > 0 && *(s+l-1) == '}') { + t.conditions.back().isruntime = true; + } else { + annotate(fmt::format("Invalid interpolation `{}`.", + std::string_view{s, l})); + return false; + } + } else { + annotate(fmt::format("Invalid interpolation `{}`.", + std::string_view{s, l})); + return false; + } + } + t.conditions.back().vals.emplace_back(s, l); + + // Principals + + } else if (w->kind == TokenKind::princ_type) { + if (pp->s.size() <= 1) { + annotate(fmt::format("Principle isn't allowed at top level.")); + return false; + } + auto& pri = pp->s[pp->s.size() - 2].w->id == TokenID::Principal ? + t->princ : t->noprinc; + + string errmsg; + if (auto o = parse_principal({s, l}, &errmsg)) { + pri.emplace(std::move(*o)); + } else if (pp->reject_invalid_principals) { + annotate(std::move(errmsg)); + return false; + } else { + ldout(cct, 0) << "Ignored principle `" << std::string_view{s, l} << "`: " + << errmsg << dendl; + } + } else { + // Failure + annotate(fmt::format("`{}` is not valid in the context of `{}`.", + std::string_view{s, l}, w->name)); + return false; + } + + if (!arraying) { + pp->s.pop_back(); + } + + if (is_action && !is_validaction) { + annotate(fmt::format("`{}` is not a valid action.", + std::string_view{s, l})); + return false; + } + + return true; +} + +bool ParseState::number(const char* s, size_t l) { + // Top level! + if (w->kind == TokenKind::cond_key) { + auto& t = pp->policy.statements.back(); + t.conditions.back().vals.emplace_back(s, l); + } else { + // Failure + annotate("Numbers are not allowed outside condition arguments."); + return false; + } + + if (!arraying) { + pp->s.pop_back(); + } + + return true; +} + +void ParseState::reset() { + pp->reset(pp->v); +} + +bool ParseState::obj_start() { + if (w->objectable && !objecting) { + objecting = true; + if (w->id == TokenID::Statement) { + pp->policy.statements.emplace_back(); + } + + return true; + } + + annotate(fmt::format("The {} keyword cannot introduce an object.", + w->name)); + + return false; +} + + +bool ParseState::array_end() { + if (arraying && !objecting) { + pp->s.pop_back(); + return true; + } + + annotate("Attempt to close unopened array."); + return false; +} + +ostream& operator <<(ostream& m, const MaskedIP& ip) { + // I have a theory about why std::bitset is the way it is. + if (ip.v6) { + for (int i = 7; i >= 0; --i) { + uint16_t hextet = 0; + for (int j = 15; j >= 0; --j) { + hextet |= (ip.addr[(i * 16) + j] << j); + } + m << hex << (unsigned int) hextet; + if (i != 0) { + m << ":"; + } + } + } else { + // It involves Satan. + for (int i = 3; i >= 0; --i) { + uint8_t b = 0; + for (int j = 7; j >= 0; --j) { + b |= (ip.addr[(i * 8) + j] << j); + } + m << (unsigned int) b; + if (i != 0) { + m << "."; + } + } + } + m << "/" << dec << ip.prefix; + // It would explain a lot + return m; +} + +bool Condition::eval(const Environment& env) const { + std::vector runtime_vals; + auto i = env.find(key); + if (op == TokenID::Null) { + return i == env.end() ? true : false; + } + + if (i == env.end()) { + if (op == TokenID::ForAllValuesStringEquals || + op == TokenID::ForAllValuesStringEqualsIgnoreCase || + op == TokenID::ForAllValuesStringLike) { + return true; + } else { + return ifexists; + } + } + + if (isruntime) { + string k = vals.back(); + k.erase(0,2); //erase $, { + k.erase(k.length() - 1, 1); //erase } + const auto& it = env.equal_range(k); + for (auto itr = it.first; itr != it.second; itr++) { + runtime_vals.emplace_back(itr->second); + } + } + const auto& s = i->second; + + const auto& itr = env.equal_range(key); + + switch (op) { + // String! + case TokenID::ForAnyValueStringEquals: + case TokenID::StringEquals: + return orrible(std::equal_to(), itr, isruntime? runtime_vals : vals); + + case TokenID::StringNotEquals: + return orrible(std::not_fn(std::equal_to()), + itr, isruntime? runtime_vals : vals); + + case TokenID::ForAnyValueStringEqualsIgnoreCase: + case TokenID::StringEqualsIgnoreCase: + return orrible(ci_equal_to(), itr, isruntime? runtime_vals : vals); + + case TokenID::StringNotEqualsIgnoreCase: + return orrible(std::not_fn(ci_equal_to()), itr, isruntime? runtime_vals : vals); + + case TokenID::ForAnyValueStringLike: + case TokenID::StringLike: + return orrible(string_like(), itr, isruntime? runtime_vals : vals); + + case TokenID::StringNotLike: + return orrible(std::not_fn(string_like()), itr, isruntime? runtime_vals : vals); + + case TokenID::ForAllValuesStringEquals: + return andible(std::equal_to(), itr, isruntime? runtime_vals : vals); + + case TokenID::ForAllValuesStringLike: + return andible(string_like(), itr, isruntime? runtime_vals : vals); + + case TokenID::ForAllValuesStringEqualsIgnoreCase: + return andible(ci_equal_to(), itr, isruntime? runtime_vals : vals); + + // Numeric + case TokenID::NumericEquals: + return shortible(std::equal_to(), as_number, s, vals); + + case TokenID::NumericNotEquals: + return shortible(std::not_fn(std::equal_to()), + as_number, s, vals); + + + case TokenID::NumericLessThan: + return shortible(std::less(), as_number, s, vals); + + + case TokenID::NumericLessThanEquals: + return shortible(std::less_equal(), as_number, s, vals); + + case TokenID::NumericGreaterThan: + return shortible(std::greater(), as_number, s, vals); + + case TokenID::NumericGreaterThanEquals: + return shortible(std::greater_equal(), as_number, s, vals); + + // Date! + case TokenID::DateEquals: + return shortible(std::equal_to(), as_date, s, vals); + + case TokenID::DateNotEquals: + return shortible(std::not_fn(std::equal_to()), + as_date, s, vals); + + case TokenID::DateLessThan: + return shortible(std::less(), as_date, s, vals); + + + case TokenID::DateLessThanEquals: + return shortible(std::less_equal(), as_date, s, vals); + + case TokenID::DateGreaterThan: + return shortible(std::greater(), as_date, s, vals); + + case TokenID::DateGreaterThanEquals: + return shortible(std::greater_equal(), as_date, s, + vals); + + // Bool! + case TokenID::Bool: + return shortible(std::equal_to(), as_bool, s, vals); + + // Binary! + case TokenID::BinaryEquals: + return shortible(std::equal_to(), as_binary, s, + vals); + + // IP Address! + case TokenID::IpAddress: + return shortible(std::equal_to(), as_network, s, vals); + + case TokenID::NotIpAddress: + { + auto xc = as_network(s); + if (!xc) { + return false; + } + + for (const string& d : vals) { + auto xd = as_network(d); + if (!xd) { + continue; + } + + if (xc == xd) { + return false; + } + } + return true; + } + +#if 0 + // Amazon Resource Names! (Does S3 need this?) + TokenID::ArnEquals, TokenID::ArnNotEquals, TokenID::ArnLike, + TokenID::ArnNotLike, +#endif + + default: + return false; + } +} + +boost::optional Condition::as_network(const string& s) { + MaskedIP m; + if (s.empty()) { + return boost::none; + } + + m.v6 = (s.find(':') == string::npos) ? false : true; + + auto slash = s.find('/'); + if (slash == string::npos) { + m.prefix = m.v6 ? 128 : 32; + } else { + char* end = 0; + m.prefix = strtoul(s.data() + slash + 1, &end, 10); + if (*end != 0 || (m.v6 && m.prefix > 128) || + (!m.v6 && m.prefix > 32)) { + return boost::none; + } + } + + string t; + auto p = &s; + + if (slash != string::npos) { + t.assign(s, 0, slash); + p = &t; + } + + if (m.v6) { + struct in6_addr a; + if (inet_pton(AF_INET6, p->c_str(), static_cast(&a)) != 1) { + return boost::none; + } + + m.addr |= Address(a.s6_addr[15]) << 0; + m.addr |= Address(a.s6_addr[14]) << 8; + m.addr |= Address(a.s6_addr[13]) << 16; + m.addr |= Address(a.s6_addr[12]) << 24; + m.addr |= Address(a.s6_addr[11]) << 32; + m.addr |= Address(a.s6_addr[10]) << 40; + m.addr |= Address(a.s6_addr[9]) << 48; + m.addr |= Address(a.s6_addr[8]) << 56; + m.addr |= Address(a.s6_addr[7]) << 64; + m.addr |= Address(a.s6_addr[6]) << 72; + m.addr |= Address(a.s6_addr[5]) << 80; + m.addr |= Address(a.s6_addr[4]) << 88; + m.addr |= Address(a.s6_addr[3]) << 96; + m.addr |= Address(a.s6_addr[2]) << 104; + m.addr |= Address(a.s6_addr[1]) << 112; + m.addr |= Address(a.s6_addr[0]) << 120; + } else { + struct in_addr a; + if (inet_pton(AF_INET, p->c_str(), static_cast(&a)) != 1) { + return boost::none; + } + + m.addr = ntohl(a.s_addr); + } + + return m; +} + +namespace { +const char* condop_string(const TokenID t) { + switch (t) { + case TokenID::StringEquals: + return "StringEquals"; + + case TokenID::StringNotEquals: + return "StringNotEquals"; + + case TokenID::StringEqualsIgnoreCase: + return "StringEqualsIgnoreCase"; + + case TokenID::StringNotEqualsIgnoreCase: + return "StringNotEqualsIgnoreCase"; + + case TokenID::StringLike: + return "StringLike"; + + case TokenID::StringNotLike: + return "StringNotLike"; + + // Numeric! + case TokenID::NumericEquals: + return "NumericEquals"; + + case TokenID::NumericNotEquals: + return "NumericNotEquals"; + + case TokenID::NumericLessThan: + return "NumericLessThan"; + + case TokenID::NumericLessThanEquals: + return "NumericLessThanEquals"; + + case TokenID::NumericGreaterThan: + return "NumericGreaterThan"; + + case TokenID::NumericGreaterThanEquals: + return "NumericGreaterThanEquals"; + + case TokenID::DateEquals: + return "DateEquals"; + + case TokenID::DateNotEquals: + return "DateNotEquals"; + + case TokenID::DateLessThan: + return "DateLessThan"; + + case TokenID::DateLessThanEquals: + return "DateLessThanEquals"; + + case TokenID::DateGreaterThan: + return "DateGreaterThan"; + + case TokenID::DateGreaterThanEquals: + return "DateGreaterThanEquals"; + + case TokenID::Bool: + return "Bool"; + + case TokenID::BinaryEquals: + return "BinaryEquals"; + + case TokenID::IpAddress: + return "case TokenID::IpAddress"; + + case TokenID::NotIpAddress: + return "NotIpAddress"; + + case TokenID::ArnEquals: + return "ArnEquals"; + + case TokenID::ArnNotEquals: + return "ArnNotEquals"; + + case TokenID::ArnLike: + return "ArnLike"; + + case TokenID::ArnNotLike: + return "ArnNotLike"; + + case TokenID::Null: + return "Null"; + + default: + return "InvalidConditionOperator"; + } +} + +template +ostream& print_array(ostream& m, Iterator begin, Iterator end) { + if (begin == end) { + m << "[]"; + } else { + m << "[ "; + std::copy(begin, end, std::experimental::make_ostream_joiner(m, ", ")); + m << " ]"; + } + return m; +} + +template +ostream& print_dict(ostream& m, Iterator begin, Iterator end) { + m << "{ "; + std::copy(begin, end, std::experimental::make_ostream_joiner(m, ", ")); + m << " }"; + return m; +} + +} + +ostream& operator <<(ostream& m, const Condition& c) { + m << condop_string(c.op); + if (c.ifexists) { + m << "IfExists"; + } + m << ": { " << c.key; + print_array(m, c.vals.cbegin(), c.vals.cend()); + return m << " }"; +} + +Effect Statement::eval(const Environment& e, + boost::optional ida, + uint64_t act, boost::optional res, boost::optional princ_type) const { + + if (eval_principal(e, ida, princ_type) == Effect::Deny) { + return Effect::Pass; + } + + if (res && resource.empty() && notresource.empty()) { + return Effect::Pass; + } + if (!res && (!resource.empty() || !notresource.empty())) { + return Effect::Pass; + } + if (!resource.empty() && res) { + if (!std::any_of(resource.begin(), resource.end(), + [&res](const ARN& pattern) { + return pattern.match(*res); + })) { + return Effect::Pass; + } + } else if (!notresource.empty() && res) { + if (std::any_of(notresource.begin(), notresource.end(), + [&res](const ARN& pattern) { + return pattern.match(*res); + })) { + return Effect::Pass; + } + } + + if (!(action[act] == 1) || (notaction[act] == 1)) { + return Effect::Pass; + } + + if (std::all_of(conditions.begin(), + conditions.end(), + [&e](const Condition& c) { return c.eval(e);})) { + return effect; + } + + return Effect::Pass; +} + +Effect Statement::eval_principal(const Environment& e, + boost::optional ida, boost::optional princ_type) const { + if (princ_type) { + *princ_type = PolicyPrincipal::Other; + } + if (ida) { + if (princ.empty() && noprinc.empty()) { + return Effect::Deny; + } + if (ida->get_identity_type() != TYPE_ROLE && !princ.empty() && !ida->is_identity(princ)) { + return Effect::Deny; + } + if (ida->get_identity_type() == TYPE_ROLE && !princ.empty()) { + bool princ_matched = false; + for (auto p : princ) { // Check each principal to determine the type of the one that has matched + boost::container::flat_set id; + id.insert(p); + if (ida->is_identity(id)) { + if (p.is_assumed_role() || p.is_user()) { + if (princ_type) *princ_type = PolicyPrincipal::Session; + } else { + if (princ_type) *princ_type = PolicyPrincipal::Role; + } + princ_matched = true; + } + } + if (!princ_matched) { + return Effect::Deny; + } + } else if (!noprinc.empty() && ida->is_identity(noprinc)) { + return Effect::Deny; + } + } + return Effect::Allow; +} + +Effect Statement::eval_conditions(const Environment& e) const { + if (std::all_of(conditions.begin(), + conditions.end(), + [&e](const Condition& c) { return c.eval(e);})) { + return Effect::Allow; + } + return Effect::Deny; +} + +namespace { +const char* action_bit_string(uint64_t action) { + switch (action) { + case s3GetObject: + return "s3:GetObject"; + + case s3GetObjectVersion: + return "s3:GetObjectVersion"; + + case s3PutObject: + return "s3:PutObject"; + + case s3GetObjectAcl: + return "s3:GetObjectAcl"; + + case s3GetObjectVersionAcl: + return "s3:GetObjectVersionAcl"; + + case s3PutObjectAcl: + return "s3:PutObjectAcl"; + + case s3PutObjectVersionAcl: + return "s3:PutObjectVersionAcl"; + + case s3DeleteObject: + return "s3:DeleteObject"; + + case s3DeleteObjectVersion: + return "s3:DeleteObjectVersion"; + + case s3ListMultipartUploadParts: + return "s3:ListMultipartUploadParts"; + + case s3AbortMultipartUpload: + return "s3:AbortMultipartUpload"; + + case s3GetObjectTorrent: + return "s3:GetObjectTorrent"; + + case s3GetObjectVersionTorrent: + return "s3:GetObjectVersionTorrent"; + + case s3RestoreObject: + return "s3:RestoreObject"; + + case s3CreateBucket: + return "s3:CreateBucket"; + + case s3DeleteBucket: + return "s3:DeleteBucket"; + + case s3ListBucket: + return "s3:ListBucket"; + + case s3ListBucketVersions: + return "s3:ListBucketVersions"; + case s3ListAllMyBuckets: + return "s3:ListAllMyBuckets"; + + case s3ListBucketMultipartUploads: + return "s3:ListBucketMultipartUploads"; + + case s3GetAccelerateConfiguration: + return "s3:GetAccelerateConfiguration"; + + case s3PutAccelerateConfiguration: + return "s3:PutAccelerateConfiguration"; + + case s3GetBucketAcl: + return "s3:GetBucketAcl"; + + case s3PutBucketAcl: + return "s3:PutBucketAcl"; + + case s3GetBucketCORS: + return "s3:GetBucketCORS"; + + case s3PutBucketCORS: + return "s3:PutBucketCORS"; + + case s3GetBucketEncryption: + return "s3:GetBucketEncryption"; + + case s3PutBucketEncryption: + return "s3:PutBucketEncryption"; + + case s3GetBucketVersioning: + return "s3:GetBucketVersioning"; + + case s3PutBucketVersioning: + return "s3:PutBucketVersioning"; + + case s3GetBucketRequestPayment: + return "s3:GetBucketRequestPayment"; + + case s3PutBucketRequestPayment: + return "s3:PutBucketRequestPayment"; + + case s3GetBucketLocation: + return "s3:GetBucketLocation"; + + case s3GetBucketPolicy: + return "s3:GetBucketPolicy"; + + case s3DeleteBucketPolicy: + return "s3:DeleteBucketPolicy"; + + case s3PutBucketPolicy: + return "s3:PutBucketPolicy"; + + case s3GetBucketNotification: + return "s3:GetBucketNotification"; + + case s3PutBucketNotification: + return "s3:PutBucketNotification"; + + case s3GetBucketLogging: + return "s3:GetBucketLogging"; + + case s3PutBucketLogging: + return "s3:PutBucketLogging"; + + case s3GetBucketTagging: + return "s3:GetBucketTagging"; + + case s3PutBucketTagging: + return "s3:PutBucketTagging"; + + case s3GetBucketWebsite: + return "s3:GetBucketWebsite"; + + case s3PutBucketWebsite: + return "s3:PutBucketWebsite"; + + case s3DeleteBucketWebsite: + return "s3:DeleteBucketWebsite"; + + case s3GetLifecycleConfiguration: + return "s3:GetLifecycleConfiguration"; + + case s3PutLifecycleConfiguration: + return "s3:PutLifecycleConfiguration"; + + case s3PutReplicationConfiguration: + return "s3:PutReplicationConfiguration"; + + case s3GetReplicationConfiguration: + return "s3:GetReplicationConfiguration"; + + case s3DeleteReplicationConfiguration: + return "s3:DeleteReplicationConfiguration"; + + case s3PutObjectTagging: + return "s3:PutObjectTagging"; + + case s3PutObjectVersionTagging: + return "s3:PutObjectVersionTagging"; + + case s3GetObjectTagging: + return "s3:GetObjectTagging"; + + case s3GetObjectVersionTagging: + return "s3:GetObjectVersionTagging"; + + case s3DeleteObjectTagging: + return "s3:DeleteObjectTagging"; + + case s3DeleteObjectVersionTagging: + return "s3:DeleteObjectVersionTagging"; + + case s3PutBucketObjectLockConfiguration: + return "s3:PutBucketObjectLockConfiguration"; + + case s3GetBucketObjectLockConfiguration: + return "s3:GetBucketObjectLockConfiguration"; + + case s3PutObjectRetention: + return "s3:PutObjectRetention"; + + case s3GetObjectRetention: + return "s3:GetObjectRetention"; + + case s3PutObjectLegalHold: + return "s3:PutObjectLegalHold"; + + case s3GetObjectLegalHold: + return "s3:GetObjectLegalHold"; + + case s3BypassGovernanceRetention: + return "s3:BypassGovernanceRetention"; + + case iamPutUserPolicy: + return "iam:PutUserPolicy"; + + case iamGetUserPolicy: + return "iam:GetUserPolicy"; + + case iamListUserPolicies: + return "iam:ListUserPolicies"; + + case iamDeleteUserPolicy: + return "iam:DeleteUserPolicy"; + + case iamCreateRole: + return "iam:CreateRole"; + + case iamDeleteRole: + return "iam:DeleteRole"; + + case iamGetRole: + return "iam:GetRole"; + + case iamModifyRoleTrustPolicy: + return "iam:ModifyRoleTrustPolicy"; + + case iamListRoles: + return "iam:ListRoles"; + + case iamPutRolePolicy: + return "iam:PutRolePolicy"; + + case iamGetRolePolicy: + return "iam:GetRolePolicy"; + + case iamListRolePolicies: + return "iam:ListRolePolicies"; + + case iamDeleteRolePolicy: + return "iam:DeleteRolePolicy"; + + case iamCreateOIDCProvider: + return "iam:CreateOIDCProvider"; + + case iamDeleteOIDCProvider: + return "iam:DeleteOIDCProvider"; + + case iamGetOIDCProvider: + return "iam:GetOIDCProvider"; + + case iamListOIDCProviders: + return "iam:ListOIDCProviders"; + + case iamTagRole: + return "iam:TagRole"; + + case iamListRoleTags: + return "iam:ListRoleTags"; + + case iamUntagRole: + return "iam:UntagRole"; + + case iamUpdateRole: + return "iam:UpdateRole"; + + case stsAssumeRole: + return "sts:AssumeRole"; + + case stsAssumeRoleWithWebIdentity: + return "sts:AssumeRoleWithWebIdentity"; + + case stsGetSessionToken: + return "sts:GetSessionToken"; + + case stsTagSession: + return "sts:TagSession"; + } + return "s3Invalid"; +} + +ostream& print_actions(ostream& m, const Action_t a) { + bool begun = false; + m << "[ "; + for (auto i = 0U; i < allCount; ++i) { + if (a[i] == 1) { + if (begun) { + m << ", "; + } else { + begun = true; + } + m << action_bit_string(i); + } + } + if (begun) { + m << " ]"; + } else { + m << "]"; + } + return m; +} +} + +ostream& operator <<(ostream& m, const Statement& s) { + m << "{ "; + if (s.sid) { + m << "Sid: " << *s.sid << ", "; + } + if (!s.princ.empty()) { + m << "Principal: "; + print_dict(m, s.princ.cbegin(), s.princ.cend()); + m << ", "; + } + if (!s.noprinc.empty()) { + m << "NotPrincipal: "; + print_dict(m, s.noprinc.cbegin(), s.noprinc.cend()); + m << ", "; + } + + m << "Effect: " << + (s.effect == Effect::Allow ? + (const char*) "Allow" : + (const char*) "Deny"); + + if (s.action.any() || s.notaction.any() || !s.resource.empty() || + !s.notresource.empty() || !s.conditions.empty()) { + m << ", "; + } + + if (s.action.any()) { + m << "Action: "; + print_actions(m, s.action); + + if (s.notaction.any() || !s.resource.empty() || + !s.notresource.empty() || !s.conditions.empty()) { + m << ", "; + } + } + + if (s.notaction.any()) { + m << "NotAction: "; + print_actions(m, s.notaction); + + if (!s.resource.empty() || !s.notresource.empty() || + !s.conditions.empty()) { + m << ", "; + } + } + + if (!s.resource.empty()) { + m << "Resource: "; + print_array(m, s.resource.cbegin(), s.resource.cend()); + + if (!s.notresource.empty() || !s.conditions.empty()) { + m << ", "; + } + } + + if (!s.notresource.empty()) { + m << "NotResource: "; + print_array(m, s.notresource.cbegin(), s.notresource.cend()); + + if (!s.conditions.empty()) { + m << ", "; + } + } + + if (!s.conditions.empty()) { + m << "Condition: "; + print_dict(m, s.conditions.cbegin(), s.conditions.cend()); + } + + return m << " }"; +} + +Policy::Policy(CephContext* cct, const string& tenant, + const bufferlist& _text, + bool reject_invalid_principals) + : text(_text.to_str()) { + StringStream ss(text.data()); + PolicyParser pp(cct, tenant, *this, reject_invalid_principals); + auto pr = Reader{}.Parse(ss, pp); + if (!pr) { + throw PolicyParseException(pr, pp.annotation); + } +} + +Effect Policy::eval(const Environment& e, + boost::optional ida, + std::uint64_t action, boost::optional resource, + boost::optional princ_type) const { + auto allowed = false; + for (auto& s : statements) { + auto g = s.eval(e, ida, action, resource, princ_type); + if (g == Effect::Deny) { + return g; + } else if (g == Effect::Allow) { + allowed = true; + } + } + return allowed ? Effect::Allow : Effect::Pass; +} + +Effect Policy::eval_principal(const Environment& e, + boost::optional ida, boost::optional princ_type) const { + auto allowed = false; + for (auto& s : statements) { + auto g = s.eval_principal(e, ida, princ_type); + if (g == Effect::Deny) { + return g; + } else if (g == Effect::Allow) { + allowed = true; + } + } + return allowed ? Effect::Allow : Effect::Deny; +} + +Effect Policy::eval_conditions(const Environment& e) const { + auto allowed = false; + for (auto& s : statements) { + auto g = s.eval_conditions(e); + if (g == Effect::Deny) { + return g; + } else if (g == Effect::Allow) { + allowed = true; + } + } + return allowed ? Effect::Allow : Effect::Deny; +} + +ostream& operator <<(ostream& m, const Policy& p) { + m << "{ Version: " + << (p.version == Version::v2008_10_17 ? "2008-10-17" : "2012-10-17"); + + if (p.id || !p.statements.empty()) { + m << ", "; + } + + if (p.id) { + m << "Id: " << *p.id; + if (!p.statements.empty()) { + m << ", "; + } + } + + if (!p.statements.empty()) { + m << "Statements: "; + print_array(m, p.statements.cbegin(), p.statements.cend()); + m << ", "; + } + return m << " }"; +} + +static const Environment iam_all_env = { + {"aws:SourceIp","1.1.1.1"}, + {"aws:UserId","anonymous"}, + {"s3:x-amz-server-side-encryption-aws-kms-key-id","secret"} +}; + +struct IsPublicStatement +{ + bool operator() (const Statement &s) const { + if (s.effect == Effect::Allow) { + for (const auto& p : s.princ) { + if (p.is_wildcard()) { + return s.eval_conditions(iam_all_env) == Effect::Allow; + } + } + // no princ should not contain fixed values + return std::none_of(s.noprinc.begin(), s.noprinc.end(), [](const rgw::auth::Principal& p) { + return p.is_wildcard(); + }); + } + return false; + } +}; + + +bool is_public(const Policy& p) +{ + return std::any_of(p.statements.begin(), p.statements.end(), IsPublicStatement()); +} + +} // namespace IAM +} // namespace rgw diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h new file mode 100644 index 000000000..c0a7e51b5 --- /dev/null +++ b/src/rgw/rgw_iam_policy.h @@ -0,0 +1,579 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "common/ceph_time.h" +#include "common/iso_8601.h" + +#include "rapidjson/error/error.h" +#include "rapidjson/error/en.h" + +#include "rgw_acl.h" +#include "rgw_basic_types.h" +#include "rgw_iam_policy_keywords.h" +#include "rgw_string.h" +#include "rgw_arn.h" + +namespace rgw { +namespace auth { +class Identity; +} +} + +namespace rgw { +namespace IAM { + +static constexpr std::uint64_t s3GetObject = 0; +static constexpr std::uint64_t s3GetObjectVersion = 1; +static constexpr std::uint64_t s3PutObject = 2; +static constexpr std::uint64_t s3GetObjectAcl = 3; +static constexpr std::uint64_t s3GetObjectVersionAcl = 4; +static constexpr std::uint64_t s3PutObjectAcl = 5; +static constexpr std::uint64_t s3PutObjectVersionAcl = 6; +static constexpr std::uint64_t s3DeleteObject = 7; +static constexpr std::uint64_t s3DeleteObjectVersion = 8; +static constexpr std::uint64_t s3ListMultipartUploadParts = 9; +static constexpr std::uint64_t s3AbortMultipartUpload = 10; +static constexpr std::uint64_t s3GetObjectTorrent = 11; +static constexpr std::uint64_t s3GetObjectVersionTorrent = 12; +static constexpr std::uint64_t s3RestoreObject = 13; +static constexpr std::uint64_t s3CreateBucket = 14; +static constexpr std::uint64_t s3DeleteBucket = 15; +static constexpr std::uint64_t s3ListBucket = 16; +static constexpr std::uint64_t s3ListBucketVersions = 17; +static constexpr std::uint64_t s3ListAllMyBuckets = 18; +static constexpr std::uint64_t s3ListBucketMultipartUploads = 19; +static constexpr std::uint64_t s3GetAccelerateConfiguration = 20; +static constexpr std::uint64_t s3PutAccelerateConfiguration = 21; +static constexpr std::uint64_t s3GetBucketAcl = 22; +static constexpr std::uint64_t s3PutBucketAcl = 23; +static constexpr std::uint64_t s3GetBucketCORS = 24; +static constexpr std::uint64_t s3PutBucketCORS = 25; +static constexpr std::uint64_t s3GetBucketVersioning = 26; +static constexpr std::uint64_t s3PutBucketVersioning = 27; +static constexpr std::uint64_t s3GetBucketRequestPayment = 28; +static constexpr std::uint64_t s3PutBucketRequestPayment = 29; +static constexpr std::uint64_t s3GetBucketLocation = 30; +static constexpr std::uint64_t s3GetBucketPolicy = 31; +static constexpr std::uint64_t s3DeleteBucketPolicy = 32; +static constexpr std::uint64_t s3PutBucketPolicy = 33; +static constexpr std::uint64_t s3GetBucketNotification = 34; +static constexpr std::uint64_t s3PutBucketNotification = 35; +static constexpr std::uint64_t s3GetBucketLogging = 36; +static constexpr std::uint64_t s3PutBucketLogging = 37; +static constexpr std::uint64_t s3GetBucketTagging = 38; +static constexpr std::uint64_t s3PutBucketTagging = 39; +static constexpr std::uint64_t s3GetBucketWebsite = 40; +static constexpr std::uint64_t s3PutBucketWebsite = 41; +static constexpr std::uint64_t s3DeleteBucketWebsite = 42; +static constexpr std::uint64_t s3GetLifecycleConfiguration = 43; +static constexpr std::uint64_t s3PutLifecycleConfiguration = 44; +static constexpr std::uint64_t s3PutReplicationConfiguration = 45; +static constexpr std::uint64_t s3GetReplicationConfiguration = 46; +static constexpr std::uint64_t s3DeleteReplicationConfiguration = 47; +static constexpr std::uint64_t s3GetObjectTagging = 48; +static constexpr std::uint64_t s3PutObjectTagging = 49; +static constexpr std::uint64_t s3DeleteObjectTagging = 50; +static constexpr std::uint64_t s3GetObjectVersionTagging = 51; +static constexpr std::uint64_t s3PutObjectVersionTagging = 52; +static constexpr std::uint64_t s3DeleteObjectVersionTagging = 53; +static constexpr std::uint64_t s3PutBucketObjectLockConfiguration = 54; +static constexpr std::uint64_t s3GetBucketObjectLockConfiguration = 55; +static constexpr std::uint64_t s3PutObjectRetention = 56; +static constexpr std::uint64_t s3GetObjectRetention = 57; +static constexpr std::uint64_t s3PutObjectLegalHold = 58; +static constexpr std::uint64_t s3GetObjectLegalHold = 59; +static constexpr std::uint64_t s3BypassGovernanceRetention = 60; +static constexpr std::uint64_t s3GetBucketPolicyStatus = 61; +static constexpr std::uint64_t s3PutPublicAccessBlock = 62; +static constexpr std::uint64_t s3GetPublicAccessBlock = 63; +static constexpr std::uint64_t s3DeletePublicAccessBlock = 64; +static constexpr std::uint64_t s3GetBucketPublicAccessBlock = 65; +static constexpr std::uint64_t s3PutBucketPublicAccessBlock = 66; +static constexpr std::uint64_t s3DeleteBucketPublicAccessBlock = 67; +static constexpr std::uint64_t s3GetBucketEncryption = 68; +static constexpr std::uint64_t s3PutBucketEncryption = 69; +static constexpr std::uint64_t s3All = 70; + +static constexpr std::uint64_t iamPutUserPolicy = s3All + 1; +static constexpr std::uint64_t iamGetUserPolicy = s3All + 2; +static constexpr std::uint64_t iamDeleteUserPolicy = s3All + 3; +static constexpr std::uint64_t iamListUserPolicies = s3All + 4; +static constexpr std::uint64_t iamCreateRole = s3All + 5; +static constexpr std::uint64_t iamDeleteRole = s3All + 6; +static constexpr std::uint64_t iamModifyRoleTrustPolicy = s3All + 7; +static constexpr std::uint64_t iamGetRole = s3All + 8; +static constexpr std::uint64_t iamListRoles = s3All + 9; +static constexpr std::uint64_t iamPutRolePolicy = s3All + 10; +static constexpr std::uint64_t iamGetRolePolicy = s3All + 11; +static constexpr std::uint64_t iamListRolePolicies = s3All + 12; +static constexpr std::uint64_t iamDeleteRolePolicy = s3All + 13; +static constexpr std::uint64_t iamCreateOIDCProvider = s3All + 14; +static constexpr std::uint64_t iamDeleteOIDCProvider = s3All + 15; +static constexpr std::uint64_t iamGetOIDCProvider = s3All + 16; +static constexpr std::uint64_t iamListOIDCProviders = s3All + 17; +static constexpr std::uint64_t iamTagRole = s3All + 18; +static constexpr std::uint64_t iamListRoleTags = s3All + 19; +static constexpr std::uint64_t iamUntagRole = s3All + 20; +static constexpr std::uint64_t iamUpdateRole = s3All + 21; +static constexpr std::uint64_t iamAll = s3All + 22; + +static constexpr std::uint64_t stsAssumeRole = iamAll + 1; +static constexpr std::uint64_t stsAssumeRoleWithWebIdentity = iamAll + 2; +static constexpr std::uint64_t stsGetSessionToken = iamAll + 3; +static constexpr std::uint64_t stsTagSession = iamAll + 4; +static constexpr std::uint64_t stsAll = iamAll + 5; + +static constexpr std::uint64_t s3Count = s3All; +static constexpr std::uint64_t allCount = stsAll + 1; + +using Action_t = std::bitset; +using NotAction_t = Action_t; + +template +constexpr std::bitset make_bitmask(size_t s) { + // unfortunately none of the shift/logic operators of std::bitset have a constexpr variation + return s < 64 ? std::bitset ((1ULL << s) - 1) : + std::bitset((1ULL << 63) - 1) | make_bitmask (s - 63) << 63; +} + +template +constexpr std::bitset set_cont_bits(size_t start, size_t end) +{ + return (make_bitmask(end - start)) << start; +} + +static const Action_t None(0); +static const Action_t s3AllValue = set_cont_bits(0,s3All); +static const Action_t iamAllValue = set_cont_bits(s3All+1,iamAll); +static const Action_t stsAllValue = set_cont_bits(iamAll+1,stsAll); +static const Action_t allValue = set_cont_bits(0,allCount); + +namespace { +// Please update the table in doc/radosgw/s3/authentication.rst if you +// modify this function. +inline int op_to_perm(std::uint64_t op) { + switch (op) { + case s3GetObject: + case s3GetObjectTorrent: + case s3GetObjectVersion: + case s3GetObjectVersionTorrent: + case s3GetObjectTagging: + case s3GetObjectVersionTagging: + case s3GetObjectRetention: + case s3GetObjectLegalHold: + case s3ListAllMyBuckets: + case s3ListBucket: + case s3ListBucketMultipartUploads: + case s3ListBucketVersions: + case s3ListMultipartUploadParts: + return RGW_PERM_READ; + + case s3AbortMultipartUpload: + case s3CreateBucket: + case s3DeleteBucket: + case s3DeleteObject: + case s3DeleteObjectVersion: + case s3PutObject: + case s3PutObjectTagging: + case s3PutObjectVersionTagging: + case s3DeleteObjectTagging: + case s3DeleteObjectVersionTagging: + case s3RestoreObject: + case s3PutObjectRetention: + case s3PutObjectLegalHold: + case s3BypassGovernanceRetention: + return RGW_PERM_WRITE; + + case s3GetAccelerateConfiguration: + case s3GetBucketAcl: + case s3GetBucketCORS: + case s3GetBucketEncryption: + case s3GetBucketLocation: + case s3GetBucketLogging: + case s3GetBucketNotification: + case s3GetBucketPolicy: + case s3GetBucketPolicyStatus: + case s3GetBucketRequestPayment: + case s3GetBucketTagging: + case s3GetBucketVersioning: + case s3GetBucketWebsite: + case s3GetLifecycleConfiguration: + case s3GetObjectAcl: + case s3GetObjectVersionAcl: + case s3GetReplicationConfiguration: + case s3GetBucketObjectLockConfiguration: + case s3GetBucketPublicAccessBlock: + return RGW_PERM_READ_ACP; + + case s3DeleteBucketPolicy: + case s3DeleteBucketWebsite: + case s3DeleteReplicationConfiguration: + case s3PutAccelerateConfiguration: + case s3PutBucketAcl: + case s3PutBucketCORS: + case s3PutBucketEncryption: + case s3PutBucketLogging: + case s3PutBucketNotification: + case s3PutBucketPolicy: + case s3PutBucketRequestPayment: + case s3PutBucketTagging: + case s3PutBucketVersioning: + case s3PutBucketWebsite: + case s3PutLifecycleConfiguration: + case s3PutObjectAcl: + case s3PutObjectVersionAcl: + case s3PutReplicationConfiguration: + case s3PutBucketObjectLockConfiguration: + case s3PutBucketPublicAccessBlock: + return RGW_PERM_WRITE_ACP; + + case s3All: + return RGW_PERM_FULL_CONTROL; + } + return RGW_PERM_INVALID; +} +} + +enum class PolicyPrincipal { + Role, + Session, + Other +}; + +using Environment = std::unordered_multimap; + +using Address = std::bitset<128>; +struct MaskedIP { + bool v6; + Address addr; + // Since we're mapping IPv6 to IPv4 addresses, we may want to + // consider making the prefix always be in terms of a v6 address + // and just use the v6 bit to rewrite it as a v4 prefix for + // output. + unsigned int prefix; +}; + +std::ostream& operator <<(std::ostream& m, const MaskedIP& ip); + +inline bool operator ==(const MaskedIP& l, const MaskedIP& r) { + auto shift = std::max((l.v6 ? 128 : 32) - ((int) l.prefix), + (r.v6 ? 128 : 32) - ((int) r.prefix)); + ceph_assert(shift >= 0); + return (l.addr >> shift) == (r.addr >> shift); +} + +struct Condition { + TokenID op; + // Originally I was going to use a perfect hash table, but Marcus + // says keys are to be added at run-time not compile time. + + // In future development, use symbol internment. + std::string key; + bool ifexists = false; + bool isruntime = false; //Is evaluated during run-time + // Much to my annoyance there is no actual way to do this in a + // typed way that is compatible with AWS. I know this because I've + // seen examples where the same value is used as a string in one + // context and a date in another. + std::vector vals; + + Condition() = default; + Condition(TokenID op, const char* s, std::size_t len, bool ifexists) + : op(op), key(s, len), ifexists(ifexists) {} + + bool eval(const Environment& e) const; + + static boost::optional as_number(const std::string& s) { + std::size_t p = 0; + + try { + double d = std::stod(s, &p); + if (p < s.length()) { + return boost::none; + } + + return d; + } catch (const std::logic_error& e) { + return boost::none; + } + } + + static boost::optional as_date(const std::string& s) { + std::size_t p = 0; + + try { + double d = std::stod(s, &p); + if (p == s.length()) { + return ceph::real_time( + std::chrono::seconds(static_cast(d)) + + std::chrono::nanoseconds( + static_cast((d - static_cast(d)) + * 1000000000))); + } + + return from_iso_8601(std::string_view(s), false); + } catch (const std::logic_error& e) { + return boost::none; + } + } + + static boost::optional as_bool(const std::string& s) { + std::size_t p = 0; + + if (s.empty() || boost::iequals(s, "false")) { + return false; + } + + try { + double d = std::stod(s, &p); + if (p == s.length()) { + return !((d == +0.0) || (d == -0.0) || std::isnan(d)); + } + } catch (const std::logic_error& e) { + // Fallthrough + } + + return true; + } + + static boost::optional as_binary(const std::string& s) { + // In a just world + ceph::bufferlist base64; + // I could populate a bufferlist + base64.push_back(buffer::create_static( + s.length(), + const_cast(s.data()))); // Yuck + // From a base64 encoded std::string. + ceph::bufferlist bin; + + try { + bin.decode_base64(base64); + } catch (const ceph::buffer::malformed_input& e) { + return boost::none; + } + return bin; + } + + static boost::optional as_network(const std::string& s); + + + struct ci_equal_to { + bool operator ()(const std::string& s1, + const std::string& s2) const { + return boost::iequals(s1, s2); + } + }; + + struct string_like { + bool operator ()(const std::string& input, + const std::string& pattern) const { + return match_wildcards(pattern, input, 0); + } + }; + + struct ci_starts_with { + bool operator()(const std::string& s1, + const std::string& s2) const { + return boost::istarts_with(s1, s2); + } + }; + + using unordered_multimap_it_pair = std::pair ::const_iterator, std::unordered_multimap::const_iterator>; + + template + static bool andible(F&& f, const unordered_multimap_it_pair& it, + const std::vector& v) { + for (auto itr = it.first; itr != it.second; itr++) { + bool matched = false; + for (const auto& d : v) { + if (std::forward(f)(itr->second, d)) { + matched = true; + } + } + if (!matched) + return false; + } + return true; + } + + template + static bool orrible(F&& f, const unordered_multimap_it_pair& it, + const std::vector& v) { + for (auto itr = it.first; itr != it.second; itr++) { + for (const auto& d : v) { + if (std::forward(f)(itr->second, d)) { + return true; + } + } + } + return false; + } + + template + static bool shortible(F&& f, X& x, const std::string& c, + const std::vector& v) { + auto xc = std::forward(x)(c); + if (!xc) { + return false; + } + + for (const auto& d : v) { + auto xd = std::forward(x)(d); + if (!xd) { + continue; + } + + if (std::forward(f)(*xc, *xd)) { + return true; + } + } + return false; + } + + template + bool has_key_p(const std::string& _key, F p) const { + return p(key, _key); + } + + template + bool has_val_p(const std::string& _val, F p) const { + for (auto val : vals) { + if (p(val, _val)) + return true; + } + return false; + } +}; + +std::ostream& operator <<(std::ostream& m, const Condition& c); + +struct Statement { + boost::optional sid = boost::none; + + boost::container::flat_set princ; + boost::container::flat_set noprinc; + + // Every statement MUST provide an effect. I just initialize it to + // deny as defensive programming. + Effect effect = Effect::Deny; + + Action_t action = 0; + NotAction_t notaction = 0; + + boost::container::flat_set resource; + boost::container::flat_set notresource; + + std::vector conditions; + + Effect eval(const Environment& e, + boost::optional ida, + std::uint64_t action, boost::optional resource, boost::optional princ_type=boost::none) const; + + Effect eval_principal(const Environment& e, + boost::optional ida, boost::optional princ_type=boost::none) const; + + Effect eval_conditions(const Environment& e) const; +}; + +std::ostream& operator <<(std::ostream& m, const Statement& s); + +struct PolicyParseException : public std::exception { + rapidjson::ParseResult pr; + std::string msg; + + explicit PolicyParseException(const rapidjson::ParseResult pr, + const std::string& annotation) + : pr(pr), + msg(fmt::format("At character offset {}, {}", + pr.Offset(), + (pr.Code() == rapidjson::kParseErrorTermination ? + annotation : + rapidjson::GetParseError_En(pr.Code())))) {} + + const char* what() const noexcept override { + return msg.c_str(); + } +}; + +struct Policy { + std::string text; + Version version = Version::v2008_10_17; + boost::optional id = boost::none; + + std::vector statements; + + // reject_invalid_principals should be set to + // `cct->_conf.get_val("rgw_policy_reject_invalid_principals")` + // when executing operations that *set* a bucket policy, but should + // be false when reading a stored bucket policy so as not to break + // backwards configuration. + Policy(CephContext* cct, const std::string& tenant, + const bufferlist& text, + bool reject_invalid_principals); + + Effect eval(const Environment& e, + boost::optional ida, + std::uint64_t action, boost::optional resource, boost::optional princ_type=boost::none) const; + + Effect eval_principal(const Environment& e, + boost::optional ida, boost::optional princ_type=boost::none) const; + + Effect eval_conditions(const Environment& e) const; + + template + bool has_conditional(const std::string& conditional, F p) const { + for (const auto&s: statements){ + if (std::any_of(s.conditions.begin(), s.conditions.end(), + [&](const Condition& c) { return c.has_key_p(conditional, p);})) + return true; + } + return false; + } + + template + bool has_conditional_value(const std::string& conditional, F p) const { + for (const auto&s: statements){ + if (std::any_of(s.conditions.begin(), s.conditions.end(), + [&](const Condition& c) { return c.has_val_p(conditional, p);})) + return true; + } + return false; + } + + bool has_conditional(const std::string& c) const { + return has_conditional(c, Condition::ci_equal_to()); + } + + bool has_partial_conditional(const std::string& c) const { + return has_conditional(c, Condition::ci_starts_with()); + } + + // Example: ${s3:ResourceTag} + bool has_partial_conditional_value(const std::string& c) const { + return has_conditional_value(c, Condition::ci_starts_with()); + } +}; + +std::ostream& operator <<(std::ostream& m, const Policy& p); +bool is_public(const Policy& p); + +} +} diff --git a/src/rgw/rgw_iam_policy_keywords.gperf b/src/rgw/rgw_iam_policy_keywords.gperf new file mode 100644 index 000000000..af73dd130 --- /dev/null +++ b/src/rgw/rgw_iam_policy_keywords.gperf @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +%language=C++ +%compare-strncmp +%define class-name keyword_hash +%define lookup-function-name lookup +%struct-type +struct Keyword { + const char* name; + TokenKind kind; + TokenID id; + uint64_t specific; + bool arrayable; + bool objectable; +}; +%% +# Top-level +# +Version, TokenKind::top, TokenID::Version, 0, false, false +Id, TokenKind::top, TokenID::Id, 0, false, false +Statement, TokenKind::top, TokenID::Statement, 0, true, true +# +# Statement level +# +Sid, TokenKind::statement, TokenID::Sid, 0, false, false +Effect, TokenKind::statement, TokenID::Effect, 0, false, false +Principal, TokenKind::statement, TokenID::Principal, 0, false, true +NotPrincipal, TokenKind::statement, TokenID::NotPrincipal, 0, true, true +Action, TokenKind::statement, TokenID::Action, 0, true, false +NotAction, TokenKind::statement, TokenID::NotAction, 0, true, false +Resource, TokenKind::statement, TokenID::Resource, 0, true, false +NotResource, TokenKind::statement, TokenID::NotResource, 0, true, false +Condition, TokenKind::statement, TokenID::Condition, 0, true, true +# +# Condition operators +# +# String +StringEquals, TokenKind::cond_op, TokenID::StringEquals, (uint64_t) Type::string, true, true +StringNotEquals, TokenKind::cond_op, TokenID::StringNotEquals, (uint64_t) Type::string, true, true +StringEqualsIgnoreCase, TokenKind::cond_op, TokenID::StringEqualsIgnoreCase, (uint64_t) Type::string, true, true +StringNotEqualsIgnoreCase, TokenKind::cond_op, TokenID::StringNotEqualsIgnoreCase, (uint64_t) Type::string, true, true +StringLike, TokenKind::cond_op, TokenID::StringLike, (uint64_t) Type::string, true, true, +StringNotLike, TokenKind::cond_op, TokenID::StringNotLike, (uint64_t) Type::string, true, true +ForAllValues:StringEquals, TokenKind::cond_op, TokenID::ForAllValuesStringEquals, (uint64_t) Type::string, true, true +ForAnyValue:StringEquals, TokenKind::cond_op, TokenID::ForAnyValueStringEquals, (uint64_t) Type::string, true, true +ForAllValues:StringLike, TokenKind::cond_op, TokenID::ForAllValuesStringLike, (uint64_t) Type::string, true, true +ForAnyValue:StringLike, TokenKind::cond_op, TokenID::ForAnyValueStringLike, (uint64_t) Type::string, true, true +ForAllValues:StringEqualsIgnoreCase, TokenKind::cond_op, TokenID::ForAllValuesStringEqualsIgnoreCase, (uint64_t) Type::string, true, true +ForAnyValue:StringEqualsIgnoreCase, TokenKind::cond_op, TokenID::ForAnyValueStringEqualsIgnoreCase, (uint64_t) Type::string, true, true +# Numeric +NumericEquals, TokenKind::cond_op, TokenID::NumericEquals, (uint64_t) Type::number, true, true +NumericNotEquals, TokenKind::cond_op, TokenID::NumericNotEquals, (uint64_t) Type::number, true, true +NumericLessThan, TokenKind::cond_op, TokenID::NumericLessThan, (uint64_t) Type::number, true, true +NumericLessThanEquals, TokenKind::cond_op, TokenID::NumericLessThanEquals, (uint64_t) Type::number, true, true +NumericGreaterThan, TokenKind::cond_op, TokenID::NumericGreaterThan, (uint64_t) Type::number, true, true +NumericGreaterThanEquals, TokenKind::cond_op, TokenID::NumericGreaterThanEquals, (uint64_t) Type::number, true, true +# Date +DateEquals, TokenKind::cond_op, TokenID::DateEquals, (uint64_t) Type::date, true, true +DateNotEquals, TokenKind::cond_op, TokenID::DateNotEquals, (uint64_t) Type::date, true, true +DateLessThan, TokenKind::cond_op, TokenID::DateLessThan, (uint64_t) Type::date, true, true +DateLessThanEquals, TokenKind::cond_op, TokenID::DateLessThanEquals, (uint64_t) Type::date, true, true +DateGreaterThan, TokenKind::cond_op, TokenID::DateGreaterThan, (uint64_t) Type::date, true, true +DateGreaterThanEquals, TokenKind::cond_op, TokenID::DateGreaterThanEquals, (uint64_t) Type::date, true, true +# Bool +Bool, TokenKind::cond_op, TokenID::Bool, (uint64_t) Type::boolean, true, true +# Binary +BinaryEquals, TokenKind::cond_op, TokenID::BinaryEquals, (uint64_t) Type::binary, true, true +# IP Address +IpAddress, TokenKind::cond_op, TokenID::IpAddress, (uint64_t) Type::ipaddr, true, true +NotIpAddress, TokenKind::cond_op, TokenID::NotIpAddress, (uint64_t) Type::ipaddr, true, true +# Amazon Resource Names +ArnEquals, TokenKind::cond_op, TokenID::ArnEquals, (uint64_t) Type::arn, true, true +ArnNotEquals, TokenKind::cond_op, TokenID::ArnNotEquals, (uint64_t) Type::arn, true, true +ArnLike, TokenKind::cond_op, TokenID::ArnLike, (uint64_t) Type::arn, true, true +ArnNotLike, TokenKind::cond_op, TokenID::ArnNotLike, (uint64_t) Type::arn, true, true +# Null +Null, TokenKind::cond_op, TokenID::Null, (uint64_t) Type::null, true, true +# +# Condition keys +# +# AWS +#aws:CurrentTime, TokenKind::cond_key, TokenID::awsCurrentTime, (uint64_t) Type::date, true, false +#aws:EpochTime, TokenKind::cond_key, TokenID::awsEpochTime, (uint64_t) Type::date, true, false +#aws:TokenIssueTime, TokenKind::cond_key, TokenID::awsTokenIssueTime, (uint64_t) Type::date, true, false +#aws:MultiFactorAuthPresent, TokenKind::cond_key, TokenID::awsMultiFactorAuthPresent, (uint64_t) Type::boolean, true, false +#aws:MultiFactorAuthAge, TokenKind::cond_key, TokenID::awsMultiFactorAuthAge, (uint64_t) Type::number, true, false +#aws:PrincipalType, TokenKind::cond_key, TokenID::awsPrincipalType, (uint64_t) Type::string, true, false +#aws:Referer, TokenKind::cond_key, TokenID::awsReferer, (uint64_t) Type::string, true, false +#aws:SecureTransport, TokenKind::cond_key, TokenID::awsSecureTransport, (uint64_t) Type::boolean, true, false +#aws:SourceArn, TokenKind::cond_key, TokenID::awsSourceArn, (uint64_t) Type::arn, true, false +#aws:SourceIp, TokenKind::cond_key, TokenID::awsSourceIp, (uint64_t) Type::ipaddr, true, false +#aws:SourceVpc, TokenKind::cond_key, TokenID::awsSourceVpc, (uint64_t) Type::string, true, false +#aws:SourceVpce, TokenKind::cond_key, TokenID::awsSourceVpce, (uint64_t) Type::string, true, false +#aws:UserAgent, TokenKind::cond_key, TokenID::awsUserAgent, (uint64_t) Type::string, true, false +#aws:userid, TokenKind::cond_key, TokenID::awsuserid, (uint64_t) Type::string, true, false +#aws:username, TokenKind::cond_key, TokenID::awsusername, (uint64_t) Type::string, true, false +# S3 +#s3:x-amz-acl, TokenKind::cond_key, TokenID::s3x_amz_acl, (uint64_t) Type::string, true, false +#s3:x-amz-grant-read, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-grant-write, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-grant-read-acp, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-grant-write-acp, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-grant-full-control, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-copy-source, TokenKind::cond_key, TokenID::s3x_amz_copy_source, (uint64_t) Type::string, true, false +#s3:x-amz-server-side-encryption, TokenKind::cond_key, TokenID::s3x_amz_server_side_encryption, (uint64_t) Type::boolean, true, false +#s3:x-amz-server-side-encryption-aws-kms-key-id, TokenKind::cond_key, TokenID::s3x_amz_server_side_encryption_aws_kms_key_id, (uint64_t) Type::arn, true, false +#s3:x-amz-metadata-directive, TokenKind::cond_key, TokenID::s3x_amz_metadata_directive, (uint64_t) Type::string, true, false +#s3:x-amz-storage-class, TokenKind::cond_key, TokenID::s3x_amz_storage_class, (uint64_t) Type::string, true, false +#s3:VersionId, TokenKind::cond_key, TokenID::s3VersionId, (uint64_t) Type::string, true, false +#s3:LocationConstraint, TokenKind::cond_key, TokenID::s3LocationConstraint, (uint64_t) Type::string, true, false +#s3:prefix, TokenKind::cond_key, TokenID::s3prefix, (uint64_t) Type::string, true, false +#s3:delimiter, TokenKind::cond_key, TokenID::s3delimiter, (uint64_t) Type::string, true, false +#s3:max-keys, TokenKind::cond_key, TokenID::s3max_keys, (uint64_t) Type::number, true, false +#s3:signatureversion, TokenKind::cond_key, TokenID::s3signatureversion, (uint64_t) Type::string, true, false +#s3:authType, TokenKind::cond_key, TokenID::s3authType, (uint64_t) Type::string, true, false +#s3:signatureAge, TokenKind::cond_key, TokenID::s3signatureAge, (uint64_t) Type::number, true, false +#s3:x-amz-content-sha256, TokenKind::cond_key, TokenID::s3x_amz_content_sha256, (uint64_t) Type::string, true, false +# STS +#sts:authentication, TokenKind::cond_key, TokenID::stsauthentication, (uint64_t) Type::boolean, true, false +# +# Version Keywords +# +2008-10-17, TokenKind::version_key, TokenID::v2008_10_17, (uint64_t) Version::v2008_10_17, false, false +2012-10-17, TokenKind::version_key, TokenID::v2012_10_17, (uint64_t) Version::v2012_10_17, false, false +# +# Effect Keywords +# +Allow, TokenKind::effect_key, TokenID::Allow, (uint64_t) Effect::Allow, false, false +Deny, TokenKind::effect_key, TokenID::Deny, (uint64_t) Effect::Deny, false, false +# +# Principal types +# +AWS, TokenKind::princ_type, TokenID::AWS, 0, true, false +Federated, TokenKind::princ_type, TokenID::Federated, 0, true, false +Service, TokenKind::princ_type, TokenID::Service, 0, true, false +CanonicalUser, TokenKind::princ_type, TokenID::CanonicalUser, 0, true, false diff --git a/src/rgw/rgw_iam_policy_keywords.h b/src/rgw/rgw_iam_policy_keywords.h new file mode 100644 index 000000000..8130ace45 --- /dev/null +++ b/src/rgw/rgw_iam_policy_keywords.h @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +namespace rgw { +namespace IAM { + +enum class TokenKind { + pseudo, top, statement, cond_op, cond_key, version_key, effect_key, + princ_type +}; + +enum class TokenID { + /// Pseudo-token + Top, + + /// Top-level tokens + Version, Id, Statement, + + /// Statement level tokens + Sid, Effect, Principal, NotPrincipal, Action, NotAction, + Resource, NotResource, Condition, + + /// Condition Operators! + /// Any of these, except Null, can have an IfExists variant. + + // String! + StringEquals, StringNotEquals, StringEqualsIgnoreCase, + StringNotEqualsIgnoreCase, StringLike, StringNotLike, + ForAllValuesStringEquals, ForAnyValueStringEquals, + ForAllValuesStringLike, ForAnyValueStringLike, + ForAllValuesStringEqualsIgnoreCase, ForAnyValueStringEqualsIgnoreCase, + + // Numeric! + NumericEquals, NumericNotEquals, NumericLessThan, NumericLessThanEquals, + NumericGreaterThan, NumericGreaterThanEquals, + + // Date! + DateEquals, DateNotEquals, DateLessThan, DateLessThanEquals, + DateGreaterThan, DateGreaterThanEquals, + + // Bool! + Bool, + + // Binary! + BinaryEquals, + + // IP Address! + IpAddress, NotIpAddress, + + // Amazon Resource Names! (Does S3 need this?) + ArnEquals, ArnNotEquals, ArnLike, ArnNotLike, + + // Null! + Null, + +#if 0 // Keys are done at runtime now + + /// Condition Keys! + awsCurrentTime, + awsEpochTime, + awsTokenIssueTime, + awsMultiFactorAuthPresent, + awsMultiFactorAuthAge, + awsPrincipalType, + awsReferer, + awsSecureTransport, + awsSourceArn, + awsSourceIp, + awsSourceVpc, + awsSourceVpce, + awsUserAgent, + awsuserid, + awsusername, + s3x_amz_acl, + s3x_amz_grant_permission, + s3x_amz_copy_source, + s3x_amz_server_side_encryption, + s3x_amz_server_side_encryption_aws_kms_key_id, + s3x_amz_metadata_directive, + s3x_amz_storage_class, + s3VersionId, + s3LocationConstraint, + s3prefix, + s3delimiter, + s3max_keys, + s3signatureversion, + s3authType, + s3signatureAge, + s3x_amz_content_sha256, +#else + CondKey, +#endif + + /// + /// Versions! + /// + v2008_10_17, + v2012_10_17, + + /// + /// Effects! + /// + Allow, + Deny, + + /// Principal Types! + AWS, + Federated, + Service, + CanonicalUser +}; + + +enum class Version { + v2008_10_17, + v2012_10_17 +}; + + +enum class Effect { + Allow, + Deny, + Pass +}; + +enum class Type { + string, + number, + date, + boolean, + binary, + ipaddr, + arn, + null +}; +} +} diff --git a/src/rgw/rgw_jsonparser.cc b/src/rgw/rgw_jsonparser.cc new file mode 100644 index 000000000..6541630b2 --- /dev/null +++ b/src/rgw/rgw_jsonparser.cc @@ -0,0 +1,133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include +#include + +#include "include/types.h" + +#include "common/Formatter.h" +#include "common/ceph_json.h" + +#include "rgw_common.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void dump_array(JSONObj *obj) +{ + + JSONObjIter iter = obj->find_first(); + + for (; !iter.end(); ++iter) { + JSONObj *o = *iter; + cout << "data=" << o->get_data() << std::endl; + } + +} + +struct Key { + string user; + string access_key; + string secret_key; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("user", user, obj); + JSONDecoder::decode_json("access_key", access_key, obj); + JSONDecoder::decode_json("secret_key", secret_key, obj); + } +}; + +struct UserInfo { + string uid; + string display_name; + int max_buckets; + list keys; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("user_id", uid, obj); + JSONDecoder::decode_json("display_name", display_name, obj); + JSONDecoder::decode_json("max_buckets", max_buckets, obj); + JSONDecoder::decode_json("keys", keys, obj); + } +}; + + +int main(int argc, char **argv) { + JSONParser parser; + + char buf[1024]; + bufferlist bl; + + for (;;) { + int done; + int len; + + len = fread(buf, 1, sizeof(buf), stdin); + if (ferror(stdin)) { + cerr << "read error" << std::endl; + exit(-1); + } + done = feof(stdin); + + bool ret = parser.parse(buf, len); + if (!ret) + cerr << "parse error" << std::endl; + + if (done) { + bl.append(buf, len); + break; + } + } + + JSONObjIter iter = parser.find_first(); + + for (; !iter.end(); ++iter) { + JSONObj *obj = *iter; + cout << "is_object=" << obj->is_object() << std::endl; + cout << "is_array=" << obj->is_array() << std::endl; + cout << "name=" << obj->get_name() << std::endl; + cout << "data=" << obj->get_data() << std::endl; + } + + iter = parser.find_first("conditions"); + if (!iter.end()) { + JSONObj *obj = *iter; + + JSONObjIter iter2 = obj->find_first(); + for (; !iter2.end(); ++iter2) { + JSONObj *child = *iter2; + cout << "is_object=" << child->is_object() << std::endl; + cout << "is_array=" << child->is_array() << std::endl; + if (child->is_array()) { + dump_array(child); + } + cout << "name=" << child->get_name() < +#include "include/ceph_assert.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/dout.h" + +#define dout_subsys ceph_subsys_rgw + +// TODO investigation, not necessarily issues: +// (1) in case of single threaded writer context use spsc_queue +// (2) check performance of emptying queue to local list, and go over the list and publish +// (3) use std::shared_mutex (c++17) or equivalent for the connections lock + +// cmparisson operator between topic pointer and name +bool operator==(const rd_kafka_topic_t* rkt, const std::string& name) { + return name == std::string_view(rd_kafka_topic_name(rkt)); +} + +namespace rgw::kafka { + +// status codes for publishing +static const int STATUS_CONNECTION_CLOSED = -0x1002; +static const int STATUS_QUEUE_FULL = -0x1003; +static const int STATUS_MAX_INFLIGHT = -0x1004; +static const int STATUS_MANAGER_STOPPED = -0x1005; +static const int STATUS_CONNECTION_IDLE = -0x1006; +// status code for connection opening +static const int STATUS_CONF_ALLOC_FAILED = -0x2001; +static const int STATUS_CONF_REPLCACE = -0x2002; + +static const int STATUS_OK = 0x0; + +// struct for holding the callback and its tag in the callback list +struct reply_callback_with_tag_t { + uint64_t tag; + reply_callback_t cb; + + reply_callback_with_tag_t(uint64_t _tag, reply_callback_t _cb) : tag(_tag), cb(_cb) {} + + bool operator==(uint64_t rhs) { + return tag == rhs; + } +}; + +typedef std::vector CallbackList; + +// struct for holding the connection state object as well as list of topics +// it is used inside an intrusive ref counted pointer (boost::intrusive_ptr) +// since references to deleted objects may still exist in the calling code +struct connection_t { + rd_kafka_t* producer = nullptr; + rd_kafka_conf_t* temp_conf = nullptr; + std::vector topics; + uint64_t delivery_tag = 1; + int status = STATUS_OK; + CephContext* const cct; + CallbackList callbacks; + const std::string broker; + const bool use_ssl; + const bool verify_ssl; // TODO currently iognored, not supported in librdkafka v0.11.6 + const boost::optional ca_location; + const std::string user; + const std::string password; + const boost::optional mechanism; + utime_t timestamp = ceph_clock_now(); + + // cleanup of all internal connection resource + // the object can still remain, and internal connection + // resources created again on successful reconnection + void destroy(int s) { + status = s; + // destroy temporary conf (if connection was never established) + if (temp_conf) { + rd_kafka_conf_destroy(temp_conf); + return; + } + if (!is_ok()) { + // no producer, nothing to destroy + return; + } + // wait for all remaining acks/nacks + rd_kafka_flush(producer, 5*1000 /* wait for max 5 seconds */); + // destroy all topics + std::for_each(topics.begin(), topics.end(), [](auto topic) {rd_kafka_topic_destroy(topic);}); + // destroy producer + rd_kafka_destroy(producer); + producer = nullptr; + // fire all remaining callbacks (if not fired by rd_kafka_flush) + std::for_each(callbacks.begin(), callbacks.end(), [this](auto& cb_tag) { + cb_tag.cb(status); + ldout(cct, 20) << "Kafka destroy: invoking callback with tag=" << cb_tag.tag << + " for: " << broker << dendl; + }); + callbacks.clear(); + delivery_tag = 1; + ldout(cct, 20) << "Kafka destroy: complete for: " << broker << dendl; + } + + bool is_ok() const { + return (producer != nullptr); + } + + // ctor for setting immutable values + connection_t(CephContext* _cct, const std::string& _broker, bool _use_ssl, bool _verify_ssl, + const boost::optional& _ca_location, + const std::string& _user, const std::string& _password, const boost::optional& _mechanism) : + cct(_cct), broker(_broker), use_ssl(_use_ssl), verify_ssl(_verify_ssl), ca_location(_ca_location), user(_user), password(_password), mechanism(_mechanism) {} + + // dtor also destroys the internals + ~connection_t() { + destroy(status); + } +}; + +// convert int status to string - including RGW specific values +std::string status_to_string(int s) { + switch (s) { + case STATUS_OK: + return "STATUS_OK"; + case STATUS_CONNECTION_CLOSED: + return "RGW_KAFKA_STATUS_CONNECTION_CLOSED"; + case STATUS_QUEUE_FULL: + return "RGW_KAFKA_STATUS_QUEUE_FULL"; + case STATUS_MAX_INFLIGHT: + return "RGW_KAFKA_STATUS_MAX_INFLIGHT"; + case STATUS_MANAGER_STOPPED: + return "RGW_KAFKA_STATUS_MANAGER_STOPPED"; + case STATUS_CONF_ALLOC_FAILED: + return "RGW_KAFKA_STATUS_CONF_ALLOC_FAILED"; + case STATUS_CONF_REPLCACE: + return "RGW_KAFKA_STATUS_CONF_REPLCACE"; + case STATUS_CONNECTION_IDLE: + return "RGW_KAFKA_STATUS_CONNECTION_IDLE"; + } + return std::string(rd_kafka_err2str((rd_kafka_resp_err_t)s)); +} + +void message_callback(rd_kafka_t* rk, const rd_kafka_message_t* rkmessage, void* opaque) { + ceph_assert(opaque); + + const auto conn = reinterpret_cast(opaque); + const auto result = rkmessage->err; + + if (!rkmessage->_private) { + ldout(conn->cct, 20) << "Kafka run: n/ack received, (no callback) with result=" << result << dendl; + return; + } + + const auto tag = reinterpret_cast(rkmessage->_private); + const auto& callbacks_end = conn->callbacks.end(); + const auto& callbacks_begin = conn->callbacks.begin(); + const auto tag_it = std::find(callbacks_begin, callbacks_end, *tag); + if (tag_it != callbacks_end) { + ldout(conn->cct, 20) << "Kafka run: n/ack received, invoking callback with tag=" << + *tag << " and result=" << rd_kafka_err2str(result) << dendl; + tag_it->cb(result); + conn->callbacks.erase(tag_it); + } else { + // TODO add counter for acks with no callback + ldout(conn->cct, 10) << "Kafka run: unsolicited n/ack received with tag=" << + *tag << dendl; + } + delete tag; + // rkmessage is destroyed automatically by librdkafka +} + +void log_callback(const rd_kafka_t* rk, int level, const char *fac, const char *buf) { + ceph_assert(rd_kafka_opaque(rk)); + + const auto conn = reinterpret_cast(rd_kafka_opaque(rk)); + if (level <= 3) + ldout(conn->cct, 1) << "RDKAFKA-" << level << "-" << fac << ": " << rd_kafka_name(rk) << ": " << buf << dendl; + else if (level <= 5) + ldout(conn->cct, 2) << "RDKAFKA-" << level << "-" << fac << ": " << rd_kafka_name(rk) << ": " << buf << dendl; + else if (level <= 6) + ldout(conn->cct, 10) << "RDKAFKA-" << level << "-" << fac << ": " << rd_kafka_name(rk) << ": " << buf << dendl; + else + ldout(conn->cct, 20) << "RDKAFKA-" << level << "-" << fac << ": " << rd_kafka_name(rk) << ": " << buf << dendl; +} + +void poll_err_callback(rd_kafka_t *rk, int err, const char *reason, void *opaque) { + const auto conn = reinterpret_cast(rd_kafka_opaque(rk)); + ldout(conn->cct, 10) << "Kafka run: poll error(" << err << "): " << reason << dendl; +} + +using connection_t_ptr = std::unique_ptr; + +// utility function to create a producer, when the connection object already exists +bool new_producer(connection_t* conn) { + // reset all status codes + conn->status = STATUS_OK; + char errstr[512] = {0}; + + conn->temp_conf = rd_kafka_conf_new(); + if (!conn->temp_conf) { + conn->status = STATUS_CONF_ALLOC_FAILED; + return false; + } + + // get list of brokers based on the bootsrap broker + if (rd_kafka_conf_set(conn->temp_conf, "bootstrap.servers", conn->broker.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + + if (conn->use_ssl) { + if (!conn->user.empty()) { + // use SSL+SASL + if (rd_kafka_conf_set(conn->temp_conf, "security.protocol", "SASL_SSL", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK || + rd_kafka_conf_set(conn->temp_conf, "sasl.username", conn->user.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK || + rd_kafka_conf_set(conn->temp_conf, "sasl.password", conn->password.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured SSL+SASL security" << dendl; + + if (conn->mechanism) { + if (rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", conn->mechanism->c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured SASL mechanism" << dendl; + } else { + if (rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", "PLAIN", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: using default SASL mechanism" << dendl; + } + + } else { + // use only SSL + if (rd_kafka_conf_set(conn->temp_conf, "security.protocol", "SSL", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured SSL security" << dendl; + } + if (conn->ca_location) { + if (rd_kafka_conf_set(conn->temp_conf, "ssl.ca.location", conn->ca_location->c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured CA location" << dendl; + } else { + ldout(conn->cct, 20) << "Kafka connect: using default CA location" << dendl; + } + // Note: when librdkafka.1.0 is available the following line could be uncommented instead of the callback setting call + // if (rd_kafka_conf_set(conn->temp_conf, "enable.ssl.certificate.verification", "0", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + + ldout(conn->cct, 20) << "Kafka connect: successfully configured security" << dendl; + } else if (!conn->user.empty()) { + // use SASL+PLAINTEXT + if (rd_kafka_conf_set(conn->temp_conf, "security.protocol", "SASL_PLAINTEXT", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK || + rd_kafka_conf_set(conn->temp_conf, "sasl.username", conn->user.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK || + rd_kafka_conf_set(conn->temp_conf, "sasl.password", conn->password.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured SASL_PLAINTEXT" << dendl; + + if (conn->mechanism) { + if (rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", conn->mechanism->c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured SASL mechanism" << dendl; + } else { + if (rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", "PLAIN", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: using default SASL mechanism" << dendl; + } + } + + // set the global callback for delivery success/fail + rd_kafka_conf_set_dr_msg_cb(conn->temp_conf, message_callback); + + // set the global opaque pointer to be the connection itself + rd_kafka_conf_set_opaque(conn->temp_conf, conn); + + // redirect kafka logs to RGW + rd_kafka_conf_set_log_cb(conn->temp_conf, log_callback); + // define poll callback to allow reconnect + rd_kafka_conf_set_error_cb(conn->temp_conf, poll_err_callback); + // create the producer + if (conn->producer) { + ldout(conn->cct, 5) << "Kafka connect: producer already exists. detroying the existing before creating a new one" << dendl; + conn->destroy(STATUS_CONF_REPLCACE); + } + conn->producer = rd_kafka_new(RD_KAFKA_PRODUCER, conn->temp_conf, errstr, sizeof(errstr)); + if (!conn->producer) { + conn->status = rd_kafka_last_error(); + ldout(conn->cct, 1) << "Kafka connect: failed to create producer: " << errstr << dendl; + return false; + } + ldout(conn->cct, 20) << "Kafka connect: successfully created new producer" << dendl; + { + // set log level of producer + const auto log_level = conn->cct->_conf->subsys.get_log_level(ceph_subsys_rgw); + if (log_level <= 1) + rd_kafka_set_log_level(conn->producer, 3); + else if (log_level <= 2) + rd_kafka_set_log_level(conn->producer, 5); + else if (log_level <= 10) + rd_kafka_set_log_level(conn->producer, 6); + else + rd_kafka_set_log_level(conn->producer, 7); + } + + // conf ownership passed to producer + conn->temp_conf = nullptr; + return true; + +conf_error: + conn->status = rd_kafka_last_error(); + ldout(conn->cct, 1) << "Kafka connect: configuration failed: " << errstr << dendl; + return false; +} + +// struct used for holding messages in the message queue +struct message_wrapper_t { + std::string conn_name; + std::string topic; + std::string message; + const reply_callback_t cb; + + message_wrapper_t(const std::string& _conn_name, + const std::string& _topic, + const std::string& _message, + reply_callback_t _cb) : conn_name(_conn_name), topic(_topic), message(_message), cb(_cb) {} +}; + +typedef std::unordered_map ConnectionList; +typedef boost::lockfree::queue> MessageQueue; + +class Manager { +public: + const size_t max_connections; + const size_t max_inflight; + const size_t max_queue; + const size_t max_idle_time; +private: + std::atomic connection_count; + bool stopped; + int read_timeout_ms; + ConnectionList connections; + MessageQueue messages; + std::atomic queued; + std::atomic dequeued; + CephContext* const cct; + mutable std::mutex connections_lock; + std::thread runner; + + // TODO use rd_kafka_produce_batch for better performance + void publish_internal(message_wrapper_t* message) { + const std::unique_ptr msg_deleter(message); + const auto conn_it = connections.find(message->conn_name); + if (conn_it == connections.end()) { + ldout(cct, 1) << "Kafka publish: connection was deleted while message was in the queue. error: " << STATUS_CONNECTION_CLOSED << dendl; + if (message->cb) { + message->cb(STATUS_CONNECTION_CLOSED); + } + return; + } + auto& conn = conn_it->second; + + conn->timestamp = ceph_clock_now(); + + if (!conn->is_ok()) { + // connection had an issue while message was in the queue + // TODO add error stats + ldout(conn->cct, 1) << "Kafka publish: producer was closed while message was in the queue. error: " << status_to_string(conn->status) << dendl; + if (message->cb) { + message->cb(conn->status); + } + return; + } + + // create a new topic unless it was already created + auto topic_it = std::find(conn->topics.begin(), conn->topics.end(), message->topic); + rd_kafka_topic_t* topic = nullptr; + if (topic_it == conn->topics.end()) { + topic = rd_kafka_topic_new(conn->producer, message->topic.c_str(), nullptr); + if (!topic) { + const auto err = rd_kafka_last_error(); + ldout(conn->cct, 1) << "Kafka publish: failed to create topic: " << message->topic << " error: " << status_to_string(err) << dendl; + if (message->cb) { + message->cb(err); + } + conn->destroy(err); + return; + } + // TODO use the topics list as an LRU cache + conn->topics.push_back(topic); + ldout(conn->cct, 20) << "Kafka publish: successfully created topic: " << message->topic << dendl; + } else { + topic = *topic_it; + ldout(conn->cct, 20) << "Kafka publish: reused existing topic: " << message->topic << dendl; + } + + const auto tag = (message->cb == nullptr ? nullptr : new uint64_t(conn->delivery_tag++)); + const auto rc = rd_kafka_produce( + topic, + // TODO: non builtin partitioning + RD_KAFKA_PARTITION_UA, + // make a copy of the payload + // so it is safe to pass the pointer from the string + RD_KAFKA_MSG_F_COPY, + message->message.data(), + message->message.length(), + // optional key and its length + nullptr, + 0, + // opaque data: tag, used in the global callback + // in order to invoke the real callback + // null if no callback exists + tag); + if (rc == -1) { + const auto err = rd_kafka_last_error(); + ldout(conn->cct, 10) << "Kafka publish: failed to produce: " << rd_kafka_err2str(err) << dendl; + // TODO: dont error on full queue, and don't destroy connection, retry instead + // immediatly invoke callback on error if needed + if (message->cb) { + message->cb(err); + } + conn->destroy(err); + delete tag; + return; + } + + if (tag) { + auto const q_len = conn->callbacks.size(); + if (q_len < max_inflight) { + ldout(conn->cct, 20) << "Kafka publish (with callback, tag=" << *tag << "): OK. Queue has: " << q_len << " callbacks" << dendl; + conn->callbacks.emplace_back(*tag, message->cb); + } else { + // immediately invoke callback with error - this is not a connection error + ldout(conn->cct, 1) << "Kafka publish (with callback): failed with error: callback queue full" << dendl; + message->cb(STATUS_MAX_INFLIGHT); + // tag will be deleted when the global callback is invoked + } + } else { + ldout(conn->cct, 20) << "Kafka publish (no callback): OK" << dendl; + } + } + + // the managers thread: + // (1) empty the queue of messages to be published + // (2) loop over all connections and read acks + // (3) manages deleted connections + // (4) TODO reconnect on connection errors + // (5) TODO cleanup timedout callbacks + void run() noexcept { + while (!stopped) { + + // publish all messages in the queue + auto reply_count = 0U; + const auto send_count = messages.consume_all(std::bind(&Manager::publish_internal, this, std::placeholders::_1)); + dequeued += send_count; + ConnectionList::iterator conn_it; + ConnectionList::const_iterator end_it; + { + // thread safe access to the connection list + // once the iterators are fetched they are guaranteed to remain valid + std::lock_guard lock(connections_lock); + conn_it = connections.begin(); + end_it = connections.end(); + } + // loop over all connections to read acks + for (;conn_it != end_it;) { + + auto& conn = conn_it->second; + + // Checking the connection idlesness + if(conn->timestamp.sec() + max_idle_time < ceph_clock_now()) { + ldout(conn->cct, 20) << "kafka run: deleting a connection due to idle behaviour: " << ceph_clock_now() << dendl; + std::lock_guard lock(connections_lock); + conn->destroy(STATUS_CONNECTION_IDLE); + conn_it = connections.erase(conn_it); + --connection_count; \ + continue; + } + + // try to reconnect the connection if it has an error + if (!conn->is_ok()) { + ldout(conn->cct, 10) << "Kafka run: connection status is: " << status_to_string(conn->status) << dendl; + const auto& broker = conn_it->first; + ldout(conn->cct, 20) << "Kafka run: retry connection" << dendl; + if (new_producer(conn.get()) == false) { + ldout(conn->cct, 10) << "Kafka run: connection (" << broker << ") retry failed" << dendl; + // TODO: add error counter for failed retries + // TODO: add exponential backoff for retries + } else { + ldout(conn->cct, 10) << "Kafka run: connection (" << broker << ") retry successfull" << dendl; + } + ++conn_it; + continue; + } + + reply_count += rd_kafka_poll(conn->producer, read_timeout_ms); + + // just increment the iterator + ++conn_it; + } + // if no messages were received or published + // across all connection, sleep for 100ms + if (send_count == 0 && reply_count == 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + } + + // used in the dtor for message cleanup + static void delete_message(const message_wrapper_t* message) { + delete message; + } + +public: + Manager(size_t _max_connections, + size_t _max_inflight, + size_t _max_queue, + int _read_timeout_ms, + CephContext* _cct) : + max_connections(_max_connections), + max_inflight(_max_inflight), + max_queue(_max_queue), + max_idle_time(30), + connection_count(0), + stopped(false), + read_timeout_ms(_read_timeout_ms), + connections(_max_connections), + messages(max_queue), + queued(0), + dequeued(0), + cct(_cct), + runner(&Manager::run, this) { + // The hashmap has "max connections" as the initial number of buckets, + // and allows for 10 collisions per bucket before rehash. + // This is to prevent rehashing so that iterators are not invalidated + // when a new connection is added. + connections.max_load_factor(10.0); + // give the runner thread a name for easier debugging + const auto rc = ceph_pthread_setname(runner.native_handle(), "kafka_manager"); + ceph_assert(rc==0); + } + + // non copyable + Manager(const Manager&) = delete; + const Manager& operator=(const Manager&) = delete; + + // stop the main thread + void stop() { + stopped = true; + } + + // connect to a broker, or reuse an existing connection if already connected + bool connect(std::string& broker, + const std::string& url, + bool use_ssl, + bool verify_ssl, + boost::optional ca_location, + boost::optional mechanism) { + if (stopped) { + ldout(cct, 1) << "Kafka connect: manager is stopped" << dendl; + return false; + } + + std::string user; + std::string password; + if (!parse_url_authority(url, broker, user, password)) { + // TODO: increment counter + ldout(cct, 1) << "Kafka connect: URL parsing failed" << dendl; + return false; + } + + // this should be validated by the regex in parse_url() + ceph_assert(user.empty() == password.empty()); + + if (!user.empty() && !use_ssl && !g_conf().get_val("rgw_allow_notification_secrets_in_cleartext")) { + ldout(cct, 1) << "Kafka connect: user/password are only allowed over secure connection" << dendl; + return false; + } + + std::lock_guard lock(connections_lock); + const auto it = connections.find(broker); + // note that ssl vs. non-ssl connection to the same host are two separate conenctions + if (it != connections.end()) { + // connection found - return even if non-ok + ldout(cct, 20) << "Kafka connect: connection found" << dendl; + return it->second.get(); + } + + // connection not found, creating a new one + if (connection_count >= max_connections) { + // TODO: increment counter + ldout(cct, 1) << "Kafka connect: max connections exceeded" << dendl; + return false; + } + // create_connection must always return a connection object + // even if error occurred during creation. + // in such a case the creation will be retried in the main thread + ++connection_count; + ldout(cct, 10) << "Kafka connect: new connection is created. Total connections: " << connection_count << dendl; + auto conn = connections.emplace(broker, std::make_unique(cct, broker, use_ssl, verify_ssl, ca_location, user, password, mechanism)).first->second.get(); + if (!new_producer(conn)) { + ldout(cct, 10) << "Kafka connect: new connection is created. But producer creation failed. will retry" << dendl; + } + return true; + } + + // TODO publish with confirm is needed in "none" case as well, cb should be invoked publish is ok (no ack) + int publish(const std::string& conn_name, + const std::string& topic, + const std::string& message) { + if (stopped) { + return STATUS_MANAGER_STOPPED; + } + if (messages.push(new message_wrapper_t(conn_name, topic, message, nullptr))) { + ++queued; + return STATUS_OK; + } + return STATUS_QUEUE_FULL; + } + + int publish_with_confirm(const std::string& conn_name, + const std::string& topic, + const std::string& message, + reply_callback_t cb) { + if (stopped) { + return STATUS_MANAGER_STOPPED; + } + if (messages.push(new message_wrapper_t(conn_name, topic, message, cb))) { + ++queued; + return STATUS_OK; + } + return STATUS_QUEUE_FULL; + } + + // dtor wait for thread to stop + // then connection are cleaned-up + ~Manager() { + stopped = true; + runner.join(); + messages.consume_all(delete_message); + } + + // get the number of connections + size_t get_connection_count() const { + return connection_count; + } + + // get the number of in-flight messages + size_t get_inflight() const { + size_t sum = 0; + std::lock_guard lock(connections_lock); + std::for_each(connections.begin(), connections.end(), [&sum](auto& conn_pair) { + sum += conn_pair.second->callbacks.size(); + }); + return sum; + } + + // running counter of the queued messages + size_t get_queued() const { + return queued; + } + + // running counter of the dequeued messages + size_t get_dequeued() const { + return dequeued; + } +}; + +// singleton manager +// note that the manager itself is not a singleton, and multiple instances may co-exist +// TODO make the pointer atomic in allocation and deallocation to avoid race conditions +static Manager* s_manager = nullptr; + +static const size_t MAX_CONNECTIONS_DEFAULT = 256; +static const size_t MAX_INFLIGHT_DEFAULT = 8192; +static const size_t MAX_QUEUE_DEFAULT = 8192; +static const int READ_TIMEOUT_MS_DEFAULT = 500; + +bool init(CephContext* cct) { + if (s_manager) { + return false; + } + // TODO: take conf from CephContext + s_manager = new Manager(MAX_CONNECTIONS_DEFAULT, MAX_INFLIGHT_DEFAULT, MAX_QUEUE_DEFAULT, READ_TIMEOUT_MS_DEFAULT, cct); + return true; +} + +void shutdown() { + delete s_manager; + s_manager = nullptr; +} + +bool connect(std::string& broker, const std::string& url, bool use_ssl, bool verify_ssl, + boost::optional ca_location, + boost::optional mechanism) { + if (!s_manager) return false; + return s_manager->connect(broker, url, use_ssl, verify_ssl, ca_location, mechanism); +} + +int publish(const std::string& conn_name, + const std::string& topic, + const std::string& message) { + if (!s_manager) return STATUS_MANAGER_STOPPED; + return s_manager->publish(conn_name, topic, message); +} + +int publish_with_confirm(const std::string& conn_name, + const std::string& topic, + const std::string& message, + reply_callback_t cb) { + if (!s_manager) return STATUS_MANAGER_STOPPED; + return s_manager->publish_with_confirm(conn_name, topic, message, cb); +} + +size_t get_connection_count() { + if (!s_manager) return 0; + return s_manager->get_connection_count(); +} + +size_t get_inflight() { + if (!s_manager) return 0; + return s_manager->get_inflight(); +} + +size_t get_queued() { + if (!s_manager) return 0; + return s_manager->get_queued(); +} + +size_t get_dequeued() { + if (!s_manager) return 0; + return s_manager->get_dequeued(); +} + +size_t get_max_connections() { + if (!s_manager) return MAX_CONNECTIONS_DEFAULT; + return s_manager->max_connections; +} + +size_t get_max_inflight() { + if (!s_manager) return MAX_INFLIGHT_DEFAULT; + return s_manager->max_inflight; +} + +size_t get_max_queue() { + if (!s_manager) return MAX_QUEUE_DEFAULT; + return s_manager->max_queue; +} + +} // namespace kafka + diff --git a/src/rgw/rgw_kafka.h b/src/rgw/rgw_kafka.h new file mode 100644 index 000000000..813fda329 --- /dev/null +++ b/src/rgw/rgw_kafka.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include "include/common_fwd.h" + +namespace rgw::kafka { + +// the reply callback is expected to get an integer parameter +// indicating the result, and not to return anything +typedef std::function reply_callback_t; + +// initialize the kafka manager +bool init(CephContext* cct); + +// shutdown the kafka manager +void shutdown(); + +// connect to a kafka endpoint +bool connect(std::string& broker, const std::string& url, bool use_ssl, bool verify_ssl, boost::optional ca_location, boost::optional mechanism); + +// publish a message over a connection that was already created +int publish(const std::string& conn_name, + const std::string& topic, + const std::string& message); + +// publish a message over a connection that was already created +// and pass a callback that will be invoked (async) when broker confirms +// receiving the message +int publish_with_confirm(const std::string& conn_name, + const std::string& topic, + const std::string& message, + reply_callback_t cb); + +// convert the integer status returned from the "publish" function to a string +std::string status_to_string(int s); + +// number of connections +size_t get_connection_count(); + +// return the number of messages that were sent +// to broker, but were not yet acked/nacked/timedout +size_t get_inflight(); + +// running counter of successfully queued messages +size_t get_queued(); + +// running counter of dequeued messages +size_t get_dequeued(); + +// number of maximum allowed connections +size_t get_max_connections(); + +// number of maximum allowed inflight messages +size_t get_max_inflight(); + +// maximum number of messages in the queue +size_t get_max_queue(); + +} + diff --git a/src/rgw/rgw_keystone.cc b/src/rgw/rgw_keystone.cc new file mode 100644 index 000000000..2df417bd0 --- /dev/null +++ b/src/rgw/rgw_keystone.cc @@ -0,0 +1,684 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include +#include +#include + +#include "common/errno.h" +#include "common/ceph_json.h" +#include "include/types.h" +#include "include/str_list.h" + +#include "rgw_common.h" +#include "rgw_keystone.h" +#include "common/armor.h" +#include "common/Cond.h" +#include "rgw_perf_counters.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw +#define PKI_ANS1_PREFIX "MII" + +using namespace std; + +bool rgw_is_pki_token(const string& token) +{ + return token.compare(0, sizeof(PKI_ANS1_PREFIX) - 1, PKI_ANS1_PREFIX) == 0; +} + +void rgw_get_token_id(const string& token, string& token_id) +{ + if (!rgw_is_pki_token(token)) { + token_id = token; + return; + } + + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + hash.Update((const unsigned char *)token.c_str(), token.size()); + hash.Final(m); + + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + token_id = calc_md5; +} + + +namespace rgw { +namespace keystone { + +ApiVersion CephCtxConfig::get_api_version() const noexcept +{ + switch (g_ceph_context->_conf->rgw_keystone_api_version) { + case 3: + return ApiVersion::VER_3; + case 2: + return ApiVersion::VER_2; + default: + dout(0) << "ERROR: wrong Keystone API version: " + << g_ceph_context->_conf->rgw_keystone_api_version + << "; falling back to v2" << dendl; + return ApiVersion::VER_2; + } +} + +std::string CephCtxConfig::get_endpoint_url() const noexcept +{ + static const std::string url = g_ceph_context->_conf->rgw_keystone_url; + + if (url.empty() || boost::algorithm::ends_with(url, "/")) { + return url; + } else { + static const std::string url_normalised = url + '/'; + return url_normalised; + } +} + +/* secrets */ +const std::string CephCtxConfig::empty{""}; + +static inline std::string read_secret(const std::string& file_path) +{ + using namespace std; + + constexpr int16_t size{1024}; + char buf[size]; + string s; + + s.reserve(size); + ifstream ifs(file_path, ios::in | ios::binary); + if (ifs) { + while (true) { + auto sbuf = ifs.rdbuf(); + auto len = sbuf->sgetn(buf, size); + if (!len) + break; + s.append(buf, len); + } + boost::algorithm::trim(s); + if (s.back() == '\n') + s.pop_back(); + } + return s; +} + +std::string CephCtxConfig::get_admin_token() const noexcept +{ + auto& atv = g_ceph_context->_conf->rgw_keystone_admin_token_path; + if (!atv.empty()) { + return read_secret(atv); + } else { + auto& atv = g_ceph_context->_conf->rgw_keystone_admin_token; + if (!atv.empty()) { + return atv; + } + } + return empty; +} + +std::string CephCtxConfig::get_admin_password() const noexcept { + auto& apv = g_ceph_context->_conf->rgw_keystone_admin_password_path; + if (!apv.empty()) { + return read_secret(apv); + } else { + auto& apv = g_ceph_context->_conf->rgw_keystone_admin_password; + if (!apv.empty()) { + return apv; + } + } + return empty; +} + +int Service::get_admin_token(const DoutPrefixProvider *dpp, + CephContext* const cct, + TokenCache& token_cache, + const Config& config, + std::string& token) +{ + /* Let's check whether someone uses the deprecated "admin token" feauture + * based on a shared secret from keystone.conf file. */ + const auto& admin_token = config.get_admin_token(); + if (! admin_token.empty()) { + token = std::string(admin_token.data(), admin_token.length()); + return 0; + } + + TokenEnvelope t; + + /* Try cache first before calling Keystone for a new admin token. */ + if (token_cache.find_admin(t)) { + ldpp_dout(dpp, 20) << "found cached admin token" << dendl; + token = t.token.id; + return 0; + } + + /* Call Keystone now. */ + const auto ret = issue_admin_token_request(dpp, cct, config, t); + if (! ret) { + token_cache.add_admin(t); + token = t.token.id; + } + + return ret; +} + +int Service::issue_admin_token_request(const DoutPrefixProvider *dpp, + CephContext* const cct, + const Config& config, + TokenEnvelope& t) +{ + std::string token_url = config.get_endpoint_url(); + if (token_url.empty()) { + return -EINVAL; + } + + bufferlist token_bl; + RGWGetKeystoneAdminToken token_req(cct, "POST", "", &token_bl); + token_req.append_header("Content-Type", "application/json"); + JSONFormatter jf; + + const auto keystone_version = config.get_api_version(); + if (keystone_version == ApiVersion::VER_2) { + AdminTokenRequestVer2 req_serializer(config); + req_serializer.dump(&jf); + + std::stringstream ss; + jf.flush(ss); + token_req.set_post_data(ss.str()); + token_req.set_send_length(ss.str().length()); + token_url.append("v2.0/tokens"); + + } else if (keystone_version == ApiVersion::VER_3) { + AdminTokenRequestVer3 req_serializer(config); + req_serializer.dump(&jf); + + std::stringstream ss; + jf.flush(ss); + token_req.set_post_data(ss.str()); + token_req.set_send_length(ss.str().length()); + token_url.append("v3/auth/tokens"); + } else { + return -ENOTSUP; + } + + token_req.set_url(token_url); + + const int ret = token_req.process(null_yield); + if (ret < 0) { + return ret; + } + + /* Detect rejection earlier than during the token parsing step. */ + if (token_req.get_http_status() == + RGWGetKeystoneAdminToken::HTTP_STATUS_UNAUTHORIZED) { + return -EACCES; + } + + if (t.parse(dpp, cct, token_req.get_subject_token(), token_bl, + keystone_version) != 0) { + return -EINVAL; + } + + return 0; +} + +int Service::get_keystone_barbican_token(const DoutPrefixProvider *dpp, + CephContext * const cct, + std::string& token) +{ + using keystone_config_t = rgw::keystone::CephCtxConfig; + using keystone_cache_t = rgw::keystone::TokenCache; + + auto& config = keystone_config_t::get_instance(); + auto& token_cache = keystone_cache_t::get_instance(); + + std::string token_url = config.get_endpoint_url(); + if (token_url.empty()) { + return -EINVAL; + } + + rgw::keystone::TokenEnvelope t; + + /* Try cache first. */ + if (token_cache.find_barbican(t)) { + ldpp_dout(dpp, 20) << "found cached barbican token" << dendl; + token = t.token.id; + return 0; + } + + bufferlist token_bl; + RGWKeystoneHTTPTransceiver token_req(cct, "POST", "", &token_bl); + token_req.append_header("Content-Type", "application/json"); + JSONFormatter jf; + + const auto keystone_version = config.get_api_version(); + if (keystone_version == ApiVersion::VER_2) { + rgw::keystone::BarbicanTokenRequestVer2 req_serializer(cct); + req_serializer.dump(&jf); + + std::stringstream ss; + jf.flush(ss); + token_req.set_post_data(ss.str()); + token_req.set_send_length(ss.str().length()); + token_url.append("v2.0/tokens"); + + } else if (keystone_version == ApiVersion::VER_3) { + BarbicanTokenRequestVer3 req_serializer(cct); + req_serializer.dump(&jf); + + std::stringstream ss; + jf.flush(ss); + token_req.set_post_data(ss.str()); + token_req.set_send_length(ss.str().length()); + token_url.append("v3/auth/tokens"); + } else { + return -ENOTSUP; + } + + token_req.set_url(token_url); + + ldpp_dout(dpp, 20) << "Requesting secret from barbican url=" << token_url << dendl; + const int ret = token_req.process(null_yield); + if (ret < 0) { + ldpp_dout(dpp, 20) << "Barbican process error:" << token_bl.c_str() << dendl; + return ret; + } + + /* Detect rejection earlier than during the token parsing step. */ + if (token_req.get_http_status() == + RGWKeystoneHTTPTransceiver::HTTP_STATUS_UNAUTHORIZED) { + return -EACCES; + } + + if (t.parse(dpp, cct, token_req.get_subject_token(), token_bl, + keystone_version) != 0) { + return -EINVAL; + } + + token_cache.add_barbican(t); + token = t.token.id; + return 0; +} + + +bool TokenEnvelope::has_role(const std::string& r) const +{ + list::const_iterator iter; + for (iter = roles.cbegin(); iter != roles.cend(); ++iter) { + if (fnmatch(r.c_str(), ((*iter).name.c_str()), 0) == 0) { + return true; + } + } + return false; +} + +int TokenEnvelope::parse(const DoutPrefixProvider *dpp, + CephContext* const cct, + const std::string& token_str, + ceph::bufferlist& bl, + const ApiVersion version) +{ + JSONParser parser; + if (! parser.parse(bl.c_str(), bl.length())) { + ldpp_dout(dpp, 0) << "Keystone token parse error: malformed json" << dendl; + return -EINVAL; + } + + JSONObjIter token_iter = parser.find_first("token"); + JSONObjIter access_iter = parser.find_first("access"); + + try { + if (version == rgw::keystone::ApiVersion::VER_2) { + if (! access_iter.end()) { + decode_v2(*access_iter); + } else if (! token_iter.end()) { + /* TokenEnvelope structure doesn't follow Identity API v2, so let's + * fallback to v3. Otherwise we can assume it's wrongly formatted. + * The whole mechanism is a workaround for s3_token middleware that + * speaks in v2 disregarding the promise to go with v3. */ + decode_v3(*token_iter); + + /* Identity v3 conveys the token inforamtion not as a part of JSON but + * in the X-Subject-Token HTTP header we're getting from caller. */ + token.id = token_str; + } else { + return -EINVAL; + } + } else if (version == rgw::keystone::ApiVersion::VER_3) { + if (! token_iter.end()) { + decode_v3(*token_iter); + /* v3 suceeded. We have to fill token.id from external input as it + * isn't a part of the JSON response anymore. It has been moved + * to X-Subject-Token HTTP header instead. */ + token.id = token_str; + } else if (! access_iter.end()) { + /* If the token cannot be parsed according to V3, try V2. */ + decode_v2(*access_iter); + } else { + return -EINVAL; + } + } else { + return -ENOTSUP; + } + } catch (const JSONDecoder::err& err) { + ldpp_dout(dpp, 0) << "Keystone token parse error: " << err.what() << dendl; + return -EINVAL; + } + + return 0; +} + +bool TokenCache::find(const std::string& token_id, + rgw::keystone::TokenEnvelope& token) +{ + std::lock_guard l{lock}; + return find_locked(token_id, token, tokens, tokens_lru); +} + +bool TokenCache::find_service(const std::string& token_id, + rgw::keystone::TokenEnvelope& token) +{ + std::lock_guard l{lock}; + return find_locked(token_id, token, service_tokens, service_tokens_lru); +} + +bool TokenCache::find_locked(const std::string& token_id, rgw::keystone::TokenEnvelope& token, + std::map& tokens, std::list& tokens_lru) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + map::iterator iter = tokens.find(token_id); + if (iter == tokens.end()) { + if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_miss); + return false; + } + + token_entry& entry = iter->second; + tokens_lru.erase(entry.lru_iter); + + if (entry.token.expired()) { + tokens.erase(iter); + if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit); + return false; + } + token = entry.token; + + tokens_lru.push_front(token_id); + entry.lru_iter = tokens_lru.begin(); + + if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit); + + return true; +} + +bool TokenCache::find_admin(rgw::keystone::TokenEnvelope& token) +{ + std::lock_guard l{lock}; + + return find_locked(admin_token_id, token, tokens, tokens_lru); +} + +bool TokenCache::find_barbican(rgw::keystone::TokenEnvelope& token) +{ + std::lock_guard l{lock}; + + return find_locked(barbican_token_id, token, tokens, tokens_lru); +} + +void TokenCache::add(const std::string& token_id, + const rgw::keystone::TokenEnvelope& token) +{ + std::lock_guard l{lock}; + add_locked(token_id, token, tokens, tokens_lru); +} + +void TokenCache::add_service(const std::string& token_id, + const rgw::keystone::TokenEnvelope& token) +{ + std::lock_guard l{lock}; + add_locked(token_id, token, service_tokens, service_tokens_lru); +} + +void TokenCache::add_locked(const std::string& token_id, const rgw::keystone::TokenEnvelope& token, + std::map& tokens, std::list& tokens_lru) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + map::iterator iter = tokens.find(token_id); + if (iter != tokens.end()) { + token_entry& e = iter->second; + tokens_lru.erase(e.lru_iter); + } + + tokens_lru.push_front(token_id); + token_entry& entry = tokens[token_id]; + entry.token = token; + entry.lru_iter = tokens_lru.begin(); + + while (tokens_lru.size() > max) { + list::reverse_iterator riter = tokens_lru.rbegin(); + iter = tokens.find(*riter); + ceph_assert(iter != tokens.end()); + tokens.erase(iter); + tokens_lru.pop_back(); + } +} + +void TokenCache::add_admin(const rgw::keystone::TokenEnvelope& token) +{ + std::lock_guard l{lock}; + + rgw_get_token_id(token.token.id, admin_token_id); + add_locked(admin_token_id, token, tokens, tokens_lru); +} + +void TokenCache::add_barbican(const rgw::keystone::TokenEnvelope& token) +{ + std::lock_guard l{lock}; + + rgw_get_token_id(token.token.id, barbican_token_id); + add_locked(barbican_token_id, token, tokens, tokens_lru); +} + +void TokenCache::invalidate(const DoutPrefixProvider *dpp, const std::string& token_id) +{ + std::lock_guard l{lock}; + map::iterator iter = tokens.find(token_id); + if (iter == tokens.end()) + return; + + ldpp_dout(dpp, 20) << "invalidating revoked token id=" << token_id << dendl; + token_entry& e = iter->second; + tokens_lru.erase(e.lru_iter); + tokens.erase(iter); +} + +bool TokenCache::going_down() const +{ + return down_flag; +} + +}; /* namespace keystone */ +}; /* namespace rgw */ + +void rgw::keystone::TokenEnvelope::Token::decode_json(JSONObj *obj) +{ + string expires_iso8601; + struct tm t; + + JSONDecoder::decode_json("id", id, obj, true); + JSONDecoder::decode_json("tenant", tenant_v2, obj, true); + JSONDecoder::decode_json("expires", expires_iso8601, obj, true); + + if (parse_iso8601(expires_iso8601.c_str(), &t)) { + expires = internal_timegm(&t); + } else { + expires = 0; + throw JSONDecoder::err("Failed to parse ISO8601 expiration date from Keystone response."); + } +} + +void rgw::keystone::TokenEnvelope::Role::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("name", name, obj, true); +} + +void rgw::keystone::TokenEnvelope::Domain::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj, true); + JSONDecoder::decode_json("name", name, obj, true); +} + +void rgw::keystone::TokenEnvelope::Project::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj, true); + JSONDecoder::decode_json("name", name, obj, true); + JSONDecoder::decode_json("domain", domain, obj); +} + +void rgw::keystone::TokenEnvelope::User::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj, true); + JSONDecoder::decode_json("name", name, obj, true); + JSONDecoder::decode_json("domain", domain, obj); + JSONDecoder::decode_json("roles", roles_v2, obj); +} + +void rgw::keystone::TokenEnvelope::decode_v3(JSONObj* const root_obj) +{ + std::string expires_iso8601; + + JSONDecoder::decode_json("user", user, root_obj, true); + JSONDecoder::decode_json("expires_at", expires_iso8601, root_obj, true); + JSONDecoder::decode_json("roles", roles, root_obj, true); + JSONDecoder::decode_json("project", project, root_obj, true); + + struct tm t; + if (parse_iso8601(expires_iso8601.c_str(), &t)) { + token.expires = internal_timegm(&t); + } else { + token.expires = 0; + throw JSONDecoder::err("Failed to parse ISO8601 expiration date" + "from Keystone response."); + } +} + +void rgw::keystone::TokenEnvelope::decode_v2(JSONObj* const root_obj) +{ + JSONDecoder::decode_json("user", user, root_obj, true); + JSONDecoder::decode_json("token", token, root_obj, true); + + roles = user.roles_v2; + project = token.tenant_v2; +} + +/* This utility function shouldn't conflict with the overload of std::to_string + * provided by string_ref since Boost 1.54 as it's defined outside of the std + * namespace. I hope we'll remove it soon - just after merging the Matt's PR + * for bundled Boost. It would allow us to forget that CentOS 7 has Boost 1.53. */ +static inline std::string to_string(const std::string_view& s) +{ + return std::string(s.data(), s.length()); +} + +void rgw::keystone::AdminTokenRequestVer2::dump(Formatter* const f) const +{ + f->open_object_section("token_request"); + f->open_object_section("auth"); + f->open_object_section("passwordCredentials"); + encode_json("username", ::to_string(conf.get_admin_user()), f); + encode_json("password", ::to_string(conf.get_admin_password()), f); + f->close_section(); + encode_json("tenantName", ::to_string(conf.get_admin_tenant()), f); + f->close_section(); + f->close_section(); +} + +void rgw::keystone::AdminTokenRequestVer3::dump(Formatter* const f) const +{ + f->open_object_section("token_request"); + f->open_object_section("auth"); + f->open_object_section("identity"); + f->open_array_section("methods"); + f->dump_string("", "password"); + f->close_section(); + f->open_object_section("password"); + f->open_object_section("user"); + f->open_object_section("domain"); + encode_json("name", ::to_string(conf.get_admin_domain()), f); + f->close_section(); + encode_json("name", ::to_string(conf.get_admin_user()), f); + encode_json("password", ::to_string(conf.get_admin_password()), f); + f->close_section(); + f->close_section(); + f->close_section(); + f->open_object_section("scope"); + f->open_object_section("project"); + if (! conf.get_admin_project().empty()) { + encode_json("name", ::to_string(conf.get_admin_project()), f); + } else { + encode_json("name", ::to_string(conf.get_admin_tenant()), f); + } + f->open_object_section("domain"); + encode_json("name", ::to_string(conf.get_admin_domain()), f); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); +} + +void rgw::keystone::BarbicanTokenRequestVer2::dump(Formatter* const f) const +{ + f->open_object_section("token_request"); + f->open_object_section("auth"); + f->open_object_section("passwordCredentials"); + encode_json("username", cct->_conf->rgw_keystone_barbican_user, f); + encode_json("password", cct->_conf->rgw_keystone_barbican_password, f); + f->close_section(); + encode_json("tenantName", cct->_conf->rgw_keystone_barbican_tenant, f); + f->close_section(); + f->close_section(); +} + +void rgw::keystone::BarbicanTokenRequestVer3::dump(Formatter* const f) const +{ + f->open_object_section("token_request"); + f->open_object_section("auth"); + f->open_object_section("identity"); + f->open_array_section("methods"); + f->dump_string("", "password"); + f->close_section(); + f->open_object_section("password"); + f->open_object_section("user"); + f->open_object_section("domain"); + encode_json("name", cct->_conf->rgw_keystone_barbican_domain, f); + f->close_section(); + encode_json("name", cct->_conf->rgw_keystone_barbican_user, f); + encode_json("password", cct->_conf->rgw_keystone_barbican_password, f); + f->close_section(); + f->close_section(); + f->close_section(); + f->open_object_section("scope"); + f->open_object_section("project"); + if (!cct->_conf->rgw_keystone_barbican_project.empty()) { + encode_json("name", cct->_conf->rgw_keystone_barbican_project, f); + } else { + encode_json("name", cct->_conf->rgw_keystone_barbican_tenant, f); + } + f->open_object_section("domain"); + encode_json("name", cct->_conf->rgw_keystone_barbican_domain, f); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); +} + + diff --git a/src/rgw/rgw_keystone.h b/src/rgw/rgw_keystone.h new file mode 100644 index 000000000..0ba882782 --- /dev/null +++ b/src/rgw/rgw_keystone.h @@ -0,0 +1,333 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include + +#include "rgw_common.h" +#include "rgw_http_client.h" +#include "common/ceph_mutex.h" +#include "global/global_init.h" + + +bool rgw_is_pki_token(const std::string& token); +void rgw_get_token_id(const std::string& token, std::string& token_id); +static inline std::string rgw_get_token_id(const std::string& token) +{ + std::string token_id; + rgw_get_token_id(token, token_id); + + return token_id; +} + +namespace rgw { +namespace keystone { + +enum class ApiVersion { + VER_2, + VER_3 +}; + + +class Config { +protected: + Config() = default; + virtual ~Config() = default; + +public: + virtual std::string get_endpoint_url() const noexcept = 0; + virtual ApiVersion get_api_version() const noexcept = 0; + + virtual std::string get_admin_token() const noexcept = 0; + virtual std::string_view get_admin_user() const noexcept = 0; + virtual std::string get_admin_password() const noexcept = 0; + virtual std::string_view get_admin_tenant() const noexcept = 0; + virtual std::string_view get_admin_project() const noexcept = 0; + virtual std::string_view get_admin_domain() const noexcept = 0; +}; + +class CephCtxConfig : public Config { +protected: + CephCtxConfig() = default; + virtual ~CephCtxConfig() = default; + + const static std::string empty; + +public: + static CephCtxConfig& get_instance() { + static CephCtxConfig instance; + return instance; + } + + std::string get_endpoint_url() const noexcept override; + ApiVersion get_api_version() const noexcept override; + + std::string get_admin_token() const noexcept override; + + std::string_view get_admin_user() const noexcept override { + return g_ceph_context->_conf->rgw_keystone_admin_user; + } + + std::string get_admin_password() const noexcept override; + + std::string_view get_admin_tenant() const noexcept override { + return g_ceph_context->_conf->rgw_keystone_admin_tenant; + } + + std::string_view get_admin_project() const noexcept override { + return g_ceph_context->_conf->rgw_keystone_admin_project; + } + + std::string_view get_admin_domain() const noexcept override { + return g_ceph_context->_conf->rgw_keystone_admin_domain; + } +}; + + +class TokenEnvelope; +class TokenCache; + +class Service { +public: + class RGWKeystoneHTTPTransceiver : public RGWHTTPTransceiver { + public: + RGWKeystoneHTTPTransceiver(CephContext * const cct, + const std::string& method, + const std::string& url, + bufferlist * const token_body_bl) + : RGWHTTPTransceiver(cct, method, url, token_body_bl, + cct->_conf->rgw_keystone_verify_ssl, + { "X-Subject-Token" }) { + } + + const header_value_t& get_subject_token() const { + try { + return get_header_value("X-Subject-Token"); + } catch (std::out_of_range&) { + static header_value_t empty_val; + return empty_val; + } + } + }; + + typedef RGWKeystoneHTTPTransceiver RGWValidateKeystoneToken; + typedef RGWKeystoneHTTPTransceiver RGWGetKeystoneAdminToken; + + static int get_admin_token(const DoutPrefixProvider *dpp, + CephContext* const cct, + TokenCache& token_cache, + const Config& config, + std::string& token); + static int issue_admin_token_request(const DoutPrefixProvider *dpp, + CephContext* const cct, + const Config& config, + TokenEnvelope& token); + static int get_keystone_barbican_token(const DoutPrefixProvider *dpp, + CephContext * const cct, + std::string& token); +}; + + +class TokenEnvelope { +public: + class Domain { + public: + std::string id; + std::string name; + void decode_json(JSONObj *obj); + }; + class Project { + public: + Domain domain; + std::string id; + std::string name; + void decode_json(JSONObj *obj); + }; + + class Token { + public: + Token() : expires(0) { } + std::string id; + time_t expires; + Project tenant_v2; + void decode_json(JSONObj *obj); + }; + + class Role { + public: + std::string id; + std::string name; + void decode_json(JSONObj *obj); + }; + + class User { + public: + std::string id; + std::string name; + Domain domain; + std::list roles_v2; + void decode_json(JSONObj *obj); + }; + + Token token; + Project project; + User user; + std::list roles; + + void decode_v3(JSONObj* obj); + void decode_v2(JSONObj* obj); + +public: + /* We really need the default ctor because of the internals of TokenCache. */ + TokenEnvelope() = default; + + void set_expires(time_t expires) { token.expires = expires; } + time_t get_expires() const { return token.expires; } + const std::string& get_domain_id() const {return project.domain.id;}; + const std::string& get_domain_name() const {return project.domain.name;}; + const std::string& get_project_id() const {return project.id;}; + const std::string& get_project_name() const {return project.name;}; + const std::string& get_user_id() const {return user.id;}; + const std::string& get_user_name() const {return user.name;}; + bool has_role(const std::string& r) const; + bool expired() const { + const uint64_t now = ceph_clock_now().sec(); + return std::cmp_greater_equal(now, get_expires()); + } + int parse(const DoutPrefixProvider *dpp, CephContext* cct, + const std::string& token_str, + ceph::buffer::list& bl /* in */, + ApiVersion version); +}; + + +class TokenCache { + struct token_entry { + TokenEnvelope token; + std::list::iterator lru_iter; + }; + + std::atomic down_flag = { false }; + const boost::intrusive_ptr cct; + + std::string admin_token_id; + std::string barbican_token_id; + std::map tokens; + std::map service_tokens; + std::list tokens_lru; + std::list service_tokens_lru; + + ceph::mutex lock = ceph::make_mutex("rgw::keystone::TokenCache"); + + const size_t max; + + explicit TokenCache(const rgw::keystone::Config& config) + : cct(g_ceph_context), + max(cct->_conf->rgw_keystone_token_cache_size) { + } + + ~TokenCache() { + down_flag = true; + } + +public: + TokenCache(const TokenCache&) = delete; + void operator=(const TokenCache&) = delete; + + template + static TokenCache& get_instance() { + static_assert(std::is_base_of::value, + "ConfigT must be a subclass of rgw::keystone::Config"); + + /* In C++11 this is thread safe. */ + static TokenCache instance(ConfigT::get_instance()); + return instance; + } + + bool find(const std::string& token_id, TokenEnvelope& token); + bool find_service(const std::string& token_id, TokenEnvelope& token); + boost::optional find(const std::string& token_id) { + TokenEnvelope token_envlp; + if (find(token_id, token_envlp)) { + return token_envlp; + } + return boost::none; + } + boost::optional find_service(const std::string& token_id) { + TokenEnvelope token_envlp; + if (find_service(token_id, token_envlp)) { + return token_envlp; + } + return boost::none; + } + bool find_admin(TokenEnvelope& token); + bool find_barbican(TokenEnvelope& token); + void add(const std::string& token_id, const TokenEnvelope& token); + void add_service(const std::string& token_id, const TokenEnvelope& token); + void add_admin(const TokenEnvelope& token); + void add_barbican(const TokenEnvelope& token); + void invalidate(const DoutPrefixProvider *dpp, const std::string& token_id); + bool going_down() const; +private: + void add_locked(const std::string& token_id, const TokenEnvelope& token, + std::map& tokens, std::list& tokens_lru); + bool find_locked(const std::string& token_id, TokenEnvelope& token, + std::map& tokens, std::list& tokens_lru); +}; + + +class AdminTokenRequest { +public: + virtual ~AdminTokenRequest() = default; + virtual void dump(Formatter* f) const = 0; +}; + +class AdminTokenRequestVer2 : public AdminTokenRequest { + const Config& conf; + +public: + explicit AdminTokenRequestVer2(const Config& conf) + : conf(conf) { + } + void dump(Formatter *f) const override; +}; + +class AdminTokenRequestVer3 : public AdminTokenRequest { + const Config& conf; + +public: + explicit AdminTokenRequestVer3(const Config& conf) + : conf(conf) { + } + void dump(Formatter *f) const override; +}; + +class BarbicanTokenRequestVer2 : public AdminTokenRequest { + CephContext *cct; + +public: + explicit BarbicanTokenRequestVer2(CephContext * const _cct) + : cct(_cct) { + } + void dump(Formatter *f) const override; +}; + +class BarbicanTokenRequestVer3 : public AdminTokenRequest { + CephContext *cct; + +public: + explicit BarbicanTokenRequestVer3(CephContext * const _cct) + : cct(_cct) { + } + void dump(Formatter *f) const override; +}; + + +}; /* namespace keystone */ +}; /* namespace rgw */ diff --git a/src/rgw/rgw_kmip_client.cc b/src/rgw/rgw_kmip_client.cc new file mode 100644 index 000000000..e801972ea --- /dev/null +++ b/src/rgw/rgw_kmip_client.cc @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/Thread.h" +#include "include/compat.h" +#include "common/errno.h" +#include "rgw_common.h" +#include "rgw_kmip_client.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +RGWKMIPManager *rgw_kmip_manager; + +int +RGWKMIPTransceiver::wait(optional_yield y) +{ + if (done) + return ret; + std::unique_lock l{lock}; + if (!done) + cond.wait(l); + if (ret) { + lderr(cct) << "kmip process failed, " << ret << dendl; + } + return ret; +} + +int +RGWKMIPTransceiver::send() +{ + int r = rgw_kmip_manager->add_request(this); + if (r < 0) { + lderr(cct) << "kmip send failed, " << r << dendl; + } + return r; +} + +int +RGWKMIPTransceiver::process(optional_yield y) +{ + int r = send(); + if (r < 0) + return r; + return wait(y); +} + +RGWKMIPTransceiver::~RGWKMIPTransceiver() +{ + int i; + if (out) + free(out); + out = nullptr; + if (outlist->strings) { + for (i = 0; i < outlist->string_count; ++i) { + free(outlist->strings[i]); + } + free(outlist->strings); + outlist->strings = 0; + } + if (outkey->data) { + ::ceph::crypto::zeroize_for_security(outkey->data, outkey->keylen); + free(outkey->data); + outkey->data = 0; + } +} + +void +rgw_kmip_client_init(RGWKMIPManager &m) +{ + rgw_kmip_manager = &m; + rgw_kmip_manager->start(); +} + +void +rgw_kmip_client_cleanup() +{ + rgw_kmip_manager->stop(); + delete rgw_kmip_manager; +} diff --git a/src/rgw/rgw_kmip_client.h b/src/rgw/rgw_kmip_client.h new file mode 100644 index 000000000..299292113 --- /dev/null +++ b/src/rgw/rgw_kmip_client.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +class RGWKMIPManager; + +class RGWKMIPTransceiver { +public: + enum kmip_operation { + CREATE, + LOCATE, + GET, + GET_ATTRIBUTES, + GET_ATTRIBUTE_LIST, + DESTROY + }; + CephContext *cct; + kmip_operation operation; + char *name = 0; + char *unique_id = 0; + // output - must free + char *out = 0; // unique_id, several + struct { // unique_ids, locate + char **strings; + int string_count; + } outlist[1] = {{0, 0}}; + struct { // key, get + unsigned char *data; + int keylen; + } outkey[1] = {0, 0}; + // end must free + int ret; + bool done; + ceph::mutex lock = ceph::make_mutex("rgw_kmip_req::lock"); + ceph::condition_variable cond; + + int wait(optional_yield y); + RGWKMIPTransceiver(CephContext * const cct, + kmip_operation operation) + : cct(cct), + operation(operation), + ret(-EDOM), + done(false) + {} + ~RGWKMIPTransceiver(); + + int send(); + int process(optional_yield y); +}; + +class RGWKMIPManager { +protected: + CephContext *cct; + bool is_started = false; + RGWKMIPManager(CephContext *cct) : cct(cct) {}; +public: + virtual ~RGWKMIPManager() { }; + virtual int start() = 0; + virtual void stop() = 0; + virtual int add_request(RGWKMIPTransceiver*) = 0; +}; + +void rgw_kmip_client_init(RGWKMIPManager &); +void rgw_kmip_client_cleanup(); diff --git a/src/rgw/rgw_kmip_client_impl.cc b/src/rgw/rgw_kmip_client_impl.cc new file mode 100644 index 000000000..0824273e6 --- /dev/null +++ b/src/rgw/rgw_kmip_client_impl.cc @@ -0,0 +1,728 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + +#include "include/compat.h" +#include "common/errno.h" +#include "rgw_common.h" +#include "rgw_kmip_client.h" +#include "rgw_kmip_client_impl.h" + +#include +#include +extern "C" { +#include "kmip.h" +#include "kmip_bio.h" +#include "kmip_memset.h" +}; + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +static enum kmip_version protocol_version = KMIP_1_0; + +struct RGWKmipHandle { + int uses; + mono_time lastuse; + SSL_CTX *ctx; + SSL *ssl; + BIO *bio; + KMIP kmip_ctx[1]; + TextString textstrings[2]; + UsernamePasswordCredential upc[1]; + Credential credential[1]; + int need_to_free_kmip; + size_t buffer_blocks, buffer_block_size, buffer_total_size; + uint8 *encoding; + + explicit RGWKmipHandle() : + uses(0), ctx(0), ssl(0), bio(0), + need_to_free_kmip(0), + encoding(0) { + memset(kmip_ctx, 0, sizeof kmip_ctx); + memset(textstrings, 0, sizeof textstrings); + memset(upc, 0, sizeof upc); + memset(credential, 0, sizeof credential); + }; +}; + +struct RGWKmipWorker: public Thread { + RGWKMIPManagerImpl &m; + RGWKmipWorker(RGWKMIPManagerImpl& m) : m(m) {} + void *entry() override; + void signal() { + std::lock_guard l{m.lock}; + m.cond.notify_all(); + } +}; + +static void +kmip_free_handle_stuff(RGWKmipHandle *kmip) +{ + if (kmip->encoding) { + kmip_free_buffer(kmip->kmip_ctx, + kmip->encoding, + kmip->buffer_total_size); + kmip_set_buffer(kmip->kmip_ctx, NULL, 0); + } + if (kmip->need_to_free_kmip) + kmip_destroy(kmip->kmip_ctx); + if (kmip->bio) + BIO_free_all(kmip->bio); + if (kmip->ctx) + SSL_CTX_free(kmip->ctx); +} + +class RGWKmipHandleBuilder { +private: + CephContext *cct; + const char *clientcert = 0; + const char *clientkey = 0; + const char *capath = 0; + const char *host = 0; + const char *portstring = 0; + const char *username = 0; + const char *password = 0; +public: + RGWKmipHandleBuilder(CephContext *cct) : cct(cct) {}; + RGWKmipHandleBuilder& set_clientcert(const std::string &v) { + const char *s = v.c_str(); + if (*s) { + clientcert = s; + } + return *this; + } + RGWKmipHandleBuilder& set_clientkey(const std::string &v) { + const char *s = v.c_str(); + if (*s) { + clientkey = s; + } + return *this; + } + RGWKmipHandleBuilder& set_capath(const std::string &v) { + const char *s = v.c_str(); + if (*s) { + capath = s; + } + return *this; + } + RGWKmipHandleBuilder& set_host(const char *v) { + host = v; + return *this; + } + RGWKmipHandleBuilder& set_portstring(const char *v) { + portstring = v; + return *this; + } + RGWKmipHandleBuilder& set_username(const std::string &v) { + const char *s = v.c_str(); + if (*s) { + username = s; + } + return *this; + } + RGWKmipHandleBuilder& set_password(const std::string& v) { + const char *s = v.c_str(); + if (*s) { + password = s; + } + return *this; + } + RGWKmipHandle *build() const; +}; + +static int +kmip_write_an_error_helper(const char *s, size_t l, void *u) { + CephContext *cct = (CephContext *)u; + std::string_view es(s, l); + lderr(cct) << es << dendl; + return l; +} + +void +ERR_print_errors_ceph(CephContext *cct) +{ + ERR_print_errors_cb(kmip_write_an_error_helper, cct); +} + +RGWKmipHandle * +RGWKmipHandleBuilder::build() const +{ + int failed = 1; + RGWKmipHandle *r = new RGWKmipHandle(); + TextString *up = 0; + size_t ns; + + r->ctx = SSL_CTX_new(TLS_client_method()); + + if (!clientcert) + ; + else if (SSL_CTX_use_certificate_file(r->ctx, clientcert, SSL_FILETYPE_PEM) != 1) { + lderr(cct) << "ERROR: can't load client cert from " + << clientcert << dendl; + ERR_print_errors_ceph(cct); + goto Done; + } + + if (!clientkey) + ; + else if (SSL_CTX_use_PrivateKey_file(r->ctx, clientkey, + SSL_FILETYPE_PEM) != 1) { + lderr(cct) << "ERROR: can't load client key from " + << clientkey << dendl; + ERR_print_errors_ceph(cct); + goto Done; + } + + if (!capath) + ; + else if (SSL_CTX_load_verify_locations(r->ctx, capath, NULL) != 1) { + lderr(cct) << "ERROR: can't load cacert from " + << capath << dendl; + ERR_print_errors_ceph(cct); + goto Done; + } + r->bio = BIO_new_ssl_connect(r->ctx); + if (!r->bio) { + lderr(cct) << "BIO_new_ssl_connect failed" << dendl; + goto Done; + } + BIO_get_ssl(r->bio, &r->ssl); + SSL_set_mode(r->ssl, SSL_MODE_AUTO_RETRY); + + BIO_set_conn_hostname(r->bio, host); + BIO_set_conn_port(r->bio, portstring); + if (BIO_do_connect(r->bio) != 1) { + lderr(cct) << "BIO_do_connect failed to " << host + << ":" << portstring << dendl; + ERR_print_errors_ceph(cct); + goto Done; + } + + // setup kmip + + kmip_init(r->kmip_ctx, NULL, 0, protocol_version); + r->need_to_free_kmip = 1; + r->buffer_blocks = 1; + r->buffer_block_size = 1024; + r->encoding = static_cast(r->kmip_ctx->calloc_func( + r->kmip_ctx->state, r->buffer_blocks, r->buffer_block_size)); + if (!r->encoding) { + lderr(cct) << "kmip buffer alloc failed: " + << r->buffer_blocks << + " * " << r->buffer_block_size << dendl; + goto Done; + } + ns = r->buffer_blocks * r->buffer_block_size; + kmip_set_buffer(r->kmip_ctx, r->encoding, ns); + r->buffer_total_size = ns; + + up = r->textstrings; + if (username) { + memset(r->upc, 0, sizeof *r->upc); + up->value = (char *) username; + up->size = strlen(username); + r->upc->username = up++; + if (password) { + up->value = (char *) password; + up->size = strlen(password); + r->upc->password = up++; + } + r->credential->credential_type = KMIP_CRED_USERNAME_AND_PASSWORD; + r->credential->credential_value = r->upc; + int i = kmip_add_credential(r->kmip_ctx, r->credential); + if (i != KMIP_OK) { + fprintf(stderr,"failed to add credential to kmip\n"); + goto Done; + } + } + + failed = 0; +Done: + if (!failed) + ; + else if (!r) + ; + else { + kmip_free_handle_stuff(r); + delete r; + r = 0; + } + return r; +} + +struct RGWKmipHandles : public Thread { + CephContext *cct; + ceph::mutex cleaner_lock = ceph::make_mutex("RGWKmipHandles::cleaner_lock"); + std::vector saved_kmip; + int cleaner_shutdown; + bool cleaner_active = false; + ceph::condition_variable cleaner_cond; + RGWKmipHandles(CephContext *cct) : + cct(cct), cleaner_shutdown{0} { + } + RGWKmipHandle* get_kmip_handle(); + void release_kmip_handle_now(RGWKmipHandle* kmip); + void release_kmip_handle(RGWKmipHandle* kmip); + void flush_kmip_handles(); + int do_one_entry(RGWKMIPTransceiver &element); + void* entry(); + void start(); + void stop(); +}; + +RGWKmipHandle* +RGWKmipHandles::get_kmip_handle() +{ + RGWKmipHandle* kmip = 0; + const char *hostaddr = cct->_conf->rgw_crypt_kmip_addr.c_str(); + { + std::lock_guard lock{cleaner_lock}; + if (!saved_kmip.empty()) { + kmip = *saved_kmip.begin(); + saved_kmip.erase(saved_kmip.begin()); + } + } + if (!kmip && hostaddr) { + char *hosttemp = strdup(hostaddr); + char *port = strchr(hosttemp, ':'); + if (port) + *port++ = 0; + kmip = RGWKmipHandleBuilder{cct} + .set_clientcert(cct->_conf->rgw_crypt_kmip_client_cert) + .set_clientkey(cct->_conf->rgw_crypt_kmip_client_key) + .set_capath(cct->_conf->rgw_crypt_kmip_ca_path) + .set_host(hosttemp) + .set_portstring(port ? port : "5696") + .set_username(cct->_conf->rgw_crypt_kmip_username) + .set_password(cct->_conf->rgw_crypt_kmip_password) + .build(); + free(hosttemp); + } + return kmip; +} + +void +RGWKmipHandles::release_kmip_handle_now(RGWKmipHandle* kmip) +{ + kmip_free_handle_stuff(kmip); + delete kmip; +} + +#define MAXIDLE 5 +void +RGWKmipHandles::release_kmip_handle(RGWKmipHandle* kmip) +{ + if (cleaner_shutdown) { + release_kmip_handle_now(kmip); + } else { + std::lock_guard lock{cleaner_lock}; + kmip->lastuse = mono_clock::now(); + saved_kmip.insert(saved_kmip.begin(), 1, kmip); + } +} + +void* +RGWKmipHandles::entry() +{ + RGWKmipHandle* kmip; + std::unique_lock lock{cleaner_lock}; + + for (;;) { + if (cleaner_shutdown) { + if (saved_kmip.empty()) + break; + } else { + cleaner_cond.wait_for(lock, std::chrono::seconds(MAXIDLE)); + } + mono_time now = mono_clock::now(); + while (!saved_kmip.empty()) { + auto cend = saved_kmip.end(); + --cend; + kmip = *cend; + if (!cleaner_shutdown && now - kmip->lastuse + < std::chrono::seconds(MAXIDLE)) + break; + saved_kmip.erase(cend); + release_kmip_handle_now(kmip); + } + } + return nullptr; +} + +void +RGWKmipHandles::start() +{ + std::lock_guard lock{cleaner_lock}; + if (!cleaner_active) { + cleaner_active = true; + this->create("KMIPcleaner"); // len<16!!! + } +} + +void +RGWKmipHandles::stop() +{ + std::unique_lock lock{cleaner_lock}; + cleaner_shutdown = 1; + cleaner_cond.notify_all(); + if (cleaner_active) { + lock.unlock(); + this->join(); + cleaner_active = false; + } +} + +void +RGWKmipHandles::flush_kmip_handles() +{ + stop(); + join(); + if (!saved_kmip.empty()) { + ldout(cct, 0) << "ERROR: " << __func__ << " failed final cleanup" << dendl; + } + saved_kmip.shrink_to_fit(); +} + +int +RGWKMIPManagerImpl::start() +{ + if (worker) { + lderr(cct) << "kmip worker already started" << dendl; + return -1; + } + worker = new RGWKmipWorker(*this); + worker->create("kmip worker"); + return 0; +} + +void +RGWKMIPManagerImpl::stop() +{ + going_down = true; + if (worker) { + worker->signal(); + worker->join(); + delete worker; + worker = 0; + } +} + +int +RGWKMIPManagerImpl::add_request(RGWKMIPTransceiver *req) +{ + std::unique_lock l{lock}; + if (going_down) + return -ECANCELED; + requests.push_back(*new Request{*req}); + l.unlock(); + if (worker) + worker->signal(); + return 0; +} + +int +RGWKmipHandles::do_one_entry(RGWKMIPTransceiver &element) +{ + auto h = get_kmip_handle(); + std::unique_lock l{element.lock}; + Attribute a[8], *ap; + TextString nvalue[1], uvalue[1]; + Name nattr[1]; + enum cryptographic_algorithm alg = KMIP_CRYPTOALG_AES; + int32 length = 256; + int32 mask = KMIP_CRYPTOMASK_ENCRYPT | KMIP_CRYPTOMASK_DECRYPT; + size_t ns; + ProtocolVersion pv[1]; + RequestHeader rh[1]; + RequestMessage rm[1]; + Authentication auth[1]; + ResponseMessage resp_m[1]; + int i; + union { + CreateRequestPayload create_req[1]; + LocateRequestPayload locate_req[1]; + GetRequestPayload get_req[1]; + GetAttributeListRequestPayload lsattrs_req[1]; + GetAttributesRequestPayload getattrs_req[1]; + } u[1]; + RequestBatchItem rbi[1]; + TemplateAttribute ta[1]; + const char *what = "?"; + int need_to_free_response = 0; + char *response = NULL; + int response_size = 0; + enum result_status rs; + ResponseBatchItem *req; + + if (!h) { + element.ret = -ERR_SERVICE_UNAVAILABLE; + return element.ret; + } + memset(a, 0, sizeof *a); + for (i = 0; i < (int)(sizeof a/sizeof *a); ++i) + kmip_init_attribute(a+i); + ap = a; + switch(element.operation) { + case RGWKMIPTransceiver::CREATE: + ap->type = KMIP_ATTR_CRYPTOGRAPHIC_ALGORITHM; + ap->value = &alg; + ++ap; + ap->type = KMIP_ATTR_CRYPTOGRAPHIC_LENGTH; + ap->value = &length; + ++ap; + ap->type = KMIP_ATTR_CRYPTOGRAPHIC_USAGE_MASK; + ap->value = &mask; + ++ap; + break; + default: + break; + } + if (element.name) { + memset(nvalue, 0, sizeof *nvalue); + nvalue->value = element.name; + nvalue->size = strlen(element.name); + memset(nattr, 0, sizeof *nattr); + nattr->value = nvalue; + nattr->type = KMIP_NAME_UNINTERPRETED_TEXT_STRING; + ap->type = KMIP_ATTR_NAME; + ap->value = nattr; + ++ap; + } + if (element.unique_id) { + memset(uvalue, 0, sizeof *uvalue); + uvalue->value = element.unique_id; + uvalue->size = strlen(element.unique_id); + } + memset(pv, 0, sizeof *pv); + memset(rh, 0, sizeof *rh); + memset(rm, 0, sizeof *rm); + memset(auth, 0, sizeof *auth); + memset(resp_m, 0, sizeof *resp_m); + kmip_init_protocol_version(pv, h->kmip_ctx->version); + kmip_init_request_header(rh); + rh->protocol_version = pv; + rh->maximum_response_size = h->kmip_ctx->max_message_size; + rh->time_stamp = time(NULL); + rh->batch_count = 1; + memset(rbi, 0, sizeof *rbi); + kmip_init_request_batch_item(rbi); + memset(u, 0, sizeof *u); + rbi->request_payload = u; + switch(element.operation) { + case RGWKMIPTransceiver::CREATE: + memset(ta, 0, sizeof *ta); + ta->attributes = a; + ta->attribute_count = ap-a; + u->create_req->object_type = KMIP_OBJTYPE_SYMMETRIC_KEY; + u->create_req->template_attribute = ta; + rbi->operation = KMIP_OP_CREATE; + what = "create"; + break; + case RGWKMIPTransceiver::GET: + if (element.unique_id) + u->get_req->unique_identifier = uvalue; + rbi->operation = KMIP_OP_GET; + what = "get"; + break; + case RGWKMIPTransceiver::LOCATE: + if (ap > a) { + u->locate_req->attributes = a; + u->locate_req->attribute_count = ap - a; + } + rbi->operation = KMIP_OP_LOCATE; + what = "locate"; + break; + case RGWKMIPTransceiver::GET_ATTRIBUTES: + case RGWKMIPTransceiver::GET_ATTRIBUTE_LIST: + case RGWKMIPTransceiver::DESTROY: + default: + lderr(cct) << "Missing operation logic op=" << element.operation << dendl; + element.ret = -EINVAL; + goto Done; + } + rm->request_header = rh; + rm->batch_items = rbi; + rm->batch_count = 1; + if (h->kmip_ctx->credential_list) { + LinkedListItem *item = h->kmip_ctx->credential_list->head; + if (item) { + auth->credential = (Credential *)item->data; + rh->authentication = auth; + } + } + for (;;) { + i = kmip_encode_request_message(h->kmip_ctx, rm); + if (i != KMIP_ERROR_BUFFER_FULL) break; + h->kmip_ctx->free_func(h->kmip_ctx->state, h->encoding); + h->encoding = 0; + ++h->buffer_blocks; + h->encoding = static_cast(h->kmip_ctx->calloc_func(h->kmip_ctx->state, h->buffer_blocks, h->buffer_block_size)); + if (!h->encoding) { + lderr(cct) << "kmip buffer alloc failed: " + << h->buffer_blocks + << " * " << h->buffer_block_size << dendl; + element.ret = -ENOMEM; + goto Done; + } + ns = h->buffer_blocks * h->buffer_block_size; + kmip_set_buffer(h->kmip_ctx, h->encoding, ns); + h->buffer_total_size = ns; + } + if (i != KMIP_OK) { + lderr(cct) << " Failed to encode " << what + << " request; err=" << i + << " ctx error message " << h->kmip_ctx->error_message + << dendl; + element.ret = -EINVAL; + goto Done; + } + i = kmip_bio_send_request_encoding(h->kmip_ctx, h->bio, + (char*)h->encoding, + h->kmip_ctx->index - h->kmip_ctx->buffer, + &response, &response_size); + if (i < 0) { + lderr(cct) << "Problem sending request to " << what << " " << i << " context error message " << h->kmip_ctx->error_message << dendl; + element.ret = -EINVAL; + goto Done; + } + kmip_free_buffer(h->kmip_ctx, h->encoding, + h->buffer_total_size); + h->encoding = 0; + kmip_set_buffer(h->kmip_ctx, response, response_size); + need_to_free_response = 1; + i = kmip_decode_response_message(h->kmip_ctx, resp_m); + if (i != KMIP_OK) { + lderr(cct) << "Failed to decode " << what << " " << i << " context error message " << h->kmip_ctx->error_message << dendl; + element.ret = -EINVAL; + goto Done; + } + if (resp_m->batch_count != 1) { + lderr(cct) << "Failed; weird response count doing " << what << " " << resp_m->batch_count << dendl; + element.ret = -EINVAL; + goto Done; + } + req = resp_m->batch_items; + rs = req->result_status; + if (rs != KMIP_STATUS_SUCCESS) { + lderr(cct) << "Failed; result status not success " << rs << dendl; + element.ret = -EINVAL; + goto Done; + } + if (req->operation != rbi->operation) { + lderr(cct) << "Failed; response operation mismatch, got " << req->operation << " expected " << rbi->operation << dendl; + element.ret = -EINVAL; + goto Done; + } + switch(req->operation) + { + case KMIP_OP_CREATE: { + CreateResponsePayload *pld = (CreateResponsePayload *)req->response_payload; + element.out = static_cast(malloc(pld->unique_identifier->size+1)); + memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size); + element.out[pld->unique_identifier->size] = 0; + } break; + case KMIP_OP_LOCATE: { + LocateResponsePayload *pld = (LocateResponsePayload *)req->response_payload; + char **list = static_cast(malloc(sizeof (char*) * (1 + pld->unique_identifiers_count))); + for (i = 0; i < pld->unique_identifiers_count; ++i) { + list[i] = static_cast(malloc(pld->unique_identifiers[i].size+1)); + memcpy(list[i], pld->unique_identifiers[i].value, pld->unique_identifiers[i].size); + list[i][pld->unique_identifiers[i].size] = 0; + } + list[i] = 0; + element.outlist->strings = list; + element.outlist->string_count = pld->unique_identifiers_count; + } break; + case KMIP_OP_GET: { + GetResponsePayload *pld = (GetResponsePayload *)req->response_payload; + element.out = static_cast(malloc(pld->unique_identifier->size+1)); + memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size); + element.out[pld->unique_identifier->size] = 0; + if (pld->object_type != KMIP_OBJTYPE_SYMMETRIC_KEY) { + lderr(cct) << "get: expected symmetric key got " << pld->object_type << dendl; + element.ret = -EINVAL; + goto Done; + } + KeyBlock *kp = static_cast(pld->object)->key_block; + ByteString *bp; + if (kp->key_format_type != KMIP_KEYFORMAT_RAW) { + lderr(cct) << "get: expected raw key fromat got " << kp->key_format_type << dendl; + element.ret = -EINVAL; + goto Done; + } + KeyValue *kv = static_cast(kp->key_value); + bp = static_cast(kv->key_material); + element.outkey->data = static_cast(malloc(bp->size)); + element.outkey->keylen = bp->size; + memcpy(element.outkey->data, bp->value, bp->size); + } break; + case KMIP_OP_GET_ATTRIBUTES: { + GetAttributesResponsePayload *pld = (GetAttributesResponsePayload *)req->response_payload; + element.out = static_cast(malloc(pld->unique_identifier->size+1)); + memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size); + element.out[pld->unique_identifier->size] = 0; + } break; + case KMIP_OP_GET_ATTRIBUTE_LIST: { + GetAttributeListResponsePayload *pld = (GetAttributeListResponsePayload *)req->response_payload; + element.out = static_cast(malloc(pld->unique_identifier->size+1)); + memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size); + element.out[pld->unique_identifier->size] = 0; + } break; + case KMIP_OP_DESTROY: { + DestroyResponsePayload *pld = (DestroyResponsePayload *)req->response_payload; + element.out = static_cast(malloc(pld->unique_identifier->size+1)); + memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size); + element.out[pld->unique_identifier->size] = 0; + } break; + default: + lderr(cct) << "Missing response logic op=" << element.operation << dendl; + element.ret = -EINVAL; + goto Done; + } + element.ret = 0; +Done: + if (need_to_free_response) + kmip_free_response_message(h->kmip_ctx, resp_m); + element.done = true; + element.cond.notify_all(); + release_kmip_handle(h); + return element.ret; +} + +void * +RGWKmipWorker::entry() +{ + std::unique_lock entry_lock{m.lock}; + ldout(m.cct, 10) << __func__ << " start" << dendl; + RGWKmipHandles handles{m.cct}; + handles.start(); + while (!m.going_down) { + if (m.requests.empty()) { + m.cond.wait_for(entry_lock, std::chrono::seconds(MAXIDLE)); + continue; + } + auto iter = m.requests.begin(); + auto element = *iter; + m.requests.erase(iter); + entry_lock.unlock(); + (void) handles.do_one_entry(element.details); + entry_lock.lock(); + } + for (;;) { + if (m.requests.empty()) break; + auto iter = m.requests.begin(); + auto element = std::move(*iter); + m.requests.erase(iter); + element.details.ret = -666; + element.details.done = true; + element.details.cond.notify_all(); + } + handles.stop(); + ldout(m.cct, 10) << __func__ << " finish" << dendl; + return nullptr; +} diff --git a/src/rgw/rgw_kmip_client_impl.h b/src/rgw/rgw_kmip_client_impl.h new file mode 100644 index 000000000..d36903a4b --- /dev/null +++ b/src/rgw/rgw_kmip_client_impl.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +struct RGWKmipWorker; +class RGWKMIPManagerImpl: public RGWKMIPManager { +protected: + ceph::mutex lock = ceph::make_mutex("RGWKMIPManager"); + ceph::condition_variable cond; + + struct Request : boost::intrusive::list_base_hook<> { + boost::intrusive::list_member_hook<> req_hook; + RGWKMIPTransceiver &details; + Request(RGWKMIPTransceiver &details) : details(details) {} + }; + boost::intrusive::list, &Request::req_hook>> requests; + bool going_down = false; + RGWKmipWorker *worker = 0; +public: + RGWKMIPManagerImpl(CephContext *cct) : RGWKMIPManager(cct) {}; + int add_request(RGWKMIPTransceiver *); + int start(); + void stop(); + friend RGWKmipWorker; +}; diff --git a/src/rgw/rgw_kms.cc b/src/rgw/rgw_kms.cc new file mode 100644 index 000000000..936580276 --- /dev/null +++ b/src/rgw/rgw_kms.cc @@ -0,0 +1,1279 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/** + * Server-side encryption integrations with Key Management Systems (SSE-KMS) + */ + +#include +#include "include/str_map.h" +#include "common/safe_io.h" +#include "rgw/rgw_crypt.h" +#include "rgw/rgw_keystone.h" +#include "rgw/rgw_b64.h" +#include "rgw/rgw_kms.h" +#include "rgw/rgw_kmip_client.h" +#include +#include +#include +#include "rapidjson/error/error.h" +#include "rapidjson/error/en.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace rgw; + +#ifndef FORTEST_VIRTUAL +#define FORTEST_VIRTUAL /**/ +#endif + +/** + * Memory pool for use with rapidjson. This version + * carefully zeros out all memory before returning it to + * the system. + */ +#define ALIGNTYPE double +#define MINCHUNKSIZE 4096 +class ZeroPoolAllocator { +private: + struct element { + struct element *next; + int size; + char data[4]; + } *b; + size_t left; +public: + static const bool kNeedFree { false }; + ZeroPoolAllocator(){ + b = 0; + left = 0; + } + ~ZeroPoolAllocator(){ + element *p; + while ((p = b)) { + b = p->next; + memset(p->data, 0, p->size); + free(p); + } + } + void * Malloc(size_t size) { + void *r; + if (!size) return 0; + size = (size + sizeof(ALIGNTYPE)-1)&(-sizeof(ALIGNTYPE)); + if (size > left) { + size_t ns { size }; + if (ns < MINCHUNKSIZE) ns = MINCHUNKSIZE; + element *nw { (element *) malloc(sizeof *b + ns) }; + if (!nw) { +// std::cerr << "out of memory" << std::endl; + return 0; + } + left = ns - sizeof *b; + nw->size = ns; + nw->next = b; + b = nw; + } + left -= size; + r = static_cast(b->data + left); + return r; + } + void* Realloc(void* p, size_t old, size_t nw) { + void *r = nullptr; + if (nw) r = malloc(nw); + if (nw > old) nw = old; + if (r && old) memcpy(r, p, nw); + return r; + } + static void Free(void *p) { + ceph_assert(0 == "Free should not be called"); + } +private: + //! Copy constructor is not permitted. + ZeroPoolAllocator(const ZeroPoolAllocator& rhs) /* = delete */; + //! Copy assignment operator is not permitted. + ZeroPoolAllocator& operator=(const ZeroPoolAllocator& rhs) /* = delete */; +}; + +typedef rapidjson::GenericDocument, + ZeroPoolAllocator, + rapidjson::CrtAllocator + > ZeroPoolDocument; +typedef rapidjson::GenericValue, ZeroPoolAllocator> ZeroPoolValue; + +/** + * Construct a full URL string by concatenating a "base" URL with another path, + * ensuring there is one and only one forward slash between them. If path is + * empty, the URL is not changed. + */ +static void concat_url(std::string &url, std::string path) { + bool url_has_slash = !url.empty() && url.back() == '/'; + if (!path.empty()) { + if (url_has_slash && path.front() == '/') { + url.pop_back(); + } else if (!url_has_slash && path.front() != '/') { + url.push_back('/'); + } + url.append(path); + } +} + +/** + * Determine if a string (url) ends with a given suffix. + * Must deal with (ignore) trailing slashes. + */ +static bool string_ends_maybe_slash(std::string_view hay, + std::string_view needle) +{ + auto hay_len { hay.size() }; + auto needle_len { needle.size() }; + if (hay_len < needle_len) return false; + auto hay_suffix_start { hay.data() + (hay_len - needle_len) }; + while (hay_len > needle_len && hay[hay_len-1] == '/') { + --hay_len; + --hay_suffix_start; + } + std::string_view hay_suffix { hay_suffix_start, needle_len }; + return hay_suffix == needle; +} + +template +static inline void +add_name_val_to_obj(std::string &n, std::string &v, rapidjson::GenericValue &d, + A &allocator) +{ + rapidjson::GenericValue name, val; + name.SetString(n.c_str(), n.length(), allocator); + val.SetString(v.c_str(), v.length(), allocator); + d.AddMember(name, val, allocator); +} + +template +static inline void +add_name_val_to_obj(std::string &n, bool v, rapidjson::GenericValue &d, + A &allocator) +{ + rapidjson::GenericValue name, val; + name.SetString(n.c_str(), n.length(), allocator); + val.SetBool(v); + d.AddMember(name, val, allocator); +} + +template +static inline void +add_name_val_to_obj(const char *n, std::string &v, rapidjson::GenericValue &d, + A &allocator) +{ + std::string ns{n, strlen(n) }; + add_name_val_to_obj(ns, v, d, allocator); +} + +template +static inline void +add_name_val_to_obj(const char *n, bool v, rapidjson::GenericValue &d, + A &allocator) +{ + std::string ns{n, strlen(n) }; + add_name_val_to_obj(ns, v, d, allocator); +} + +typedef std::map EngineParmMap; + + +class SSEContext { +protected: + virtual ~SSEContext(){}; +public: + virtual const std::string & backend() = 0; + virtual const std::string & addr() = 0; + virtual const std::string & auth() = 0; + virtual const std::string & k_namespace() = 0; + virtual const std::string & prefix() = 0; + virtual const std::string & secret_engine() = 0; + virtual const std::string & ssl_cacert() = 0; + virtual const std::string & ssl_clientcert() = 0; + virtual const std::string & ssl_clientkey() = 0; + virtual const std::string & token_file() = 0; + virtual const bool verify_ssl() = 0; +}; + +class VaultSecretEngine: public SecretEngine { + +protected: + CephContext *cct; + SSEContext & kctx; + + int load_token_from_file(const DoutPrefixProvider *dpp, std::string *vault_token) + { + + int res = 0; + std::string token_file = kctx.token_file(); + if (token_file.empty()) { + ldpp_dout(dpp, 0) << "ERROR: Vault token file not set in rgw_crypt_vault_token_file" << dendl; + return -EINVAL; + } + ldpp_dout(dpp, 20) << "Vault token file: " << token_file << dendl; + + struct stat token_st; + if (stat(token_file.c_str(), &token_st) != 0) { + ldpp_dout(dpp, 0) << "ERROR: Vault token file '" << token_file << "' not found " << dendl; + return -ENOENT; + } + + if (token_st.st_mode & (S_IRWXG | S_IRWXO)) { + ldpp_dout(dpp, 0) << "ERROR: Vault token file '" << token_file << "' permissions are " + << "too open, it must not be accessible by other users" << dendl; + return -EACCES; + } + + char buf[2048]; + res = safe_read_file("", token_file.c_str(), buf, sizeof(buf)); + if (res < 0) { + if (-EACCES == res) { + ldpp_dout(dpp, 0) << "ERROR: Permission denied reading Vault token file" << dendl; + } else { + ldpp_dout(dpp, 0) << "ERROR: Failed to read Vault token file with error " << res << dendl; + } + return res; + } + // drop trailing newlines + while (res && isspace(buf[res-1])) { + --res; + } + vault_token->assign(std::string{buf, static_cast(res)}); + memset(buf, 0, sizeof(buf)); + ::ceph::crypto::zeroize_for_security(buf, sizeof(buf)); + return res; + } + + FORTEST_VIRTUAL + int send_request(const DoutPrefixProvider *dpp, const char *method, std::string_view infix, + std::string_view key_id, + const std::string& postdata, + bufferlist &secret_bl) + { + int res; + string vault_token = ""; + if (RGW_SSE_KMS_VAULT_AUTH_TOKEN == kctx.auth()){ + ldpp_dout(dpp, 0) << "Loading Vault Token from filesystem" << dendl; + res = load_token_from_file(dpp, &vault_token); + if (res < 0){ + return res; + } + } + + std::string secret_url = kctx.addr(); + if (secret_url.empty()) { + ldpp_dout(dpp, 0) << "ERROR: Vault address not set in rgw_crypt_vault_addr" << dendl; + return -EINVAL; + } + + concat_url(secret_url, kctx.prefix()); + concat_url(secret_url, std::string(infix)); + concat_url(secret_url, std::string(key_id)); + + RGWHTTPTransceiver secret_req(cct, method, secret_url, &secret_bl); + + if (postdata.length()) { + secret_req.set_post_data(postdata); + secret_req.set_send_length(postdata.length()); + } + + secret_req.append_header("X-Vault-Token", vault_token); + if (!vault_token.empty()){ + secret_req.append_header("X-Vault-Token", vault_token); + vault_token.replace(0, vault_token.length(), vault_token.length(), '\000'); + } + + string vault_namespace = kctx.k_namespace(); + if (!vault_namespace.empty()){ + ldpp_dout(dpp, 20) << "Vault Namespace: " << vault_namespace << dendl; + secret_req.append_header("X-Vault-Namespace", vault_namespace); + } + + secret_req.set_verify_ssl(kctx.verify_ssl()); + + if (!kctx.ssl_cacert().empty()) { + secret_req.set_ca_path(kctx.ssl_cacert()); + } + + if (!kctx.ssl_clientcert().empty()) { + secret_req.set_client_cert(kctx.ssl_clientcert()); + } + if (!kctx.ssl_clientkey().empty()) { + secret_req.set_client_key(kctx.ssl_clientkey()); + } + + res = secret_req.process(null_yield); + + // map 401 to EACCES instead of EPERM + if (secret_req.get_http_status() == + RGWHTTPTransceiver::HTTP_STATUS_UNAUTHORIZED) { + ldpp_dout(dpp, 0) << "ERROR: Vault request failed authorization" << dendl; + return -EACCES; + } + if (res < 0) { + ldpp_dout(dpp, 0) << "ERROR: Request to Vault failed with error " << res << dendl; + return res; + } + + ldpp_dout(dpp, 20) << "Request to Vault returned " << res << " and HTTP status " + << secret_req.get_http_status() << dendl; + + return res; + } + + int send_request(const DoutPrefixProvider *dpp, std::string_view key_id, bufferlist &secret_bl) + { + return send_request(dpp, "GET", "", key_id, string{}, secret_bl); + } + + int decode_secret(const DoutPrefixProvider *dpp, std::string encoded, std::string& actual_key){ + try { + actual_key = from_base64(encoded); + } catch (std::exception&) { + ldpp_dout(dpp, 0) << "ERROR: Failed to base64 decode key retrieved from Vault" << dendl; + return -EINVAL; + } + memset(encoded.data(), 0, encoded.length()); + return 0; + } + +public: + + VaultSecretEngine(CephContext *_c, SSEContext & _k) : cct(_c), kctx(_k) { + } +}; + +class TransitSecretEngine: public VaultSecretEngine { +public: + int compat; + static const int COMPAT_NEW_ONLY = 0; + static const int COMPAT_OLD_AND_NEW = 1; + static const int COMPAT_ONLY_OLD = 2; + static const int COMPAT_UNSET = -1; + +private: + EngineParmMap parms; + + int get_key_version(std::string_view key_id, string& version) + { + size_t pos = 0; + + pos = key_id.rfind("/"); + if (pos != std::string_view::npos){ + std::string_view token = key_id.substr(pos+1, key_id.length()-pos); + if (!token.empty() && token.find_first_not_of("0123456789") == std::string_view::npos){ + version.assign(std::string(token)); + return 0; + } + } + return -1; + } + +public: + TransitSecretEngine(CephContext *cct, SSEContext & kctx, EngineParmMap parms): VaultSecretEngine(cct, kctx), parms(parms) { + compat = COMPAT_UNSET; + for (auto& e: parms) { + if (e.first == "compat") { + if (e.second.empty()) { + compat = COMPAT_OLD_AND_NEW; + } else { + size_t ep; + + compat = std::stoi(e.second, &ep); + if (ep != e.second.length()) { + lderr(cct) << "warning: vault transit secrets engine : compat=" + << e.second << " trailing junk? (ignored)" << dendl; + } + } + continue; + } + lderr(cct) << "ERROR: vault transit secrets engine : parameter " + << e.first << "=" << e.second << " ignored" << dendl; + } + if (compat == COMPAT_UNSET) { + std::string_view v { kctx.prefix() }; + if (string_ends_maybe_slash(v,"/export/encryption-key")) { + compat = COMPAT_ONLY_OLD; + } else { + compat = COMPAT_NEW_ONLY; + } + } + } + + int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key) + { + ZeroPoolDocument d; + ZeroPoolValue *v; + string version; + bufferlist secret_bl; + + if (get_key_version(key_id, version) < 0){ + ldpp_dout(dpp, 20) << "Missing or invalid key version" << dendl; + return -EINVAL; + } + + int res = send_request(dpp, "GET", compat == COMPAT_ONLY_OLD ? "" : "/export/encryption-key", + key_id, string{}, secret_bl); + if (res < 0) { + return res; + } + + ldpp_dout(dpp, 20) << "Parse response into JSON Object" << dendl; + + secret_bl.append('\0'); + rapidjson::StringStream isw(secret_bl.c_str()); + d.ParseStream<>(isw); + + if (d.HasParseError()) { + ldpp_dout(dpp, 0) << "ERROR: Failed to parse JSON response from Vault: " + << rapidjson::GetParseError_En(d.GetParseError()) << dendl; + return -EINVAL; + } + secret_bl.zero(); + + const char *elements[] = {"data", "keys", version.c_str()}; + v = &d; + for (auto &elem: elements) { + if (!v->IsObject()) { + v = nullptr; + break; + } + auto endr { v->MemberEnd() }; + auto itr { v->FindMember(elem) }; + if (itr == endr) { + v = nullptr; + break; + } + v = &itr->value; + } + if (!v || !v->IsString()) { + ldpp_dout(dpp, 0) << "ERROR: Key not found in JSON response from Vault using Transit Engine" << dendl; + return -EINVAL; + } + return decode_secret(dpp, v->GetString(), actual_key); + } + + int make_actual_key(const DoutPrefixProvider *dpp, map& attrs, std::string& actual_key) + { + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + if (compat == COMPAT_ONLY_OLD) return get_key(dpp, key_id, actual_key); + if (key_id.find("/") != std::string::npos) { + ldpp_dout(dpp, 0) << "sorry, can't allow / in keyid" << dendl; + return -EINVAL; + } +/* + data: {context } + post to prefix + /datakey/plaintext/ + key_id + jq: .data.plaintext -> key + jq: .data.ciphertext -> (to-be) named attribute + return decode_secret(json_obj, actual_key) +*/ + std::string context = get_str_attribute(attrs, RGW_ATTR_CRYPT_CONTEXT); + ZeroPoolDocument d { rapidjson::kObjectType }; + auto &allocator { d.GetAllocator() }; + bufferlist secret_bl; + + add_name_val_to_obj("context", context, d, allocator); + rapidjson::StringBuffer buf; + rapidjson::Writer writer(buf); + if (!d.Accept(writer)) { + ldpp_dout(dpp, 0) << "ERROR: can't make json for vault" << dendl; + return -EINVAL; + } + std::string post_data { buf.GetString() }; + + int res = send_request(dpp, "POST", "/datakey/plaintext/", key_id, + post_data, secret_bl); + if (res < 0) { + return res; + } + + ldpp_dout(dpp, 20) << "Parse response into JSON Object" << dendl; + + secret_bl.append('\0'); + rapidjson::StringStream isw(secret_bl.c_str()); + d.SetNull(); + d.ParseStream<>(isw); + + if (d.HasParseError()) { + ldpp_dout(dpp, 0) << "ERROR: Failed to parse JSON response from Vault: " + << rapidjson::GetParseError_En(d.GetParseError()) << dendl; + return -EINVAL; + } + secret_bl.zero(); + + if (!d.IsObject()) { + ldpp_dout(dpp, 0) << "ERROR: response from Vault is not an object" << dendl; + return -EINVAL; + } + { + auto data_itr { d.FindMember("data") }; + if (data_itr == d.MemberEnd()) { + ldpp_dout(dpp, 0) << "ERROR: no .data in response from Vault" << dendl; + return -EINVAL; + } + auto ciphertext_itr { data_itr->value.FindMember("ciphertext") }; + auto plaintext_itr { data_itr->value.FindMember("plaintext") }; + if (ciphertext_itr == data_itr->value.MemberEnd()) { + ldpp_dout(dpp, 0) << "ERROR: no .data.ciphertext in response from Vault" << dendl; + return -EINVAL; + } + if (plaintext_itr == data_itr->value.MemberEnd()) { + ldpp_dout(dpp, 0) << "ERROR: no .data.plaintext in response from Vault" << dendl; + return -EINVAL; + } + auto &ciphertext_v { ciphertext_itr->value }; + auto &plaintext_v { plaintext_itr->value }; + if (!ciphertext_v.IsString()) { + ldpp_dout(dpp, 0) << "ERROR: .data.ciphertext not a string in response from Vault" << dendl; + return -EINVAL; + } + if (!plaintext_v.IsString()) { + ldpp_dout(dpp, 0) << "ERROR: .data.plaintext not a string in response from Vault" << dendl; + return -EINVAL; + } + set_attr(attrs, RGW_ATTR_CRYPT_DATAKEY, ciphertext_v.GetString()); + return decode_secret(dpp, plaintext_v.GetString(), actual_key); + } + } + + int reconstitute_actual_key(const DoutPrefixProvider *dpp, map& attrs, std::string& actual_key) + { + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + std::string wrapped_key = get_str_attribute(attrs, RGW_ATTR_CRYPT_DATAKEY); + if (compat == COMPAT_ONLY_OLD || key_id.rfind("/") != std::string::npos) { + return get_key(dpp, key_id, actual_key); + } +/* + .data.ciphertext <- (to-be) named attribute + data: {context ciphertext} + post to prefix + /decrypt/ + key_id + jq: .data.plaintext + return decode_secret(json_obj, actual_key) +*/ + std::string context = get_str_attribute(attrs, RGW_ATTR_CRYPT_CONTEXT); + ZeroPoolDocument d { rapidjson::kObjectType }; + auto &allocator { d.GetAllocator() }; + bufferlist secret_bl; + + add_name_val_to_obj("context", context, d, allocator); + add_name_val_to_obj("ciphertext", wrapped_key, d, allocator); + rapidjson::StringBuffer buf; + rapidjson::Writer writer(buf); + if (!d.Accept(writer)) { + ldpp_dout(dpp, 0) << "ERROR: can't make json for vault" << dendl; + return -EINVAL; + } + std::string post_data { buf.GetString() }; + + int res = send_request(dpp, "POST", "/decrypt/", key_id, + post_data, secret_bl); + if (res < 0) { + return res; + } + + ldpp_dout(dpp, 20) << "Parse response into JSON Object" << dendl; + + secret_bl.append('\0'); + rapidjson::StringStream isw(secret_bl.c_str()); + d.SetNull(); + d.ParseStream<>(isw); + + if (d.HasParseError()) { + ldpp_dout(dpp, 0) << "ERROR: Failed to parse JSON response from Vault: " + << rapidjson::GetParseError_En(d.GetParseError()) << dendl; + return -EINVAL; + } + secret_bl.zero(); + + if (!d.IsObject()) { + ldpp_dout(dpp, 0) << "ERROR: response from Vault is not an object" << dendl; + return -EINVAL; + } + { + auto data_itr { d.FindMember("data") }; + if (data_itr == d.MemberEnd()) { + ldpp_dout(dpp, 0) << "ERROR: no .data in response from Vault" << dendl; + return -EINVAL; + } + auto plaintext_itr { data_itr->value.FindMember("plaintext") }; + if (plaintext_itr == data_itr->value.MemberEnd()) { + ldpp_dout(dpp, 0) << "ERROR: no .data.plaintext in response from Vault" << dendl; + return -EINVAL; + } + auto &plaintext_v { plaintext_itr->value }; + if (!plaintext_v.IsString()) { + ldpp_dout(dpp, 0) << "ERROR: .data.plaintext not a string in response from Vault" << dendl; + return -EINVAL; + } + return decode_secret(dpp, plaintext_v.GetString(), actual_key); + } + } + + int create_bucket_key(const DoutPrefixProvider *dpp, const std::string& key_name) + { +/* + .data.ciphertext <- (to-be) named attribute + data: {"type": "chacha20-poly1305", "derived": true} + post to prefix + key_name + empty output. +*/ + ZeroPoolDocument d { rapidjson::kObjectType }; + auto &allocator { d.GetAllocator() }; + bufferlist dummy_bl; + std::string chacha20_poly1305 { "chacha20-poly1305" }; + + add_name_val_to_obj("type", chacha20_poly1305, d, allocator); + add_name_val_to_obj("derived", true, d, allocator); + rapidjson::StringBuffer buf; + rapidjson::Writer writer(buf); + if (!d.Accept(writer)) { + ldpp_dout(dpp, 0) << "ERROR: can't make json for vault" << dendl; + return -EINVAL; + } + std::string post_data { buf.GetString() }; + + int res = send_request(dpp, "POST", "/keys/", key_name, + post_data, dummy_bl); + if (res < 0) { + return res; + } + if (dummy_bl.length() != 0) { + ldpp_dout(dpp, 0) << "ERROR: unexpected response from Vault making a key: " + << dummy_bl + << dendl; + } + return 0; + } + + int delete_bucket_key(const DoutPrefixProvider *dpp, const std::string& key_name) + { +/* + /keys//config + data: {"deletion_allowed": true} + post to prefix + key_name + empty output. +*/ + ZeroPoolDocument d { rapidjson::kObjectType }; + auto &allocator { d.GetAllocator() }; + bufferlist dummy_bl; + std::ostringstream path_temp; + path_temp << "/keys/"; + path_temp << key_name; + std::string delete_path { path_temp.str() }; + path_temp << "/config"; + std::string config_path { path_temp.str() }; + + add_name_val_to_obj("deletion_allowed", true, d, allocator); + rapidjson::StringBuffer buf; + rapidjson::Writer writer(buf); + if (!d.Accept(writer)) { + ldpp_dout(dpp, 0) << "ERROR: can't make json for vault" << dendl; + return -EINVAL; + } + std::string post_data { buf.GetString() }; + + int res = send_request(dpp, "POST", "", config_path, + post_data, dummy_bl); + if (res < 0) { + return res; + } + if (dummy_bl.length() != 0) { + ldpp_dout(dpp, 0) << "ERROR: unexpected response from Vault marking key to delete: " + << dummy_bl + << dendl; + return -EINVAL; + } + + res = send_request(dpp, "DELETE", "", delete_path, + string{}, dummy_bl); + if (res < 0) { + return res; + } + if (dummy_bl.length() != 0) { + ldpp_dout(dpp, 0) << "ERROR: unexpected response from Vault deleting key: " + << dummy_bl + << dendl; + return -EINVAL; + } + return 0; + } +}; + +class KvSecretEngine: public VaultSecretEngine { + +public: + + KvSecretEngine(CephContext *cct, SSEContext & kctx, EngineParmMap parms): VaultSecretEngine(cct, kctx){ + if (!parms.empty()) { + lderr(cct) << "ERROR: vault kv secrets engine takes no parameters (ignoring them)" << dendl; + } + } + + virtual ~KvSecretEngine(){} + + int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key){ + ZeroPoolDocument d; + ZeroPoolValue *v; + bufferlist secret_bl; + + int res = send_request(dpp, key_id, secret_bl); + if (res < 0) { + return res; + } + + ldpp_dout(dpp, 20) << "Parse response into JSON Object" << dendl; + + secret_bl.append('\0'); + rapidjson::StringStream isw(secret_bl.c_str()); + d.ParseStream<>(isw); + + if (d.HasParseError()) { + ldpp_dout(dpp, 0) << "ERROR: Failed to parse JSON response from Vault: " + << rapidjson::GetParseError_En(d.GetParseError()) << dendl; + return -EINVAL; + } + secret_bl.zero(); + + static const char *elements[] = {"data", "data", "key"}; + v = &d; + for (auto &elem: elements) { + if (!v->IsObject()) { + v = nullptr; + break; + } + auto endr { v->MemberEnd() }; + auto itr { v->FindMember(elem) }; + if (itr == endr) { + v = nullptr; + break; + } + v = &itr->value; + } + if (!v || !v->IsString()) { + ldpp_dout(dpp, 0) << "ERROR: Key not found in JSON response from Vault using KV Engine" << dendl; + return -EINVAL; + } + return decode_secret(dpp, v->GetString(), actual_key); + } + +}; + +class KmipSecretEngine; +class KmipGetTheKey { +private: + CephContext *cct; + std::string work; + bool failed = false; + int ret; +protected: + KmipGetTheKey(CephContext *cct) : cct(cct) {} + KmipGetTheKey& keyid_to_keyname(std::string_view key_id); + KmipGetTheKey& get_uniqueid_for_keyname(); + int get_key_for_uniqueid(std::string &); + friend KmipSecretEngine; +}; + +KmipGetTheKey& +KmipGetTheKey::keyid_to_keyname(std::string_view key_id) +{ + work = cct->_conf->rgw_crypt_kmip_kms_key_template; + std::string keyword = "$keyid"; + std::string replacement = std::string(key_id); + size_t pos = 0; + if (work.length() == 0) { + work = std::move(replacement); + } else { + while (pos < work.length()) { + pos = work.find(keyword, pos); + if (pos == std::string::npos) break; + work.replace(pos, keyword.length(), replacement); + pos += key_id.length(); + } + } + return *this; +} + +KmipGetTheKey& +KmipGetTheKey::get_uniqueid_for_keyname() +{ + RGWKMIPTransceiver secret_req(cct, RGWKMIPTransceiver::LOCATE); + + secret_req.name = work.data(); + ret = secret_req.process(null_yield); + if (ret < 0) { + failed = true; + } else if (!secret_req.outlist->string_count) { + ret = -ENOENT; + lderr(cct) << "error: locate returned no results for " + << secret_req.name << dendl; + failed = true; + } else if (secret_req.outlist->string_count != 1) { + ret = -EINVAL; + lderr(cct) << "error: locate found " + << secret_req.outlist->string_count + << " results for " << secret_req.name << dendl; + failed = true; + } else { + work = std::string(secret_req.outlist->strings[0]); + } + return *this; +} + +int +KmipGetTheKey::get_key_for_uniqueid(std::string& actual_key) +{ + if (failed) return ret; + RGWKMIPTransceiver secret_req(cct, RGWKMIPTransceiver::GET); + secret_req.unique_id = work.data(); + ret = secret_req.process(null_yield); + if (ret < 0) { + failed = true; + } else { + actual_key = std::string((char*)(secret_req.outkey->data), + secret_req.outkey->keylen); + } + return ret; +} + +class KmipSecretEngine: public SecretEngine { + +protected: + CephContext *cct; + +public: + + KmipSecretEngine(CephContext *cct) { + this->cct = cct; + } + + int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key) + { + int r; + r = KmipGetTheKey{cct} + .keyid_to_keyname(key_id) + .get_uniqueid_for_keyname() + .get_key_for_uniqueid(actual_key); + return r; + } +}; + +static int get_actual_key_from_conf(const DoutPrefixProvider* dpp, + CephContext *cct, + std::string_view key_id, + std::string_view key_selector, + std::string& actual_key) +{ + int res = 0; + + static map str_map = get_str_map( + cct->_conf->rgw_crypt_s3_kms_encryption_keys); + + map::iterator it = str_map.find(std::string(key_id)); + if (it == str_map.end()) + return -EINVAL; + + std::string master_key; + try { + master_key = from_base64((*it).second); + } catch (std::exception&) { + ldpp_dout(dpp, 5) << "ERROR: get_actual_key_from_conf invalid encryption key id " + << "which contains character that is not base64 encoded." + << dendl; + return -EINVAL; + } + + if (master_key.length() == AES_256_KEYSIZE) { + uint8_t _actual_key[AES_256_KEYSIZE]; + if (AES_256_ECB_encrypt(dpp, cct, + reinterpret_cast(master_key.c_str()), AES_256_KEYSIZE, + reinterpret_cast(key_selector.data()), + _actual_key, AES_256_KEYSIZE)) { + actual_key = std::string((char*)&_actual_key[0], AES_256_KEYSIZE); + } else { + res = -EIO; + } + ::ceph::crypto::zeroize_for_security(_actual_key, sizeof(_actual_key)); + } else { + ldpp_dout(dpp, 20) << "Wrong size for key=" << key_id << dendl; + res = -EIO; + } + + return res; +} + +static int request_key_from_barbican(const DoutPrefixProvider *dpp, + CephContext *cct, + std::string_view key_id, + const std::string& barbican_token, + std::string& actual_key) { + int res; + + std::string secret_url = cct->_conf->rgw_barbican_url; + if (secret_url.empty()) { + ldpp_dout(dpp, 0) << "ERROR: conf rgw_barbican_url is not set" << dendl; + return -EINVAL; + } + concat_url(secret_url, "/v1/secrets/"); + concat_url(secret_url, std::string(key_id)); + + bufferlist secret_bl; + RGWHTTPTransceiver secret_req(cct, "GET", secret_url, &secret_bl); + secret_req.append_header("Accept", "application/octet-stream"); + secret_req.append_header("X-Auth-Token", barbican_token); + + res = secret_req.process(null_yield); + // map 401 to EACCES instead of EPERM + if (secret_req.get_http_status() == + RGWHTTPTransceiver::HTTP_STATUS_UNAUTHORIZED) { + return -EACCES; + } + if (res < 0) { + return res; + } + + if (secret_req.get_http_status() >=200 && + secret_req.get_http_status() < 300 && + secret_bl.length() == AES_256_KEYSIZE) { + actual_key.assign(secret_bl.c_str(), secret_bl.length()); + secret_bl.zero(); + } else { + res = -EACCES; + } + return res; +} + +static int get_actual_key_from_barbican(const DoutPrefixProvider *dpp, + CephContext *cct, + std::string_view key_id, + std::string& actual_key) +{ + int res = 0; + std::string token; + + if (rgw::keystone::Service::get_keystone_barbican_token(dpp, cct, token) < 0) { + ldpp_dout(dpp, 5) << "Failed to retrieve token for Barbican" << dendl; + return -EINVAL; + } + + res = request_key_from_barbican(dpp, cct, key_id, token, actual_key); + if (res != 0) { + ldpp_dout(dpp, 5) << "Failed to retrieve secret from Barbican:" << key_id << dendl; + } + return res; +} + + +std::string config_to_engine_and_parms(CephContext *cct, + const char* which, + std::string& secret_engine_str, + EngineParmMap& secret_engine_parms) +{ + std::ostringstream oss; + std::vector secret_engine_v; + std::string secret_engine; + + get_str_vec(secret_engine_str, " ", secret_engine_v); + + cct->_conf.early_expand_meta(secret_engine_str, &oss); + auto meta_errors {oss.str()}; + if (meta_errors.length()) { + meta_errors.erase(meta_errors.find_last_not_of("\n")+1); + lderr(cct) << "ERROR: while expanding " << which << ": " + << meta_errors << dendl; + } + for (auto& e: secret_engine_v) { + if (!secret_engine.length()) { + secret_engine = std::move(e); + continue; + } + auto p { e.find('=') }; + if (p == std::string::npos) { + secret_engine_parms.emplace(std::move(e), ""); + continue; + } + std::string key{ e.substr(0,p) }; + std::string val{ e.substr(p+1) }; + secret_engine_parms.emplace(std::move(key), std::move(val)); + } + return secret_engine; +} + + +static int get_actual_key_from_vault(const DoutPrefixProvider *dpp, + CephContext *cct, + SSEContext & kctx, + map& attrs, + std::string& actual_key, bool make_it) +{ + std::string secret_engine_str = kctx.secret_engine(); + EngineParmMap secret_engine_parms; + auto secret_engine { config_to_engine_and_parms( + cct, "rgw_crypt_vault_secret_engine", + secret_engine_str, secret_engine_parms) }; + ldpp_dout(dpp, 20) << "Vault authentication method: " << kctx.auth() << dendl; + ldpp_dout(dpp, 20) << "Vault Secrets Engine: " << secret_engine << dendl; + + if (RGW_SSE_KMS_VAULT_SE_KV == secret_engine){ + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + KvSecretEngine engine(cct, kctx, std::move(secret_engine_parms)); + return engine.get_key(dpp, key_id, actual_key); + } + else if (RGW_SSE_KMS_VAULT_SE_TRANSIT == secret_engine){ + TransitSecretEngine engine(cct, kctx, std::move(secret_engine_parms)); + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + return make_it + ? engine.make_actual_key(dpp, attrs, actual_key) + : engine.reconstitute_actual_key(dpp, attrs, actual_key); + } + else { + ldpp_dout(dpp, 0) << "Missing or invalid secret engine" << dendl; + return -EINVAL; + } +} + + +static int make_actual_key_from_vault(const DoutPrefixProvider *dpp, + CephContext *cct, + SSEContext & kctx, + map& attrs, + std::string& actual_key) +{ + return get_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key, true); +} + + +static int reconstitute_actual_key_from_vault(const DoutPrefixProvider *dpp, + CephContext *cct, + SSEContext & kctx, + map& attrs, + std::string& actual_key) +{ + return get_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key, false); +} + + +static int get_actual_key_from_kmip(const DoutPrefixProvider *dpp, + CephContext *cct, + std::string_view key_id, + std::string& actual_key) +{ + std::string secret_engine = RGW_SSE_KMS_KMIP_SE_KV; + + if (RGW_SSE_KMS_KMIP_SE_KV == secret_engine){ + KmipSecretEngine engine(cct); + return engine.get_key(dpp, key_id, actual_key); + } + else{ + ldpp_dout(dpp, 0) << "Missing or invalid secret engine" << dendl; + return -EINVAL; + } +} +class KMSContext : public SSEContext { + CephContext *cct; +public: + KMSContext(CephContext*_cct) : cct{_cct} {}; + ~KMSContext() override {}; + const std::string & backend() override { + return cct->_conf->rgw_crypt_s3_kms_backend; + }; + const std::string & addr() override { + return cct->_conf->rgw_crypt_vault_addr; + }; + const std::string & auth() override { + return cct->_conf->rgw_crypt_vault_auth; + }; + const std::string & k_namespace() override { + return cct->_conf->rgw_crypt_vault_namespace; + }; + const std::string & prefix() override { + return cct->_conf->rgw_crypt_vault_prefix; + }; + const std::string & secret_engine() override { + return cct->_conf->rgw_crypt_vault_secret_engine; + }; + const std::string & ssl_cacert() override { + return cct->_conf->rgw_crypt_vault_ssl_cacert; + }; + const std::string & ssl_clientcert() override { + return cct->_conf->rgw_crypt_vault_ssl_clientcert; + }; + const std::string & ssl_clientkey() override { + return cct->_conf->rgw_crypt_vault_ssl_clientkey; + }; + const std::string & token_file() override { + return cct->_conf->rgw_crypt_vault_token_file; + }; + const bool verify_ssl() override { + return cct->_conf->rgw_crypt_vault_verify_ssl; + }; +}; + +class SseS3Context : public SSEContext { + CephContext *cct; +public: + static const std::string sse_s3_secret_engine; + SseS3Context(CephContext*_cct) : cct{_cct} {}; + ~SseS3Context(){}; + const std::string & backend() override { + return cct->_conf->rgw_crypt_sse_s3_backend; + }; + const std::string & addr() override { + return cct->_conf->rgw_crypt_sse_s3_vault_addr; + }; + const std::string & auth() override { + return cct->_conf->rgw_crypt_sse_s3_vault_auth; + }; + const std::string & k_namespace() override { + return cct->_conf->rgw_crypt_sse_s3_vault_namespace; + }; + const std::string & prefix() override { + return cct->_conf->rgw_crypt_sse_s3_vault_prefix; + }; + const std::string & secret_engine() override { + return cct->_conf->rgw_crypt_sse_s3_vault_secret_engine; + }; + const std::string & ssl_cacert() override { + return cct->_conf->rgw_crypt_sse_s3_vault_ssl_cacert; + }; + const std::string & ssl_clientcert() override { + return cct->_conf->rgw_crypt_sse_s3_vault_ssl_clientcert; + }; + const std::string & ssl_clientkey() override { + return cct->_conf->rgw_crypt_sse_s3_vault_ssl_clientkey; + }; + const std::string & token_file() override { + return cct->_conf->rgw_crypt_sse_s3_vault_token_file; + }; + const bool verify_ssl() override { + return cct->_conf->rgw_crypt_sse_s3_vault_verify_ssl; + }; +}; + +int reconstitute_actual_key_from_kms(const DoutPrefixProvider *dpp, CephContext *cct, + map& attrs, + std::string& actual_key) +{ + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + KMSContext kctx { cct }; + const std::string &kms_backend { kctx.backend() }; + + ldpp_dout(dpp, 20) << "Getting KMS encryption key for key " << key_id << dendl; + ldpp_dout(dpp, 20) << "SSE-KMS backend is " << kms_backend << dendl; + + if (RGW_SSE_KMS_BACKEND_BARBICAN == kms_backend) { + return get_actual_key_from_barbican(dpp, cct, key_id, actual_key); + } + + if (RGW_SSE_KMS_BACKEND_VAULT == kms_backend) { + return reconstitute_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key); + } + + if (RGW_SSE_KMS_BACKEND_KMIP == kms_backend) { + return get_actual_key_from_kmip(dpp, cct, key_id, actual_key); + } + + if (RGW_SSE_KMS_BACKEND_TESTING == kms_backend) { + std::string key_selector = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYSEL); + return get_actual_key_from_conf(dpp, cct, key_id, key_selector, actual_key); + } + + ldpp_dout(dpp, 0) << "ERROR: Invalid rgw_crypt_s3_kms_backend: " << kms_backend << dendl; + return -EINVAL; +} + +int make_actual_key_from_kms(const DoutPrefixProvider *dpp, CephContext *cct, + map& attrs, + std::string& actual_key) +{ + KMSContext kctx { cct }; + const std::string &kms_backend { kctx.backend() }; + if (RGW_SSE_KMS_BACKEND_VAULT == kms_backend) + return make_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key); + return reconstitute_actual_key_from_kms(dpp, cct, attrs, actual_key); +} + +int reconstitute_actual_key_from_sse_s3(const DoutPrefixProvider *dpp, + CephContext *cct, + map& attrs, + std::string& actual_key) +{ + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + SseS3Context kctx { cct }; + const std::string &kms_backend { kctx.backend() }; + + ldpp_dout(dpp, 20) << "Getting SSE-S3 encryption key for key " << key_id << dendl; + ldpp_dout(dpp, 20) << "SSE-KMS backend is " << kms_backend << dendl; + + if (RGW_SSE_KMS_BACKEND_VAULT == kms_backend) { + return reconstitute_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key); + } + + ldpp_dout(dpp, 0) << "ERROR: Invalid rgw_crypt_sse_s3_backend: " << kms_backend << dendl; + return -EINVAL; +} + +int make_actual_key_from_sse_s3(const DoutPrefixProvider *dpp, + CephContext *cct, + map& attrs, + std::string& actual_key) +{ + SseS3Context kctx { cct }; + const std::string kms_backend { kctx.backend() }; + if (RGW_SSE_KMS_BACKEND_VAULT != kms_backend) { + ldpp_dout(dpp, 0) << "ERROR: Unsupported rgw_crypt_sse_s3_backend: " << kms_backend << dendl; + return -EINVAL; + } + return make_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key); +} + + +int create_sse_s3_bucket_key(const DoutPrefixProvider *dpp, + CephContext *cct, + const std::string& bucket_key) +{ + SseS3Context kctx { cct }; + + const std::string kms_backend { kctx.backend() }; + if (RGW_SSE_KMS_BACKEND_VAULT != kms_backend) { + ldpp_dout(dpp, 0) << "ERROR: Unsupported rgw_crypt_sse_s3_backend: " << kms_backend << dendl; + return -EINVAL; + } + + std::string secret_engine_str = kctx.secret_engine(); + EngineParmMap secret_engine_parms; + auto secret_engine { config_to_engine_and_parms( + cct, "rgw_crypt_sse_s3_vault_secret_engine", + secret_engine_str, secret_engine_parms) }; + if (RGW_SSE_KMS_VAULT_SE_TRANSIT == secret_engine){ + TransitSecretEngine engine(cct, kctx, std::move(secret_engine_parms)); + return engine.create_bucket_key(dpp, bucket_key); + } + else { + ldpp_dout(dpp, 0) << "Missing or invalid secret engine" << dendl; + return -EINVAL; + } +} + +int remove_sse_s3_bucket_key(const DoutPrefixProvider *dpp, + CephContext *cct, + const std::string& bucket_key) +{ + SseS3Context kctx { cct }; + std::string secret_engine_str = kctx.secret_engine(); + EngineParmMap secret_engine_parms; + auto secret_engine { config_to_engine_and_parms( + cct, "rgw_crypt_sse_s3_vault_secret_engine", + secret_engine_str, secret_engine_parms) }; + if (RGW_SSE_KMS_VAULT_SE_TRANSIT == secret_engine){ + TransitSecretEngine engine(cct, kctx, std::move(secret_engine_parms)); + return engine.delete_bucket_key(dpp, bucket_key); + } + else { + ldpp_dout(dpp, 0) << "Missing or invalid secret engine" << dendl; + return -EINVAL; + } +} diff --git a/src/rgw/rgw_kms.h b/src/rgw/rgw_kms.h new file mode 100644 index 000000000..f8e8655f2 --- /dev/null +++ b/src/rgw/rgw_kms.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/** + * Server-side encryption integrations with Key Management Systems (SSE-KMS) + */ + +#pragma once + +#include + +static const std::string RGW_SSE_KMS_BACKEND_TESTING = "testing"; +static const std::string RGW_SSE_KMS_BACKEND_BARBICAN = "barbican"; +static const std::string RGW_SSE_KMS_BACKEND_VAULT = "vault"; +static const std::string RGW_SSE_KMS_BACKEND_KMIP = "kmip"; + +static const std::string RGW_SSE_KMS_VAULT_AUTH_TOKEN = "token"; +static const std::string RGW_SSE_KMS_VAULT_AUTH_AGENT = "agent"; + +static const std::string RGW_SSE_KMS_VAULT_SE_TRANSIT = "transit"; +static const std::string RGW_SSE_KMS_VAULT_SE_KV = "kv"; + +static const std::string RGW_SSE_KMS_KMIP_SE_KV = "kv"; + +/** + * Retrieves the actual server-side encryption key from a KMS system given a + * key ID. Currently supported KMS systems are OpenStack Barbican and HashiCorp + * Vault, but keys can also be retrieved from Ceph configuration file (if + * kms is set to 'local'). + * + * \params + * TODO + * \return + */ +int make_actual_key_from_kms(const DoutPrefixProvider *dpp, CephContext *cct, + std::map& attrs, + std::string& actual_key); +int reconstitute_actual_key_from_kms(const DoutPrefixProvider *dpp, CephContext *cct, + std::map& attrs, + std::string& actual_key); +int make_actual_key_from_sse_s3(const DoutPrefixProvider *dpp, CephContext *cct, + std::map& attrs, + std::string& actual_key); +int reconstitute_actual_key_from_sse_s3(const DoutPrefixProvider *dpp, CephContext *cct, + std::map& attrs, + std::string& actual_key); + +int create_sse_s3_bucket_key(const DoutPrefixProvider *dpp, CephContext *cct, + const std::string& actual_key); + +int remove_sse_s3_bucket_key(const DoutPrefixProvider *dpp, CephContext *cct, + const std::string& actual_key); + +/** + * SecretEngine Interface + * Defining interface here such that we can use both a real implementation + * of this interface, and a mock implementation in tests. +**/ +class SecretEngine { + +public: + virtual int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key) = 0; + virtual ~SecretEngine(){}; +}; diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc new file mode 100644 index 000000000..7f4a79501 --- /dev/null +++ b/src/rgw/rgw_lc.cc @@ -0,0 +1,2869 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "include/scope_guard.h" +#include "include/function2.hpp" +#include "common/Formatter.h" +#include "common/containers.h" +#include "common/split.h" +#include +#include "include/random.h" +#include "cls/lock/cls_lock_client.h" +#include "rgw_perf_counters.h" +#include "rgw_common.h" +#include "rgw_bucket.h" +#include "rgw_lc.h" +#include "rgw_zone.h" +#include "rgw_string.h" +#include "rgw_multi.h" +#include "rgw_sal.h" +#include "rgw_lc_tier.h" +#include "rgw_notify.h" + +#include "fmt/format.h" + +#include "services/svc_sys_obj.h" +#include "services/svc_zone.h" +#include "services/svc_tier_rados.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +const char* LC_STATUS[] = { + "UNINITIAL", + "PROCESSING", + "FAILED", + "COMPLETE" +}; + +using namespace librados; + +bool LCRule::valid() const +{ + if (id.length() > MAX_ID_LEN) { + return false; + } + else if(expiration.empty() && noncur_expiration.empty() && + mp_expiration.empty() && !dm_expiration && + transitions.empty() && noncur_transitions.empty()) { + return false; + } + else if (!expiration.valid() || !noncur_expiration.valid() || + !mp_expiration.valid()) { + return false; + } + if (!transitions.empty()) { + bool using_days = expiration.has_days(); + bool using_date = expiration.has_date(); + for (const auto& elem : transitions) { + if (!elem.second.valid()) { + return false; + } + using_days = using_days || elem.second.has_days(); + using_date = using_date || elem.second.has_date(); + if (using_days && using_date) { + return false; + } + } + } + for (const auto& elem : noncur_transitions) { + if (!elem.second.valid()) { + return false; + } + } + + return true; +} + +void LCRule::init_simple_days_rule(std::string_view _id, + std::string_view _prefix, int num_days) +{ + id = _id; + prefix = _prefix; + char buf[32]; + snprintf(buf, sizeof(buf), "%d", num_days); + expiration.set_days(buf); + set_enabled(true); +} + +void RGWLifecycleConfiguration::add_rule(const LCRule& rule) +{ + auto& id = rule.get_id(); // note that this will return false for groups, but that's ok, we won't search groups + rule_map.insert(pair(id, rule)); +} + +bool RGWLifecycleConfiguration::_add_rule(const LCRule& rule) +{ + lc_op op(rule.get_id()); + op.status = rule.is_enabled(); + if (rule.get_expiration().has_days()) { + op.expiration = rule.get_expiration().get_days(); + } + if (rule.get_expiration().has_date()) { + op.expiration_date = ceph::from_iso_8601(rule.get_expiration().get_date()); + } + if (rule.get_noncur_expiration().has_days()) { + op.noncur_expiration = rule.get_noncur_expiration().get_days(); + } + if (rule.get_mp_expiration().has_days()) { + op.mp_expiration = rule.get_mp_expiration().get_days(); + } + op.dm_expiration = rule.get_dm_expiration(); + for (const auto &elem : rule.get_transitions()) { + transition_action action; + if (elem.second.has_days()) { + action.days = elem.second.get_days(); + } else { + action.date = ceph::from_iso_8601(elem.second.get_date()); + } + action.storage_class + = rgw_placement_rule::get_canonical_storage_class(elem.first); + op.transitions.emplace(elem.first, std::move(action)); + } + for (const auto &elem : rule.get_noncur_transitions()) { + transition_action action; + action.days = elem.second.get_days(); + action.date = ceph::from_iso_8601(elem.second.get_date()); + action.storage_class + = rgw_placement_rule::get_canonical_storage_class(elem.first); + op.noncur_transitions.emplace(elem.first, std::move(action)); + } + std::string prefix; + if (rule.get_filter().has_prefix()){ + prefix = rule.get_filter().get_prefix(); + } else { + prefix = rule.get_prefix(); + } + if (rule.get_filter().has_tags()){ + op.obj_tags = rule.get_filter().get_tags(); + } + op.rule_flags = rule.get_filter().get_flags(); + prefix_map.emplace(std::move(prefix), std::move(op)); + return true; +} + +int RGWLifecycleConfiguration::check_and_add_rule(const LCRule& rule) +{ + if (!rule.valid()) { + return -EINVAL; + } + auto& id = rule.get_id(); + if (rule_map.find(id) != rule_map.end()) { //id shouldn't be the same + return -EINVAL; + } + if (rule.get_filter().has_tags() && (rule.get_dm_expiration() || + !rule.get_mp_expiration().empty())) { + return -ERR_INVALID_REQUEST; + } + rule_map.insert(pair(id, rule)); + + if (!_add_rule(rule)) { + return -ERR_INVALID_REQUEST; + } + return 0; +} + +bool RGWLifecycleConfiguration::has_same_action(const lc_op& first, + const lc_op& second) { + if ((first.expiration > 0 || first.expiration_date != boost::none) && + (second.expiration > 0 || second.expiration_date != boost::none)) { + return true; + } else if (first.noncur_expiration > 0 && second.noncur_expiration > 0) { + return true; + } else if (first.mp_expiration > 0 && second.mp_expiration > 0) { + return true; + } else if (!first.transitions.empty() && !second.transitions.empty()) { + for (auto &elem : first.transitions) { + if (second.transitions.find(elem.first) != second.transitions.end()) { + return true; + } + } + } else if (!first.noncur_transitions.empty() && + !second.noncur_transitions.empty()) { + for (auto &elem : first.noncur_transitions) { + if (second.noncur_transitions.find(elem.first) != + second.noncur_transitions.end()) { + return true; + } + } + } + return false; +} + +/* Formerly, this method checked for duplicate rules using an invalid + * method (prefix uniqueness). */ +bool RGWLifecycleConfiguration::valid() +{ + return true; +} + +void *RGWLC::LCWorker::entry() { + do { + std::unique_ptr all_buckets; // empty restriction + utime_t start = ceph_clock_now(); + if (should_work(start)) { + ldpp_dout(dpp, 2) << "life cycle: start" << dendl; + int r = lc->process(this, all_buckets, false /* once */); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: do life cycle process() returned error r=" + << r << dendl; + } + ldpp_dout(dpp, 2) << "life cycle: stop" << dendl; + cloud_targets.clear(); // clear cloud targets + } + if (lc->going_down()) + break; + + utime_t end = ceph_clock_now(); + int secs = schedule_next_start_time(start, end); + utime_t next; + next.set_from_double(end + secs); + + ldpp_dout(dpp, 5) << "schedule life cycle next start time: " + << rgw_to_asctime(next) << dendl; + + std::unique_lock l{lock}; + cond.wait_for(l, std::chrono::seconds(secs)); + } while (!lc->going_down()); + + return NULL; +} + +void RGWLC::initialize(CephContext *_cct, rgw::sal::Driver* _driver) { + cct = _cct; + driver = _driver; + sal_lc = driver->get_lifecycle(); + max_objs = cct->_conf->rgw_lc_max_objs; + if (max_objs > HASH_PRIME) + max_objs = HASH_PRIME; + + obj_names = new string[max_objs]; + + for (int i = 0; i < max_objs; i++) { + obj_names[i] = lc_oid_prefix; + char buf[32]; + snprintf(buf, 32, ".%d", i); + obj_names[i].append(buf); + } + +#define COOKIE_LEN 16 + char cookie_buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(cct, cookie_buf, sizeof(cookie_buf) - 1); + cookie = cookie_buf; +} + +void RGWLC::finalize() +{ + delete[] obj_names; +} + +static inline std::ostream& operator<<(std::ostream &os, rgw::sal::Lifecycle::LCEntry& ent) { + os << ""; + return os; +} + +static bool obj_has_expired(const DoutPrefixProvider *dpp, CephContext *cct, ceph::real_time mtime, int days, + ceph::real_time *expire_time = nullptr) +{ + double timediff, cmp; + utime_t base_time; + if (cct->_conf->rgw_lc_debug_interval <= 0) { + /* Normal case, run properly */ + cmp = double(days)*24*60*60; + base_time = ceph_clock_now().round_to_day(); + } else { + /* We're in debug mode; Treat each rgw_lc_debug_interval seconds as a day */ + cmp = double(days)*cct->_conf->rgw_lc_debug_interval; + base_time = ceph_clock_now(); + } + auto tt_mtime = ceph::real_clock::to_time_t(mtime); + timediff = base_time - tt_mtime; + + if (expire_time) { + *expire_time = mtime + make_timespan(cmp); + } + + ldpp_dout(dpp, 20) << __func__ + << "(): mtime=" << mtime << " days=" << days + << " base_time=" << base_time << " timediff=" << timediff + << " cmp=" << cmp + << " is_expired=" << (timediff >= cmp) + << dendl; + + return (timediff >= cmp); +} + +static bool pass_object_lock_check(rgw::sal::Driver* driver, rgw::sal::Object* obj, const DoutPrefixProvider *dpp) +{ + if (!obj->get_bucket()->get_info().obj_lock_enabled()) { + return true; + } + std::unique_ptr read_op = obj->get_read_op(); + int ret = read_op->prepare(null_yield, dpp); + if (ret < 0) { + if (ret == -ENOENT) { + return true; + } else { + return false; + } + } else { + auto iter = obj->get_attrs().find(RGW_ATTR_OBJECT_RETENTION); + if (iter != obj->get_attrs().end()) { + RGWObjectRetention retention; + try { + decode(retention, iter->second); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode RGWObjectRetention" + << dendl; + return false; + } + if (ceph::real_clock::to_time_t(retention.get_retain_until_date()) > + ceph_clock_now()) { + return false; + } + } + iter = obj->get_attrs().find(RGW_ATTR_OBJECT_LEGAL_HOLD); + if (iter != obj->get_attrs().end()) { + RGWObjectLegalHold obj_legal_hold; + try { + decode(obj_legal_hold, iter->second); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode RGWObjectLegalHold" + << dendl; + return false; + } + if (obj_legal_hold.is_enabled()) { + return false; + } + } + return true; + } +} + +class LCObjsLister { + rgw::sal::Driver* driver; + rgw::sal::Bucket* bucket; + rgw::sal::Bucket::ListParams list_params; + rgw::sal::Bucket::ListResults list_results; + string prefix; + vector::iterator obj_iter; + rgw_bucket_dir_entry pre_obj; + int64_t delay_ms; + +public: + LCObjsLister(rgw::sal::Driver* _driver, rgw::sal::Bucket* _bucket) : + driver(_driver), bucket(_bucket) { + list_params.list_versions = bucket->versioned(); + list_params.allow_unordered = true; + delay_ms = driver->ctx()->_conf.get_val("rgw_lc_thread_delay"); + } + + void set_prefix(const string& p) { + prefix = p; + list_params.prefix = prefix; + } + + int init(const DoutPrefixProvider *dpp) { + return fetch(dpp); + } + + int fetch(const DoutPrefixProvider *dpp) { + int ret = bucket->list(dpp, list_params, 1000, list_results, null_yield); + if (ret < 0) { + return ret; + } + + obj_iter = list_results.objs.begin(); + + return 0; + } + + void delay() { + std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms)); + } + + bool get_obj(const DoutPrefixProvider *dpp, rgw_bucket_dir_entry **obj, + std::function fetch_barrier + = []() { /* nada */}) { + if (obj_iter == list_results.objs.end()) { + if (!list_results.is_truncated) { + delay(); + return false; + } else { + fetch_barrier(); + list_params.marker = pre_obj.key; + int ret = fetch(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: list_op returned ret=" << ret + << dendl; + return false; + } + } + delay(); + } + /* returning address of entry in objs */ + *obj = &(*obj_iter); + return obj_iter != list_results.objs.end(); + } + + rgw_bucket_dir_entry get_prev_obj() { + return pre_obj; + } + + void next() { + pre_obj = *obj_iter; + ++obj_iter; + } + + boost::optional next_key_name() { + if (obj_iter == list_results.objs.end() || + (obj_iter + 1) == list_results.objs.end()) { + /* this should have been called after get_obj() was called, so this should + * only happen if is_truncated is false */ + return boost::none; + } + + return ((obj_iter + 1)->key.name); + } + +}; /* LCObjsLister */ + +struct op_env { + + using LCWorker = RGWLC::LCWorker; + + lc_op op; + rgw::sal::Driver* driver; + LCWorker* worker; + rgw::sal::Bucket* bucket; + LCObjsLister& ol; + + op_env(lc_op& _op, rgw::sal::Driver* _driver, LCWorker* _worker, + rgw::sal::Bucket* _bucket, LCObjsLister& _ol) + : op(_op), driver(_driver), worker(_worker), bucket(_bucket), + ol(_ol) {} +}; /* op_env */ + +class LCRuleOp; +class WorkQ; + +struct lc_op_ctx { + CephContext *cct; + op_env env; + rgw_bucket_dir_entry o; + boost::optional next_key_name; + ceph::real_time effective_mtime; + + rgw::sal::Driver* driver; + rgw::sal::Bucket* bucket; + lc_op& op; // ok--refers to expanded env.op + LCObjsLister& ol; + + std::unique_ptr obj; + RGWObjectCtx rctx; + const DoutPrefixProvider *dpp; + WorkQ* wq; + + std::unique_ptr tier; + + lc_op_ctx(op_env& env, rgw_bucket_dir_entry& o, + boost::optional next_key_name, + ceph::real_time effective_mtime, + const DoutPrefixProvider *dpp, WorkQ* wq) + : cct(env.driver->ctx()), env(env), o(o), next_key_name(next_key_name), + effective_mtime(effective_mtime), + driver(env.driver), bucket(env.bucket), op(env.op), ol(env.ol), + rctx(env.driver), dpp(dpp), wq(wq) + { + obj = bucket->get_object(o.key); + } + + bool next_has_same_name(const std::string& key_name) { + return (next_key_name && key_name.compare( + boost::get(next_key_name)) == 0); + } + +}; /* lc_op_ctx */ + + +static std::string lc_id = "rgw lifecycle"; +static std::string lc_req_id = "0"; + +static int remove_expired_obj( + const DoutPrefixProvider *dpp, lc_op_ctx& oc, bool remove_indeed, + rgw::notify::EventType event_type) +{ + auto& driver = oc.driver; + auto& bucket_info = oc.bucket->get_info(); + auto& o = oc.o; + auto obj_key = o.key; + auto& meta = o.meta; + int ret; + std::string version_id; + std::unique_ptr notify; + + if (!remove_indeed) { + obj_key.instance.clear(); + } else if (obj_key.instance.empty()) { + obj_key.instance = "null"; + } + + std::unique_ptr bucket; + std::unique_ptr obj; + + ret = driver->get_bucket(nullptr, bucket_info, &bucket); + if (ret < 0) { + return ret; + } + + // XXXX currently, rgw::sal::Bucket.owner is always null here + std::unique_ptr user; + if (! bucket->get_owner()) { + auto& bucket_info = bucket->get_info(); + user = driver->get_user(bucket_info.owner); + // forgive me, lord + if (user) { + bucket->set_owner(user.get()); + } + } + + obj = bucket->get_object(obj_key); + + RGWObjState* obj_state{nullptr}; + ret = obj->get_obj_state(dpp, &obj_state, null_yield, true); + if (ret < 0) { + return ret; + } + + std::unique_ptr del_op + = obj->get_delete_op(); + del_op->params.versioning_status + = obj->get_bucket()->get_info().versioning_status(); + del_op->params.obj_owner.set_id(rgw_user {meta.owner}); + del_op->params.obj_owner.set_name(meta.owner_display_name); + del_op->params.bucket_owner.set_id(bucket_info.owner); + del_op->params.unmod_since = meta.mtime; + del_op->params.marker_version_id = version_id; + + // notification supported only for RADOS driver for now + notify = driver->get_notification(dpp, obj.get(), nullptr, event_type, + bucket.get(), lc_id, + const_cast(oc.bucket->get_tenant()), + lc_req_id, null_yield); + + ret = notify->publish_reserve(dpp, nullptr); + if ( ret < 0) { + ldpp_dout(dpp, 1) + << "ERROR: notify reservation failed, deferring delete of object k=" + << o.key + << dendl; + return ret; + } + ret = del_op->delete_obj(dpp, null_yield); + if (ret < 0) { + ldpp_dout(dpp, 1) << + "ERROR: publishing notification failed, with error: " << ret << dendl; + } else { + // send request to notification manager + (void) notify->publish_commit(dpp, obj_state->size, + ceph::real_clock::now(), + obj_state->attrset[RGW_ATTR_ETAG].to_str(), + version_id); + } + + return ret; + +} /* remove_expired_obj */ + +class LCOpAction { +public: + virtual ~LCOpAction() {} + + virtual bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) { + return false; + } + + /* called after check(). Check should tell us whether this action + * is applicable. If there are multiple actions, we'll end up executing + * the latest applicable action + * For example: + * one action after 10 days, another after 20, third after 40. + * After 10 days, the latest applicable action would be the first one, + * after 20 days it will be the second one. After 21 days it will still be the + * second one. So check() should return true for the second action at that point, + * but should_process() if the action has already been applied. In object removal + * it doesn't matter, but in object transition it does. + */ + virtual bool should_process() { + return true; + } + + virtual int process(lc_op_ctx& oc) { + return 0; + } + + friend class LCOpRule; +}; /* LCOpAction */ + +class LCOpFilter { +public: +virtual ~LCOpFilter() {} + virtual bool check(const DoutPrefixProvider *dpp, lc_op_ctx& oc) { + return false; + } +}; /* LCOpFilter */ + +class LCOpRule { + friend class LCOpAction; + + op_env env; + boost::optional next_key_name; + ceph::real_time effective_mtime; + + std::vector > filters; // n.b., sharing ovhd + std::vector > actions; + +public: + LCOpRule(op_env& _env) : env(_env) {} + + boost::optional get_next_key_name() { + return next_key_name; + } + + std::vector>& get_actions() { + return actions; + } + + void build(); + void update(); + int process(rgw_bucket_dir_entry& o, const DoutPrefixProvider *dpp, + WorkQ* wq); +}; /* LCOpRule */ + +using WorkItem = + boost::variant, + /* uncompleted MPU expiration */ + std::tuple, + rgw_bucket_dir_entry>; + +class WorkQ : public Thread +{ +public: + using unique_lock = std::unique_lock; + using work_f = std::function; + using dequeue_result = boost::variant; + + static constexpr uint32_t FLAG_NONE = 0x0000; + static constexpr uint32_t FLAG_EWAIT_SYNC = 0x0001; + static constexpr uint32_t FLAG_DWAIT_SYNC = 0x0002; + static constexpr uint32_t FLAG_EDRAIN_SYNC = 0x0004; + +private: + const work_f bsf = [](RGWLC::LCWorker* wk, WorkQ* wq, WorkItem& wi) {}; + RGWLC::LCWorker* wk; + uint32_t qmax; + int ix; + std::mutex mtx; + std::condition_variable cv; + uint32_t flags; + vector items; + work_f f; + +public: + WorkQ(RGWLC::LCWorker* wk, uint32_t ix, uint32_t qmax) + : wk(wk), qmax(qmax), ix(ix), flags(FLAG_NONE), f(bsf) + { + create(thr_name().c_str()); + } + + std::string thr_name() { + return std::string{"wp_thrd: "} + + std::to_string(wk->ix) + ", " + std::to_string(ix); + } + + void setf(work_f _f) { + f = _f; + } + + void enqueue(WorkItem&& item) { + unique_lock uniq(mtx); + while ((!wk->get_lc()->going_down()) && + (items.size() > qmax)) { + flags |= FLAG_EWAIT_SYNC; + cv.wait_for(uniq, 200ms); + } + items.push_back(item); + if (flags & FLAG_DWAIT_SYNC) { + flags &= ~FLAG_DWAIT_SYNC; + cv.notify_one(); + } + } + + void drain() { + unique_lock uniq(mtx); + flags |= FLAG_EDRAIN_SYNC; + while (flags & FLAG_EDRAIN_SYNC) { + cv.wait_for(uniq, 200ms); + } + } + +private: + dequeue_result dequeue() { + unique_lock uniq(mtx); + while ((!wk->get_lc()->going_down()) && + (items.size() == 0)) { + /* clear drain state, as we are NOT doing work and qlen==0 */ + if (flags & FLAG_EDRAIN_SYNC) { + flags &= ~FLAG_EDRAIN_SYNC; + } + flags |= FLAG_DWAIT_SYNC; + cv.wait_for(uniq, 200ms); + } + if (items.size() > 0) { + auto item = items.back(); + items.pop_back(); + if (flags & FLAG_EWAIT_SYNC) { + flags &= ~FLAG_EWAIT_SYNC; + cv.notify_one(); + } + return {item}; + } + return nullptr; + } + + void* entry() override { + while (!wk->get_lc()->going_down()) { + auto item = dequeue(); + if (item.which() == 0) { + /* going down */ + break; + } + f(wk, this, boost::get(item)); + } + return nullptr; + } +}; /* WorkQ */ + +class RGWLC::WorkPool +{ + using TVector = ceph::containers::tiny_vector; + TVector wqs; + uint64_t ix; + +public: + WorkPool(RGWLC::LCWorker* wk, uint16_t n_threads, uint32_t qmax) + : wqs(TVector{ + n_threads, + [&](const size_t ix, auto emplacer) { + emplacer.emplace(wk, ix, qmax); + }}), + ix(0) + {} + + ~WorkPool() { + for (auto& wq : wqs) { + wq.join(); + } + } + + void setf(WorkQ::work_f _f) { + for (auto& wq : wqs) { + wq.setf(_f); + } + } + + void enqueue(WorkItem item) { + const auto tix = ix; + ix = (ix+1) % wqs.size(); + (wqs[tix]).enqueue(std::move(item)); + } + + void drain() { + for (auto& wq : wqs) { + wq.drain(); + } + } +}; /* WorkPool */ + +RGWLC::LCWorker::LCWorker(const DoutPrefixProvider* dpp, CephContext *cct, + RGWLC *lc, int ix) + : dpp(dpp), cct(cct), lc(lc), ix(ix) +{ + auto wpw = cct->_conf.get_val("rgw_lc_max_wp_worker"); + workpool = new WorkPool(this, wpw, 512); +} + +static inline bool worker_should_stop(time_t stop_at, bool once) +{ + return !once && stop_at < time(nullptr); +} + +int RGWLC::handle_multipart_expiration(rgw::sal::Bucket* target, + const multimap& prefix_map, + LCWorker* worker, time_t stop_at, bool once) +{ + MultipartMetaFilter mp_filter; + int ret; + rgw::sal::Bucket::ListParams params; + rgw::sal::Bucket::ListResults results; + auto delay_ms = cct->_conf.get_val("rgw_lc_thread_delay"); + params.list_versions = false; + /* lifecycle processing does not depend on total order, so can + * take advantage of unordered listing optimizations--such as + * operating on one shard at a time */ + params.allow_unordered = true; + params.ns = RGW_OBJ_NS_MULTIPART; + params.access_list_filter = &mp_filter; + + auto pf = [&](RGWLC::LCWorker* wk, WorkQ* wq, WorkItem& wi) { + auto wt = boost::get>(wi); + auto& [rule, obj] = wt; + if (obj_has_expired(this, cct, obj.meta.mtime, rule.mp_expiration)) { + rgw_obj_key key(obj.key); + std::unique_ptr mpu = target->get_multipart_upload(key.name); + int ret = mpu->abort(this, cct); + if (ret == 0) { + if (perfcounter) { + perfcounter->inc(l_rgw_lc_abort_mpu, 1); + } + } else { + if (ret == -ERR_NO_SUCH_UPLOAD) { + ldpp_dout(wk->get_lc(), 5) + << "ERROR: abort_multipart_upload failed, ret=" << ret + << ", thread:" << wq->thr_name() + << ", meta:" << obj.key + << dendl; + } else { + ldpp_dout(wk->get_lc(), 0) + << "ERROR: abort_multipart_upload failed, ret=" << ret + << ", thread:" << wq->thr_name() + << ", meta:" << obj.key + << dendl; + } + } /* abort failed */ + } /* expired */ + }; + + worker->workpool->setf(pf); + + for (auto prefix_iter = prefix_map.begin(); prefix_iter != prefix_map.end(); + ++prefix_iter) { + + if (worker_should_stop(stop_at, once)) { + ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker " + << worker->ix + << dendl; + return 0; + } + + if (!prefix_iter->second.status || prefix_iter->second.mp_expiration <= 0) { + continue; + } + params.prefix = prefix_iter->first; + do { + auto offset = 0; + results.objs.clear(); + ret = target->list(this, params, 1000, results, null_yield); + if (ret < 0) { + if (ret == (-ENOENT)) + return 0; + ldpp_dout(this, 0) << "ERROR: driver->list_objects():" < t1 = + {prefix_iter->second, *obj_iter}; + worker->workpool->enqueue(WorkItem{t1}); + if (going_down()) { + return 0; + } + } /* for objs */ + + if ((offset % 100) == 0) { + if (worker_should_stop(stop_at, once)) { + ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker " + << worker->ix + << dendl; + return 0; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms)); + } while(results.is_truncated); + } /* for prefix_map */ + + worker->workpool->drain(); + return 0; +} /* RGWLC::handle_multipart_expiration */ + +static int read_obj_tags(const DoutPrefixProvider *dpp, rgw::sal::Object* obj, bufferlist& tags_bl) +{ + std::unique_ptr rop = obj->get_read_op(); + + return rop->get_attr(dpp, RGW_ATTR_TAGS, tags_bl, null_yield); +} + +static bool is_valid_op(const lc_op& op) +{ + return (op.status && + (op.expiration > 0 + || op.expiration_date != boost::none + || op.noncur_expiration > 0 + || op.dm_expiration + || !op.transitions.empty() + || !op.noncur_transitions.empty())); +} + +static bool zone_check(const lc_op& op, rgw::sal::Zone* zone) +{ + + if (zone->get_tier_type() == "archive") { + return (op.rule_flags & uint32_t(LCFlagType::ArchiveZone)); + } else { + return (! (op.rule_flags & uint32_t(LCFlagType::ArchiveZone))); + } +} + +static inline bool has_all_tags(const lc_op& rule_action, + const RGWObjTags& object_tags) +{ + if(! rule_action.obj_tags) + return false; + if(object_tags.count() < rule_action.obj_tags->count()) + return false; + size_t tag_count = 0; + for (const auto& tag : object_tags.get_tags()) { + const auto& rule_tags = rule_action.obj_tags->get_tags(); + const auto& iter = rule_tags.find(tag.first); + if(iter == rule_tags.end()) + continue; + if(iter->second == tag.second) + { + tag_count++; + } + /* all tags in the rule appear in obj tags */ + } + return tag_count == rule_action.obj_tags->count(); +} + +static int check_tags(const DoutPrefixProvider *dpp, lc_op_ctx& oc, bool *skip) +{ + auto& op = oc.op; + + if (op.obj_tags != boost::none) { + *skip = true; + + bufferlist tags_bl; + int ret = read_obj_tags(dpp, oc.obj.get(), tags_bl); + if (ret < 0) { + if (ret != -ENODATA) { + ldpp_dout(oc.dpp, 5) << "ERROR: read_obj_tags returned r=" + << ret << " " << oc.wq->thr_name() << dendl; + } + return 0; + } + RGWObjTags dest_obj_tags; + try { + auto iter = tags_bl.cbegin(); + dest_obj_tags.decode(iter); + } catch (buffer::error& err) { + ldpp_dout(oc.dpp,0) << "ERROR: caught buffer::error, couldn't decode TagSet " + << oc.wq->thr_name() << dendl; + return -EIO; + } + + if (! has_all_tags(op, dest_obj_tags)) { + ldpp_dout(oc.dpp, 20) << __func__ << "() skipping obj " << oc.obj + << " as tags do not match in rule: " + << op.id << " " + << oc.wq->thr_name() << dendl; + return 0; + } + } + *skip = false; + return 0; +} + +class LCOpFilter_Tags : public LCOpFilter { +public: + bool check(const DoutPrefixProvider *dpp, lc_op_ctx& oc) override { + auto& o = oc.o; + + if (o.is_delete_marker()) { + return true; + } + + bool skip; + + int ret = check_tags(dpp, oc, &skip); + if (ret < 0) { + if (ret == -ENOENT) { + return false; + } + ldpp_dout(oc.dpp, 0) << "ERROR: check_tags on obj=" << oc.obj + << " returned ret=" << ret << " " + << oc.wq->thr_name() << dendl; + return false; + } + + return !skip; + }; +}; + +class LCOpAction_CurrentExpiration : public LCOpAction { +public: + LCOpAction_CurrentExpiration(op_env& env) {} + + bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) override { + auto& o = oc.o; + if (!o.is_current()) { + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key + << ": not current, skipping " + << oc.wq->thr_name() << dendl; + return false; + } + if (o.is_delete_marker()) { + if (oc.next_key_name) { + std::string nkn = *oc.next_key_name; + if (oc.next_has_same_name(o.key.name)) { + ldpp_dout(dpp, 7) << __func__ << "(): dm-check SAME: key=" << o.key + << " next_key_name: %%" << nkn << "%% " + << oc.wq->thr_name() << dendl; + return false; + } else { + ldpp_dout(dpp, 7) << __func__ << "(): dm-check DELE: key=" << o.key + << " next_key_name: %%" << nkn << "%% " + << oc.wq->thr_name() << dendl; + *exp_time = real_clock::now(); + return true; + } + } + return false; + } + + auto& mtime = o.meta.mtime; + bool is_expired; + auto& op = oc.op; + if (op.expiration <= 0) { + if (op.expiration_date == boost::none) { + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key + << ": no expiration set in rule, skipping " + << oc.wq->thr_name() << dendl; + return false; + } + is_expired = ceph_clock_now() >= + ceph::real_clock::to_time_t(*op.expiration_date); + *exp_time = *op.expiration_date; + } else { + is_expired = obj_has_expired(dpp, oc.cct, mtime, op.expiration, exp_time); + } + + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key << ": is_expired=" + << (int)is_expired << " " + << oc.wq->thr_name() << dendl; + return is_expired; + } + + int process(lc_op_ctx& oc) { + auto& o = oc.o; + int r; + if (o.is_delete_marker()) { + r = remove_expired_obj(oc.dpp, oc, true, + rgw::notify::ObjectExpirationDeleteMarker); + if (r < 0) { + ldpp_dout(oc.dpp, 0) << "ERROR: current is-dm remove_expired_obj " + << oc.bucket << ":" << o.key + << " " << cpp_strerror(r) << " " + << oc.wq->thr_name() << dendl; + return r; + } + ldpp_dout(oc.dpp, 2) << "DELETED: current is-dm " + << oc.bucket << ":" << o.key + << " " << oc.wq->thr_name() << dendl; + } else { + /* ! o.is_delete_marker() */ + r = remove_expired_obj(oc.dpp, oc, !oc.bucket->versioned(), + rgw::notify::ObjectExpirationCurrent); + if (r < 0) { + ldpp_dout(oc.dpp, 0) << "ERROR: remove_expired_obj " + << oc.bucket << ":" << o.key + << " " << cpp_strerror(r) << " " + << oc.wq->thr_name() << dendl; + return r; + } + if (perfcounter) { + perfcounter->inc(l_rgw_lc_expire_current, 1); + } + ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key + << " " << oc.wq->thr_name() << dendl; + } + return 0; + } +}; + +class LCOpAction_NonCurrentExpiration : public LCOpAction { +protected: +public: + LCOpAction_NonCurrentExpiration(op_env& env) + {} + + bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) override { + auto& o = oc.o; + if (o.is_current()) { + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key + << ": current version, skipping " + << oc.wq->thr_name() << dendl; + return false; + } + + int expiration = oc.op.noncur_expiration; + bool is_expired = obj_has_expired(dpp, oc.cct, oc.effective_mtime, expiration, + exp_time); + + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key << ": is_expired=" + << is_expired << " " + << oc.wq->thr_name() << dendl; + + return is_expired && + pass_object_lock_check(oc.driver, oc.obj.get(), dpp); + } + + int process(lc_op_ctx& oc) { + auto& o = oc.o; + int r = remove_expired_obj(oc.dpp, oc, true, + rgw::notify::ObjectExpirationNoncurrent); + if (r < 0) { + ldpp_dout(oc.dpp, 0) << "ERROR: remove_expired_obj (non-current expiration) " + << oc.bucket << ":" << o.key + << " " << cpp_strerror(r) + << " " << oc.wq->thr_name() << dendl; + return r; + } + if (perfcounter) { + perfcounter->inc(l_rgw_lc_expire_noncurrent, 1); + } + ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key + << " (non-current expiration) " + << oc.wq->thr_name() << dendl; + return 0; + } +}; + +class LCOpAction_DMExpiration : public LCOpAction { +public: + LCOpAction_DMExpiration(op_env& env) {} + + bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) override { + auto& o = oc.o; + if (!o.is_delete_marker()) { + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key + << ": not a delete marker, skipping " + << oc.wq->thr_name() << dendl; + return false; + } + if (oc.next_has_same_name(o.key.name)) { + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key + << ": next is same object, skipping " + << oc.wq->thr_name() << dendl; + return false; + } + + *exp_time = real_clock::now(); + + return true; + } + + int process(lc_op_ctx& oc) { + auto& o = oc.o; + int r = remove_expired_obj(oc.dpp, oc, true, + rgw::notify::ObjectExpirationDeleteMarker); + if (r < 0) { + ldpp_dout(oc.dpp, 0) << "ERROR: remove_expired_obj (delete marker expiration) " + << oc.bucket << ":" << o.key + << " " << cpp_strerror(r) + << " " << oc.wq->thr_name() + << dendl; + return r; + } + if (perfcounter) { + perfcounter->inc(l_rgw_lc_expire_dm, 1); + } + ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key + << " (delete marker expiration) " + << oc.wq->thr_name() << dendl; + return 0; + } +}; + +class LCOpAction_Transition : public LCOpAction { + const transition_action& transition; + bool need_to_process{false}; + +protected: + virtual bool check_current_state(bool is_current) = 0; + virtual ceph::real_time get_effective_mtime(lc_op_ctx& oc) = 0; +public: + LCOpAction_Transition(const transition_action& _transition) + : transition(_transition) {} + + bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) override { + auto& o = oc.o; + + if (o.is_delete_marker()) { + return false; + } + + if (!check_current_state(o.is_current())) { + return false; + } + + auto mtime = get_effective_mtime(oc); + bool is_expired; + if (transition.days < 0) { + if (transition.date == boost::none) { + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key + << ": no transition day/date set in rule, skipping " + << oc.wq->thr_name() << dendl; + return false; + } + is_expired = ceph_clock_now() >= + ceph::real_clock::to_time_t(*transition.date); + *exp_time = *transition.date; + } else { + is_expired = obj_has_expired(dpp, oc.cct, mtime, transition.days, exp_time); + } + + ldpp_dout(oc.dpp, 20) << __func__ << "(): key=" << o.key << ": is_expired=" + << is_expired << " " + << oc.wq->thr_name() << dendl; + + need_to_process = + (rgw_placement_rule::get_canonical_storage_class(o.meta.storage_class) != + transition.storage_class); + + return is_expired; + } + + bool should_process() override { + return need_to_process; + } + + int delete_tier_obj(lc_op_ctx& oc) { + int ret = 0; + + /* If bucket is versioned, create delete_marker for current version + */ + if (oc.bucket->versioned() && oc.o.is_current() && !oc.o.is_delete_marker()) { + ret = remove_expired_obj(oc.dpp, oc, false, rgw::notify::ObjectExpiration); + ldpp_dout(oc.dpp, 20) << "delete_tier_obj Object(key:" << oc.o.key << ") current & not delete_marker" << " versioned_epoch: " << oc.o.versioned_epoch << "flags: " << oc.o.flags << dendl; + } else { + ret = remove_expired_obj(oc.dpp, oc, true, rgw::notify::ObjectExpiration); + ldpp_dout(oc.dpp, 20) << "delete_tier_obj Object(key:" << oc.o.key << ") not current " << "versioned_epoch: " << oc.o.versioned_epoch << "flags: " << oc.o.flags << dendl; + } + return ret; + } + + int transition_obj_to_cloud(lc_op_ctx& oc) { + /* If CurrentVersion object, remove it & create delete marker */ + bool delete_object = (!oc.tier->retain_head_object() || + (oc.o.is_current() && oc.bucket->versioned())); + + int ret = oc.obj->transition_to_cloud(oc.bucket, oc.tier.get(), oc.o, + oc.env.worker->get_cloud_targets(), oc.cct, + !delete_object, oc.dpp, null_yield); + if (ret < 0) { + return ret; + } + + if (delete_object) { + ret = delete_tier_obj(oc); + if (ret < 0) { + ldpp_dout(oc.dpp, 0) << "ERROR: Deleting tier object(" << oc.o.key << ") failed ret=" << ret << dendl; + return ret; + } + } + + return 0; + } + + int process(lc_op_ctx& oc) { + auto& o = oc.o; + int r; + + if (oc.o.meta.category == RGWObjCategory::CloudTiered) { + /* Skip objects which are already cloud tiered. */ + ldpp_dout(oc.dpp, 30) << "Object(key:" << oc.o.key << ") is already cloud tiered to cloud-s3 tier: " << oc.o.meta.storage_class << dendl; + return 0; + } + + std::string tier_type = ""; + rgw::sal::ZoneGroup& zonegroup = oc.driver->get_zone()->get_zonegroup(); + + rgw_placement_rule target_placement; + target_placement.inherit_from(oc.bucket->get_placement_rule()); + target_placement.storage_class = transition.storage_class; + + r = zonegroup.get_placement_tier(target_placement, &oc.tier); + + if (!r && oc.tier->get_tier_type() == "cloud-s3") { + ldpp_dout(oc.dpp, 30) << "Found cloud s3 tier: " << target_placement.storage_class << dendl; + if (!oc.o.is_current() && + !pass_object_lock_check(oc.driver, oc.obj.get(), oc.dpp)) { + /* Skip objects which has object lock enabled. */ + ldpp_dout(oc.dpp, 10) << "Object(key:" << oc.o.key << ") is locked. Skipping transition to cloud-s3 tier: " << target_placement.storage_class << dendl; + return 0; + } + + r = transition_obj_to_cloud(oc); + if (r < 0) { + ldpp_dout(oc.dpp, 0) << "ERROR: failed to transition obj(key:" << oc.o.key << ") to cloud (r=" << r << ")" + << dendl; + return r; + } + } else { + if (!oc.driver->valid_placement(target_placement)) { + ldpp_dout(oc.dpp, 0) << "ERROR: non existent dest placement: " + << target_placement + << " bucket="<< oc.bucket + << " rule_id=" << oc.op.id + << " " << oc.wq->thr_name() << dendl; + return -EINVAL; + } + + int r = oc.obj->transition(oc.bucket, target_placement, o.meta.mtime, + o.versioned_epoch, oc.dpp, null_yield); + if (r < 0) { + ldpp_dout(oc.dpp, 0) << "ERROR: failed to transition obj " + << oc.bucket << ":" << o.key + << " -> " << transition.storage_class + << " " << cpp_strerror(r) + << " " << oc.wq->thr_name() << dendl; + return r; + } + } + ldpp_dout(oc.dpp, 2) << "TRANSITIONED:" << oc.bucket + << ":" << o.key << " -> " + << transition.storage_class + << " " << oc.wq->thr_name() << dendl; + return 0; + } +}; + +class LCOpAction_CurrentTransition : public LCOpAction_Transition { +protected: + bool check_current_state(bool is_current) override { + return is_current; + } + + ceph::real_time get_effective_mtime(lc_op_ctx& oc) override { + return oc.o.meta.mtime; + } +public: + LCOpAction_CurrentTransition(const transition_action& _transition) + : LCOpAction_Transition(_transition) {} + int process(lc_op_ctx& oc) { + int r = LCOpAction_Transition::process(oc); + if (r == 0) { + if (perfcounter) { + perfcounter->inc(l_rgw_lc_transition_current, 1); + } + } + return r; + } +}; + +class LCOpAction_NonCurrentTransition : public LCOpAction_Transition { +protected: + bool check_current_state(bool is_current) override { + return !is_current; + } + + ceph::real_time get_effective_mtime(lc_op_ctx& oc) override { + return oc.effective_mtime; + } +public: + LCOpAction_NonCurrentTransition(op_env& env, + const transition_action& _transition) + : LCOpAction_Transition(_transition) + {} + int process(lc_op_ctx& oc) { + int r = LCOpAction_Transition::process(oc); + if (r == 0) { + if (perfcounter) { + perfcounter->inc(l_rgw_lc_transition_noncurrent, 1); + } + } + return r; + } +}; + +void LCOpRule::build() +{ + filters.emplace_back(new LCOpFilter_Tags); + + auto& op = env.op; + + if (op.expiration > 0 || + op.expiration_date != boost::none) { + actions.emplace_back(new LCOpAction_CurrentExpiration(env)); + } + + if (op.dm_expiration) { + actions.emplace_back(new LCOpAction_DMExpiration(env)); + } + + if (op.noncur_expiration > 0) { + actions.emplace_back(new LCOpAction_NonCurrentExpiration(env)); + } + + for (auto& iter : op.transitions) { + actions.emplace_back(new LCOpAction_CurrentTransition(iter.second)); + } + + for (auto& iter : op.noncur_transitions) { + actions.emplace_back(new LCOpAction_NonCurrentTransition(env, iter.second)); + } +} + +void LCOpRule::update() +{ + next_key_name = env.ol.next_key_name(); + effective_mtime = env.ol.get_prev_obj().meta.mtime; +} + +int LCOpRule::process(rgw_bucket_dir_entry& o, + const DoutPrefixProvider *dpp, + WorkQ* wq) +{ + lc_op_ctx ctx(env, o, next_key_name, effective_mtime, dpp, wq); + shared_ptr *selected = nullptr; // n.b., req'd by sharing + real_time exp; + + for (auto& a : actions) { + real_time action_exp; + + if (a->check(ctx, &action_exp, dpp)) { + if (action_exp > exp) { + exp = action_exp; + selected = &a; + } + } + } + + if (selected && + (*selected)->should_process()) { + + /* + * Calling filter checks after action checks because + * all action checks (as they are implemented now) do + * not access the objects themselves, but return result + * from info from bucket index listing. The current tags filter + * check does access the objects, so we avoid unnecessary rados calls + * having filters check later in the process. + */ + + bool cont = false; + for (auto& f : filters) { + if (f->check(dpp, ctx)) { + cont = true; + break; + } + } + + if (!cont) { + ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key + << ": no rule match, skipping " + << wq->thr_name() << dendl; + return 0; + } + + int r = (*selected)->process(ctx); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: remove_expired_obj " + << env.bucket << ":" << o.key + << " " << cpp_strerror(r) + << " " << wq->thr_name() << dendl; + return r; + } + ldpp_dout(dpp, 20) << "processed:" << env.bucket << ":" + << o.key << " " << wq->thr_name() << dendl; + } + + return 0; + +} + +int RGWLC::bucket_lc_process(string& shard_id, LCWorker* worker, + time_t stop_at, bool once) +{ + RGWLifecycleConfiguration config(cct); + std::unique_ptr bucket; + string no_ns, list_versions; + vector objs; + vector result; + boost::split(result, shard_id, boost::is_any_of(":")); + string bucket_tenant = result[0]; + string bucket_name = result[1]; + string bucket_marker = result[2]; + + ldpp_dout(this, 5) << "RGWLC::bucket_lc_process ENTER " << bucket_name << dendl; + if (unlikely(cct->_conf->rgwlc_skip_bucket_step)) { + return 0; + } + + int ret = driver->get_bucket(this, nullptr, bucket_tenant, bucket_name, &bucket, null_yield); + if (ret < 0) { + ldpp_dout(this, 0) << "LC:get_bucket for " << bucket_name + << " failed" << dendl; + return ret; + } + + ret = bucket->load_bucket(this, null_yield); + if (ret < 0) { + ldpp_dout(this, 0) << "LC:load_bucket for " << bucket_name + << " failed" << dendl; + return ret; + } + + auto stack_guard = make_scope_guard( + [&worker] + { + worker->workpool->drain(); + } + ); + + if (bucket->get_marker() != bucket_marker) { + ldpp_dout(this, 1) << "LC: deleting stale entry found for bucket=" + << bucket_tenant << ":" << bucket_name + << " cur_marker=" << bucket->get_marker() + << " orig_marker=" << bucket_marker << dendl; + return -ENOENT; + } + + map::iterator aiter + = bucket->get_attrs().find(RGW_ATTR_LC); + if (aiter == bucket->get_attrs().end()) { + ldpp_dout(this, 0) << "WARNING: bucket_attrs.find(RGW_ATTR_LC) failed for " + << bucket_name << " (terminates bucket_lc_process(...))" + << dendl; + return 0; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(this, 0) << __func__ << "() decode life cycle config failed" + << dendl; + return -1; + } + + /* fetch information for zone checks */ + rgw::sal::Zone* zone = driver->get_zone(); + + auto pf = [](RGWLC::LCWorker* wk, WorkQ* wq, WorkItem& wi) { + auto wt = + boost::get>(wi); + auto& [op_rule, o] = wt; + + ldpp_dout(wk->get_lc(), 20) + << __func__ << "(): key=" << o.key << wq->thr_name() + << dendl; + int ret = op_rule.process(o, wk->dpp, wq); + if (ret < 0) { + ldpp_dout(wk->get_lc(), 20) + << "ERROR: orule.process() returned ret=" << ret + << "thread:" << wq->thr_name() + << dendl; + } + }; + worker->workpool->setf(pf); + + multimap& prefix_map = config.get_prefix_map(); + ldpp_dout(this, 10) << __func__ << "() prefix_map size=" + << prefix_map.size() + << dendl; + + rgw_obj_key pre_marker; + rgw_obj_key next_marker; + for(auto prefix_iter = prefix_map.begin(); prefix_iter != prefix_map.end(); + ++prefix_iter) { + + if (worker_should_stop(stop_at, once)) { + ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker " + << worker->ix + << dendl; + return 0; + } + + auto& op = prefix_iter->second; + if (!is_valid_op(op)) { + continue; + } + ldpp_dout(this, 20) << __func__ << "(): prefix=" << prefix_iter->first + << dendl; + if (prefix_iter != prefix_map.begin() && + (prefix_iter->first.compare(0, prev(prefix_iter)->first.length(), + prev(prefix_iter)->first) == 0)) { + next_marker = pre_marker; + } else { + pre_marker = next_marker; + } + + LCObjsLister ol(driver, bucket.get()); + ol.set_prefix(prefix_iter->first); + + if (! zone_check(op, zone)) { + ldpp_dout(this, 7) << "LC rule not executable in " << zone->get_tier_type() + << " zone, skipping" << dendl; + continue; + } + + ret = ol.init(this); + if (ret < 0) { + if (ret == (-ENOENT)) + return 0; + ldpp_dout(this, 0) << "ERROR: driver->list_objects():" << dendl; + return ret; + } + + op_env oenv(op, driver, worker, bucket.get(), ol); + LCOpRule orule(oenv); + orule.build(); // why can't ctor do it? + rgw_bucket_dir_entry* o{nullptr}; + for (auto offset = 0; ol.get_obj(this, &o /* , fetch_barrier */); ++offset, ol.next()) { + orule.update(); + std::tuple t1 = {orule, *o}; + worker->workpool->enqueue(WorkItem{t1}); + if ((offset % 100) == 0) { + if (worker_should_stop(stop_at, once)) { + ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker " + << worker->ix + << dendl; + return 0; + } + } + } + worker->workpool->drain(); + } + + ret = handle_multipart_expiration(bucket.get(), prefix_map, worker, stop_at, once); + return ret; +} + +class SimpleBackoff +{ + const int max_retries; + std::chrono::milliseconds sleep_ms; + int retries{0}; +public: + SimpleBackoff(int max_retries, std::chrono::milliseconds initial_sleep_ms) + : max_retries(max_retries), sleep_ms(initial_sleep_ms) + {} + SimpleBackoff(const SimpleBackoff&) = delete; + SimpleBackoff& operator=(const SimpleBackoff&) = delete; + + int get_retries() const { + return retries; + } + + void reset() { + retries = 0; + } + + bool wait_backoff(const fu2::unique_function& barrier) { + reset(); + while (retries < max_retries) { + auto r = barrier(); + if (r) { + return r; + } + std::this_thread::sleep_for(sleep_ms * 2 * retries++); + } + return false; + } +}; + +int RGWLC::bucket_lc_post(int index, int max_lock_sec, + rgw::sal::Lifecycle::LCEntry& entry, int& result, + LCWorker* worker) +{ + utime_t lock_duration(cct->_conf->rgw_lc_lock_max_time, 0); + + std::unique_ptr lock = + sal_lc->get_serializer(lc_index_lock_name, obj_names[index], cookie); + + ldpp_dout(this, 5) << "RGWLC::bucket_lc_post(): POST " << entry + << " index: " << index << " worker ix: " << worker->ix + << dendl; + + do { + int ret = lock->try_lock(this, lock_duration, null_yield); + if (ret == -EBUSY || ret == -EEXIST) { + /* already locked by another lc processor */ + ldpp_dout(this, 0) << "RGWLC::bucket_lc_post() failed to acquire lock on " + << obj_names[index] << ", sleep 5, try again " << dendl; + sleep(5); + continue; + } + + if (ret < 0) + return 0; + ldpp_dout(this, 20) << "RGWLC::bucket_lc_post() lock " << obj_names[index] + << dendl; + + if (result == -ENOENT) { + /* XXXX are we SURE the only way result could == ENOENT is when + * there is no such bucket? It is currently the value returned + * from bucket_lc_process(...) */ + ret = sal_lc->rm_entry(obj_names[index], entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::bucket_lc_post() failed to remove entry " + << obj_names[index] << dendl; + } + goto clean; + } else if (result < 0) { + entry.set_status(lc_failed); + } else { + entry.set_status(lc_complete); + } + + ret = sal_lc->set_entry(obj_names[index], entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to set entry on " + << obj_names[index] << dendl; + } +clean: + lock->unlock(); + ldpp_dout(this, 20) << "RGWLC::bucket_lc_post() unlock " + << obj_names[index] << dendl; + return 0; + } while (true); +} /* RGWLC::bucket_lc_post */ + +int RGWLC::list_lc_progress(string& marker, uint32_t max_entries, + vector>& progress_map, + int& index) +{ + progress_map.clear(); + for(; index < max_objs; index++, marker="") { + vector> entries; + int ret = sal_lc->list_entries(obj_names[index], marker, max_entries, entries); + if (ret < 0) { + if (ret == -ENOENT) { + ldpp_dout(this, 10) << __func__ << "() ignoring unfound lc object=" + << obj_names[index] << dendl; + continue; + } else { + return ret; + } + } + progress_map.reserve(progress_map.size() + entries.size()); + std::move(begin(entries), end(entries), std::back_inserter(progress_map)); + //progress_map.insert(progress_map.end(), entries.begin(), entries.end()); + + /* update index, marker tuple */ + if (progress_map.size() > 0) + marker = progress_map.back()->get_bucket(); + + if (progress_map.size() >= max_entries) + break; + } + return 0; +} + +static inline vector random_sequence(uint32_t n) +{ + vector v(n, 0); + std::generate(v.begin(), v.end(), + [ix = 0]() mutable { + return ix++; + }); + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(v.begin(), v.end(), rng); + return v; +} + +static inline int get_lc_index(CephContext *cct, + const std::string& shard_id) +{ + int max_objs = + (cct->_conf->rgw_lc_max_objs > HASH_PRIME ? HASH_PRIME : + cct->_conf->rgw_lc_max_objs); + /* n.b. review hash algo */ + int index = ceph_str_hash_linux(shard_id.c_str(), + shard_id.size()) % HASH_PRIME % max_objs; + return index; +} + +static inline void get_lc_oid(CephContext *cct, + const std::string& shard_id, string *oid) +{ + /* n.b. review hash algo */ + int index = get_lc_index(cct, shard_id); + *oid = lc_oid_prefix; + char buf[32]; + snprintf(buf, 32, ".%d", index); + oid->append(buf); + return; +} + +static std::string get_bucket_lc_key(const rgw_bucket& bucket){ + return string_join_reserve(':', bucket.tenant, bucket.name, bucket.marker); +} + +int RGWLC::process(LCWorker* worker, + const std::unique_ptr& optional_bucket, + bool once = false) +{ + int ret = 0; + int max_secs = cct->_conf->rgw_lc_lock_max_time; + + if (optional_bucket) { + /* if a bucket is provided, this is a single-bucket run, and + * can be processed without traversing any state entries (we + * do need the entry {pro,epi}logue which update the state entry + * for this bucket) */ + auto bucket_lc_key = get_bucket_lc_key(optional_bucket->get_key()); + auto index = get_lc_index(driver->ctx(), bucket_lc_key); + ret = process_bucket(index, max_secs, worker, bucket_lc_key, once); + return ret; + } else { + /* generate an index-shard sequence unrelated to any other + * that might be running in parallel */ + std::string all_buckets{""}; + vector shard_seq = random_sequence(max_objs); + for (auto index : shard_seq) { + ret = process(index, max_secs, worker, once); + if (ret < 0) + return ret; + } + } + + return 0; +} + +bool RGWLC::expired_session(time_t started) +{ + if (! cct->_conf->rgwlc_auto_session_clear) { + return false; + } + + time_t interval = (cct->_conf->rgw_lc_debug_interval > 0) + ? cct->_conf->rgw_lc_debug_interval + : 24*60*60; + + auto now = time(nullptr); + + ldpp_dout(this, 16) << "RGWLC::expired_session" + << " started: " << started + << " interval: " << interval << "(*2==" << 2*interval << ")" + << " now: " << now + << dendl; + + return (started + 2*interval < now); +} + +time_t RGWLC::thread_stop_at() +{ + uint64_t interval = (cct->_conf->rgw_lc_debug_interval > 0) + ? cct->_conf->rgw_lc_debug_interval + : 24*60*60; + + return time(nullptr) + interval; +} + +int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker, + const std::string& bucket_entry_marker, + bool once = false) +{ + ldpp_dout(this, 5) << "RGWLC::process_bucket(): ENTER: " + << "index: " << index << " worker ix: " << worker->ix + << dendl; + + int ret = 0; + std::unique_ptr serializer = + sal_lc->get_serializer(lc_index_lock_name, obj_names[index], + worker->thr_name()); + std::unique_ptr entry; + if (max_lock_secs <= 0) { + return -EAGAIN; + } + + utime_t time(max_lock_secs, 0); + ret = serializer->try_lock(this, time, null_yield); + if (ret == -EBUSY || ret == -EEXIST) { + /* already locked by another lc processor */ + ldpp_dout(this, 0) << "RGWLC::process() failed to acquire lock on " + << obj_names[index] << dendl; + return -EBUSY; + } + if (ret < 0) + return 0; + + std::unique_lock lock( + *(serializer.get()), std::adopt_lock); + + ret = sal_lc->get_entry(obj_names[index], bucket_entry_marker, &entry); + if (ret >= 0) { + if (entry->get_status() == lc_processing) { + if (expired_session(entry->get_start_time())) { + ldpp_dout(this, 5) << "RGWLC::process_bucket(): STALE lc session found for: " << entry + << " index: " << index << " worker ix: " << worker->ix + << " (clearing)" + << dendl; + } else { + ldpp_dout(this, 5) << "RGWLC::process_bucket(): ACTIVE entry: " + << entry + << " index: " << index + << " worker ix: " << worker->ix + << dendl; + return ret; + } + } + } + + /* do nothing if no bucket */ + if (entry->get_bucket().empty()) { + return ret; + } + + ldpp_dout(this, 5) << "RGWLC::process_bucket(): START entry 1: " << entry + << " index: " << index << " worker ix: " << worker->ix + << dendl; + + entry->set_status(lc_processing); + ret = sal_lc->set_entry(obj_names[index], *entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process_bucket() failed to set obj entry " + << obj_names[index] << entry->get_bucket() << entry->get_status() + << dendl; + return ret; + } + + ldpp_dout(this, 5) << "RGWLC::process_bucket(): START entry 2: " << entry + << " index: " << index << " worker ix: " << worker->ix + << dendl; + + lock.unlock(); + ret = bucket_lc_process(entry->get_bucket(), worker, thread_stop_at(), once); + bucket_lc_post(index, max_lock_secs, *entry, ret, worker); + + return ret; +} /* RGWLC::process_bucket */ + +static inline bool allow_shard_rollover(CephContext* cct, time_t now, time_t shard_rollover_date) +{ + /* return true iff: + * - non-debug scheduling is in effect, and + * - the current shard has not rolled over in the last 24 hours + */ + if (((shard_rollover_date < now) && + (now - shard_rollover_date > 24*60*60)) || + (! shard_rollover_date /* no rollover date stored */) || + (cct->_conf->rgw_lc_debug_interval > 0 /* defaults to -1 == disabled */)) { + return true; + } + return false; +} /* allow_shard_rollover */ + +static inline bool already_run_today(CephContext* cct, time_t start_date) +{ + struct tm bdt; + time_t begin_of_day; + utime_t now = ceph_clock_now(); + localtime_r(&start_date, &bdt); + + if (cct->_conf->rgw_lc_debug_interval > 0) { + if (now - start_date < cct->_conf->rgw_lc_debug_interval) + return true; + else + return false; + } + + bdt.tm_hour = 0; + bdt.tm_min = 0; + bdt.tm_sec = 0; + begin_of_day = mktime(&bdt); + if (now - begin_of_day < 24*60*60) + return true; + else + return false; +} /* already_run_today */ + +inline int RGWLC::advance_head(const std::string& lc_shard, + rgw::sal::Lifecycle::LCHead& head, + rgw::sal::Lifecycle::LCEntry& entry, + time_t start_date) +{ + int ret{0}; + std::unique_ptr next_entry; + + ret = sal_lc->get_next_entry(lc_shard, entry.get_bucket(), &next_entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to get obj entry " + << lc_shard << dendl; + goto exit; + } + + /* save the next position */ + head.set_marker(next_entry->get_bucket()); + head.set_start_date(start_date); + + ret = sal_lc->put_head(lc_shard, head); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to put head " + << lc_shard + << dendl; + goto exit; + } +exit: + return ret; +} /* advance head */ + +int RGWLC::process(int index, int max_lock_secs, LCWorker* worker, + bool once = false) +{ + int ret{0}; + const auto& lc_shard = obj_names[index]; + + std::unique_ptr head; + std::unique_ptr entry; //string = bucket_name:bucket_id, start_time, int = LC_BUCKET_STATUS + + ldpp_dout(this, 5) << "RGWLC::process(): ENTER: " + << "index: " << index << " worker ix: " << worker->ix + << dendl; + + std::unique_ptr lock = + sal_lc->get_serializer(lc_index_lock_name, lc_shard, worker->thr_name()); + + utime_t lock_for_s(max_lock_secs, 0); + const auto& lock_lambda = [&]() { + ret = lock->try_lock(this, lock_for_s, null_yield); + if (ret == 0) { + return true; + } + if (ret == -EBUSY || ret == -EEXIST) { + /* already locked by another lc processor */ + return false; + } + return false; + }; + + SimpleBackoff shard_lock(5 /* max retries */, 50ms); + if (! shard_lock.wait_backoff(lock_lambda)) { + ldpp_dout(this, 0) << "RGWLC::process(): failed to aquire lock on " + << lc_shard << " after " << shard_lock.get_retries() + << dendl; + return 0; + } + + do { + utime_t now = ceph_clock_now(); + + /* preamble: find an inital bucket/marker */ + ret = sal_lc->get_head(lc_shard, &head); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to get obj head " + << lc_shard << ", ret=" << ret << dendl; + goto exit; + } + + /* if there is nothing at head, try to reinitialize head.marker with the + * first entry in the queue */ + if (head->get_marker().empty() && + allow_shard_rollover(cct, now, head->get_shard_rollover_date()) /* prevent multiple passes by diff. + * rgws,in same cycle */) { + + ldpp_dout(this, 5) << "RGWLC::process() process shard rollover lc_shard=" << lc_shard + << " head.marker=" << head->get_marker() + << " head.shard_rollover_date=" << head->get_shard_rollover_date() + << dendl; + + vector> entries; + int ret = sal_lc->list_entries(lc_shard, head->get_marker(), 1, entries); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() sal_lc->list_entries(lc_shard, head.marker, 1, " + << "entries) returned error ret==" << ret << dendl; + goto exit; + } + if (entries.size() > 0) { + entry = std::move(entries.front()); + head->set_marker(entry->get_bucket()); + head->set_start_date(now); + head->set_shard_rollover_date(0); + } + } else { + ldpp_dout(this, 0) << "RGWLC::process() head.marker !empty() at START for shard==" + << lc_shard << " head last stored at " + << rgw_to_asctime(utime_t(time_t(head->get_start_date()), 0)) + << dendl; + + /* fetches the entry pointed to by head.bucket */ + ret = sal_lc->get_entry(lc_shard, head->get_marker(), &entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() sal_lc->get_entry(lc_shard, head.marker, entry) " + << "returned error ret==" << ret << dendl; + goto exit; + } + } + + if (entry && !entry->get_bucket().empty()) { + if (entry->get_status() == lc_processing) { + if (expired_session(entry->get_start_time())) { + ldpp_dout(this, 5) + << "RGWLC::process(): STALE lc session found for: " << entry + << " index: " << index << " worker ix: " << worker->ix + << " (clearing)" << dendl; + } else { + ldpp_dout(this, 5) + << "RGWLC::process(): ACTIVE entry: " << entry + << " index: " << index << " worker ix: " << worker->ix << dendl; + /* skip to next entry */ + if (advance_head(lc_shard, *head.get(), *entry.get(), now) < 0) { + goto exit; + } + /* done with this shard */ + if (head->get_marker().empty()) { + ldpp_dout(this, 5) << + "RGWLC::process() cycle finished lc_shard=" + << lc_shard + << dendl; + head->set_shard_rollover_date(ceph_clock_now()); + ret = sal_lc->put_head(lc_shard, *head.get()); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to put head " + << lc_shard + << dendl; + } + goto exit; + } + continue; + } + } else { + if ((entry->get_status() == lc_complete) && + already_run_today(cct, entry->get_start_time())) { + /* skip to next entry */ + if (advance_head(lc_shard, *head.get(), *entry.get(), now) < 0) { + goto exit; + } + ldpp_dout(this, 5) << "RGWLC::process() worker ix; " << worker->ix + << " SKIP processing for already-processed bucket " << entry->get_bucket() + << dendl; + /* done with this shard */ + if (head->get_marker().empty()) { + ldpp_dout(this, 5) << + "RGWLC::process() cycle finished lc_shard=" + << lc_shard + << dendl; + head->set_shard_rollover_date(ceph_clock_now()); + ret = sal_lc->put_head(lc_shard, *head.get()); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to put head " + << lc_shard + << dendl; + } + goto exit; + } + continue; + } + } + } else { + ldpp_dout(this, 5) << "RGWLC::process() entry.bucket.empty() == true at START 1" + << " (this is possible mainly before any lc policy has been stored" + << " or after removal of an lc_shard object)" + << dendl; + goto exit; + } + + /* When there are no more entries to process, entry will be + * equivalent to an empty marker and so the following resets the + * processing for the shard automatically when processing is + * finished for the shard */ + ldpp_dout(this, 5) << "RGWLC::process(): START entry 1: " << entry + << " index: " << index << " worker ix: " << worker->ix + << dendl; + + entry->set_status(lc_processing); + entry->set_start_time(now); + + ret = sal_lc->set_entry(lc_shard, *entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to set obj entry " + << lc_shard << entry->get_bucket() << entry->get_status() << dendl; + goto exit; + } + + /* advance head for next waiter, then process */ + if (advance_head(lc_shard, *head.get(), *entry.get(), now) < 0) { + goto exit; + } + + ldpp_dout(this, 5) << "RGWLC::process(): START entry 2: " << entry + << " index: " << index << " worker ix: " << worker->ix + << dendl; + + /* drop lock so other instances can make progress while this + * bucket is being processed */ + lock->unlock(); + ret = bucket_lc_process(entry->get_bucket(), worker, thread_stop_at(), once); + + /* postamble */ + //bucket_lc_post(index, max_lock_secs, entry, ret, worker); + if (! shard_lock.wait_backoff(lock_lambda)) { + ldpp_dout(this, 0) << "RGWLC::process(): failed to aquire lock on " + << lc_shard << " after " << shard_lock.get_retries() + << dendl; + return 0; + } + + if (ret == -ENOENT) { + /* XXXX are we SURE the only way result could == ENOENT is when + * there is no such bucket? It is currently the value returned + * from bucket_lc_process(...) */ + ret = sal_lc->rm_entry(lc_shard, *entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to remove entry " + << lc_shard << " (nonfatal)" + << dendl; + /* not fatal, could result from a race */ + } + } else { + if (ret < 0) { + entry->set_status(lc_failed); + } else { + entry->set_status(lc_complete); + } + ret = sal_lc->set_entry(lc_shard, *entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to set entry on " + << lc_shard + << dendl; + /* fatal, locked */ + goto exit; + } + } + + /* done with this shard */ + if (head->get_marker().empty()) { + ldpp_dout(this, 5) << + "RGWLC::process() cycle finished lc_shard=" + << lc_shard + << dendl; + head->set_shard_rollover_date(ceph_clock_now()); + ret = sal_lc->put_head(lc_shard, *head.get()); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to put head " + << lc_shard + << dendl; + } + goto exit; + } + } while(1 && !once && !going_down()); + +exit: + lock->unlock(); + return 0; +} + +void RGWLC::start_processor() +{ + auto maxw = cct->_conf->rgw_lc_max_worker; + workers.reserve(maxw); + for (int ix = 0; ix < maxw; ++ix) { + auto worker = + std::make_unique(this /* dpp */, cct, this, ix); + worker->create((string{"lifecycle_thr_"} + to_string(ix)).c_str()); + workers.emplace_back(std::move(worker)); + } +} + +void RGWLC::stop_processor() +{ + down_flag = true; + for (auto& worker : workers) { + worker->stop(); + worker->join(); + } + workers.clear(); +} + +unsigned RGWLC::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWLC::gen_prefix(std::ostream& out) const +{ + return out << "lifecycle: "; +} + +void RGWLC::LCWorker::stop() +{ + std::lock_guard l{lock}; + cond.notify_all(); +} + +bool RGWLC::going_down() +{ + return down_flag; +} + +bool RGWLC::LCWorker::should_work(utime_t& now) +{ + int start_hour; + int start_minute; + int end_hour; + int end_minute; + string worktime = cct->_conf->rgw_lifecycle_work_time; + sscanf(worktime.c_str(),"%d:%d-%d:%d",&start_hour, &start_minute, + &end_hour, &end_minute); + struct tm bdt; + time_t tt = now.sec(); + localtime_r(&tt, &bdt); + + if (cct->_conf->rgw_lc_debug_interval > 0) { + /* We're debugging, so say we can run */ + return true; + } else if ((bdt.tm_hour*60 + bdt.tm_min >= start_hour*60 + start_minute) && + (bdt.tm_hour*60 + bdt.tm_min <= end_hour*60 + end_minute)) { + return true; + } else { + return false; + } + +} + +int RGWLC::LCWorker::schedule_next_start_time(utime_t &start, utime_t& now) +{ + int secs; + + if (cct->_conf->rgw_lc_debug_interval > 0) { + secs = start + cct->_conf->rgw_lc_debug_interval - now; + if (secs < 0) + secs = 0; + return (secs); + } + + int start_hour; + int start_minute; + int end_hour; + int end_minute; + string worktime = cct->_conf->rgw_lifecycle_work_time; + sscanf(worktime.c_str(),"%d:%d-%d:%d",&start_hour, &start_minute, &end_hour, + &end_minute); + struct tm bdt; + time_t tt = now.sec(); + time_t nt; + localtime_r(&tt, &bdt); + bdt.tm_hour = start_hour; + bdt.tm_min = start_minute; + bdt.tm_sec = 0; + nt = mktime(&bdt); + secs = nt - tt; + + return secs>0 ? secs : secs+24*60*60; +} + +RGWLC::LCWorker::~LCWorker() +{ + delete workpool; +} /* ~LCWorker */ + +void RGWLifecycleConfiguration::generate_test_instances( + list& o) +{ + o.push_back(new RGWLifecycleConfiguration); +} + +template +static int guard_lc_modify(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + rgw::sal::Lifecycle* sal_lc, + const rgw_bucket& bucket, const string& cookie, + const F& f) { + CephContext *cct = driver->ctx(); + + auto bucket_lc_key = get_bucket_lc_key(bucket); + string oid; + get_lc_oid(cct, bucket_lc_key, &oid); + + /* XXX it makes sense to take shard_id for a bucket_id? */ + std::unique_ptr entry = sal_lc->get_entry(); + entry->set_bucket(bucket_lc_key); + entry->set_status(lc_uninitial); + int max_lock_secs = cct->_conf->rgw_lc_lock_max_time; + + std::unique_ptr lock = + sal_lc->get_serializer(lc_index_lock_name, oid, cookie); + utime_t time(max_lock_secs, 0); + + int ret; + uint16_t retries{0}; + + // due to reports of starvation trying to save lifecycle policy, try hard + do { + ret = lock->try_lock(dpp, time, null_yield); + if (ret == -EBUSY || ret == -EEXIST) { + ldpp_dout(dpp, 0) << "RGWLC::RGWPutLC() failed to acquire lock on " + << oid << ", retry in 100ms, ret=" << ret << dendl; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // the typical S3 client will time out in 60s + if(retries++ < 500) { + continue; + } + } + if (ret < 0) { + ldpp_dout(dpp, 0) << "RGWLC::RGWPutLC() failed to acquire lock on " + << oid << ", ret=" << ret << dendl; + break; + } + ret = f(sal_lc, oid, *entry.get()); + if (ret < 0) { + ldpp_dout(dpp, 0) << "RGWLC::RGWPutLC() failed to set entry on " + << oid << ", ret=" << ret << dendl; + } + break; + } while(true); + lock->unlock(); + return ret; +} + +int RGWLC::set_bucket_config(rgw::sal::Bucket* bucket, + const rgw::sal::Attrs& bucket_attrs, + RGWLifecycleConfiguration *config) +{ + int ret{0}; + rgw::sal::Attrs attrs = bucket_attrs; + if (config) { + /* if no RGWLifecycleconfiguration provided, it means + * RGW_ATTR_LC is already valid and present */ + bufferlist lc_bl; + config->encode(lc_bl); + attrs[RGW_ATTR_LC] = std::move(lc_bl); + + ret = + bucket->merge_and_store_attrs(this, attrs, null_yield); + if (ret < 0) { + return ret; + } + } + + rgw_bucket& b = bucket->get_key(); + + + ret = guard_lc_modify(this, driver, sal_lc.get(), b, cookie, + [&](rgw::sal::Lifecycle* sal_lc, const string& oid, + rgw::sal::Lifecycle::LCEntry& entry) { + return sal_lc->set_entry(oid, entry); + }); + + return ret; +} + +int RGWLC::remove_bucket_config(rgw::sal::Bucket* bucket, + const rgw::sal::Attrs& bucket_attrs, + bool merge_attrs) +{ + rgw::sal::Attrs attrs = bucket_attrs; + rgw_bucket& b = bucket->get_key(); + int ret{0}; + + if (merge_attrs) { + attrs.erase(RGW_ATTR_LC); + ret = bucket->merge_and_store_attrs(this, attrs, null_yield); + + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::RGWDeleteLC() failed to set attrs on bucket=" + << b.name << " returned err=" << ret << dendl; + return ret; + } + } + + ret = guard_lc_modify(this, driver, sal_lc.get(), b, cookie, + [&](rgw::sal::Lifecycle* sal_lc, const string& oid, + rgw::sal::Lifecycle::LCEntry& entry) { + return sal_lc->rm_entry(oid, entry); + }); + + return ret; +} /* RGWLC::remove_bucket_config */ + +RGWLC::~RGWLC() +{ + stop_processor(); + finalize(); +} /* ~RGWLC() */ + +namespace rgw::lc { + +int fix_lc_shard_entry(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + rgw::sal::Lifecycle* sal_lc, + rgw::sal::Bucket* bucket) +{ + if (auto aiter = bucket->get_attrs().find(RGW_ATTR_LC); + aiter == bucket->get_attrs().end()) { + return 0; // No entry, nothing to fix + } + + auto bucket_lc_key = get_bucket_lc_key(bucket->get_key()); + std::string lc_oid; + get_lc_oid(driver->ctx(), bucket_lc_key, &lc_oid); + + std::unique_ptr entry; + // There are multiple cases we need to encounter here + // 1. entry exists and is already set to marker, happens in plain buckets & newly resharded buckets + // 2. entry doesn't exist, which usually happens when reshard has happened prior to update and next LC process has already dropped the update + // 3. entry exists matching the current bucket id which was after a reshard (needs to be updated to the marker) + // We are not dropping the old marker here as that would be caught by the next LC process update + int ret = sal_lc->get_entry(lc_oid, bucket_lc_key, &entry); + if (ret == 0) { + ldpp_dout(dpp, 5) << "Entry already exists, nothing to do" << dendl; + return ret; // entry is already existing correctly set to marker + } + ldpp_dout(dpp, 5) << "lc_get_entry errored ret code=" << ret << dendl; + if (ret == -ENOENT) { + ldpp_dout(dpp, 1) << "No entry for bucket=" << bucket + << " creating " << dendl; + // TODO: we have too many ppl making cookies like this! + char cookie_buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(driver->ctx(), cookie_buf, sizeof(cookie_buf) - 1); + std::string cookie = cookie_buf; + + ret = guard_lc_modify(dpp, + driver, sal_lc, bucket->get_key(), cookie, + [&lc_oid](rgw::sal::Lifecycle* slc, + const string& oid, + rgw::sal::Lifecycle::LCEntry& entry) { + return slc->set_entry(lc_oid, entry); + }); + + } + + return ret; +} + +std::string s3_expiration_header( + DoutPrefixProvider* dpp, + const rgw_obj_key& obj_key, + const RGWObjTags& obj_tagset, + const ceph::real_time& mtime, + const std::map& bucket_attrs) +{ + CephContext* cct = dpp->get_cct(); + RGWLifecycleConfiguration config(cct); + std::string hdr{""}; + + const auto& aiter = bucket_attrs.find(RGW_ATTR_LC); + if (aiter == bucket_attrs.end()) + return hdr; + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 0) << __func__ + << "() decode life cycle config failed" + << dendl; + return hdr; + } /* catch */ + + /* dump tags at debug level 16 */ + RGWObjTags::tag_map_t obj_tag_map = obj_tagset.get_tags(); + if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 16)) { + for (const auto& elt : obj_tag_map) { + ldpp_dout(dpp, 16) << __func__ + << "() key=" << elt.first << " val=" << elt.second + << dendl; + } + } + + boost::optional expiration_date; + boost::optional rule_id; + + const auto& rule_map = config.get_rule_map(); + for (const auto& ri : rule_map) { + const auto& rule = ri.second; + auto& id = rule.get_id(); + auto& filter = rule.get_filter(); + auto& prefix = filter.has_prefix() ? filter.get_prefix(): rule.get_prefix(); + auto& expiration = rule.get_expiration(); + auto& noncur_expiration = rule.get_noncur_expiration(); + + ldpp_dout(dpp, 10) << "rule: " << ri.first + << " prefix: " << prefix + << " expiration: " + << " date: " << expiration.get_date() + << " days: " << expiration.get_days() + << " noncur_expiration: " + << " date: " << noncur_expiration.get_date() + << " days: " << noncur_expiration.get_days() + << dendl; + + /* skip if rule !enabled + * if rule has prefix, skip iff object !match prefix + * if rule has tags, skip iff object !match tags + * note if object is current or non-current, compare accordingly + * if rule has days, construct date expression and save iff older + * than last saved + * if rule has date, convert date expression and save iff older + * than last saved + * if the date accum has a value, format it into hdr + */ + + if (! rule.is_enabled()) + continue; + + if(! prefix.empty()) { + if (! boost::starts_with(obj_key.name, prefix)) + continue; + } + + if (filter.has_tags()) { + bool tag_match = false; + const RGWObjTags& rule_tagset = filter.get_tags(); + for (auto& tag : rule_tagset.get_tags()) { + /* remember, S3 tags are {key,value} tuples */ + tag_match = true; + auto obj_tag = obj_tag_map.find(tag.first); + if (obj_tag == obj_tag_map.end() || obj_tag->second != tag.second) { + ldpp_dout(dpp, 10) << "tag does not match obj_key=" << obj_key + << " rule_id=" << id + << " tag=" << tag + << dendl; + tag_match = false; + break; + } + } + if (! tag_match) + continue; + } + + // compute a uniform expiration date + boost::optional rule_expiration_date; + const LCExpiration& rule_expiration = + (obj_key.instance.empty()) ? expiration : noncur_expiration; + + if (rule_expiration.has_date()) { + rule_expiration_date = + boost::optional( + ceph::from_iso_8601(rule.get_expiration().get_date())); + } else { + if (rule_expiration.has_days()) { + rule_expiration_date = + boost::optional( + mtime + make_timespan(double(rule_expiration.get_days())*24*60*60 - ceph::real_clock::to_time_t(mtime)%(24*60*60) + 24*60*60)); + } + } + + // update earliest expiration + if (rule_expiration_date) { + if ((! expiration_date) || + (*expiration_date > *rule_expiration_date)) { + expiration_date = + boost::optional(rule_expiration_date); + rule_id = boost::optional(id); + } + } + } + + // cond format header + if (expiration_date && rule_id) { + // Fri, 23 Dec 2012 00:00:00 GMT + char exp_buf[100]; + time_t exp = ceph::real_clock::to_time_t(*expiration_date); + if (std::strftime(exp_buf, sizeof(exp_buf), + "%a, %d %b %Y %T %Z", std::gmtime(&exp))) { + hdr = fmt::format("expiry-date=\"{0}\", rule-id=\"{1}\"", exp_buf, + *rule_id); + } else { + ldpp_dout(dpp, 0) << __func__ << + "() strftime of life cycle expiration header failed" + << dendl; + } + } + + return hdr; + +} /* rgwlc_s3_expiration_header */ + +bool s3_multipart_abort_header( + DoutPrefixProvider* dpp, + const rgw_obj_key& obj_key, + const ceph::real_time& mtime, + const std::map& bucket_attrs, + ceph::real_time& abort_date, + std::string& rule_id) +{ + CephContext* cct = dpp->get_cct(); + RGWLifecycleConfiguration config(cct); + + const auto& aiter = bucket_attrs.find(RGW_ATTR_LC); + if (aiter == bucket_attrs.end()) + return false; + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 0) << __func__ + << "() decode life cycle config failed" + << dendl; + return false; + } /* catch */ + + std::optional abort_date_tmp; + std::optional rule_id_tmp; + const auto& rule_map = config.get_rule_map(); + for (const auto& ri : rule_map) { + const auto& rule = ri.second; + const auto& id = rule.get_id(); + const auto& filter = rule.get_filter(); + const auto& prefix = filter.has_prefix()?filter.get_prefix():rule.get_prefix(); + const auto& mp_expiration = rule.get_mp_expiration(); + if (!rule.is_enabled()) { + continue; + } + if(!prefix.empty() && !boost::starts_with(obj_key.name, prefix)) { + continue; + } + + std::optional rule_abort_date; + if (mp_expiration.has_days()) { + rule_abort_date = std::optional( + mtime + make_timespan(mp_expiration.get_days()*24*60*60 - ceph::real_clock::to_time_t(mtime)%(24*60*60) + 24*60*60)); + } + + // update earliest abort date + if (rule_abort_date) { + if ((! abort_date_tmp) || + (*abort_date_tmp > *rule_abort_date)) { + abort_date_tmp = + std::optional(rule_abort_date); + rule_id_tmp = std::optional(id); + } + } + } + if (abort_date_tmp && rule_id_tmp) { + abort_date = *abort_date_tmp; + rule_id = *rule_id_tmp; + return true; + } else { + return false; + } +} + +} /* namespace rgw::lc */ + +void lc_op::dump(Formatter *f) const +{ + f->dump_bool("status", status); + f->dump_bool("dm_expiration", dm_expiration); + + f->dump_int("expiration", expiration); + f->dump_int("noncur_expiration", noncur_expiration); + f->dump_int("mp_expiration", mp_expiration); + if (expiration_date) { + utime_t ut(*expiration_date); + f->dump_stream("expiration_date") << ut; + } + if (obj_tags) { + f->dump_object("obj_tags", *obj_tags); + } + f->open_object_section("transitions"); + for(auto& [storage_class, transition] : transitions) { + f->dump_object(storage_class, transition); + } + f->close_section(); + + f->open_object_section("noncur_transitions"); + for (auto& [storage_class, transition] : noncur_transitions) { + f->dump_object(storage_class, transition); + } + f->close_section(); +} + +void LCFilter::dump(Formatter *f) const +{ + f->dump_string("prefix", prefix); + f->dump_object("obj_tags", obj_tags); + if (have_flag(LCFlagType::ArchiveZone)) { + f->dump_string("archivezone", ""); + } +} + +void LCExpiration::dump(Formatter *f) const +{ + f->dump_string("days", days); + f->dump_string("date", date); +} + +void LCRule::dump(Formatter *f) const +{ + f->dump_string("id", id); + f->dump_string("prefix", prefix); + f->dump_string("status", status); + f->dump_object("expiration", expiration); + f->dump_object("noncur_expiration", noncur_expiration); + f->dump_object("mp_expiration", mp_expiration); + f->dump_object("filter", filter); + f->open_object_section("transitions"); + for (auto& [storage_class, transition] : transitions) { + f->dump_object(storage_class, transition); + } + f->close_section(); + + f->open_object_section("noncur_transitions"); + for (auto& [storage_class, transition] : noncur_transitions) { + f->dump_object(storage_class, transition); + } + f->close_section(); + f->dump_bool("dm_expiration", dm_expiration); +} + + +void RGWLifecycleConfiguration::dump(Formatter *f) const +{ + f->open_object_section("prefix_map"); + for (auto& prefix : prefix_map) { + f->dump_object(prefix.first.c_str(), prefix.second); + } + f->close_section(); + + f->open_array_section("rule_map"); + for (auto& rule : rule_map) { + f->open_object_section("entry"); + f->dump_string("id", rule.first); + f->open_object_section("rule"); + rule.second.dump(f); + f->close_section(); + f->close_section(); + } + f->close_section(); +} + diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h new file mode 100644 index 000000000..bd8efd9b6 --- /dev/null +++ b/src/rgw/rgw_lc.h @@ -0,0 +1,640 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include "common/debug.h" + +#include "include/types.h" +#include "include/rados/librados.hpp" +#include "common/ceph_mutex.h" +#include "common/Cond.h" +#include "common/iso_8601.h" +#include "common/Thread.h" +#include "rgw_common.h" +#include "cls/rgw/cls_rgw_types.h" +#include "rgw_tag.h" +#include "rgw_sal.h" + +#include +#include + +#define HASH_PRIME 7877 +#define MAX_ID_LEN 255 +static std::string lc_oid_prefix = "lc"; +static std::string lc_index_lock_name = "lc_process"; + +extern const char* LC_STATUS[]; + +typedef enum { + lc_uninitial = 0, + lc_processing, + lc_failed, + lc_complete, +} LC_BUCKET_STATUS; + +class LCExpiration +{ +protected: + std::string days; + //At present only current object has expiration date + std::string date; +public: + LCExpiration() {} + LCExpiration(const std::string& _days, const std::string& _date) : days(_days), date(_date) {} + + void encode(bufferlist& bl) const { + ENCODE_START(3, 2, bl); + encode(days, bl); + encode(date, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(days, bl); + if (struct_v >= 3) { + decode(date, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +// static void generate_test_instances(list& o); + void set_days(const std::string& _days) { days = _days; } + std::string get_days_str() const { + return days; + } + int get_days() const {return atoi(days.c_str()); } + bool has_days() const { + return !days.empty(); + } + void set_date(const std::string& _date) { date = _date; } + std::string get_date() const { + return date; + } + bool has_date() const { + return !date.empty(); + } + bool empty() const { + return days.empty() && date.empty(); + } + bool valid() const { + if (!days.empty() && !date.empty()) { + return false; + } else if (!days.empty() && get_days() <= 0) { + return false; + } + //We've checked date in xml parsing + return true; + } +}; +WRITE_CLASS_ENCODER(LCExpiration) + +class LCTransition +{ +protected: + std::string days; + std::string date; + std::string storage_class; + +public: + int get_days() const { + return atoi(days.c_str()); + } + + std::string get_date() const { + return date; + } + + std::string get_storage_class() const { + return storage_class; + } + + bool has_days() const { + return !days.empty(); + } + + bool has_date() const { + return !date.empty(); + } + + bool empty() const { + return days.empty() && date.empty(); + } + + bool valid() const { + if (!days.empty() && !date.empty()) { + return false; + } else if (!days.empty() && get_days() < 0) { + return false; + } + //We've checked date in xml parsing + return true; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(days, bl); + encode(date, bl); + encode(storage_class, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(days, bl); + decode(date, bl); + decode(storage_class, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const { + f->dump_string("days", days); + f->dump_string("date", date); + f->dump_string("storage_class", storage_class); + } +}; +WRITE_CLASS_ENCODER(LCTransition) + +enum class LCFlagType : uint16_t +{ + none = 0, + ArchiveZone, +}; + +class LCFlag { +public: + LCFlagType bit; + const char* name; + + constexpr LCFlag(LCFlagType ord, const char* name) : bit(ord), name(name) + {} +}; + +class LCFilter +{ + public: + + static constexpr uint32_t make_flag(LCFlagType type) { + switch (type) { + case LCFlagType::none: + return 0; + break; + default: + return 1 << (uint32_t(type) - 1); + } + } + + static constexpr std::array filter_flags = + { + LCFlag(LCFlagType::none, "none"), + LCFlag(LCFlagType::ArchiveZone, "ArchiveZone"), + }; + +protected: + std::string prefix; + RGWObjTags obj_tags; + uint32_t flags; + +public: + + LCFilter() : flags(make_flag(LCFlagType::none)) + {} + + const std::string& get_prefix() const { + return prefix; + } + + const RGWObjTags& get_tags() const { + return obj_tags; + } + + const uint32_t get_flags() const { + return flags; + } + + bool empty() const { + return !(has_prefix() || has_tags() || has_flags()); + } + + // Determine if we need AND tag when creating xml + bool has_multi_condition() const { + if (obj_tags.count() + int(has_prefix()) + int(has_flags()) > 1) // Prefix is a member of Filter + return true; + return false; + } + + bool has_prefix() const { + return !prefix.empty(); + } + + bool has_tags() const { + return !obj_tags.empty(); + } + + bool has_flags() const { + return !(flags == uint32_t(LCFlagType::none)); + } + + bool have_flag(LCFlagType flag) const { + return flags & make_flag(flag); + } + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(prefix, bl); + encode(obj_tags, bl); + encode(flags, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(prefix, bl); + if (struct_v >= 2) { + decode(obj_tags, bl); + if (struct_v >= 3) { + decode(flags, bl); + } + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(LCFilter) + +class LCRule +{ +protected: + std::string id; + std::string prefix; + std::string status; + LCExpiration expiration; + LCExpiration noncur_expiration; + LCExpiration mp_expiration; + LCFilter filter; + std::map transitions; + std::map noncur_transitions; + bool dm_expiration = false; + +public: + + LCRule(){}; + virtual ~LCRule() {} + + const std::string& get_id() const { + return id; + } + + const std::string& get_status() const { + return status; + } + + bool is_enabled() const { + return status == "Enabled"; + } + + void set_enabled(bool flag) { + status = (flag ? "Enabled" : "Disabled"); + } + + const std::string& get_prefix() const { + return prefix; + } + + const LCFilter& get_filter() const { + return filter; + } + + const LCExpiration& get_expiration() const { + return expiration; + } + + const LCExpiration& get_noncur_expiration() const { + return noncur_expiration; + } + + const LCExpiration& get_mp_expiration() const { + return mp_expiration; + } + + bool get_dm_expiration() const { + return dm_expiration; + } + + const std::map& get_transitions() const { + return transitions; + } + + const std::map& get_noncur_transitions() const { + return noncur_transitions; + } + + void set_id(const std::string& _id) { + id = _id; + } + + void set_prefix(const std::string& _prefix) { + prefix = _prefix; + } + + void set_status(const std::string& _status) { + status = _status; + } + + void set_expiration(const LCExpiration& _expiration) { + expiration = _expiration; + } + + void set_noncur_expiration(const LCExpiration& _noncur_expiration) { + noncur_expiration = _noncur_expiration; + } + + void set_mp_expiration(const LCExpiration& _mp_expiration) { + mp_expiration = _mp_expiration; + } + + void set_dm_expiration(bool _dm_expiration) { + dm_expiration = _dm_expiration; + } + + bool add_transition(const LCTransition& _transition) { + auto ret = transitions.emplace(_transition.get_storage_class(), _transition); + return ret.second; + } + + bool add_noncur_transition(const LCTransition& _noncur_transition) { + auto ret = noncur_transitions.emplace(_noncur_transition.get_storage_class(), _noncur_transition); + return ret.second; + } + + bool valid() const; + + void encode(bufferlist& bl) const { + ENCODE_START(6, 1, bl); + encode(id, bl); + encode(prefix, bl); + encode(status, bl); + encode(expiration, bl); + encode(noncur_expiration, bl); + encode(mp_expiration, bl); + encode(dm_expiration, bl); + encode(filter, bl); + encode(transitions, bl); + encode(noncur_transitions, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(6, 1, 1, bl); + decode(id, bl); + decode(prefix, bl); + decode(status, bl); + decode(expiration, bl); + if (struct_v >=2) { + decode(noncur_expiration, bl); + } + if (struct_v >= 3) { + decode(mp_expiration, bl); + } + if (struct_v >= 4) { + decode(dm_expiration, bl); + } + if (struct_v >= 5) { + decode(filter, bl); + } + if (struct_v >= 6) { + decode(transitions, bl); + decode(noncur_transitions, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + + void init_simple_days_rule(std::string_view _id, std::string_view _prefix, int num_days); +}; +WRITE_CLASS_ENCODER(LCRule) + +struct transition_action +{ + int days; + boost::optional date; + std::string storage_class; + transition_action() : days(0) {} + void dump(Formatter *f) const { + if (!date) { + f->dump_int("days", days); + } else { + utime_t ut(*date); + f->dump_stream("date") << ut; + } + } +}; + +/* XXX why not LCRule? */ +struct lc_op +{ + std::string id; + bool status{false}; + bool dm_expiration{false}; + int expiration{0}; + int noncur_expiration{0}; + int mp_expiration{0}; + boost::optional expiration_date; + boost::optional obj_tags; + std::map transitions; + std::map noncur_transitions; + uint32_t rule_flags; + + /* ctors are nice */ + lc_op() = delete; + + lc_op(const std::string id) : id(id) + {} + + void dump(Formatter *f) const; +}; + +class RGWLifecycleConfiguration +{ +protected: + CephContext *cct; + std::multimap prefix_map; + std::multimap rule_map; + bool _add_rule(const LCRule& rule); + bool has_same_action(const lc_op& first, const lc_op& second); +public: + explicit RGWLifecycleConfiguration(CephContext *_cct) : cct(_cct) {} + RGWLifecycleConfiguration() : cct(NULL) {} + + void set_ctx(CephContext *ctx) { + cct = ctx; + } + + virtual ~RGWLifecycleConfiguration() {} + +// int get_perm(std::string& id, int perm_mask); +// int get_group_perm(ACLGroupTypeEnum group, int perm_mask); + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(rule_map, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl); + decode(rule_map, bl); + std::multimap::iterator iter; + for (iter = rule_map.begin(); iter != rule_map.end(); ++iter) { + LCRule& rule = iter->second; + _add_rule(rule); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + void add_rule(const LCRule& rule); + + int check_and_add_rule(const LCRule& rule); + + bool valid(); + + std::multimap& get_rule_map() { return rule_map; } + std::multimap& get_prefix_map() { return prefix_map; } +/* + void create_default(std::string id, std::string name) { + ACLGrant grant; + grant.set_canon(id, name, RGW_PERM_FULL_CONTROL); + add_grant(&grant); + } +*/ +}; +WRITE_CLASS_ENCODER(RGWLifecycleConfiguration) + +class RGWLC : public DoutPrefixProvider { + CephContext *cct; + rgw::sal::Driver* driver; + std::unique_ptr sal_lc; + int max_objs{0}; + std::string *obj_names{nullptr}; + std::atomic down_flag = { false }; + std::string cookie; + +public: + + class WorkPool; + + class LCWorker : public Thread + { + const DoutPrefixProvider *dpp; + CephContext *cct; + RGWLC *lc; + int ix; + std::mutex lock; + std::condition_variable cond; + WorkPool* workpool{nullptr}; + /* save the target bucket names created as part of object transition + * to cloud. This list is maintained for the duration of each RGWLC::process() + * post which it is discarded. */ + std::set cloud_targets; + + public: + + using lock_guard = std::lock_guard; + using unique_lock = std::unique_lock; + + LCWorker(const DoutPrefixProvider* dpp, CephContext *_cct, RGWLC *_lc, + int ix); + RGWLC* get_lc() { return lc; } + + std::string thr_name() { + return std::string{"lc_thrd: "} + std::to_string(ix); + } + + void *entry() override; + void stop(); + bool should_work(utime_t& now); + int schedule_next_start_time(utime_t& start, utime_t& now); + std::set& get_cloud_targets() { return cloud_targets; } + virtual ~LCWorker() override; + + friend class RGWRados; + friend class RGWLC; + friend class WorkQ; + }; /* LCWorker */ + + friend class RGWRados; + + std::vector> workers; + + RGWLC() : cct(nullptr), driver(nullptr) {} + virtual ~RGWLC() override; + + void initialize(CephContext *_cct, rgw::sal::Driver* _driver); + void finalize(); + + int process(LCWorker* worker, + const std::unique_ptr& optional_bucket, + bool once); + int advance_head(const std::string& lc_shard, + rgw::sal::Lifecycle::LCHead& head, + rgw::sal::Lifecycle::LCEntry& entry, + time_t start_date); + int process(int index, int max_lock_secs, LCWorker* worker, bool once); + int process_bucket(int index, int max_lock_secs, LCWorker* worker, + const std::string& bucket_entry_marker, bool once); + bool expired_session(time_t started); + time_t thread_stop_at(); + int list_lc_progress(std::string& marker, uint32_t max_entries, + std::vector>&, + int& index); + int bucket_lc_process(std::string& shard_id, LCWorker* worker, time_t stop_at, + bool once); + int bucket_lc_post(int index, int max_lock_sec, + rgw::sal::Lifecycle::LCEntry& entry, int& result, LCWorker* worker); + bool going_down(); + void start_processor(); + void stop_processor(); + int set_bucket_config(rgw::sal::Bucket* bucket, + const rgw::sal::Attrs& bucket_attrs, + RGWLifecycleConfiguration *config); + int remove_bucket_config(rgw::sal::Bucket* bucket, + const rgw::sal::Attrs& bucket_attrs, + bool merge_attrs = true); + + CephContext *get_cct() const override { return cct; } + rgw::sal::Lifecycle* get_lc() const { return sal_lc.get(); } + unsigned get_subsys() const; + std::ostream& gen_prefix(std::ostream& out) const; + + private: + + int handle_multipart_expiration(rgw::sal::Bucket* target, + const std::multimap& prefix_map, + LCWorker* worker, time_t stop_at, bool once); +}; + +namespace rgw::lc { + +int fix_lc_shard_entry(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + rgw::sal::Lifecycle* sal_lc, + rgw::sal::Bucket* bucket); + +std::string s3_expiration_header( + DoutPrefixProvider* dpp, + const rgw_obj_key& obj_key, + const RGWObjTags& obj_tagset, + const ceph::real_time& mtime, + const std::map& bucket_attrs); + +bool s3_multipart_abort_header( + DoutPrefixProvider* dpp, + const rgw_obj_key& obj_key, + const ceph::real_time& mtime, + const std::map& bucket_attrs, + ceph::real_time& abort_date, + std::string& rule_id); + +} // namespace rgw::lc diff --git a/src/rgw/rgw_lc_s3.cc b/src/rgw/rgw_lc_s3.cc new file mode 100644 index 000000000..cf152b84a --- /dev/null +++ b/src/rgw/rgw_lc_s3.cc @@ -0,0 +1,353 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_user.h" +#include "rgw_lc_s3.h" + + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static bool check_date(const string& _date) +{ + boost::optional date = ceph::from_iso_8601(_date); + if (boost::none == date) { + return false; + } + struct timespec time = ceph::real_clock::to_timespec(*date); + if (time.tv_sec % (24*60*60) || time.tv_nsec) { + return false; + } + return true; +} + +void LCExpiration_S3::dump_xml(Formatter *f) const { + if (dm_expiration) { + encode_xml("ExpiredObjectDeleteMarker", "true", f); + } else if (!days.empty()) { + encode_xml("Days", days, f); + } else { + encode_xml("Date", date, f); + } +} + +void LCExpiration_S3::decode_xml(XMLObj *obj) +{ + bool has_days = RGWXMLDecoder::decode_xml("Days", days, obj); + bool has_date = RGWXMLDecoder::decode_xml("Date", date, obj); + string dm; + bool has_dm = RGWXMLDecoder::decode_xml("ExpiredObjectDeleteMarker", dm, obj); + + int num = !!has_days + !!has_date + !!has_dm; + + if (num != 1) { + throw RGWXMLDecoder::err("bad Expiration section"); + } + + if (has_date && !check_date(date)) { + //We need return xml error according to S3 + throw RGWXMLDecoder::err("bad date in Date section"); + } + + if (has_dm) { + dm_expiration = (dm == "true"); + } +} + +void LCNoncurExpiration_S3::decode_xml(XMLObj *obj) +{ + RGWXMLDecoder::decode_xml("NoncurrentDays", days, obj, true); +} + +void LCNoncurExpiration_S3::dump_xml(Formatter *f) const +{ + encode_xml("NoncurrentDays", days, f); +} + +void LCMPExpiration_S3::decode_xml(XMLObj *obj) +{ + RGWXMLDecoder::decode_xml("DaysAfterInitiation", days, obj, true); +} + +void LCMPExpiration_S3::dump_xml(Formatter *f) const +{ + encode_xml("DaysAfterInitiation", days, f); +} + +void RGWLifecycleConfiguration_S3::decode_xml(XMLObj *obj) +{ + if (!cct) { + throw RGWXMLDecoder::err("ERROR: RGWLifecycleConfiguration_S3 can't be decoded without cct initialized"); + } + vector rules; + + RGWXMLDecoder::decode_xml("Rule", rules, obj, true); + + for (auto& rule : rules) { + if (rule.get_id().empty()) { + // S3 generates a 48 bit random ID, maybe we could generate shorter IDs + static constexpr auto LC_ID_LENGTH = 48; + string id = gen_rand_alphanumeric_lower(cct, LC_ID_LENGTH); + rule.set_id(id); + } + + add_rule(rule); + } + + if (cct->_conf->rgw_lc_max_rules < rule_map.size()) { + stringstream ss; + ss << "Warn: The lifecycle config has too many rules, rule number is:" + << rule_map.size() << ", max number is:" << cct->_conf->rgw_lc_max_rules; + throw RGWXMLDecoder::err(ss.str()); + } +} + +void LCFilter_S3::dump_xml(Formatter *f) const +{ + bool multi = has_multi_condition(); + if (multi) { + f->open_array_section("And"); + } + if (has_prefix()) { + encode_xml("Prefix", prefix, f); + } + if (has_tags()) { + const auto& tagset_s3 = static_cast(obj_tags); + tagset_s3.dump_xml(f); + } + if (has_flags()) { + if (have_flag(LCFlagType::ArchiveZone)) { + encode_xml("ArchiveZone", "", f); + } + } + if (multi) { + f->close_section(); // And + } +} + +void LCFilter_S3::decode_xml(XMLObj *obj) +{ + /* + * The prior logic here looked for an And element, but did not + * structurally parse the Filter clause (and incorrectly rejected + * the base case where a Prefix and one Tag were supplied). It + * could not reject generally malformed Filter syntax. + * + * Empty filters are allowed: + * https://docs.aws.amazon.com/AmazonS3/latest/dev/intro-lifecycle-rules.html + */ + XMLObj* o = obj->find_first("And"); + if (o == nullptr){ + o = obj; + } + + RGWXMLDecoder::decode_xml("Prefix", prefix, o); + + /* parse optional ArchiveZone flag (extension) */ + if (o->find_first("ArchiveZone")) { + flags |= make_flag(LCFlagType::ArchiveZone); + } + + obj_tags.clear(); // why is this needed? + auto tags_iter = o->find("Tag"); + while (auto tag_xml = tags_iter.get_next()){ + std::string _key,_val; + RGWXMLDecoder::decode_xml("Key", _key, tag_xml); + RGWXMLDecoder::decode_xml("Value", _val, tag_xml); + obj_tags.emplace_tag(std::move(_key), std::move(_val)); + } +} + +void LCTransition_S3::decode_xml(XMLObj *obj) +{ + bool has_days = RGWXMLDecoder::decode_xml("Days", days, obj); + bool has_date = RGWXMLDecoder::decode_xml("Date", date, obj); + if ((has_days && has_date) || (!has_days && !has_date)) { + throw RGWXMLDecoder::err("bad Transition section"); + } + + if (has_date && !check_date(date)) { + //We need return xml error according to S3 + throw RGWXMLDecoder::err("bad Date in Transition section"); + } + + if (!RGWXMLDecoder::decode_xml("StorageClass", storage_class, obj)) { + throw RGWXMLDecoder::err("missing StorageClass in Transition section"); + } +} + +void LCTransition_S3::dump_xml(Formatter *f) const { + if (!days.empty()) { + encode_xml("Days", days, f); + } else { + encode_xml("Date", date, f); + } + encode_xml("StorageClass", storage_class, f); +} + +void LCNoncurTransition_S3::decode_xml(XMLObj *obj) +{ + if (!RGWXMLDecoder::decode_xml("NoncurrentDays", days, obj)) { + throw RGWXMLDecoder::err("missing NoncurrentDays in NoncurrentVersionTransition section"); + } + if (!RGWXMLDecoder::decode_xml("StorageClass", storage_class, obj)) { + throw RGWXMLDecoder::err("missing StorageClass in NoncurrentVersionTransition section"); + } +} + +void LCNoncurTransition_S3::dump_xml(Formatter *f) const +{ + encode_xml("NoncurrentDays", days, f); + encode_xml("StorageClass", storage_class, f); +} + +void LCRule_S3::decode_xml(XMLObj *obj) +{ + id.clear(); + prefix.clear(); + status.clear(); + dm_expiration = false; + + RGWXMLDecoder::decode_xml("ID", id, obj); + + LCFilter_S3 filter_s3; + if (!RGWXMLDecoder::decode_xml("Filter", filter_s3, obj)) { + // Ideally the following code should be deprecated and we should return + // False here, The new S3 LC configuration xml spec. makes Filter mandatory + // and Prefix optional. However older clients including boto2 still generate + // xml according to the older spec, where Prefix existed outside of Filter + // and S3 itself seems to be sloppy on enforcing the mandatory Filter + // argument. A day will come when S3 enforces their own xml-spec, but it is + // not this day + + if (!RGWXMLDecoder::decode_xml("Prefix", prefix, obj)) { + throw RGWXMLDecoder::err("missing Prefix in Filter"); + } + } + filter = (LCFilter)filter_s3; + + if (!RGWXMLDecoder::decode_xml("Status", status, obj)) { + throw RGWXMLDecoder::err("missing Status in Filter"); + } + if (status.compare("Enabled") != 0 && status.compare("Disabled") != 0) { + throw RGWXMLDecoder::err("bad Status in Filter"); + } + + LCExpiration_S3 s3_expiration; + LCNoncurExpiration_S3 s3_noncur_expiration; + LCMPExpiration_S3 s3_mp_expiration; + LCFilter_S3 s3_filter; + + bool has_expiration = RGWXMLDecoder::decode_xml("Expiration", s3_expiration, obj); + bool has_noncur_expiration = RGWXMLDecoder::decode_xml("NoncurrentVersionExpiration", s3_noncur_expiration, obj); + bool has_mp_expiration = RGWXMLDecoder::decode_xml("AbortIncompleteMultipartUpload", s3_mp_expiration, obj); + + vector transitions; + vector noncur_transitions; + + bool has_transition = RGWXMLDecoder::decode_xml("Transition", transitions, obj); + bool has_noncur_transition = RGWXMLDecoder::decode_xml("NoncurrentVersionTransition", noncur_transitions, obj); + + if (!has_expiration && + !has_noncur_expiration && + !has_mp_expiration && + !has_transition && + !has_noncur_transition) { + throw RGWXMLDecoder::err("bad Rule"); + } + + if (has_expiration) { + if (s3_expiration.has_days() || + s3_expiration.has_date()) { + expiration = s3_expiration; + } else { + dm_expiration = s3_expiration.get_dm_expiration(); + } + } + if (has_noncur_expiration) { + noncur_expiration = s3_noncur_expiration; + } + if (has_mp_expiration) { + mp_expiration = s3_mp_expiration; + } + for (auto& t : transitions) { + if (!add_transition(t)) { + throw RGWXMLDecoder::err("Failed to add transition"); + } + } + for (auto& t : noncur_transitions) { + if (!add_noncur_transition(t)) { + throw RGWXMLDecoder::err("Failed to add non-current version transition"); + } + } +} + +void LCRule_S3::dump_xml(Formatter *f) const { + encode_xml("ID", id, f); + // In case of an empty filter and an empty Prefix, we defer to Prefix. + if (!filter.empty()) { + const LCFilter_S3& lc_filter = static_cast(filter); + encode_xml("Filter", lc_filter, f); + } else { + encode_xml("Prefix", prefix, f); + } + encode_xml("Status", status, f); + if (!expiration.empty() || dm_expiration) { + LCExpiration_S3 expir(expiration.get_days_str(), expiration.get_date(), dm_expiration); + encode_xml("Expiration", expir, f); + } + if (!noncur_expiration.empty()) { + const LCNoncurExpiration_S3& noncur_expir = static_cast(noncur_expiration); + encode_xml("NoncurrentVersionExpiration", noncur_expir, f); + } + if (!mp_expiration.empty()) { + const LCMPExpiration_S3& mp_expir = static_cast(mp_expiration); + encode_xml("AbortIncompleteMultipartUpload", mp_expir, f); + } + if (!transitions.empty()) { + for (auto &elem : transitions) { + const LCTransition_S3& tran = static_cast(elem.second); + encode_xml("Transition", tran, f); + } + } + if (!noncur_transitions.empty()) { + for (auto &elem : noncur_transitions) { + const LCNoncurTransition_S3& noncur_tran = static_cast(elem.second); + encode_xml("NoncurrentVersionTransition", noncur_tran, f); + } + } +} + +int RGWLifecycleConfiguration_S3::rebuild(RGWLifecycleConfiguration& dest) +{ + int ret = 0; + multimap::iterator iter; + for (iter = rule_map.begin(); iter != rule_map.end(); ++iter) { + LCRule& src_rule = iter->second; + ret = dest.check_and_add_rule(src_rule); + if (ret < 0) + return ret; + } + if (!dest.valid()) { + ret = -ERR_INVALID_REQUEST; + } + return ret; +} + + +void RGWLifecycleConfiguration_S3::dump_xml(Formatter *f) const +{ + for (auto iter = rule_map.begin(); iter != rule_map.end(); ++iter) { + const LCRule_S3& rule = static_cast(iter->second); + encode_xml("Rule", rule, f); + } +} + diff --git a/src/rgw/rgw_lc_s3.h b/src/rgw/rgw_lc_s3.h new file mode 100644 index 000000000..5486aef35 --- /dev/null +++ b/src/rgw/rgw_lc_s3.h @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include "include/str_list.h" +#include "rgw_lc.h" +#include "rgw_xml.h" +#include "rgw_tag_s3.h" + + +class LCFilter_S3 : public LCFilter +{ +public: + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); +}; + +class LCExpiration_S3 : public LCExpiration +{ +private: + bool dm_expiration{false}; +public: + LCExpiration_S3() {} + LCExpiration_S3(std::string _days, std::string _date, bool _dm_expiration) : LCExpiration(_days, _date), dm_expiration(_dm_expiration) {} + + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); + + void set_dm_expiration(bool _dm_expiration) { + dm_expiration = _dm_expiration; + } + + bool get_dm_expiration() { + return dm_expiration; + } +}; + +class LCNoncurExpiration_S3 : public LCExpiration +{ +public: + LCNoncurExpiration_S3() {} + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +class LCMPExpiration_S3 : public LCExpiration +{ +public: + LCMPExpiration_S3() {} + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +class LCTransition_S3 : public LCTransition +{ +public: + LCTransition_S3() {} + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +class LCNoncurTransition_S3 : public LCTransition +{ +public: + LCNoncurTransition_S3() {} + ~LCNoncurTransition_S3() {} + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + + +class LCRule_S3 : public LCRule +{ +public: + LCRule_S3() {} + + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); +}; + +class RGWLifecycleConfiguration_S3 : public RGWLifecycleConfiguration +{ +public: + explicit RGWLifecycleConfiguration_S3(CephContext *_cct) : RGWLifecycleConfiguration(_cct) {} + RGWLifecycleConfiguration_S3() : RGWLifecycleConfiguration(nullptr) {} + + void decode_xml(XMLObj *obj); + int rebuild(RGWLifecycleConfiguration& dest); + void dump_xml(Formatter *f) const; +}; diff --git a/src/rgw/rgw_ldap.cc b/src/rgw/rgw_ldap.cc new file mode 100644 index 000000000..7ad6b74b1 --- /dev/null +++ b/src/rgw/rgw_ldap.cc @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_ldap.h" + +#include "common/ceph_crypto.h" +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/dout.h" +#include "common/safe_io.h" +#include + +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +std::string parse_rgw_ldap_bindpw(CephContext* ctx) +{ + string ldap_bindpw; + string ldap_secret = ctx->_conf->rgw_ldap_secret; + + if (ldap_secret.empty()) { + ldout(ctx, 10) + << __func__ << " LDAP auth no rgw_ldap_secret file found in conf" + << dendl; + } else { + // FIPS zeroization audit 20191116: this memset is not intended to + // wipe out a secret after use. + char bindpw[1024]; + memset(bindpw, 0, 1024); + int pwlen = safe_read_file("" /* base */, ldap_secret.c_str(), + bindpw, 1023); + if (pwlen > 0) { + ldap_bindpw = bindpw; + boost::algorithm::trim(ldap_bindpw); + if (ldap_bindpw.back() == '\n') + ldap_bindpw.pop_back(); + } + ::ceph::crypto::zeroize_for_security(bindpw, sizeof(bindpw)); + } + + return ldap_bindpw; +} + +#if defined(HAVE_OPENLDAP) +namespace rgw { + + int LDAPHelper::auth(const std::string &uid, const std::string &pwd) { + int ret; + std::string filter; + if (msad) { + filter = "(&(objectClass=user)(sAMAccountName="; + filter += uid; + filter += "))"; + } else { + /* openldap */ + if (searchfilter.empty()) { + /* no search filter provided in config, we construct our own */ + filter = "("; + filter += dnattr; + filter += "="; + filter += uid; + filter += ")"; + } else { + if (searchfilter.find("@USERNAME@") != std::string::npos) { + /* we need to substitute the @USERNAME@ placeholder */ + filter = searchfilter; + filter.replace(searchfilter.find("@USERNAME@"), std::string("@USERNAME@").length(), uid); + } else { + /* no placeholder for username, so we need to append our own username filter to the custom searchfilter */ + filter = "(&("; + filter += searchfilter; + filter += ")("; + filter += dnattr; + filter += "="; + filter += uid; + filter += "))"; + } + } + } + ldout(g_ceph_context, 12) + << __func__ << " search filter: " << filter + << dendl; + char *attrs[] = { const_cast(dnattr.c_str()), nullptr }; + LDAPMessage *answer = nullptr, *entry = nullptr; + bool once = true; + + lock_guard guard(mtx); + + retry_bind: + ret = ldap_search_s(ldap, searchdn.c_str(), LDAP_SCOPE_SUBTREE, + filter.c_str(), attrs, 0, &answer); + if (ret == LDAP_SUCCESS) { + entry = ldap_first_entry(ldap, answer); + if (entry) { + char *dn = ldap_get_dn(ldap, entry); + ret = simple_bind(dn, pwd); + if (ret != LDAP_SUCCESS) { + ldout(g_ceph_context, 10) + << __func__ << " simple_bind failed uid=" << uid + << "ldap err=" << ret + << dendl; + } + ldap_memfree(dn); + } else { + ldout(g_ceph_context, 12) + << __func__ << " ldap_search_s no user matching uid=" << uid + << dendl; + ret = LDAP_NO_SUCH_ATTRIBUTE; // fixup result + } + ldap_msgfree(answer); + } else { + ldout(g_ceph_context, 5) + << __func__ << " ldap_search_s error uid=" << uid + << " ldap err=" << ret + << dendl; + /* search should never fail--try to rebind */ + if (once) { + rebind(); + once = false; + goto retry_bind; + } + } + return (ret == LDAP_SUCCESS) ? ret : -EACCES; + } /* LDAPHelper::auth */ +} + +#endif /* defined(HAVE_OPENLDAP) */ diff --git a/src/rgw/rgw_ldap.h b/src/rgw/rgw_ldap.h new file mode 100644 index 000000000..05a48ce19 --- /dev/null +++ b/src/rgw/rgw_ldap.h @@ -0,0 +1,138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "acconfig.h" + +#if defined(HAVE_OPENLDAP) +#define LDAP_DEPRECATED 1 +#include "ldap.h" +#endif + +#include +#include +#include +#include +#include +#include + +namespace rgw { + +#if defined(HAVE_OPENLDAP) + + class LDAPHelper + { + std::string uri; + std::string binddn; + std::string bindpw; + std::string searchdn; + std::string searchfilter; + std::string dnattr; + LDAP *ldap; + bool msad = false; /* TODO: possible future specialization */ + std::mutex mtx; + + public: + using lock_guard = std::lock_guard; + + LDAPHelper(std::string _uri, std::string _binddn, std::string _bindpw, + const std::string &_searchdn, const std::string &_searchfilter, const std::string &_dnattr) + : uri(std::move(_uri)), binddn(std::move(_binddn)), + bindpw(std::move(_bindpw)), searchdn(_searchdn), searchfilter(_searchfilter), dnattr(_dnattr), + ldap(nullptr) { + // nothing + } + + int init() { + int ret; + ret = ldap_initialize(&ldap, uri.c_str()); + if (ret == LDAP_SUCCESS) { + unsigned long ldap_ver = LDAP_VERSION3; + ret = ldap_set_option(ldap, LDAP_OPT_PROTOCOL_VERSION, + (void*) &ldap_ver); + } + if (ret == LDAP_SUCCESS) { + ret = ldap_set_option(ldap, LDAP_OPT_REFERRALS, LDAP_OPT_OFF); + } + return (ret == LDAP_SUCCESS) ? ret : -EINVAL; + } + + int bind() { + int ret; + ret = ldap_simple_bind_s(ldap, binddn.c_str(), bindpw.c_str()); + return (ret == LDAP_SUCCESS) ? ret : -EINVAL; + } + + int rebind() { + if (ldap) { + (void) ldap_unbind(ldap); + (void) init(); + return bind(); + } + return -EINVAL; + } + + int simple_bind(const char *dn, const std::string& pwd) { + LDAP* tldap; + int ret = ldap_initialize(&tldap, uri.c_str()); + if (ret == LDAP_SUCCESS) { + unsigned long ldap_ver = LDAP_VERSION3; + ret = ldap_set_option(tldap, LDAP_OPT_PROTOCOL_VERSION, + (void*) &ldap_ver); + if (ret == LDAP_SUCCESS) { + ret = ldap_simple_bind_s(tldap, dn, pwd.c_str()); + } + (void) ldap_unbind(tldap); + } + return ret; // OpenLDAP client error space + } + + int auth(const std::string &uid, const std::string &pwd); + + ~LDAPHelper() { + if (ldap) + (void) ldap_unbind(ldap); + } + + }; /* LDAPHelper */ + +#else + + class LDAPHelper + { + public: + LDAPHelper(const std::string &_uri, const std::string &_binddn, const std::string &_bindpw, + const std::string &_searchdn, const std::string &_searchfilter, const std::string &_dnattr) + {} + + int init() { + return -ENOTSUP; + } + + int bind() { + return -ENOTSUP; + } + + int auth(const std::string &uid, const std::string &pwd) { + return -EACCES; + } + + ~LDAPHelper() {} + + }; /* LDAPHelper */ + + +#endif /* HAVE_OPENLDAP */ + +} /* namespace rgw */ + +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/dout.h" +#include "common/safe_io.h" +#include + +#include "include/ceph_assert.h" + +std::string parse_rgw_ldap_bindpw(CephContext* ctx); diff --git a/src/rgw/rgw_lib.cc b/src/rgw/rgw_lib.cc new file mode 100644 index 000000000..f449cce21 --- /dev/null +++ b/src/rgw/rgw_lib.cc @@ -0,0 +1,610 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include + +#include "include/rados/librgw.h" +#include "rgw_acl.h" + +#include "include/str_list.h" +#include "global/signal_handler.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "common/ceph_argparse.h" +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/dout.h" + +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_log.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_auth.h" +#include "rgw_lib.h" +#include "rgw_lib_frontend.h" +#include "rgw_perf_counters.h" +#include "rgw_signal.h" +#include "rgw_main.h" + +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw { + + RGWLib* g_rgwlib = nullptr; + + class C_InitTimeout : public Context { + public: + C_InitTimeout() {} + void finish(int r) override { + derr << "Initialization timeout, failed to initialize" << dendl; + exit(1); + } + }; + + void RGWLibProcess::checkpoint() + { + m_tp.drain(&req_wq); + } + +#define MIN_EXPIRE_S 120 + + void RGWLibProcess::run() + { + /* write completion interval */ + RGWLibFS::write_completion_interval_s = + cct->_conf->rgw_nfs_write_completion_interval_s; + + /* start write timer */ + RGWLibFS::write_timer.resume(); + + /* gc loop */ + while (! shutdown) { + lsubdout(cct, rgw, 5) << "RGWLibProcess GC" << dendl; + + /* dirent invalidate timeout--basically, the upper-bound on + * inconsistency with the S3 namespace */ + auto expire_s = cct->_conf->rgw_nfs_namespace_expire_secs; + + /* delay between gc cycles */ + auto delay_s = std::max(int64_t(1), std::min(int64_t(MIN_EXPIRE_S), expire_s/2)); + + unique_lock uniq(mtx); + restart: + int cur_gen = gen; + for (auto iter = mounted_fs.begin(); iter != mounted_fs.end(); + ++iter) { + RGWLibFS* fs = iter->first->ref(); + uniq.unlock(); + fs->gc(); + const DoutPrefix dp(cct, dout_subsys, "librgw: "); + fs->update_user(&dp); + fs->rele(); + uniq.lock(); + if (cur_gen != gen) + goto restart; /* invalidated */ + } + cv.wait_for(uniq, std::chrono::seconds(delay_s)); + uniq.unlock(); + } + } + + void RGWLibProcess::handle_request(const DoutPrefixProvider *dpp, RGWRequest* r) + { + /* + * invariant: valid requests are derived from RGWLibRequst + */ + RGWLibRequest* req = static_cast(r); + + // XXX move RGWLibIO and timing setup into process_request + +#if 0 /* XXX */ + utime_t tm = ceph_clock_now(); +#endif + + RGWLibIO io_ctx; + + int ret = process_request(req, &io_ctx); + if (ret < 0) { + /* we don't really care about return code */ + dout(20) << "process_request() returned " << ret << dendl; + + } + delete req; + } /* handle_request */ + + int RGWLibProcess::process_request(RGWLibRequest* req) + { + // XXX move RGWLibIO and timing setup into process_request + +#if 0 /* XXX */ + utime_t tm = ceph_clock_now(); +#endif + + RGWLibIO io_ctx; + + int ret = process_request(req, &io_ctx); + if (ret < 0) { + /* we don't really care about return code */ + dout(20) << "process_request() returned " << ret << dendl; + } + return ret; + } /* process_request */ + + static inline void abort_req(req_state *s, RGWOp *op, int err_no) + { + if (!s) + return; + + /* XXX the dump_errno and dump_bucket_from_state behaviors in + * the abort_early (rgw_rest.cc) might be valuable, but aren't + * safe to call presently as they return HTTP data */ + + perfcounter->inc(l_rgw_failed_req); + } /* abort_req */ + + int RGWLibProcess::process_request(RGWLibRequest* req, RGWLibIO* io) + { + int ret = 0; + bool should_log = true; // XXX + + dout(1) << "====== " << __func__ + << " starting new request req=" << hex << req << dec + << " ======" << dendl; + + /* + * invariant: valid requests are derived from RGWOp--well-formed + * requests should have assigned RGWRequest::op in their descendant + * constructor--if not, the compiler can find it, at the cost of + * a runtime check + */ + RGWOp *op = (req->op) ? req->op : dynamic_cast(req); + if (! op) { + ldpp_dout(op, 1) << "failed to derive cognate RGWOp (invalid op?)" << dendl; + return -EINVAL; + } + + io->init(req->cct); + + perfcounter->inc(l_rgw_req); + + RGWEnv& rgw_env = io->get_env(); + + /* XXX + * until major refactoring of req_state and req_info, we need + * to build their RGWEnv boilerplate from the RGWLibRequest, + * pre-staging any strings (HTTP_HOST) that provoke a crash when + * not found + */ + + /* XXX for now, use ""; could be a legit hostname, or, in future, + * perhaps a tenant (Yehuda) */ + rgw_env.set("HTTP_HOST", ""); + + /* XXX and -then- bloat up req_state with string copies from it */ + req_state rstate(req->cct, env, &rgw_env, req->id); + req_state *s = &rstate; + + // XXX fix this + s->cio = io; + + /* XXX and -then- stash req_state pointers everywhere they are needed */ + ret = req->init(rgw_env, env.driver, io, s); + if (ret < 0) { + ldpp_dout(op, 10) << "failed to initialize request" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* req is-a RGWOp, currently initialized separately */ + ret = req->op_init(); + if (ret < 0) { + dout(10) << "failed to initialize RGWOp" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* now expected by rgw_log_op() */ + rgw_env.set("REQUEST_METHOD", s->info.method); + rgw_env.set("REQUEST_URI", s->info.request_uri); + rgw_env.set("QUERY_STRING", ""); + + try { + /* XXX authorize does less here then in the REST path, e.g., + * the user's info is cached, but still incomplete */ + ldpp_dout(s, 2) << "authorizing" << dendl; + ret = req->authorize(op, null_yield); + if (ret < 0) { + dout(10) << "failed to authorize request" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* FIXME: remove this after switching all handlers to the new + * authentication infrastructure. */ + if (! s->auth.identity) { + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } + + ldpp_dout(s, 2) << "reading op permissions" << dendl; + ret = req->read_permissions(op, null_yield); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "init op" << dendl; + ret = op->init_processing(null_yield); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "verifying op mask" << dendl; + ret = op->verify_op_mask(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "verifying op permissions" << dendl; + ret = op->verify_permission(null_yield); + if (ret < 0) { + if (s->system_request) { + ldpp_dout(op, 2) << "overriding permissions due to system operation" << dendl; + } else if (s->auth.identity->is_admin_of(s->user->get_id())) { + ldpp_dout(op, 2) << "overriding permissions due to admin operation" << dendl; + } else { + abort_req(s, op, ret); + goto done; + } + } + + ldpp_dout(s, 2) << "verifying op params" << dendl; + ret = op->verify_params(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "executing" << dendl; + op->pre_exec(); + op->execute(null_yield); + op->complete(); + + } catch (const ceph::crypto::DigestException& e) { + dout(0) << "authentication failed" << e.what() << dendl; + abort_req(s, op, -ERR_INVALID_SECRET_KEY); + } + + done: + try { + io->complete_request(); + } catch (rgw::io::Exception& e) { + dout(0) << "ERROR: io->complete_request() returned " + << e.what() << dendl; + } + if (should_log) { + rgw_log_op(nullptr /* !rest */, s, op, env.olog); + } + + int http_ret = s->err.http_ret; + + ldpp_dout(s, 2) << "http status=" << http_ret << dendl; + + ldpp_dout(op, 1) << "====== " << __func__ + << " req done req=" << hex << req << dec << " http_status=" + << http_ret + << " ======" << dendl; + + return (ret < 0 ? ret : s->err.ret); + } /* process_request */ + + int RGWLibProcess::start_request(RGWLibContinuedReq* req) + { + + dout(1) << "====== " << __func__ + << " starting new continued request req=" << hex << req << dec + << " ======" << dendl; + + /* + * invariant: valid requests are derived from RGWOp--well-formed + * requests should have assigned RGWRequest::op in their descendant + * constructor--if not, the compiler can find it, at the cost of + * a runtime check + */ + RGWOp *op = (req->op) ? req->op : dynamic_cast(req); + if (! op) { + ldpp_dout(op, 1) << "failed to derive cognate RGWOp (invalid op?)" << dendl; + return -EINVAL; + } + + req_state* s = req->get_state(); + RGWLibIO& io_ctx = req->get_io(); + RGWEnv& rgw_env = io_ctx.get_env(); + + rgw_env.set("HTTP_HOST", ""); + + int ret = req->init(rgw_env, env.driver, &io_ctx, s); + if (ret < 0) { + ldpp_dout(op, 10) << "failed to initialize request" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* req is-a RGWOp, currently initialized separately */ + ret = req->op_init(); + if (ret < 0) { + dout(10) << "failed to initialize RGWOp" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* XXX authorize does less here then in the REST path, e.g., + * the user's info is cached, but still incomplete */ + ldpp_dout(s, 2) << "authorizing" << dendl; + ret = req->authorize(op, null_yield); + if (ret < 0) { + dout(10) << "failed to authorize request" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* FIXME: remove this after switching all handlers to the new authentication + * infrastructure. */ + if (! s->auth.identity) { + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } + + ldpp_dout(s, 2) << "reading op permissions" << dendl; + ret = req->read_permissions(op, null_yield); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "init op" << dendl; + ret = op->init_processing(null_yield); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "verifying op mask" << dendl; + ret = op->verify_op_mask(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "verifying op permissions" << dendl; + ret = op->verify_permission(null_yield); + if (ret < 0) { + if (s->system_request) { + ldpp_dout(op, 2) << "overriding permissions due to system operation" << dendl; + } else if (s->auth.identity->is_admin_of(s->user->get_id())) { + ldpp_dout(op, 2) << "overriding permissions due to admin operation" << dendl; + } else { + abort_req(s, op, ret); + goto done; + } + } + + ldpp_dout(s, 2) << "verifying op params" << dendl; + ret = op->verify_params(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + op->pre_exec(); + req->exec_start(); + + done: + return (ret < 0 ? ret : s->err.ret); + } + + int RGWLibProcess::finish_request(RGWLibContinuedReq* req) + { + RGWOp *op = (req->op) ? req->op : dynamic_cast(req); + if (! op) { + ldpp_dout(op, 1) << "failed to derive cognate RGWOp (invalid op?)" << dendl; + return -EINVAL; + } + + int ret = req->exec_finish(); + int op_ret = op->get_ret(); + + ldpp_dout(op, 1) << "====== " << __func__ + << " finishing continued request req=" << hex << req << dec + << " op status=" << op_ret + << " ======" << dendl; + + perfcounter->inc(l_rgw_req); + + return ret; + } + + int RGWLibFrontend::init() + { + std::string uri_prefix; // empty + pprocess = new RGWLibProcess(g_ceph_context, env, + g_conf()->rgw_thread_pool_size, uri_prefix, conf); + return 0; + } + + void RGWLib::set_fe(rgw::RGWLibFrontend* fe) + { + this->fe = fe; + } + + int RGWLib::init() + { + vector args; + return init(args); + } + + int RGWLib::init(vector& args) + { + /* alternative default for module */ + map defaults = { + { "debug_rgw", "1/5" }, + { "keyring", "$rgw_data/keyring" }, + { "log_file", "/var/log/radosgw/$cluster-$name.log" }, + { "objecter_inflight_ops", "24576" }, + // require a secure mon connection by default + { "ms_mon_client_mode", "secure" }, + { "auth_client_required", "cephx" }, + }; + + cct = rgw_global_init(&defaults, args, + CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + + ceph::mutex mutex = ceph::make_mutex("main"); + SafeTimer init_timer(g_ceph_context, mutex); + init_timer.init(); + mutex.lock(); + init_timer.add_event_after(g_conf()->rgw_init_timeout, new C_InitTimeout); + mutex.unlock(); + + /* stage all front-ends (before common-init-finish) */ + main.init_frontends1(true /* nfs */); + + common_init_finish(g_ceph_context); + + main.init_perfcounters(); + main.init_http_clients(); + + main.init_storage(); + if (! main.get_driver()) { + mutex.lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.unlock(); + + derr << "Couldn't init storage provider (RADOS)" << dendl; + return -EIO; + } + + main.cond_init_apis(); + + mutex.lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.unlock(); + + main.init_ldap(); + main.init_opslog(); + + init_async_signal_handler(); + register_async_signal_handler(SIGUSR1, rgw::signal::handle_sigterm); + + main.init_tracepoints(); + main.init_frontends2(this /* rgwlib */); + main.init_notification_endpoints(); + main.init_lua(); + + return 0; + } /* RGWLib::init() */ + + int RGWLib::stop() + { + derr << "shutting down" << dendl; + + const auto finalize_async_signals = []() { + unregister_async_signal_handler(SIGUSR1, rgw::signal::handle_sigterm); + shutdown_async_signal_handler(); + }; + + main.shutdown(finalize_async_signals); + + return 0; + } /* RGWLib::stop() */ + + int RGWLibIO::set_uid(rgw::sal::Driver* driver, const rgw_user& uid) + { + const DoutPrefix dp(driver->ctx(), dout_subsys, "librgw: "); + std::unique_ptr user = driver->get_user(uid); + /* object exists, but policy is broken */ + int ret = user->load_user(&dp, null_yield); + if (ret < 0) { + derr << "ERROR: failed reading user info: uid=" << uid << " ret=" + << ret << dendl; + } + user_info = user->get_info(); + return ret; + } + + int RGWLibRequest::read_permissions(RGWOp* op, optional_yield y) { + /* bucket and object ops */ + int ret = + rgw_build_bucket_policies(op, g_rgwlib->get_driver(), get_state(), y); + if (ret < 0) { + ldpp_dout(op, 10) << "read_permissions (bucket policy) on " + << get_state()->bucket << ":" + << get_state()->object + << " only_bucket=" << only_bucket() + << " ret=" << ret << dendl; + if (ret == -ENODATA) + ret = -EACCES; + } else if (! only_bucket()) { + /* object ops */ + ret = rgw_build_object_policies(op, g_rgwlib->get_driver(), get_state(), + op->prefetch_data(), y); + if (ret < 0) { + ldpp_dout(op, 10) << "read_permissions (object policy) on" + << get_state()->bucket << ":" + << get_state()->object + << " ret=" << ret << dendl; + if (ret == -ENODATA) + ret = -EACCES; + } + } + return ret; + } /* RGWLibRequest::read_permissions */ + + int RGWHandler_Lib::authorize(const DoutPrefixProvider *dpp, optional_yield y) + { + /* TODO: handle + * 1. subusers + * 2. anonymous access + * 3. system access + * 4. ? + * + * Much or all of this depends on handling the cached authorization + * correctly (e.g., dealing with keystone) at mount time. + */ + s->perm_mask = RGW_PERM_FULL_CONTROL; + + // populate the owner info + s->owner.set_id(s->user->get_id()); + s->owner.set_name(s->user->get_display_name()); + + return 0; + } /* RGWHandler_Lib::authorize */ + +} /* namespace rgw */ diff --git a/src/rgw/rgw_lib.h b/src/rgw/rgw_lib.h new file mode 100644 index 000000000..1ad54b49b --- /dev/null +++ b/src/rgw/rgw_lib.h @@ -0,0 +1,209 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "rgw_common.h" +#include "rgw_client_io.h" +#include "rgw_rest.h" +#include "rgw_request.h" +#include "rgw_ldap.h" +#include "include/ceph_assert.h" +#include "rgw_main.h" + +class OpsLogSink; + +namespace rgw { + + class RGWLibFrontend; + + class RGWLib : public DoutPrefixProvider { + boost::intrusive_ptr cct; + AppMain main; + RGWLibFrontend* fe; + + public: + RGWLib() : main(this), fe(nullptr) + {} + ~RGWLib() {} + + rgw::sal::Driver* get_driver() { return main.get_driver(); } + + RGWLibFrontend* get_fe() { return fe; } + + rgw::LDAPHelper* get_ldh() { return main.get_ldh(); } + CephContext *get_cct() const override { return cct.get(); } + unsigned get_subsys() const { return ceph_subsys_rgw; } + std::ostream& gen_prefix(std::ostream& out) const { return out << "lib rgw: "; } + + void set_fe(RGWLibFrontend* fe); + + int init(); + int init(std::vector& args); + int stop(); + }; + + extern RGWLib* g_rgwlib; + +/* request interface */ + + class RGWLibIO : public rgw::io::BasicClient, + public rgw::io::Accounter + { + RGWUserInfo user_info; + RGWEnv env; + public: + RGWLibIO() { + get_env().set("HTTP_HOST", ""); + } + explicit RGWLibIO(const RGWUserInfo &_user_info) + : user_info(_user_info) {} + + int init_env(CephContext *cct) override { + env.init(cct); + return 0; + } + + const RGWUserInfo& get_user() { + return user_info; + } + + int set_uid(rgw::sal::Driver* driver, const rgw_user& uid); + + int write_data(const char *buf, int len); + int read_data(char *buf, int len); + int send_status(int status, const char *status_name); + int send_100_continue(); + int complete_header(); + int send_content_length(uint64_t len); + + RGWEnv& get_env() noexcept override { + return env; + } + + size_t complete_request() override { /* XXX */ + return 0; + }; + + void set_account(bool) override { + return; + } + + uint64_t get_bytes_sent() const override { + return 0; + } + + uint64_t get_bytes_received() const override { + return 0; + } + + }; /* RGWLibIO */ + + class RGWRESTMgr_Lib : public RGWRESTMgr { + public: + RGWRESTMgr_Lib() {} + ~RGWRESTMgr_Lib() override {} + }; /* RGWRESTMgr_Lib */ + + class RGWHandler_Lib : public RGWHandler { + friend class RGWRESTMgr_Lib; + public: + + int authorize(const DoutPrefixProvider *dpp, optional_yield y) override; + + RGWHandler_Lib() {} + ~RGWHandler_Lib() override {} + static int init_from_header(rgw::sal::Driver* driver, + req_state *s); + }; /* RGWHandler_Lib */ + + class RGWLibRequest : public RGWRequest, + public RGWHandler_Lib { + private: + std::unique_ptr tuser; // Don't use this. It's empty except during init. + public: + CephContext* cct; + + /* unambiguiously return req_state */ + inline req_state* get_state() { return this->RGWRequest::s; } + + RGWLibRequest(CephContext* _cct, std::unique_ptr _user) + : RGWRequest(g_rgwlib->get_driver()->get_new_req_id()), + tuser(std::move(_user)), cct(_cct) + {} + + int postauth_init(optional_yield) override { return 0; } + + /* descendant equivalent of *REST*::init_from_header(...): + * prepare request for execute()--should mean, fixup URI-alikes + * and any other expected stat vars in local req_state, for + * now */ + virtual int header_init() = 0; + + /* descendant initializer responsible to call RGWOp::init()--which + * descendants are required to inherit */ + virtual int op_init() = 0; + + using RGWHandler::init; + + int init(const RGWEnv& rgw_env, rgw::sal::Driver* _driver, + RGWLibIO* io, req_state* _s) { + + RGWRequest::init_state(_s); + RGWHandler::init(_driver, _s, io); + + get_state()->req_id = driver->zone_unique_id(id); + get_state()->trans_id = driver->zone_unique_trans_id(id); + get_state()->bucket_tenant = tuser->get_tenant(); + get_state()->set_user(tuser); + + ldpp_dout(_s, 2) << "initializing for trans_id = " + << get_state()->trans_id.c_str() << dendl; + + int ret = header_init(); + if (ret == 0) { + ret = init_from_header(driver, _s); + } + return ret; + } + + virtual bool only_bucket() = 0; + + int read_permissions(RGWOp *op, optional_yield y) override; + + }; /* RGWLibRequest */ + + class RGWLibContinuedReq : public RGWLibRequest { + RGWLibIO io_ctx; + req_state rstate; + public: + + RGWLibContinuedReq(CephContext* _cct, const RGWProcessEnv& penv, + std::unique_ptr _user) + : RGWLibRequest(_cct, std::move(_user)), io_ctx(), + rstate(_cct, penv, &io_ctx.get_env(), id) + { + io_ctx.init(_cct); + + RGWRequest::init_state(&rstate); + RGWHandler::init(g_rgwlib->get_driver(), &rstate, &io_ctx); + + get_state()->req_id = driver->zone_unique_id(id); + get_state()->trans_id = driver->zone_unique_trans_id(id); + + ldpp_dout(get_state(), 2) << "initializing for trans_id = " + << get_state()->trans_id.c_str() << dendl; + } + + inline rgw::sal::Driver* get_driver() { return driver; } + inline RGWLibIO& get_io() { return io_ctx; } + + virtual int execute() final { ceph_abort(); } + virtual int exec_start() = 0; + virtual int exec_continue() = 0; + virtual int exec_finish() = 0; + + }; /* RGWLibContinuedReq */ + +} /* namespace rgw */ diff --git a/src/rgw/rgw_lib_frontend.h b/src/rgw/rgw_lib_frontend.h new file mode 100644 index 000000000..1772724d2 --- /dev/null +++ b/src/rgw/rgw_lib_frontend.h @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "rgw_lib.h" +#include "rgw_file.h" + +namespace rgw { + + class RGWLibProcess : public RGWProcess { + RGWAccessKey access_key; + std::mutex mtx; + std::condition_variable cv; + int gen; + bool shutdown; + + typedef flat_map FSMAP; + FSMAP mounted_fs; + + using lock_guard = std::lock_guard; + using unique_lock = std::unique_lock; + + public: + RGWLibProcess(CephContext* cct, RGWProcessEnv& pe, int num_threads, + std::string uri_prefix, RGWFrontendConfig* _conf) : + RGWProcess(cct, pe, num_threads, std::move(uri_prefix), _conf), + gen(0), shutdown(false) {} + + void run() override; + void checkpoint(); + + void stop() { + shutdown = true; + for (const auto& fs: mounted_fs) { + fs.second->stop(); + } + cv.notify_all(); + } + + void register_fs(RGWLibFS* fs) { + lock_guard guard(mtx); + mounted_fs.insert(FSMAP::value_type(fs, fs)); + ++gen; + } + + void unregister_fs(RGWLibFS* fs) { + lock_guard guard(mtx); + FSMAP::iterator it = mounted_fs.find(fs); + if (it != mounted_fs.end()) { + mounted_fs.erase(it); + ++gen; + } + } + + void enqueue_req(RGWLibRequest* req) { + + lsubdout(g_ceph_context, rgw, 10) + << __func__ << " enqueue request req=" + << std::hex << req << std::dec << dendl; + + req_throttle.get(1); + req_wq.queue(req); + } /* enqueue_req */ + + /* "regular" requests */ + void handle_request(const DoutPrefixProvider *dpp, RGWRequest* req) override; // async handler, deletes req + int process_request(RGWLibRequest* req); + int process_request(RGWLibRequest* req, RGWLibIO* io); + void set_access_key(RGWAccessKey& key) { access_key = key; } + + /* requests w/continue semantics */ + int start_request(RGWLibContinuedReq* req); + int finish_request(RGWLibContinuedReq* req); + }; /* RGWLibProcess */ + + class RGWLibFrontend : public RGWProcessFrontend { + public: + RGWLibFrontend(RGWProcessEnv& pe, RGWFrontendConfig *_conf) + : RGWProcessFrontend(pe, _conf) {} + + int init() override; + + void stop() override { + RGWProcessFrontend::stop(); + get_process()->stop(); + } + + RGWLibProcess* get_process() { + return static_cast(pprocess); + } + + inline void enqueue_req(RGWLibRequest* req) { + static_cast(pprocess)->enqueue_req(req); // async + } + + inline int execute_req(RGWLibRequest* req) { + return static_cast(pprocess)->process_request(req); // !async + } + + inline int start_req(RGWLibContinuedReq* req) { + return static_cast(pprocess)->start_request(req); + } + + inline int finish_req(RGWLibContinuedReq* req) { + return static_cast(pprocess)->finish_request(req); + } + + }; /* RGWLibFrontend */ + +} /* namespace rgw */ diff --git a/src/rgw/rgw_loadgen.cc b/src/rgw/rgw_loadgen.cc new file mode 100644 index 000000000..015057e9c --- /dev/null +++ b/src/rgw/rgw_loadgen.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include + +#include "rgw_loadgen.h" +#include "rgw_auth_s3.h" + + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void RGWLoadGenRequestEnv::set_date(utime_t& tm) +{ + date_str = rgw_to_asctime(tm); +} + +int RGWLoadGenRequestEnv::sign(const DoutPrefixProvider *dpp, RGWAccessKey& access_key) +{ + meta_map_t meta_map; + map sub_resources; + + string canonical_header; + string digest; + + rgw_create_s3_canonical_header(dpp, + request_method.c_str(), + nullptr, /* const char *content_md5 */ + content_type.c_str(), + date_str.c_str(), + meta_map, + meta_map_t{}, + uri.c_str(), + sub_resources, + canonical_header); + + headers["HTTP_DATE"] = date_str; + try { + /* FIXME(rzarzynski): kill the dependency on g_ceph_context. */ + const auto signature = static_cast( + rgw::auth::s3::get_v2_signature(g_ceph_context, canonical_header, + access_key.key)); + headers["HTTP_AUTHORIZATION"] = \ + std::string("AWS ") + access_key.id + ":" + signature; + } catch (int ret) { + return ret; + } + + return 0; +} + +size_t RGWLoadGenIO::write_data(const char* const buf, + const size_t len) +{ + return len; +} + +size_t RGWLoadGenIO::read_data(char* const buf, const size_t len) +{ + const size_t read_len = std::min(left_to_read, + static_cast(len)); + left_to_read -= read_len; + return read_len; +} + +void RGWLoadGenIO::flush() +{ +} + +size_t RGWLoadGenIO::complete_request() +{ + return 0; +} + +int RGWLoadGenIO::init_env(CephContext *cct) +{ + env.init(cct); + + left_to_read = req->content_length; + + char buf[32]; + snprintf(buf, sizeof(buf), "%lld", (long long)req->content_length); + env.set("CONTENT_LENGTH", buf); + + env.set("CONTENT_TYPE", req->content_type.c_str()); + env.set("HTTP_DATE", req->date_str.c_str()); + + for (map::iterator iter = req->headers.begin(); iter != req->headers.end(); ++iter) { + env.set(iter->first.c_str(), iter->second.c_str()); + } + + env.set("REQUEST_METHOD", req->request_method.c_str()); + env.set("REQUEST_URI", req->uri.c_str()); + env.set("QUERY_STRING", req->query_string.c_str()); + env.set("SCRIPT_URI", req->uri.c_str()); + + char port_buf[16]; + snprintf(port_buf, sizeof(port_buf), "%d", req->port); + env.set("SERVER_PORT", port_buf); + return 0; +} + +size_t RGWLoadGenIO::send_status(const int status, + const char* const status_name) +{ + return 0; +} + +size_t RGWLoadGenIO::send_100_continue() +{ + return 0; +} + +size_t RGWLoadGenIO::send_header(const std::string_view& name, + const std::string_view& value) +{ + return 0; +} + +size_t RGWLoadGenIO::complete_header() +{ + return 0; +} + +size_t RGWLoadGenIO::send_content_length(const uint64_t len) +{ + return 0; +} diff --git a/src/rgw/rgw_loadgen.h b/src/rgw/rgw_loadgen.h new file mode 100644 index 000000000..7f3f847c2 --- /dev/null +++ b/src/rgw/rgw_loadgen.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include + +#include "rgw_client_io.h" + + +struct RGWLoadGenRequestEnv { + int port; + uint64_t content_length; + std::string content_type; + std::string request_method; + std::string uri; + std::string query_string; + std::string date_str; + + std::map headers; + + RGWLoadGenRequestEnv() + : port(0), + content_length(0) { + } + + void set_date(utime_t& tm); + int sign(const DoutPrefixProvider *dpp, RGWAccessKey& access_key); +}; + +/* XXX does RGWLoadGenIO actually want to perform stream/HTTP I/O, + * or (e.g) are these NOOPs? */ +class RGWLoadGenIO : public rgw::io::RestfulClient +{ + uint64_t left_to_read; + RGWLoadGenRequestEnv* req; + RGWEnv env; + + int init_env(CephContext *cct) override; + size_t read_data(char *buf, size_t len); + size_t write_data(const char *buf, size_t len); + +public: + explicit RGWLoadGenIO(RGWLoadGenRequestEnv* const req) + : left_to_read(0), + req(req) { + } + + size_t send_status(int status, const char *status_name) override; + size_t send_100_continue() override; + size_t send_header(const std::string_view& name, + const std::string_view& value) override; + size_t complete_header() override; + size_t send_content_length(uint64_t len) override; + + size_t recv_body(char* buf, size_t max) override { + return read_data(buf, max); + } + + size_t send_body(const char* buf, size_t len) override { + return write_data(buf, len); + } + + void flush() override; + + RGWEnv& get_env() noexcept override { + return env; + } + + size_t complete_request() override; +}; diff --git a/src/rgw/rgw_loadgen_process.cc b/src/rgw/rgw_loadgen_process.cc new file mode 100644 index 000000000..f8165185d --- /dev/null +++ b/src/rgw/rgw_loadgen_process.cc @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" + +#include "rgw_rest.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_loadgen.h" +#include "rgw_client_io.h" +#include "rgw_signal.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void RGWLoadGenProcess::checkpoint() +{ + m_tp.drain(&req_wq); +} + +void RGWLoadGenProcess::run() +{ + m_tp.start(); /* start thread pool */ + + int i; + + int num_objs; + + conf->get_val("num_objs", 1000, &num_objs); + + int num_buckets; + conf->get_val("num_buckets", 1, &num_buckets); + + vector buckets(num_buckets); + + std::atomic failed = { false }; + + for (i = 0; i < num_buckets; i++) { + buckets[i] = "/loadgen"; + string& bucket = buckets[i]; + append_rand_alpha(cct, bucket, bucket, 16); + + /* first create a bucket */ + gen_request("PUT", bucket, 0, &failed); + checkpoint(); + } + + string *objs = new string[num_objs]; + + if (failed) { + derr << "ERROR: bucket creation failed" << dendl; + goto done; + } + + for (i = 0; i < num_objs; i++) { + char buf[16 + 1]; + gen_rand_alphanumeric(cct, buf, sizeof(buf)); + buf[16] = '\0'; + objs[i] = buckets[i % num_buckets] + "/" + buf; + } + + for (i = 0; i < num_objs; i++) { + gen_request("PUT", objs[i], 4096, &failed); + } + + checkpoint(); + + if (failed) { + derr << "ERROR: bucket creation failed" << dendl; + goto done; + } + + for (i = 0; i < num_objs; i++) { + gen_request("GET", objs[i], 4096, NULL); + } + + checkpoint(); + + for (i = 0; i < num_objs; i++) { + gen_request("DELETE", objs[i], 0, NULL); + } + + checkpoint(); + + for (i = 0; i < num_buckets; i++) { + gen_request("DELETE", buckets[i], 0, NULL); + } + +done: + checkpoint(); + + m_tp.stop(); + + delete[] objs; + + rgw::signal::signal_shutdown(); +} /* RGWLoadGenProcess::run() */ + +void RGWLoadGenProcess::gen_request(const string& method, + const string& resource, + int content_length, std::atomic* fail_flag) +{ + RGWLoadGenRequest* req = + new RGWLoadGenRequest(env.driver->get_new_req_id(), method, resource, + content_length, fail_flag); + dout(10) << "allocated request req=" << hex << req << dec << dendl; + req_throttle.get(1); + req_wq.queue(req); +} /* RGWLoadGenProcess::gen_request */ + +void RGWLoadGenProcess::handle_request(const DoutPrefixProvider *dpp, RGWRequest* r) +{ + RGWLoadGenRequest* req = static_cast(r); + + RGWLoadGenRequestEnv renv; + + utime_t tm = ceph_clock_now(); + + renv.port = 80; + renv.content_length = req->content_length; + renv.content_type = "binary/octet-stream"; + renv.request_method = req->method; + renv.uri = req->resource; + renv.set_date(tm); + renv.sign(dpp, access_key); + + RGWLoadGenIO real_client_io(&renv); + RGWRestfulIO client_io(cct, &real_client_io); + int ret = process_request(env, req, uri_prefix, &client_io, + null_yield, nullptr, nullptr, nullptr); + if (ret < 0) { + /* we don't really care about return code */ + dout(20) << "process_request() returned " << ret << dendl; + + if (req->fail_flag) { + req->fail_flag++; + } + } + + delete req; +} /* RGWLoadGenProcess::handle_request */ diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc new file mode 100644 index 000000000..de67fcd4b --- /dev/null +++ b/src/rgw/rgw_log.cc @@ -0,0 +1,722 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/Clock.h" +#include "common/Timer.h" +#include "common/utf8.h" +#include "common/OutputDataSocket.h" +#include "common/Formatter.h" + +#include "rgw_bucket.h" +#include "rgw_log.h" +#include "rgw_acl.h" +#include "rgw_client_io.h" +#include "rgw_rest.h" +#include "rgw_zone.h" +#include "rgw_rados.h" + +#include "services/svc_zone.h" + +#include +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static void set_param_str(req_state *s, const char *name, string& str) +{ + const char *p = s->info.env->get(name); + if (p) + str = p; +} + +string render_log_object_name(const string& format, + struct tm *dt, const string& bucket_id, + const string& bucket_name) +{ + string o; + for (unsigned i=0; itm_year + 1900); + break; + case 'y': + sprintf(buf, "%.2d", dt->tm_year % 100); + break; + case 'm': + sprintf(buf, "%.2d", dt->tm_mon + 1); + break; + case 'd': + sprintf(buf, "%.2d", dt->tm_mday); + break; + case 'H': + sprintf(buf, "%.2d", dt->tm_hour); + break; + case 'I': + sprintf(buf, "%.2d", (dt->tm_hour % 12) + 1); + break; + case 'k': + sprintf(buf, "%d", dt->tm_hour); + break; + case 'l': + sprintf(buf, "%d", (dt->tm_hour % 12) + 1); + break; + case 'M': + sprintf(buf, "%.2d", dt->tm_min); + break; + + case 'i': + o += bucket_id; + continue; + case 'n': + o += bucket_name; + continue; + default: + // unknown code + sprintf(buf, "%%%c", format[i]); + break; + } + o += buf; + continue; + } + o += format[i]; + } + return o; +} + +/* usage logger */ +class UsageLogger : public DoutPrefixProvider { + CephContext *cct; + rgw::sal::Driver* driver; + map usage_map; + ceph::mutex lock = ceph::make_mutex("UsageLogger"); + int32_t num_entries; + ceph::mutex timer_lock = ceph::make_mutex("UsageLogger::timer_lock"); + SafeTimer timer; + utime_t round_timestamp; + + class C_UsageLogTimeout : public Context { + UsageLogger *logger; + public: + explicit C_UsageLogTimeout(UsageLogger *_l) : logger(_l) {} + void finish(int r) override { + logger->flush(); + logger->set_timer(); + } + }; + + void set_timer() { + timer.add_event_after(cct->_conf->rgw_usage_log_tick_interval, new C_UsageLogTimeout(this)); + } +public: + + UsageLogger(CephContext *_cct, rgw::sal::Driver* _driver) : cct(_cct), driver(_driver), num_entries(0), timer(cct, timer_lock) { + timer.init(); + std::lock_guard l{timer_lock}; + set_timer(); + utime_t ts = ceph_clock_now(); + recalc_round_timestamp(ts); + } + + ~UsageLogger() { + std::lock_guard l{timer_lock}; + flush(); + timer.cancel_all_events(); + timer.shutdown(); + } + + void recalc_round_timestamp(utime_t& ts) { + round_timestamp = ts.round_to_hour(); + } + + void insert_user(utime_t& timestamp, const rgw_user& user, rgw_usage_log_entry& entry) { + lock.lock(); + if (timestamp.sec() > round_timestamp + 3600) + recalc_round_timestamp(timestamp); + entry.epoch = round_timestamp.sec(); + bool account; + string u = user.to_str(); + rgw_user_bucket ub(u, entry.bucket); + real_time rt = round_timestamp.to_real_time(); + usage_map[ub].insert(rt, entry, &account); + if (account) + num_entries++; + bool need_flush = (num_entries > cct->_conf->rgw_usage_log_flush_threshold); + lock.unlock(); + if (need_flush) { + std::lock_guard l{timer_lock}; + flush(); + } + } + + void insert(utime_t& timestamp, rgw_usage_log_entry& entry) { + if (entry.payer.empty()) { + insert_user(timestamp, entry.owner, entry); + } else { + insert_user(timestamp, entry.payer, entry); + } + } + + void flush() { + map old_map; + lock.lock(); + old_map.swap(usage_map); + num_entries = 0; + lock.unlock(); + + driver->log_usage(this, old_map); + } + + CephContext *get_cct() const override { return cct; } + unsigned get_subsys() const override { return dout_subsys; } + std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw UsageLogger: "; } +}; + +static UsageLogger *usage_logger = NULL; + +void rgw_log_usage_init(CephContext *cct, rgw::sal::Driver* driver) +{ + usage_logger = new UsageLogger(cct, driver); +} + +void rgw_log_usage_finalize() +{ + delete usage_logger; + usage_logger = NULL; +} + +static void log_usage(req_state *s, const string& op_name) +{ + if (s->system_request) /* don't log system user operations */ + return; + + if (!usage_logger) + return; + + rgw_user user; + rgw_user payer; + string bucket_name; + + bucket_name = s->bucket_name; + + if (!bucket_name.empty()) { + bucket_name = s->bucket_name; + user = s->bucket_owner.get_id(); + if (!rgw::sal::Bucket::empty(s->bucket.get()) && + s->bucket->get_info().requester_pays) { + payer = s->user->get_id(); + } + } else { + user = s->user->get_id(); + } + + bool error = s->err.is_err(); + if (error && s->err.http_ret == 404) { + bucket_name = "-"; /* bucket not found, use the invalid '-' as bucket name */ + } + + string u = user.to_str(); + string p = payer.to_str(); + rgw_usage_log_entry entry(u, p, bucket_name); + + uint64_t bytes_sent = ACCOUNTING_IO(s)->get_bytes_sent(); + uint64_t bytes_received = ACCOUNTING_IO(s)->get_bytes_received(); + + rgw_usage_data data(bytes_sent, bytes_received); + + data.ops = 1; + if (!s->is_err()) + data.successful_ops = 1; + + ldpp_dout(s, 30) << "log_usage: bucket_name=" << bucket_name + << " tenant=" << s->bucket_tenant + << ", bytes_sent=" << bytes_sent << ", bytes_received=" + << bytes_received << ", success=" << data.successful_ops << dendl; + + entry.add(op_name, data); + + utime_t ts = ceph_clock_now(); + + usage_logger->insert(ts, entry); +} + +void rgw_format_ops_log_entry(struct rgw_log_entry& entry, Formatter *formatter) +{ + formatter->open_object_section("log_entry"); + formatter->dump_string("bucket", entry.bucket); + { + auto t = utime_t{entry.time}; + t.gmtime(formatter->dump_stream("time")); // UTC + t.localtime(formatter->dump_stream("time_local")); + } + formatter->dump_string("remote_addr", entry.remote_addr); + string obj_owner = entry.object_owner.to_str(); + if (obj_owner.length()) + formatter->dump_string("object_owner", obj_owner); + formatter->dump_string("user", entry.user); + formatter->dump_string("operation", entry.op); + formatter->dump_string("uri", entry.uri); + formatter->dump_string("http_status", entry.http_status); + formatter->dump_string("error_code", entry.error_code); + formatter->dump_int("bytes_sent", entry.bytes_sent); + formatter->dump_int("bytes_received", entry.bytes_received); + formatter->dump_int("object_size", entry.obj_size); + { + using namespace std::chrono; + uint64_t total_time = duration_cast(entry.total_time).count(); + formatter->dump_int("total_time", total_time); + } + formatter->dump_string("user_agent", entry.user_agent); + formatter->dump_string("referrer", entry.referrer); + if (entry.x_headers.size() > 0) { + formatter->open_array_section("http_x_headers"); + for (const auto& iter: entry.x_headers) { + formatter->open_object_section(iter.first.c_str()); + formatter->dump_string(iter.first.c_str(), iter.second); + formatter->close_section(); + } + formatter->close_section(); + } + formatter->dump_string("trans_id", entry.trans_id); + switch(entry.identity_type) { + case TYPE_RGW: + formatter->dump_string("authentication_type","Local"); + break; + case TYPE_LDAP: + formatter->dump_string("authentication_type","LDAP"); + break; + case TYPE_KEYSTONE: + formatter->dump_string("authentication_type","Keystone"); + break; + case TYPE_WEB: + formatter->dump_string("authentication_type","OIDC Provider"); + break; + case TYPE_ROLE: + formatter->dump_string("authentication_type","STS"); + break; + default: + break; + } + if (entry.token_claims.size() > 0) { + if (entry.token_claims[0] == "sts") { + formatter->open_object_section("sts_info"); + for (const auto& iter: entry.token_claims) { + auto pos = iter.find(":"); + if (pos != string::npos) { + formatter->dump_string(iter.substr(0, pos), iter.substr(pos + 1)); + } + } + formatter->close_section(); + } + } + if (!entry.access_key_id.empty()) { + formatter->dump_string("access_key_id", entry.access_key_id); + } + if (!entry.subuser.empty()) { + formatter->dump_string("subuser", entry.subuser); + } + formatter->dump_bool("temp_url", entry.temp_url); + + if (entry.op == "multi_object_delete") { + formatter->open_object_section("op_data"); + formatter->dump_int("num_ok", entry.delete_multi_obj_meta.num_ok); + formatter->dump_int("num_err", entry.delete_multi_obj_meta.num_err); + formatter->open_array_section("objects"); + for (const auto& iter: entry.delete_multi_obj_meta.objects) { + formatter->open_object_section(""); + formatter->dump_string("key", iter.key); + formatter->dump_string("version_id", iter.version_id); + formatter->dump_int("http_status", iter.http_status); + formatter->dump_bool("error", iter.error); + if (iter.error) { + formatter->dump_string("error_message", iter.error_message); + } else { + formatter->dump_bool("delete_marker", iter.delete_marker); + formatter->dump_string("marker_version_id", iter.marker_version_id); + } + formatter->close_section(); + } + formatter->close_section(); + formatter->close_section(); + } + formatter->close_section(); +} + +OpsLogManifold::~OpsLogManifold() +{ + for (const auto &sink : sinks) { + delete sink; + } +} + +void OpsLogManifold::add_sink(OpsLogSink* sink) +{ + sinks.push_back(sink); +} + +int OpsLogManifold::log(req_state* s, struct rgw_log_entry& entry) +{ + int ret = 0; + for (const auto &sink : sinks) { + if (sink->log(s, entry) < 0) { + ret = -1; + } + } + return ret; +} + +OpsLogFile::OpsLogFile(CephContext* cct, std::string& path, uint64_t max_data_size) : + cct(cct), data_size(0), max_data_size(max_data_size), path(path), need_reopen(false) +{ +} + +void OpsLogFile::reopen() { + need_reopen = true; +} + +void OpsLogFile::flush() +{ + { + std::scoped_lock log_lock(mutex); + assert(flush_buffer.empty()); + flush_buffer.swap(log_buffer); + data_size = 0; + } + for (auto bl : flush_buffer) { + int try_num = 0; + while (true) { + if (!file.is_open() || need_reopen) { + need_reopen = false; + file.close(); + file.open(path, std::ofstream::app); + } + bl.write_stream(file); + if (!file) { + ldpp_dout(this, 0) << "ERROR: failed to log RGW ops log file entry" << dendl; + file.clear(); + if (stopped) { + break; + } + int sleep_time_secs = std::min((int) pow(2, try_num), 60); + std::this_thread::sleep_for(std::chrono::seconds(sleep_time_secs)); + try_num++; + } else { + break; + } + } + } + flush_buffer.clear(); + file << std::endl; +} + +void* OpsLogFile::entry() { + std::unique_lock lock(mutex); + while (!stopped) { + if (!log_buffer.empty()) { + lock.unlock(); + flush(); + lock.lock(); + continue; + } + cond.wait(lock); + } + lock.unlock(); + flush(); + return NULL; +} + +void OpsLogFile::start() { + stopped = false; + create("ops_log_file"); +} + +void OpsLogFile::stop() { + { + std::unique_lock lock(mutex); + cond.notify_one(); + stopped = true; + } + join(); +} + +OpsLogFile::~OpsLogFile() +{ + if (!stopped) { + stop(); + } + file.close(); +} + +int OpsLogFile::log_json(req_state* s, bufferlist& bl) +{ + std::unique_lock lock(mutex); + if (data_size + bl.length() >= max_data_size) { + ldout(s->cct, 0) << "ERROR: RGW ops log file buffer too full, dropping log for txn: " << s->trans_id << dendl; + return -1; + } + log_buffer.push_back(bl); + data_size += bl.length(); + cond.notify_all(); + return 0; +} + +unsigned OpsLogFile::get_subsys() const { + return dout_subsys; +} + +JsonOpsLogSink::JsonOpsLogSink() { + formatter = new JSONFormatter; +} + +JsonOpsLogSink::~JsonOpsLogSink() { + delete formatter; +} + +void JsonOpsLogSink::formatter_to_bl(bufferlist& bl) +{ + stringstream ss; + formatter->flush(ss); + const string& s = ss.str(); + bl.append(s); +} + +int JsonOpsLogSink::log(req_state* s, struct rgw_log_entry& entry) +{ + bufferlist bl; + + lock.lock(); + rgw_format_ops_log_entry(entry, formatter); + formatter_to_bl(bl); + lock.unlock(); + + return log_json(s, bl); +} + +void OpsLogSocket::init_connection(bufferlist& bl) +{ + bl.append("["); +} + +OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog) +{ + delim.append(",\n"); +} + +int OpsLogSocket::log_json(req_state* s, bufferlist& bl) +{ + append_output(bl); + return 0; +} + +OpsLogRados::OpsLogRados(rgw::sal::Driver* const& driver): driver(driver) +{ +} + +int OpsLogRados::log(req_state* s, struct rgw_log_entry& entry) +{ + if (!s->cct->_conf->rgw_ops_log_rados) { + return 0; + } + bufferlist bl; + encode(entry, bl); + + struct tm bdt; + time_t t = req_state::Clock::to_time_t(entry.time); + if (s->cct->_conf->rgw_log_object_name_utc) + gmtime_r(&t, &bdt); + else + localtime_r(&t, &bdt); + string oid = render_log_object_name(s->cct->_conf->rgw_log_object_name, &bdt, + entry.bucket_id, entry.bucket); + if (driver->log_op(s, oid, bl) < 0) { + ldpp_dout(s, 0) << "ERROR: failed to log RADOS RGW ops log entry for txn: " << s->trans_id << dendl; + return -1; + } + return 0; +} + +int rgw_log_op(RGWREST* const rest, req_state *s, const RGWOp* op, OpsLogSink *olog) +{ + struct rgw_log_entry entry; + string bucket_id; + string op_name = (op ? op->name() : "unknown"); + + if (s->enable_usage_log) + log_usage(s, op_name); + + if (!s->enable_ops_log) + return 0; + + if (s->bucket_name.empty()) { + /* this case is needed for, e.g., list_buckets */ + } else { + if (s->err.ret == -ERR_NO_SUCH_BUCKET || + rgw::sal::Bucket::empty(s->bucket.get())) { + if (!s->cct->_conf->rgw_log_nonexistent_bucket) { + ldout(s->cct, 5) << "bucket " << s->bucket_name << " doesn't exist, not logging" << dendl; + return 0; + } + bucket_id = ""; + } else { + bucket_id = s->bucket->get_bucket_id(); + } + entry.bucket = rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name); + + if (check_utf8(entry.bucket.c_str(), entry.bucket.size()) != 0) { + ldpp_dout(s, 5) << "not logging op on bucket with non-utf8 name" << dendl; + return 0; + } + + if (!rgw::sal::Object::empty(s->object.get())) { + entry.obj = s->object->get_key(); + } else { + entry.obj = rgw_obj_key("-"); + } + + entry.obj_size = s->obj_size; + } /* !bucket empty */ + + if (s->cct->_conf->rgw_remote_addr_param.length()) + set_param_str(s, s->cct->_conf->rgw_remote_addr_param.c_str(), + entry.remote_addr); + else + set_param_str(s, "REMOTE_ADDR", entry.remote_addr); + set_param_str(s, "HTTP_USER_AGENT", entry.user_agent); + // legacy apps are still using misspelling referer, such as curl -e option + if (s->info.env->exists("HTTP_REFERRER")) + set_param_str(s, "HTTP_REFERRER", entry.referrer); + else + set_param_str(s, "HTTP_REFERER", entry.referrer); + + std::string uri; + if (s->info.env->exists("REQUEST_METHOD")) { + uri.append(s->info.env->get("REQUEST_METHOD")); + uri.append(" "); + } + + if (s->info.env->exists("REQUEST_URI")) { + uri.append(s->info.env->get("REQUEST_URI")); + } + + /* Formerly, we appended QUERY_STRING to uri, but in RGW, QUERY_STRING is a + * substring of REQUEST_URI--appending qs to uri here duplicates qs to the + * ops log */ + + if (s->info.env->exists("HTTP_VERSION")) { + uri.append(" "); + uri.append("HTTP/"); + uri.append(s->info.env->get("HTTP_VERSION")); + } + + entry.uri = std::move(uri); + + entry.op = op_name; + if (op) { + op->write_ops_log_entry(entry); + } + + if (s->auth.identity) { + entry.identity_type = s->auth.identity->get_identity_type(); + s->auth.identity->write_ops_log_entry(entry); + } else { + entry.identity_type = TYPE_NONE; + } + + if (! s->token_claims.empty()) { + entry.token_claims = std::move(s->token_claims); + } + + /* custom header logging */ + if (rest) { + if (rest->log_x_headers()) { + for (const auto& iter : s->info.env->get_map()) { + if (rest->log_x_header(iter.first)) { + entry.x_headers.insert( + rgw_log_entry::headers_map::value_type(iter.first, iter.second)); + } + } + } + } + + entry.user = s->user->get_id().to_str(); + if (s->object_acl) + entry.object_owner = s->object_acl->get_owner().get_id(); + entry.bucket_owner = s->bucket_owner.get_id(); + + uint64_t bytes_sent = ACCOUNTING_IO(s)->get_bytes_sent(); + uint64_t bytes_received = ACCOUNTING_IO(s)->get_bytes_received(); + + entry.time = s->time; + entry.total_time = s->time_elapsed(); + entry.bytes_sent = bytes_sent; + entry.bytes_received = bytes_received; + if (s->err.http_ret) { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", s->err.http_ret); + entry.http_status = buf; + } else { + entry.http_status = "200"; // default + } + entry.error_code = s->err.err_code; + entry.bucket_id = bucket_id; + entry.trans_id = s->trans_id; + if (olog) { + return olog->log(s, entry); + } + return 0; +} + +void rgw_log_entry::generate_test_instances(list& o) +{ + rgw_log_entry *e = new rgw_log_entry; + e->object_owner = "object_owner"; + e->bucket_owner = "bucket_owner"; + e->bucket = "bucket"; + e->remote_addr = "1.2.3.4"; + e->user = "user"; + e->obj = rgw_obj_key("obj"); + e->uri = "http://uri/bucket/obj"; + e->http_status = "200"; + e->error_code = "error_code"; + e->bytes_sent = 1024; + e->bytes_received = 512; + e->obj_size = 2048; + e->user_agent = "user_agent"; + e->referrer = "referrer"; + e->bucket_id = "10"; + e->trans_id = "trans_id"; + e->identity_type = TYPE_RGW; + o.push_back(e); + o.push_back(new rgw_log_entry); +} + +void rgw_log_entry::dump(Formatter *f) const +{ + f->dump_string("object_owner", object_owner.to_str()); + f->dump_string("bucket_owner", bucket_owner.to_str()); + f->dump_string("bucket", bucket); + f->dump_stream("time") << time; + f->dump_string("remote_addr", remote_addr); + f->dump_string("user", user); + f->dump_stream("obj") << obj; + f->dump_string("op", op); + f->dump_string("uri", uri); + f->dump_string("http_status", http_status); + f->dump_string("error_code", error_code); + f->dump_unsigned("bytes_sent", bytes_sent); + f->dump_unsigned("bytes_received", bytes_received); + f->dump_unsigned("obj_size", obj_size); + f->dump_stream("total_time") << total_time; + f->dump_string("user_agent", user_agent); + f->dump_string("referrer", referrer); + f->dump_string("bucket_id", bucket_id); + f->dump_string("trans_id", trans_id); + f->dump_unsigned("identity_type", identity_type); +} diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h new file mode 100644 index 000000000..1dd79273e --- /dev/null +++ b/src/rgw/rgw_log.h @@ -0,0 +1,289 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "rgw_common.h" +#include "common/OutputDataSocket.h" +#include +#include +#include "rgw_sal_fwd.h" + +class RGWOp; + +struct delete_multi_obj_entry { + std::string key; + std::string version_id; + std::string error_message; + std::string marker_version_id; + uint32_t http_status = 0; + bool error = false; + bool delete_marker = false; + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + encode(key, bl); + encode(version_id, bl); + encode(error_message, bl); + encode(marker_version_id, bl); + encode(http_status, bl); + encode(error, bl); + encode(delete_marker, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &p) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p); + decode(key, p); + decode(version_id, p); + decode(error_message, p); + decode(marker_version_id, p); + decode(http_status, p); + decode(error, p); + decode(delete_marker, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(delete_multi_obj_entry) + +struct delete_multi_obj_op_meta { + uint32_t num_ok = 0; + uint32_t num_err = 0; + std::vector objects; + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + encode(num_ok, bl); + encode(num_err, bl); + encode(objects, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &p) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p); + decode(num_ok, p); + decode(num_err, p); + decode(objects, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(delete_multi_obj_op_meta) + +struct rgw_log_entry { + + using headers_map = boost::container::flat_map; + using Clock = req_state::Clock; + + rgw_user object_owner; + rgw_user bucket_owner; + std::string bucket; + Clock::time_point time; + std::string remote_addr; + std::string user; + rgw_obj_key obj; + std::string op; + std::string uri; + std::string http_status; + std::string error_code; + uint64_t bytes_sent = 0; + uint64_t bytes_received = 0; + uint64_t obj_size = 0; + Clock::duration total_time{}; + std::string user_agent; + std::string referrer; + std::string bucket_id; + headers_map x_headers; + std::string trans_id; + std::vector token_claims; + uint32_t identity_type = TYPE_NONE; + std::string access_key_id; + std::string subuser; + bool temp_url {false}; + delete_multi_obj_op_meta delete_multi_obj_meta; + + void encode(bufferlist &bl) const { + ENCODE_START(14, 5, bl); + encode(object_owner.id, bl); + encode(bucket_owner.id, bl); + encode(bucket, bl); + encode(time, bl); + encode(remote_addr, bl); + encode(user, bl); + encode(obj.name, bl); + encode(op, bl); + encode(uri, bl); + encode(http_status, bl); + encode(error_code, bl); + encode(bytes_sent, bl); + encode(obj_size, bl); + encode(total_time, bl); + encode(user_agent, bl); + encode(referrer, bl); + encode(bytes_received, bl); + encode(bucket_id, bl); + encode(obj, bl); + encode(object_owner, bl); + encode(bucket_owner, bl); + encode(x_headers, bl); + encode(trans_id, bl); + encode(token_claims, bl); + encode(identity_type,bl); + encode(access_key_id, bl); + encode(subuser, bl); + encode(temp_url, bl); + encode(delete_multi_obj_meta, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &p) { + DECODE_START_LEGACY_COMPAT_LEN(14, 5, 5, p); + decode(object_owner.id, p); + if (struct_v > 3) + decode(bucket_owner.id, p); + decode(bucket, p); + decode(time, p); + decode(remote_addr, p); + decode(user, p); + decode(obj.name, p); + decode(op, p); + decode(uri, p); + decode(http_status, p); + decode(error_code, p); + decode(bytes_sent, p); + decode(obj_size, p); + decode(total_time, p); + decode(user_agent, p); + decode(referrer, p); + if (struct_v >= 2) + decode(bytes_received, p); + else + bytes_received = 0; + + if (struct_v >= 3) { + if (struct_v <= 5) { + uint64_t id; + decode(id, p); + char buf[32]; + snprintf(buf, sizeof(buf), "%" PRIu64, id); + bucket_id = buf; + } else { + decode(bucket_id, p); + } + } else { + bucket_id = ""; + } + if (struct_v >= 7) { + decode(obj, p); + } + if (struct_v >= 8) { + decode(object_owner, p); + decode(bucket_owner, p); + } + if (struct_v >= 9) { + decode(x_headers, p); + } + if (struct_v >= 10) { + decode(trans_id, p); + } + if (struct_v >= 11) { + decode(token_claims, p); + } + if (struct_v >= 12) { + decode(identity_type, p); + } + if (struct_v >= 13) { + decode(access_key_id, p); + decode(subuser, p); + decode(temp_url, p); + } + if (struct_v >= 14) { + decode(delete_multi_obj_meta, p); + } + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(rgw_log_entry) + +class OpsLogSink { +public: + virtual int log(req_state* s, struct rgw_log_entry& entry) = 0; + virtual ~OpsLogSink() = default; +}; + +class OpsLogManifold: public OpsLogSink { + std::vector sinks; +public: + ~OpsLogManifold() override; + void add_sink(OpsLogSink* sink); + int log(req_state* s, struct rgw_log_entry& entry) override; +}; + +class JsonOpsLogSink : public OpsLogSink { + ceph::Formatter *formatter; + ceph::mutex lock = ceph::make_mutex("JsonOpsLogSink"); + + void formatter_to_bl(bufferlist& bl); +protected: + virtual int log_json(req_state* s, bufferlist& bl) = 0; +public: + JsonOpsLogSink(); + ~JsonOpsLogSink() override; + int log(req_state* s, struct rgw_log_entry& entry) override; +}; + +class OpsLogFile : public JsonOpsLogSink, public Thread, public DoutPrefixProvider { + CephContext* cct; + ceph::mutex mutex = ceph::make_mutex("OpsLogFile"); + std::vector log_buffer; + std::vector flush_buffer; + ceph::condition_variable cond; + std::ofstream file; + bool stopped; + uint64_t data_size; + uint64_t max_data_size; + std::string path; + std::atomic_bool need_reopen; + + void flush(); +protected: + int log_json(req_state* s, bufferlist& bl) override; + void *entry() override; +public: + OpsLogFile(CephContext* cct, std::string& path, uint64_t max_data_size); + ~OpsLogFile() override; + CephContext *get_cct() const override { return cct; } + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw OpsLogFile: "; } + void reopen(); + void start(); + void stop(); +}; + +class OpsLogSocket : public OutputDataSocket, public JsonOpsLogSink { +protected: + int log_json(req_state* s, bufferlist& bl) override; + void init_connection(bufferlist& bl) override; + +public: + OpsLogSocket(CephContext *cct, uint64_t _backlog); +}; + +class OpsLogRados : public OpsLogSink { + // main()'s driver pointer as a reference, possibly modified by RGWRealmReloader + rgw::sal::Driver* const& driver; + +public: + OpsLogRados(rgw::sal::Driver* const& driver); + int log(req_state* s, struct rgw_log_entry& entry) override; +}; + +class RGWREST; + +int rgw_log_op(RGWREST* const rest, struct req_state* s, + const RGWOp* op, OpsLogSink* olog); +void rgw_log_usage_init(CephContext* cct, rgw::sal::Driver* driver); +void rgw_log_usage_finalize(); +void rgw_format_ops_log_entry(struct rgw_log_entry& entry, + ceph::Formatter *formatter); diff --git a/src/rgw/rgw_lua.cc b/src/rgw/rgw_lua.cc new file mode 100644 index 000000000..33af60370 --- /dev/null +++ b/src/rgw/rgw_lua.cc @@ -0,0 +1,214 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include "services/svc_zone.h" +#include "rgw_lua_utils.h" +#include "rgw_sal_rados.h" +#include "rgw_lua.h" +#ifdef WITH_RADOSGW_LUA_PACKAGES +#include +#include +#endif + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::lua { + +context to_context(const std::string& s) +{ + if (strcasecmp(s.c_str(), "prerequest") == 0) { + return context::preRequest; + } + if (strcasecmp(s.c_str(), "postrequest") == 0) { + return context::postRequest; + } + if (strcasecmp(s.c_str(), "background") == 0) { + return context::background; + } + if (strcasecmp(s.c_str(), "getdata") == 0) { + return context::getData; + } + if (strcasecmp(s.c_str(), "putdata") == 0) { + return context::putData; + } + return context::none; +} + +std::string to_string(context ctx) +{ + switch (ctx) { + case context::preRequest: + return "prerequest"; + case context::postRequest: + return "postrequest"; + case context::background: + return "background"; + case context::getData: + return "getdata"; + case context::putData: + return "putdata"; + case context::none: + break; + } + return "none"; +} + +bool verify(const std::string& script, std::string& err_msg) +{ + lua_State *L = luaL_newstate(); + lua_state_guard guard(L); + open_standard_libs(L); + try { + if (luaL_loadstring(L, script.c_str()) != LUA_OK) { + err_msg.assign(lua_tostring(L, -1)); + return false; + } + } catch (const std::runtime_error& e) { + err_msg = e.what(); + return false; + } + err_msg = ""; + return true; +} + +std::string script_oid(context ctx, const std::string& tenant) { + static const std::string SCRIPT_OID_PREFIX("script."); + return SCRIPT_OID_PREFIX + to_string(ctx) + "." + tenant; +} + + +int read_script(const DoutPrefixProvider *dpp, sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx, std::string& script) +{ + return manager ? manager->get_script(dpp, y, script_oid(ctx, tenant), script) : -ENOENT; +} + +int write_script(const DoutPrefixProvider *dpp, sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx, const std::string& script) +{ + return manager ? manager->put_script(dpp, y, script_oid(ctx, tenant), script) : -ENOENT; +} + +int delete_script(const DoutPrefixProvider *dpp, sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx) +{ + return manager ? manager->del_script(dpp, y, script_oid(ctx, tenant)) : -ENOENT; +} + +#ifdef WITH_RADOSGW_LUA_PACKAGES + +namespace bp = boost::process; + +int add_package(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, const std::string& package_name, bool allow_compilation) +{ + // verify that luarocks can load this package + const auto p = bp::search_path("luarocks"); + if (p.empty()) { + return -ECHILD; + } + bp::ipstream is; + const auto cmd = p.string() + " search --porcelain" + (allow_compilation ? " " : " --binary ") + package_name; + bp::child c(cmd, + bp::std_in.close(), + bp::std_err > bp::null, + bp::std_out > is); + + std::string line; + bool package_found = false; + while (c.running() && std::getline(is, line) && !line.empty()) { + package_found = true; + } + c.wait(); + auto ret = c.exit_code(); + if (ret) { + return -ret; + } + + if (!package_found) { + return -EINVAL; + } + + //replace previous versions of the package + const std::string package_name_no_version = package_name.substr(0, package_name.find(" ")); + ret = remove_package(dpp, driver, y, package_name_no_version); + if (ret < 0) { + return ret; + } + + auto lua_mgr = driver->get_lua_manager(); + + return lua_mgr->add_package(dpp, y, package_name); +} + +int remove_package(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, const std::string& package_name) +{ + auto lua_mgr = driver->get_lua_manager(); + + return lua_mgr->remove_package(dpp, y, package_name); +} + +namespace bp = boost::process; + +int list_packages(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, packages_t& packages) +{ + auto lua_mgr = driver->get_lua_manager(); + + return lua_mgr->list_packages(dpp, y, packages); +} + +int install_packages(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + optional_yield y, const std::string& luarocks_path, + packages_t& failed_packages, std::string& output) { + // luarocks directory cleanup + std::error_code ec; + if (std::filesystem::remove_all(luarocks_path, ec) + == static_cast(-1) && + ec != std::errc::no_such_file_or_directory) { + output.append("failed to clear luarock directory: "); + output.append(ec.message()); + output.append("\n"); + return ec.value(); + } + + packages_t packages; + auto ret = list_packages(dpp, driver, y, packages); + if (ret == -ENOENT) { + // allowlist is empty + return 0; + } + if (ret < 0) { + return ret; + } + // verify that luarocks exists + const auto p = bp::search_path("luarocks"); + if (p.empty()) { + return -ECHILD; + } + + // the lua rocks install dir will be created by luarocks the first time it is called + for (const auto& package : packages) { + bp::ipstream is; + const auto cmd = p.string() + " install --lua-version " + CEPH_LUA_VERSION + " --tree " + luarocks_path + " --deps-mode one " + package; + bp::child c(cmd, bp::std_in.close(), (bp::std_err & bp::std_out) > is); + + // once package reload is supported, code should yield when reading output + std::string line = std::string("CMD: ") + cmd; + + do { + if (!line.empty()) { + output.append(line); + output.append("\n"); + } + } while (c.running() && std::getline(is, line)); + + c.wait(); + if (c.exit_code()) { + failed_packages.insert(package); + } + } + + return 0; +} + +#endif + +} + diff --git a/src/rgw/rgw_lua.h b/src/rgw/rgw_lua.h new file mode 100644 index 000000000..a6ebcc2d0 --- /dev/null +++ b/src/rgw/rgw_lua.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include "rgw_lua_version.h" +#include "common/async/yield_context.h" +#include "common/dout.h" +#include "rgw_sal_fwd.h" + +class DoutPrefixProvider; +class lua_State; +class rgw_user; +class DoutPrefixProvider; +namespace rgw::sal { + class RadosStore; + class LuaManager; +} + +namespace rgw::lua { + +enum class context { + preRequest, + postRequest, + background, + getData, + putData, + none +}; + +// get context enum from string +// the expected string the same as the enum (case insensitive) +// return "none" if not matched +context to_context(const std::string& s); + +// verify a lua script +bool verify(const std::string& script, std::string& err_msg); + +// driver a lua script in a context +int write_script(const DoutPrefixProvider *dpp, rgw::sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx, const std::string& script); + +// read the stored lua script from a context +int read_script(const DoutPrefixProvider *dpp, rgw::sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx, std::string& script); + +// delete the stored lua script from a context +int delete_script(const DoutPrefixProvider *dpp, rgw::sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx); + +using packages_t = std::set; + +#ifdef WITH_RADOSGW_LUA_PACKAGES + +// add a lua package to the allowlist +int add_package(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, const std::string& package_name, bool allow_compilation); + +// remove a lua package from the allowlist +int remove_package(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, const std::string& package_name); + +// list lua packages in the allowlist +int list_packages(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, packages_t& packages); + +// install all packages from the allowlist +// return the list of packages that failed to install and the output of the install command +int install_packages(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + optional_yield y, const std::string& luarocks_path, + packages_t& failed_packages, std::string& output); +#endif +} + diff --git a/src/rgw/rgw_lua_background.cc b/src/rgw/rgw_lua_background.cc new file mode 100644 index 000000000..35de4a7e9 --- /dev/null +++ b/src/rgw/rgw_lua_background.cc @@ -0,0 +1,181 @@ +#include "rgw_sal_rados.h" +#include "rgw_lua_background.h" +#include "rgw_lua.h" +#include "rgw_lua_utils.h" +#include "rgw_perf_counters.h" +#include "include/ceph_assert.h" +#include + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::lua { + +const char* RGWTable::INCREMENT = "increment"; +const char* RGWTable::DECREMENT = "decrement"; + +int RGWTable::increment_by(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + auto& mtx = *reinterpret_cast(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL))); + auto decrement = lua_toboolean(L, lua_upvalueindex(THIRD_UPVAL)); + + const auto args = lua_gettop(L); + const auto index = luaL_checkstring(L, 1); + + // by default we increment by 1/-1 + const long long int default_inc = (decrement ? -1 : 1); + BackgroundMapValue inc_by = default_inc; + if (args == 2) { + if (lua_isinteger(L, 2)) { + inc_by = lua_tointeger(L, 2)*default_inc; + } else if (lua_isnumber(L, 2)){ + inc_by = lua_tonumber(L, 2)*static_cast(default_inc); + } else { + return luaL_error(L, "can increment only by numeric values"); + } + } + + std::unique_lock l(mtx); + + const auto it = map->find(std::string(index)); + if (it != map->end()) { + auto& value = it->second; + if (std::holds_alternative(value) && std::holds_alternative(inc_by)) { + value = std::get(value) + std::get(inc_by); + } else if (std::holds_alternative(value) && std::holds_alternative(inc_by)) { + value = std::get(value) + std::get(inc_by); + } else if (std::holds_alternative(value) && std::holds_alternative(inc_by)) { + value = std::get(value) + static_cast(std::get(inc_by)); + } else if (std::holds_alternative(value) && std::holds_alternative(inc_by)) { + value = static_cast(std::get(value)) + std::get(inc_by); + } else { + mtx.unlock(); + return luaL_error(L, "can increment only numeric values"); + } + } + + return 0; +} + +Background::Background(rgw::sal::Driver* driver, + CephContext* cct, + const std::string& luarocks_path, + int execute_interval) : + execute_interval(execute_interval), + dp(cct, dout_subsys, "lua background: "), + lua_manager(driver->get_lua_manager()), + cct(cct), + luarocks_path(luarocks_path) {} + +void Background::shutdown(){ + stopped = true; + cond.notify_all(); + if (runner.joinable()) { + runner.join(); + } + started = false; + stopped = false; +} + +void Background::start() { + if (started) { + // start the thread only once + return; + } + started = true; + runner = std::thread(&Background::run, this); + const auto rc = ceph_pthread_setname(runner.native_handle(), + "lua_background"); + ceph_assert(rc == 0); +} + +void Background::pause() { + { + std::unique_lock cond_lock(pause_mutex); + paused = true; + } + cond.notify_all(); +} + +void Background::resume(rgw::sal::Driver* driver) { + lua_manager = driver->get_lua_manager(); + paused = false; + cond.notify_all(); +} + +int Background::read_script() { + std::unique_lock cond_lock(pause_mutex); + if (paused) { + return -EAGAIN; + } + std::string tenant; + return rgw::lua::read_script(&dp, lua_manager.get(), tenant, null_yield, rgw::lua::context::background, rgw_script); +} + +const BackgroundMapValue Background::empty_table_value; + +const BackgroundMapValue& Background::get_table_value(const std::string& key) const { + std::unique_lock cond_lock(table_mutex); + const auto it = rgw_map.find(key); + if (it == rgw_map.end()) { + return empty_table_value; + } + return it->second; +} + +//(1) Loads the script from the object if not paused +//(2) Executes the script +//(3) Sleep (configurable) +void Background::run() { + lua_State* const L = luaL_newstate(); + rgw::lua::lua_state_guard lguard(L); + open_standard_libs(L); + set_package_path(L, luarocks_path); + create_debug_action(L, cct); + create_background_metatable(L); + const DoutPrefixProvider* const dpp = &dp; + + while (!stopped) { + if (paused) { + ldpp_dout(dpp, 10) << "Lua background thread paused" << dendl; + std::unique_lock cond_lock(cond_mutex); + cond.wait(cond_lock, [this]{return !paused || stopped;}); + if (stopped) { + ldpp_dout(dpp, 10) << "Lua background thread stopped" << dendl; + return; + } + ldpp_dout(dpp, 10) << "Lua background thread resumed" << dendl; + } + const auto rc = read_script(); + if (rc == -ENOENT || rc == -EAGAIN) { + // either no script or paused, nothing to do + } else if (rc < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed to read background script. error " << rc << dendl; + } else { + auto failed = false; + try { + //execute the background lua script + if (luaL_dostring(L, rgw_script.c_str()) != LUA_OK) { + const std::string err(lua_tostring(L, -1)); + ldpp_dout(dpp, 1) << "Lua ERROR: " << err << dendl; + failed = true; + } + } catch (const std::exception& e) { + ldpp_dout(dpp, 1) << "Lua ERROR: " << e.what() << dendl; + failed = true; + } + if (perfcounter) { + perfcounter->inc((failed ? l_rgw_lua_script_fail : l_rgw_lua_script_ok), 1); + } + } + std::unique_lock cond_lock(cond_mutex); + cond.wait_for(cond_lock, std::chrono::seconds(execute_interval), [this]{return stopped;}); + } + ldpp_dout(dpp, 10) << "Lua background thread stopped" << dendl; +} + +void Background::create_background_metatable(lua_State* L) { + create_metatable(L, true, &rgw_map, &table_mutex); +} + +} //namespace rgw::lua + diff --git a/src/rgw/rgw_lua_background.h b/src/rgw/rgw_lua_background.h new file mode 100644 index 000000000..e1271bceb --- /dev/null +++ b/src/rgw/rgw_lua_background.h @@ -0,0 +1,230 @@ +#pragma once +#include "common/dout.h" +#include "rgw_common.h" +#include +#include +#include +#include "rgw_lua_utils.h" +#include "rgw_realm_reloader.h" + +namespace rgw::lua { + +//Interval between each execution of the script is set to 5 seconds +constexpr const int INIT_EXECUTE_INTERVAL = 5; + +//Writeable meta table named RGW with mutex protection +using BackgroundMapValue = std::variant; +using BackgroundMap = std::unordered_map; + +inline void pushvalue(lua_State* L, const std::string& value) { + pushstring(L, value); +} + +inline void pushvalue(lua_State* L, long long value) { + lua_pushinteger(L, value); +} + +inline void pushvalue(lua_State* L, double value) { + lua_pushnumber(L, value); +} + +inline void pushvalue(lua_State* L, bool value) { + lua_pushboolean(L, value); +} + + +struct RGWTable : EmptyMetaTable { + + static const char* INCREMENT; + static const char* DECREMENT; + + static std::string TableName() {return "RGW";} + static std::string Name() {return TableName() + "Meta";} + + static int increment_by(lua_State* L); + + static int IndexClosure(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + auto& mtx = *reinterpret_cast(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL))); + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, INCREMENT) == 0) { + lua_pushlightuserdata(L, map); + lua_pushlightuserdata(L, &mtx); + lua_pushboolean(L, false /*increment*/); + lua_pushcclosure(L, increment_by, THREE_UPVALS); + return ONE_RETURNVAL; + } + if (strcasecmp(index, DECREMENT) == 0) { + lua_pushlightuserdata(L, map); + lua_pushlightuserdata(L, &mtx); + lua_pushboolean(L, true /*decrement*/); + lua_pushcclosure(L, increment_by, THREE_UPVALS); + return ONE_RETURNVAL; + } + + std::lock_guard l(mtx); + + const auto it = map->find(std::string(index)); + if (it == map->end()) { + lua_pushnil(L); + } else { + std::visit([L](auto&& value) { pushvalue(L, value); }, it->second); + } + return ONE_RETURNVAL; + } + + static int LenClosure(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + auto& mtx = *reinterpret_cast(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL))); + + std::lock_guard l(mtx); + + lua_pushinteger(L, map->size()); + + return ONE_RETURNVAL; + } + + static int NewIndexClosure(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + auto& mtx = *reinterpret_cast(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL))); + const auto index = luaL_checkstring(L, 2); + + if (strcasecmp(index, INCREMENT) == 0 || strcasecmp(index, DECREMENT) == 0) { + return luaL_error(L, "increment/decrement are reserved function names for RGW"); + } + + std::unique_lock l(mtx); + + size_t len; + BackgroundMapValue value; + const int value_type = lua_type(L, 3); + + switch (value_type) { + case LUA_TNIL: + map->erase(std::string(index)); + return NO_RETURNVAL; + case LUA_TBOOLEAN: + value = static_cast(lua_toboolean(L, 3)); + len = sizeof(bool); + break; + case LUA_TNUMBER: + if (lua_isinteger(L, 3)) { + value = lua_tointeger(L, 3); + len = sizeof(long long int); + } else { + value = lua_tonumber(L, 3); + len = sizeof(double); + } + break; + case LUA_TSTRING: + { + const auto str = lua_tolstring(L, 3, &len); + value = std::string{str, len}; + break; + } + default: + l.unlock(); + return luaL_error(L, "unsupported value type for RGW table"); + } + + if (len + strnlen(index, MAX_LUA_VALUE_SIZE) + > MAX_LUA_VALUE_SIZE) { + return luaL_error(L, "Lua maximum size of entry limit exceeded"); + } else if (map->size() > MAX_LUA_KEY_ENTRIES) { + l.unlock(); + return luaL_error(L, "Lua max number of entries limit exceeded"); + } else { + map->insert_or_assign(index, value); + } + + return NO_RETURNVAL; + } + + static int PairsClosure(lua_State* L) { + auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + ceph_assert(map); + lua_pushlightuserdata(L, map); + lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function + lua_pushnil(L); // indicate this is the first call + // return stateless_iter, nil + + return TWO_RETURNVALS; + } + + static int stateless_iter(lua_State* L) { + // based on: http://lua-users.org/wiki/GeneralizedPairsAndIpairs + auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + typename BackgroundMap::const_iterator next_it; + if (lua_isnil(L, -1)) { + next_it = map->begin(); + } else { + const char* index = luaL_checkstring(L, 2); + const auto it = map->find(std::string(index)); + ceph_assert(it != map->end()); + next_it = std::next(it); + } + + if (next_it == map->end()) { + // index of the last element was provided + lua_pushnil(L); + lua_pushnil(L); + // return nil, nil + } else { + pushstring(L, next_it->first); + std::visit([L](auto&& value) { pushvalue(L, value); }, next_it->second); + // return key, value + } + + return TWO_RETURNVALS; + } +}; + +class Background : public RGWRealmReloader::Pauser { + +private: + BackgroundMap rgw_map; + bool stopped = false; + bool started = false; + bool paused = false; + int execute_interval; + const DoutPrefix dp; + std::unique_ptr lua_manager; + CephContext* const cct; + const std::string luarocks_path; + std::thread runner; + mutable std::mutex table_mutex; + std::mutex cond_mutex; + std::mutex pause_mutex; + std::condition_variable cond; + static const BackgroundMapValue empty_table_value; + + void run(); + +protected: + std::string rgw_script; + virtual int read_script(); + +public: + Background(rgw::sal::Driver* driver, + CephContext* cct, + const std::string& luarocks_path, + int execute_interval = INIT_EXECUTE_INTERVAL); + + virtual ~Background() = default; + void start(); + void shutdown(); + void create_background_metatable(lua_State* L); + const BackgroundMapValue& get_table_value(const std::string& key) const; + template + void put_table_value(const std::string& key, T value) { + std::unique_lock cond_lock(table_mutex); + rgw_map[key] = value; + } + + void pause() override; + void resume(rgw::sal::Driver* _driver) override; +}; + +} //namepsace rgw::lua + diff --git a/src/rgw/rgw_lua_data_filter.cc b/src/rgw/rgw_lua_data_filter.cc new file mode 100644 index 000000000..9ebaf3453 --- /dev/null +++ b/src/rgw/rgw_lua_data_filter.cc @@ -0,0 +1,143 @@ +#include "rgw_lua_data_filter.h" +#include "rgw_lua_utils.h" +#include "rgw_lua_request.h" +#include "rgw_lua_background.h" +#include "rgw_process_env.h" +#include + +namespace rgw::lua { + +void push_bufferlist_byte(lua_State* L, bufferlist::iterator& it) { + char byte[1]; + it.copy(1, byte); + lua_pushlstring(L, byte, 1); +} + +struct BufferlistMetaTable : public EmptyMetaTable { + + static std::string TableName() {return "Data";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + auto bl = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(1))); + const auto index = luaL_checkinteger(L, 2); + if (index <= 0 || index > bl->length()) { + // lua arrays start from 1 + lua_pushnil(L); + return ONE_RETURNVAL; + } + auto it = bl->begin(index-1); + if (it != bl->end()) { + push_bufferlist_byte(L, it); + } else { + lua_pushnil(L); + } + + return ONE_RETURNVAL; + } + + static int PairsClosure(lua_State* L) { + auto bl = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(1))); + ceph_assert(bl); + lua_pushlightuserdata(L, bl); + lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function + lua_pushnil(L); // indicate this is the first call + // return stateless_iter, nil + + return TWO_RETURNVALS; + } + + static int stateless_iter(lua_State* L) { + // based on: http://lua-users.org/wiki/GeneralizedPairsAndIpairs + auto bl = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(1))); + lua_Integer index; + if (lua_isnil(L, -1)) { + index = 1; + } else { + index = luaL_checkinteger(L, -1) + 1; + } + + // lua arrays start from 1 + auto it = bl->begin(index-1); + + if (index > bl->length()) { + // index of the last element was provided + lua_pushnil(L); + lua_pushnil(L); + // return nil, nil + } else { + lua_pushinteger(L, index); + push_bufferlist_byte(L, it); + // return key, value + } + + return TWO_RETURNVALS; + } + + static int LenClosure(lua_State* L) { + const auto bl = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(1))); + + lua_pushinteger(L, bl->length()); + + return ONE_RETURNVAL; + } +}; + +int RGWObjFilter::execute(bufferlist& bl, off_t offset, const char* op_name) const { + auto L = luaL_newstate(); + lua_state_guard lguard(L); + + open_standard_libs(L); + + create_debug_action(L, s->cct); + + // create the "Data" table + create_metatable(L, true, &bl); + lua_getglobal(L, BufferlistMetaTable::TableName().c_str()); + ceph_assert(lua_istable(L, -1)); + + // create the "Request" table + request::create_top_metatable(L, s, op_name); + + // create the "Offset" variable + lua_pushinteger(L, offset); + lua_setglobal(L, "Offset"); + + if (s->penv.lua.background) { + // create the "RGW" table + s->penv.lua.background->create_background_metatable(L); + lua_getglobal(L, rgw::lua::RGWTable::TableName().c_str()); + ceph_assert(lua_istable(L, -1)); + } + + try { + // execute the lua script + if (luaL_dostring(L, script.c_str()) != LUA_OK) { + const std::string err(lua_tostring(L, -1)); + ldpp_dout(s, 1) << "Lua ERROR: " << err << dendl; + return -EINVAL; + } + } catch (const std::runtime_error& e) { + ldpp_dout(s, 1) << "Lua ERROR: " << e.what() << dendl; + return -EINVAL; + } + + return 0; +} + +int RGWGetObjFilter::handle_data(bufferlist& bl, + off_t bl_ofs, + off_t bl_len) { + filter.execute(bl, bl_ofs, "get_obj"); + // return value is ignored since we don't want to fail execution if lua script fails + return RGWGetObj_Filter::handle_data(bl, bl_ofs, bl_len); +} + +int RGWPutObjFilter::process(bufferlist&& data, uint64_t logical_offset) { + filter.execute(data, logical_offset, "put_obj"); + // return value is ignored since we don't want to fail execution if lua script fails + return rgw::putobj::Pipe::process(std::move(data), logical_offset); +} + +} // namespace rgw::lua + diff --git a/src/rgw/rgw_lua_data_filter.h b/src/rgw/rgw_lua_data_filter.h new file mode 100644 index 000000000..75596b64e --- /dev/null +++ b/src/rgw/rgw_lua_data_filter.h @@ -0,0 +1,52 @@ +#pragma once + +#include "rgw_op.h" + +class DoutPrefixProvider; + +namespace rgw::lua { + +class RGWObjFilter { + req_state* const s; + const std::string script; + +public: + RGWObjFilter(req_state* s, + const std::string& script) : + s(s), script(script) {} + + int execute(bufferlist& bl, off_t offset, const char* op_name) const; +}; + +class RGWGetObjFilter : public RGWGetObj_Filter { + const RGWObjFilter filter; + +public: + RGWGetObjFilter(req_state* s, + const std::string& script, + RGWGetObj_Filter* next) : RGWGetObj_Filter(next), filter(s, script) + {} + + ~RGWGetObjFilter() override = default; + + int handle_data(bufferlist& bl, + off_t bl_ofs, + off_t bl_len) override; + +}; + +class RGWPutObjFilter : public rgw::putobj::Pipe { + const RGWObjFilter filter; + +public: + RGWPutObjFilter(req_state* s, + const std::string& script, + rgw::sal::DataProcessor* next) : rgw::putobj::Pipe(next), filter(s, script) + {} + + ~RGWPutObjFilter() override = default; + + int process(bufferlist&& data, uint64_t logical_offset) override; +}; +} // namespace rgw::lua + diff --git a/src/rgw/rgw_lua_request.cc b/src/rgw/rgw_lua_request.cc new file mode 100644 index 000000000..6d324d4fc --- /dev/null +++ b/src/rgw/rgw_lua_request.cc @@ -0,0 +1,906 @@ +#include +#include +#include +#include "common/dout.h" +#include "services/svc_zone.h" +#include "rgw_lua_utils.h" +#include "rgw_lua.h" +#include "rgw_common.h" +#include "rgw_log.h" +#include "rgw_op.h" +#include "rgw_process_env.h" +#include "rgw_zone.h" +#include "rgw_acl.h" +#include "rgw_sal_rados.h" +#include "rgw_lua_background.h" +#include "rgw_perf_counters.h" + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::lua::request { + +// closure that perform ops log action +// e.g. +// Request.Log() +// +constexpr const char* RequestLogAction{"Log"}; + +int RequestLog(lua_State* L) +{ + const auto rest = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + const auto olog = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL))); + const auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(THIRD_UPVAL))); + const auto op(reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FOURTH_UPVAL)))); + if (s) { + const auto rc = rgw_log_op(rest, s, op, olog); + lua_pushinteger(L, rc); + } else { + ldpp_dout(s, 1) << "Lua ERROR: missing request state, cannot use ops log" << dendl; + lua_pushinteger(L, -EINVAL); + } + + return ONE_RETURNVAL; +} + +int SetAttribute(lua_State* L) { + auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(1))); + + if (!s->trace || !s->trace->IsRecording()) { + return 0; + } + + auto key = luaL_checkstring(L, 1); + int value_type = lua_type(L, 2); + + switch (value_type) { + case LUA_TSTRING: + s->trace->SetAttribute(key, lua_tostring(L, 2)); + break; + + case LUA_TNUMBER: + if (lua_isinteger(L, 2)) { + s->trace->SetAttribute(key, static_cast(lua_tointeger(L, 2))); + } else { + s->trace->SetAttribute(key, static_cast(lua_tonumber(L, 2))); + } + break; + + default: + luaL_error(L, "unsupported value type for SetAttribute"); + } + return 0; +} + +int AddEvent(lua_State* L) { + auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(1))); + + if (!s->trace || !s->trace->IsRecording()) { + return 0; + } + + int args = lua_gettop(L); + if (args == 1) { + auto log = luaL_checkstring(L, 1); + s->trace->AddEvent(log); + } else if(args == 2) { + auto event_name = luaL_checkstring(L, 1); + std::unordered_map event_values; + lua_pushnil(L); + while (lua_next(L, 2) != 0) { + if (lua_type(L, -2) != LUA_TSTRING) { + // skip pair if key is not a string + lua_pop(L, 1); + continue; + } + + auto key = luaL_checkstring(L, -2); + int value_type = lua_type(L, -1); + switch (value_type) { + case LUA_TSTRING: + event_values.emplace(key, lua_tostring(L, -1)); + break; + + case LUA_TNUMBER: + if (lua_isinteger(L, -1)) { + event_values.emplace(key, static_cast(lua_tointeger(L, -1))); + } else { + event_values.emplace(key, static_cast(lua_tonumber(L, -1))); + } + break; + } + lua_pop(L, 1); + } + lua_pop(L, 1); + s->trace->AddEvent(event_name, event_values); + } + return 0; +} + +struct ResponseMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Response";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto err = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "HTTPStatusCode") == 0) { + lua_pushinteger(L, err->http_ret); + } else if (strcasecmp(index, "RGWCode") == 0) { + lua_pushinteger(L, err->ret); + } else if (strcasecmp(index, "HTTPStatus") == 0) { + pushstring(L, err->err_code); + } else if (strcasecmp(index, "Message") == 0) { + pushstring(L, err->message); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } + + static int NewIndexClosure(lua_State* L) { + auto err = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "HTTPStatusCode") == 0) { + err->http_ret = luaL_checkinteger(L, 3); + } else if (strcasecmp(index, "RGWCode") == 0) { + err->ret = luaL_checkinteger(L, 3); + } else if (strcasecmp(index, "HTTPStatus") == 0) { + err->err_code.assign(luaL_checkstring(L, 3)); + } else if (strcasecmp(index, "Message") == 0) { + err->message.assign(luaL_checkstring(L, 3)); + } else { + return error_unknown_field(L, index, TableName()); + } + return NO_RETURNVAL; + } +}; + +struct QuotaMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Quota";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto info = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "MaxSize") == 0) { + lua_pushinteger(L, info->max_size); + } else if (strcasecmp(index, "MaxObjects") == 0) { + lua_pushinteger(L, info->max_objects); + } else if (strcasecmp(index, "Enabled") == 0) { + lua_pushboolean(L, info->enabled); + } else if (strcasecmp(index, "Rounded") == 0) { + lua_pushboolean(L, !info->check_on_raw); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct PlacementRuleMetaTable : public EmptyMetaTable { + static std::string TableName() {return "PlacementRule";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto rule = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Name") == 0) { + pushstring(L, rule->name); + } else if (strcasecmp(index, "StorageClass") == 0) { + pushstring(L, rule->storage_class); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct UserMetaTable : public EmptyMetaTable { + static std::string TableName() {return "User";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto user = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Tenant") == 0) { + pushstring(L, user->tenant); + } else if (strcasecmp(index, "Id") == 0) { + pushstring(L, user->id); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct TraceMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Trace";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Enable") == 0) { + lua_pushboolean(L, s->trace_enabled); + } else if(strcasecmp(index, "SetAttribute") == 0) { + lua_pushlightuserdata(L, s); + lua_pushcclosure(L, SetAttribute, ONE_UPVAL); + } else if(strcasecmp(index, "AddEvent") == 0) { + lua_pushlightuserdata(L, s); + lua_pushcclosure(L, AddEvent, ONE_UPVAL); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } + + static int NewIndexClosure(lua_State* L) { + const auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Enable") == 0) { + s->trace_enabled = lua_toboolean(L, 3); + } else { + return error_unknown_field(L, index, TableName()); + } + return NO_RETURNVAL; + } +}; + +struct OwnerMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Owner";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto owner = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "DisplayName") == 0) { + pushstring(L, owner->get_display_name()); + } else if (strcasecmp(index, "User") == 0) { + create_metatable(L, false, &(owner->get_id())); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct BucketMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Bucket";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + const auto bucket = s->bucket.get(); + + const char* index = luaL_checkstring(L, 2); + + if (rgw::sal::Bucket::empty(bucket)) { + if (strcasecmp(index, "Name") == 0) { + pushstring(L, s->init_state.url_bucket); + } else { + lua_pushnil(L); + } + } else if (strcasecmp(index, "Tenant") == 0) { + pushstring(L, bucket->get_tenant()); + } else if (strcasecmp(index, "Name") == 0) { + pushstring(L, bucket->get_name()); + } else if (strcasecmp(index, "Marker") == 0) { + pushstring(L, bucket->get_marker()); + } else if (strcasecmp(index, "Id") == 0) { + pushstring(L, bucket->get_bucket_id()); + } else if (strcasecmp(index, "Count") == 0) { + lua_pushinteger(L, bucket->get_count()); + } else if (strcasecmp(index, "Size") == 0) { + lua_pushinteger(L, bucket->get_size()); + } else if (strcasecmp(index, "ZoneGroupId") == 0) { + pushstring(L, bucket->get_info().zonegroup); + } else if (strcasecmp(index, "CreationTime") == 0) { + pushtime(L, bucket->get_creation_time()); + } else if (strcasecmp(index, "MTime") == 0) { + pushtime(L, bucket->get_modification_time()); + } else if (strcasecmp(index, "Quota") == 0) { + create_metatable(L, false, &(bucket->get_info().quota)); + } else if (strcasecmp(index, "PlacementRule") == 0) { + create_metatable(L, false, &(bucket->get_info().placement_rule)); + } else if (strcasecmp(index, "User") == 0) { + create_metatable(L, false, &(bucket->get_info().owner)); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } + + static int NewIndexClosure(lua_State* L) { + const auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + const auto bucket = s->bucket.get(); + + const char* index = luaL_checkstring(L, 2); + + if (rgw::sal::Bucket::empty(bucket)) { + if (strcasecmp(index, "Name") == 0) { + s->init_state.url_bucket = luaL_checkstring(L, 3); + return NO_RETURNVAL; + } + } + return error_unknown_field(L, index, TableName()); + } +}; + +struct ObjectMetaTable : public EmptyMetaTable { + static const std::string TableName() {return "Object";} + static std::string Name() {return TableName() + "Meta";} + + using Type = rgw::sal::Object; + + static int IndexClosure(lua_State* L) { + const auto obj = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Name") == 0) { + pushstring(L, obj->get_name()); + } else if (strcasecmp(index, "Instance") == 0) { + pushstring(L, obj->get_instance()); + } else if (strcasecmp(index, "Id") == 0) { + pushstring(L, obj->get_oid()); + } else if (strcasecmp(index, "Size") == 0) { + lua_pushinteger(L, obj->get_obj_size()); + } else if (strcasecmp(index, "MTime") == 0) { + pushtime(L, obj->get_mtime()); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct GrantMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Grant";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto grant = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Type") == 0) { + lua_pushinteger(L, grant->get_type().get_type()); + } else if (strcasecmp(index, "User") == 0) { + const auto id_ptr = grant->get_id(); + if (id_ptr) { + create_metatable(L, false, const_cast(id_ptr)); + } else { + lua_pushnil(L); + } + } else if (strcasecmp(index, "Permission") == 0) { + lua_pushinteger(L, grant->get_permission().get_permissions()); + } else if (strcasecmp(index, "GroupType") == 0) { + lua_pushinteger(L, grant->get_group()); + } else if (strcasecmp(index, "Referer") == 0) { + pushstring(L, grant->get_referer()); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct GrantsMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Grants";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + const auto it = map->find(std::string(index)); + if (it == map->end()) { + lua_pushnil(L); + } else { + create_metatable(L, false, &(it->second)); + } + return ONE_RETURNVAL; + } + + static int PairsClosure(lua_State* L) { + auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + ceph_assert(map); + lua_pushlightuserdata(L, map); + lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function + lua_pushnil(L); // indicate this is the first call + // return stateless_iter, nil + + return TWO_RETURNVALS; + } + + static int stateless_iter(lua_State* L) { + // based on: http://lua-users.org/wiki/GeneralizedPairsAndIpairs + auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + ACLGrantMap::iterator next_it; + if (lua_isnil(L, -1)) { + next_it = map->begin(); + } else { + const char* index = luaL_checkstring(L, 2); + const auto it = map->find(std::string(index)); + ceph_assert(it != map->end()); + next_it = std::next(it); + } + + if (next_it == map->end()) { + // index of the last element was provided + lua_pushnil(L); + lua_pushnil(L); + return TWO_RETURNVALS; + // return nil, nil + } + + while (next_it->first.empty()) { + // this is a multimap and the next element does not have a unique key + ++next_it; + if (next_it == map->end()) { + // index of the last element was provided + lua_pushnil(L); + lua_pushnil(L); + return TWO_RETURNVALS; + // return nil, nil + } + } + + pushstring(L, next_it->first); + create_metatable(L, false, &(next_it->second)); + // return key, value + + return TWO_RETURNVALS; + } + + static int LenClosure(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + lua_pushinteger(L, map->size()); + + return ONE_RETURNVAL; + } +}; + +struct ACLMetaTable : public EmptyMetaTable { + static std::string TableName() {return "ACL";} + static std::string Name() {return TableName() + "Meta";} + + using Type = RGWAccessControlPolicy; + + static int IndexClosure(lua_State* L) { + const auto acl = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Owner") == 0) { + create_metatable(L, false, &(acl->get_owner())); + } else if (strcasecmp(index, "Grants") == 0) { + create_metatable(L, false, &(acl->get_acl().get_grant_map())); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct StatementsMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Statements";} + static std::string Name() {return TableName() + "Meta";} + + using Type = std::vector; + + static std::string statement_to_string(const rgw::IAM::Statement& statement) { + std::stringstream ss; + ss << statement; + return ss.str(); + } + + static int IndexClosure(lua_State* L) { + const auto statements = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const auto index = luaL_checkinteger(L, 2); + + if (index >= (int)statements->size() || index < 0) { + lua_pushnil(L); + } else { + // TODO: policy language could be interpreted to lua and executed as such + pushstring(L, statement_to_string((*statements)[index])); + } + return ONE_RETURNVAL; + } + + static int PairsClosure(lua_State* L) { + auto statements = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + ceph_assert(statements); + lua_pushlightuserdata(L, statements); + lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function + lua_pushnil(L); // indicate this is the first call + // return stateless_iter, nil + + return TWO_RETURNVALS; + } + + static int stateless_iter(lua_State* L) { + auto statements = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + size_t next_it; + if (lua_isnil(L, -1)) { + next_it = 0; + } else { + const auto it = luaL_checkinteger(L, -1); + next_it = it+1; + } + + if (next_it >= statements->size()) { + // index of the last element was provided + lua_pushnil(L); + lua_pushnil(L); + // return nil, nil + } else { + lua_pushinteger(L, next_it); + pushstring(L, statement_to_string((*statements)[next_it])); + // return key, value + } + + return TWO_RETURNVALS; + } + + static int LenClosure(lua_State* L) { + const auto statements = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + lua_pushinteger(L, statements->size()); + + return ONE_RETURNVAL; + } +}; + +struct PolicyMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Policy";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto policy = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Text") == 0) { + pushstring(L, policy->text); + } else if (strcasecmp(index, "Id") == 0) { + // TODO create pushstring for std::unique_ptr + if (!policy->id) { + lua_pushnil(L); + } else { + pushstring(L, policy->id.get()); + } + } else if (strcasecmp(index, "Statements") == 0) { + create_metatable(L, false, &(policy->statements)); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct PoliciesMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Policies";} + static std::string Name() {return TableName() + "Meta";} + + using Type = std::vector; + + static int IndexClosure(lua_State* L) { + const auto policies = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const auto index = luaL_checkinteger(L, 2); + + if (index >= (int)policies->size() || index < 0) { + lua_pushnil(L); + } else { + create_metatable(L, false, &((*policies)[index])); + } + return ONE_RETURNVAL; + } + + static int PairsClosure(lua_State* L) { + auto policies = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + ceph_assert(policies); + lua_pushlightuserdata(L, policies); + lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function + lua_pushnil(L); // indicate this is the first call + // return stateless_iter, nil + + return TWO_RETURNVALS; + } + + static int stateless_iter(lua_State* L) { + auto policies = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + size_t next_it; + if (lua_isnil(L, -1)) { + next_it = 0; + } else { + ceph_assert(lua_isinteger(L, -1)); + const auto it = luaL_checkinteger(L, -1); + next_it = it+1; + } + + if (next_it >= policies->size()) { + // index of the last element was provided + lua_pushnil(L); + lua_pushnil(L); + // return nil, nil + } else { + lua_pushinteger(L, next_it); + create_metatable(L, false, &((*policies)[next_it])); + // return key, value + } + + return TWO_RETURNVALS; + } + + static int LenClosure(lua_State* L) { + const auto policies = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + lua_pushinteger(L, policies->size()); + + return ONE_RETURNVAL; + } +}; + +struct HTTPMetaTable : public EmptyMetaTable { + static std::string TableName() {return "HTTP";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto info = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Parameters") == 0) { + create_metatable>(L, false, &(info->args.get_params())); + } else if (strcasecmp(index, "Resources") == 0) { + // TODO: add non-const api to get resources + create_metatable>(L, false, + const_cast*>(&(info->args.get_sub_resources()))); + } else if (strcasecmp(index, "Metadata") == 0) { + create_metatable>>(L, false, &(info->x_meta_map)); + } else if (strcasecmp(index, "Host") == 0) { + pushstring(L, info->host); + } else if (strcasecmp(index, "Method") == 0) { + pushstring(L, info->method); + } else if (strcasecmp(index, "URI") == 0) { + pushstring(L, info->request_uri); + } else if (strcasecmp(index, "QueryString") == 0) { + pushstring(L, info->request_params); + } else if (strcasecmp(index, "Domain") == 0) { + pushstring(L, info->domain); + } else if (strcasecmp(index, "StorageClass") == 0) { + pushstring(L, info->storage_class); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } + + static int NewIndexClosure(lua_State* L) { + auto info = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "StorageClass") == 0) { + info->storage_class = luaL_checkstring(L, 3); + } else { + return error_unknown_field(L, index, TableName()); + } + return NO_RETURNVAL; + } +}; + +struct CopyFromMetaTable : public EmptyMetaTable { + static std::string TableName() {return "CopyFrom";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Tenant") == 0) { + pushstring(L, s->src_tenant_name); + } else if (strcasecmp(index, "Bucket") == 0) { + pushstring(L, s->src_bucket_name); + } else if (strcasecmp(index, "Object") == 0) { + create_metatable(L, false, s->src_object); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct ZoneGroupMetaTable : public EmptyMetaTable { + static std::string TableName() {return "ZoneGroup";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "Name") == 0) { + pushstring(L, s->zonegroup_name); + } else if (strcasecmp(index, "Endpoint") == 0) { + pushstring(L, s->zonegroup_endpoint); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +struct RequestMetaTable : public EmptyMetaTable { + static std::string TableName() {return "Request";} + static std::string Name() {return TableName() + "Meta";} + + // __index closure that expect req_state to be captured + static int IndexClosure(lua_State* L) { + const auto s = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + const auto op_name = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (strcasecmp(index, "RGWOp") == 0) { + pushstring(L, op_name); + } else if (strcasecmp(index, "DecodedURI") == 0) { + pushstring(L, s->decoded_uri); + } else if (strcasecmp(index, "ContentLength") == 0) { + lua_pushinteger(L, s->content_length); + } else if (strcasecmp(index, "GenericAttributes") == 0) { + create_metatable>(L, false, &(s->generic_attrs)); + } else if (strcasecmp(index, "Response") == 0) { + create_metatable(L, false, &(s->err)); + } else if (strcasecmp(index, "SwiftAccountName") == 0) { + if (s->dialect == "swift") { + pushstring(L, s->account_name); + } else { + lua_pushnil(L); + } + } else if (strcasecmp(index, "Bucket") == 0) { + create_metatable(L, false, s); + } else if (strcasecmp(index, "Object") == 0) { + create_metatable(L, false, s->object); + } else if (strcasecmp(index, "CopyFrom") == 0) { + if (s->op_type == RGW_OP_COPY_OBJ) { + create_metatable(L, s); + } else { + lua_pushnil(L); + } + } else if (strcasecmp(index, "ObjectOwner") == 0) { + create_metatable(L, false, &(s->owner)); + } else if (strcasecmp(index, "ZoneGroup") == 0) { + create_metatable(L, false, s); + } else if (strcasecmp(index, "UserACL") == 0) { + create_metatable(L, false, s->user_acl); + } else if (strcasecmp(index, "BucketACL") == 0) { + create_metatable(L, false, s->bucket_acl); + } else if (strcasecmp(index, "ObjectACL") == 0) { + create_metatable(L, false, s->object_acl); + } else if (strcasecmp(index, "Environment") == 0) { + create_metatable>(L, false, &(s->env)); + } else if (strcasecmp(index, "Policy") == 0) { + // TODO: create a wrapper to std::optional + if (!s->iam_policy) { + lua_pushnil(L); + } else { + create_metatable(L, false, s->iam_policy.get_ptr()); + } + } else if (strcasecmp(index, "UserPolicies") == 0) { + create_metatable(L, false, &(s->iam_user_policies)); + } else if (strcasecmp(index, "RGWId") == 0) { + pushstring(L, s->host_id); + } else if (strcasecmp(index, "HTTP") == 0) { + create_metatable(L, false, &(s->info)); + } else if (strcasecmp(index, "Time") == 0) { + pushtime(L, s->time); + } else if (strcasecmp(index, "Dialect") == 0) { + pushstring(L, s->dialect); + } else if (strcasecmp(index, "Id") == 0) { + pushstring(L, s->req_id); + } else if (strcasecmp(index, "TransactionId") == 0) { + pushstring(L, s->trans_id); + } else if (strcasecmp(index, "Tags") == 0) { + create_metatable>(L, false, &(s->tagset.get_tags())); + } else if (strcasecmp(index, "User") == 0) { + if (!s->user) { + lua_pushnil(L); + } else { + create_metatable(L, false, const_cast(&(s->user->get_id()))); + } + } else if (strcasecmp(index, "Trace") == 0) { + create_metatable(L, false, s); + } else { + return error_unknown_field(L, index, TableName()); + } + return ONE_RETURNVAL; + } +}; + +void create_top_metatable(lua_State* L, req_state* s, const char* op_name) { + create_metatable(L, true, s, const_cast(op_name)); + lua_getglobal(L, RequestMetaTable::TableName().c_str()); + ceph_assert(lua_istable(L, -1)); +} + +int execute( + rgw::sal::Driver* driver, + RGWREST* rest, + OpsLogSink* olog, + req_state* s, + RGWOp* op, + const std::string& script) +{ + auto L = luaL_newstate(); + const char* op_name = op ? op->name() : "Unknown"; + lua_state_guard lguard(L); + + open_standard_libs(L); + set_package_path(L, s->penv.lua.luarocks_path); + + create_debug_action(L, s->cct); + + create_metatable(L, true, s, const_cast(op_name)); + + lua_getglobal(L, RequestMetaTable::TableName().c_str()); + ceph_assert(lua_istable(L, -1)); + + // add the ops log action + pushstring(L, RequestLogAction); + lua_pushlightuserdata(L, rest); + lua_pushlightuserdata(L, olog); + lua_pushlightuserdata(L, s); + lua_pushlightuserdata(L, op); + lua_pushcclosure(L, RequestLog, FOUR_UPVALS); + lua_rawset(L, -3); + + if (s->penv.lua.background) { + s->penv.lua.background->create_background_metatable(L); + lua_getglobal(L, rgw::lua::RGWTable::TableName().c_str()); + ceph_assert(lua_istable(L, -1)); + } + + int rc = 0; + try { + // execute the lua script + if (luaL_dostring(L, script.c_str()) != LUA_OK) { + const std::string err(lua_tostring(L, -1)); + ldpp_dout(s, 1) << "Lua ERROR: " << err << dendl; + rc = -1; + } + } catch (const std::runtime_error& e) { + ldpp_dout(s, 1) << "Lua ERROR: " << e.what() << dendl; + rc = -1; + } + if (perfcounter) { + perfcounter->inc((rc == -1 ? l_rgw_lua_script_fail : l_rgw_lua_script_ok), 1); + } + + return rc; +} + +} // namespace rgw::lua::request + diff --git a/src/rgw/rgw_lua_request.h b/src/rgw/rgw_lua_request.h new file mode 100644 index 000000000..7c85ac9cd --- /dev/null +++ b/src/rgw/rgw_lua_request.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include "include/common_fwd.h" +#include "rgw_sal_fwd.h" + +struct lua_State; +class req_state; +class RGWREST; +class OpsLogSink; + +namespace rgw::lua::request { + +// create the request metatable +void create_top_metatable(lua_State* L, req_state* s, const char* op_name); + +// execute a lua script in the Request context +int execute( + rgw::sal::Driver* driver, + RGWREST* rest, + OpsLogSink* olog, + req_state *s, + RGWOp* op, + const std::string& script); +} // namespace rgw::lua::request + diff --git a/src/rgw/rgw_lua_utils.cc b/src/rgw/rgw_lua_utils.cc new file mode 100644 index 000000000..3ffe23662 --- /dev/null +++ b/src/rgw/rgw_lua_utils.cc @@ -0,0 +1,77 @@ +#include +#include +#include "common/ceph_context.h" +#include "common/dout.h" +#include "rgw_lua_utils.h" +#include "rgw_lua_version.h" + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::lua { + +// TODO - add the folowing generic functions +// lua_push(lua_State* L, const std::string& str) +// template lua_push(lua_State* L, const std::optional& val) +// lua_push(lua_State* L, const ceph::real_time& tp) + +constexpr const char* RGWDebugLogAction{"RGWDebugLog"}; + +int RGWDebugLog(lua_State* L) +{ + auto cct = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + auto message = luaL_checkstring(L, 1); + ldout(cct, 20) << "Lua INFO: " << message << dendl; + return 0; +} + +void create_debug_action(lua_State* L, CephContext* cct) { + lua_pushlightuserdata(L, cct); + lua_pushcclosure(L, RGWDebugLog, ONE_UPVAL); + lua_setglobal(L, RGWDebugLogAction); +} + +void stack_dump(lua_State* L) { + int top = lua_gettop(L); + std::cout << std::endl << " ---------------- Stack Dump ----------------" << std::endl; + std::cout << "Stack Size: " << top << std::endl; + for (int i = 1, j = -top; i <= top; i++, j++) { + std::cout << "[" << i << "," << j << "]: " << luaL_tolstring(L, i, NULL) << std::endl; + lua_pop(L, 1); + } + std::cout << "--------------- Stack Dump Finished ---------------" << std::endl; +} + +void set_package_path(lua_State* L, const std::string& install_dir) { + if (install_dir.empty()) { + return; + } + lua_getglobal(L, "package"); + if (!lua_istable(L, -1)) { + return; + } + const auto path = install_dir+"/share/lua/"+CEPH_LUA_VERSION+"/?.lua"; + pushstring(L, path); + lua_setfield(L, -2, "path"); + + const auto cpath = install_dir+"/lib/lua/"+CEPH_LUA_VERSION+"/?.so"; + pushstring(L, cpath); + lua_setfield(L, -2, "cpath"); +} + +void open_standard_libs(lua_State* L) { + luaL_openlibs(L); + unsetglobal(L, "load"); + unsetglobal(L, "loadfile"); + unsetglobal(L, "loadstring"); + unsetglobal(L, "dofile"); + unsetglobal(L, "debug"); + // remove os.exit() + lua_getglobal(L, "os"); + lua_pushstring(L, "exit"); + lua_pushnil(L); + lua_settable(L, -3); +} + +} // namespace rgw::lua + diff --git a/src/rgw/rgw_lua_utils.h b/src/rgw/rgw_lua_utils.h new file mode 100644 index 000000000..cc77dae7a --- /dev/null +++ b/src/rgw/rgw_lua_utils.h @@ -0,0 +1,315 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "include/common_fwd.h" +#include "rgw_perf_counters.h" + +namespace rgw::lua { + +// push ceph time in string format: "%Y-%m-%d %H:%M:%S" +template +void pushtime(lua_State* L, const CephTime& tp) +{ + const auto tt = CephTime::clock::to_time_t(tp); + const auto tm = *std::localtime(&tt); + char buff[64]; + std::strftime(buff, sizeof(buff), "%Y-%m-%d %H:%M:%S", &tm); + lua_pushstring(L, buff); +} + +static inline void pushstring(lua_State* L, std::string_view str) +{ + lua_pushlstring(L, str.data(), str.size()); +} + +static inline void unsetglobal(lua_State* L, const char* name) +{ + lua_pushnil(L); + lua_setglobal(L, name); +} + +// dump the lua stack to stdout +void stack_dump(lua_State* L); + +class lua_state_guard { + lua_State* l; +public: + lua_state_guard(lua_State* _l) : l(_l) { + if (perfcounter) { + perfcounter->inc(l_rgw_lua_current_vms, 1); + } + } + ~lua_state_guard() { + lua_close(l); + if (perfcounter) { + perfcounter->dec(l_rgw_lua_current_vms, 1); + } + } + void reset(lua_State* _l=nullptr) {l = _l;} +}; + +constexpr const int MAX_LUA_VALUE_SIZE = 1000; +constexpr const int MAX_LUA_KEY_ENTRIES = 100000; + +constexpr auto ONE_UPVAL = 1; +constexpr auto TWO_UPVALS = 2; +constexpr auto THREE_UPVALS = 3; +constexpr auto FOUR_UPVALS = 4; +constexpr auto FIVE_UPVALS = 5; + +constexpr auto FIRST_UPVAL = 1; +constexpr auto SECOND_UPVAL = 2; +constexpr auto THIRD_UPVAL = 3; +constexpr auto FOURTH_UPVAL = 4; +constexpr auto FIFTH_UPVAL = 5; + +constexpr auto NO_RETURNVAL = 0; +constexpr auto ONE_RETURNVAL = 1; +constexpr auto TWO_RETURNVALS = 2; +constexpr auto THREE_RETURNVALS = 3; +constexpr auto FOUR_RETURNVALS = 4; +// utility functions to create a metatable +// and tie it to an unnamed table +// +// add an __index method to it, to allow reading values +// if "readonly" parameter is set to "false", it will also add +// a __newindex method to it, to allow writing values +// if the "toplevel" parameter is set to "true", it will name the +// table as well as the metatable, this would allow direct access from +// the lua script. +// +// The MetaTable is expected to be a class with the following members: +// Name (static function returning the unique name of the metatable) +// TableName (static function returning the unique name of the table - needed only for "toplevel" tables) +// Type (typename) - the type of the "upvalue" (the type that the meta table represent) +// IndexClosure (static function return "int" and accept "lua_State*") +// NewIndexClosure (static function return "int" and accept "lua_State*") +// e.g. +// struct MyStructMetaTable { +// static std::string TableName() { +// return "MyStruct"; +// } +// +// using Type = MyStruct; +// +// static int IndexClosure(lua_State* L) { +// const auto value = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); +// ... +// } + +// static int NewIndexClosure(lua_State* L) { +// auto value = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); +// ... +// } +// }; +// + +template +void create_metatable(lua_State* L, bool toplevel, Upvalues... upvalues) +{ + constexpr auto upvals_size = sizeof...(upvalues); + const std::array upvalue_arr = {upvalues...}; + // create table + lua_newtable(L); + if (toplevel) { + // duplicate the table to make sure it remain in the stack + lua_pushvalue(L, -1); + // give table a name (in cae of "toplevel") + lua_setglobal(L, MetaTable::TableName().c_str()); + } + // create metatable + [[maybe_unused]] const auto rc = luaL_newmetatable(L, MetaTable::Name().c_str()); + lua_pushliteral(L, "__index"); + for (const auto upvalue : upvalue_arr) { + lua_pushlightuserdata(L, upvalue); + } + lua_pushcclosure(L, MetaTable::IndexClosure, upvals_size); + lua_rawset(L, -3); + lua_pushliteral(L, "__newindex"); + for (const auto upvalue : upvalue_arr) { + lua_pushlightuserdata(L, upvalue); + } + lua_pushcclosure(L, MetaTable::NewIndexClosure, upvals_size); + lua_rawset(L, -3); + lua_pushliteral(L, "__pairs"); + for (const auto upvalue : upvalue_arr) { + lua_pushlightuserdata(L, upvalue); + } + lua_pushcclosure(L, MetaTable::PairsClosure, upvals_size); + lua_rawset(L, -3); + lua_pushliteral(L, "__len"); + for (const auto upvalue : upvalue_arr) { + lua_pushlightuserdata(L, upvalue); + } + lua_pushcclosure(L, MetaTable::LenClosure, upvals_size); + lua_rawset(L, -3); + // tie metatable and table + lua_setmetatable(L, -2); +} + +template +void create_metatable(lua_State* L, bool toplevel, std::unique_ptr& ptr) +{ + if (ptr) { + create_metatable(L, toplevel, reinterpret_cast(ptr.get())); + } else { + lua_pushnil(L); + } +} + +// following struct may be used as a base class for other MetaTable classes +// note, however, this is not mandatory to use it as a base +struct EmptyMetaTable { + // by default everythinmg is "readonly" + // to change, overload this function in the derived + static int NewIndexClosure(lua_State* L) { + return luaL_error(L, "trying to write to readonly field"); + } + + // by default nothing is iterable + // to change, overload this function in the derived + static int PairsClosure(lua_State* L) { + return luaL_error(L, "trying to iterate over non-iterable field"); + } + + // by default nothing is iterable + // to change, overload this function in the derived + static int LenClosure(lua_State* L) { + return luaL_error(L, "trying to get length of non-iterable field"); + } + + static int error_unknown_field(lua_State* L, const std::string& index, const std::string& table) { + return luaL_error(L, "unknown field name: %s provided to: %s", + index.c_str(), table.c_str()); + } +}; + +// create a debug log action +// it expects CephContext to be captured +// it expects one string parameter, which is the message to log +// could be executed from any context that has CephContext +// e.g. +// RGWDebugLog("hello world from lua") +// +void create_debug_action(lua_State* L, CephContext* cct); + +// set the packages search path according to: +// package.path = "/share/lua/5.3/?.lua" │ LuaRocks. +// package.cpath= "/lib/lua/5.3/?.so" +void set_package_path(lua_State* L, const std::string& install_dir); + +// open standard lua libs and remove the following functions: +// os.exit() +// load() +// loadfile() +// loadstring() +// dofile() +// and the "debug" library +void open_standard_libs(lua_State* L); + +typedef int MetaTableClosure(lua_State* L); + +template> +int StringMapWriteableNewIndex(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + if (lua_isnil(L, 3) == 0) { + const char* value = luaL_checkstring(L, 3); + if (strnlen(value, MAX_LUA_VALUE_SIZE) + strnlen(index, MAX_LUA_VALUE_SIZE) + > MAX_LUA_VALUE_SIZE) { + return luaL_error(L, "Lua maximum size of entry limit exceeded"); + } else if (map->size() > MAX_LUA_KEY_ENTRIES) { + return luaL_error(L, "Lua max number of entries limit exceeded"); + } else { + map->insert_or_assign(index, value); + } + } else { + map->erase(std::string(index)); + } + + return NO_RETURNVAL; +} + +template, + MetaTableClosure NewIndex=EmptyMetaTable::NewIndexClosure> +struct StringMapMetaTable : public EmptyMetaTable { + + static std::string TableName() {return "StringMap";} + static std::string Name() {return TableName() + "Meta";} + + static int IndexClosure(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + const char* index = luaL_checkstring(L, 2); + + const auto it = map->find(std::string(index)); + if (it == map->end()) { + lua_pushnil(L); + } else { + pushstring(L, it->second); + } + return ONE_RETURNVAL; + } + + static int NewIndexClosure(lua_State* L) { + return NewIndex(L); + } + + static int PairsClosure(lua_State* L) { + auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + ceph_assert(map); + lua_pushlightuserdata(L, map); + lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function + lua_pushnil(L); // indicate this is the first call + // return stateless_iter, nil + + return TWO_RETURNVALS; + } + + static int stateless_iter(lua_State* L) { + // based on: http://lua-users.org/wiki/GeneralizedPairsAndIpairs + auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + typename MapType::const_iterator next_it; + if (lua_isnil(L, -1)) { + next_it = map->begin(); + } else { + const char* index = luaL_checkstring(L, 2); + const auto it = map->find(std::string(index)); + ceph_assert(it != map->end()); + next_it = std::next(it); + } + + if (next_it == map->end()) { + // index of the last element was provided + lua_pushnil(L); + lua_pushnil(L); + // return nil, nil + } else { + pushstring(L, next_it->first); + pushstring(L, next_it->second); + // return key, value + } + + return TWO_RETURNVALS; + } + + static int LenClosure(lua_State* L) { + const auto map = reinterpret_cast(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL))); + + lua_pushinteger(L, map->size()); + + return ONE_RETURNVAL; + } +}; + +} // namespace rgw::lua + diff --git a/src/rgw/rgw_lua_version.h b/src/rgw/rgw_lua_version.h new file mode 100644 index 000000000..ff096334a --- /dev/null +++ b/src/rgw/rgw_lua_version.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +namespace rgw::lua { + +const std::string CEPH_LUA_VERSION(LUA_VERSION_MAJOR "." LUA_VERSION_MINOR); + +} + diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc new file mode 100644 index 000000000..6d2630251 --- /dev/null +++ b/src/rgw/rgw_main.cc @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "global/signal_handler.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/TracepointProvider.h" +#include "rgw_main.h" +#include "rgw_signal.h" +#include "rgw_common.h" +#include "rgw_lib.h" +#include "rgw_log.h" + +#ifdef HAVE_SYS_PRCTL_H +#include +#endif + +using namespace std; + +static constexpr auto dout_subsys = ceph_subsys_rgw; + +static sig_t sighandler_alrm; + +static void godown_alarm(int signum) +{ + _exit(0); +} + +class C_InitTimeout : public Context { +public: + C_InitTimeout() {} + void finish(int r) override { + derr << "Initialization timeout, failed to initialize" << dendl; + exit(1); + } +}; + +static int usage() +{ + cout << "usage: radosgw [options...]" << std::endl; + cout << "options:\n"; + cout << " --rgw-region= region in which radosgw runs\n"; + cout << " --rgw-zone= zone in which radosgw runs\n"; + cout << " --rgw-socket-path= specify a unix domain socket path\n"; + cout << " -m monaddress[:port] connect to specified monitor\n"; + cout << " --keyring= path to radosgw keyring\n"; + cout << " --logfile= file to log debug output\n"; + cout << " --debug-rgw=/ set radosgw debug level\n"; + generic_server_usage(); + + return 0; +} + +/* + * start up the RADOS connection and then handle HTTP messages as they come in + */ +int main(int argc, char *argv[]) +{ + int r{0}; + + // dout() messages will be sent to stderr, but FCGX wants messages on stdout + // Redirect stderr to stdout. + TEMP_FAILURE_RETRY(close(STDERR_FILENO)); + if (TEMP_FAILURE_RETRY(dup2(STDOUT_FILENO, STDERR_FILENO)) < 0) { + int err = errno; + cout << "failed to redirect stderr to stdout: " << cpp_strerror(err) + << std::endl; + return ENOSYS; + } + + /* alternative default for module */ + map defaults = { + { "debug_rgw", "1/5" }, + { "keyring", "$rgw_data/keyring" }, + { "objecter_inflight_ops", "24576" }, + // require a secure mon connection by default + { "ms_mon_client_mode", "secure" }, + { "auth_client_required", "cephx" } + }; + + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + int flags = CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS; + // Prevent global_init() from dropping permissions until frontends can bind + // privileged ports + flags |= CINIT_FLAG_DEFER_DROP_PRIVILEGES; + + auto cct = rgw_global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, flags); + + DoutPrefix dp(cct.get(), dout_subsys, "rgw main: "); + rgw::AppMain main(&dp); + + main.init_frontends1(false /* nfs */); + main.init_numa(); + + if (g_conf()->daemonize) { + global_init_daemonize(g_ceph_context); + } + ceph::mutex mutex = ceph::make_mutex("main"); + SafeTimer init_timer(g_ceph_context, mutex); + init_timer.init(); + mutex.lock(); + init_timer.add_event_after(g_conf()->rgw_init_timeout, new C_InitTimeout); + mutex.unlock(); + + common_init_finish(g_ceph_context); + init_async_signal_handler(); + + /* XXXX check locations thru sighandler_alrm */ + register_async_signal_handler(SIGHUP, rgw::signal::sighup_handler); + r = rgw::signal::signal_fd_init(); + if (r < 0) { + derr << "ERROR: unable to initialize signal fds" << dendl; + exit(1); + } + + register_async_signal_handler(SIGTERM, rgw::signal::handle_sigterm); + register_async_signal_handler(SIGINT, rgw::signal::handle_sigterm); + register_async_signal_handler(SIGUSR1, rgw::signal::handle_sigterm); + sighandler_alrm = signal(SIGALRM, godown_alarm); + + main.init_perfcounters(); + main.init_http_clients(); + + main.init_storage(); + if (! main.get_driver()) { + mutex.lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.unlock(); + + derr << "Couldn't init storage provider (RADOS)" << dendl; + return EIO; + } + + main.cond_init_apis(); + + mutex.lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.unlock(); + + main.init_ldap(); + main.init_opslog(); + main.init_tracepoints(); + main.init_lua(); + main.init_frontends2(nullptr /* RGWLib */); + main.init_notification_endpoints(); + +#if defined(HAVE_SYS_PRCTL_H) + if (prctl(PR_SET_DUMPABLE, 1) == -1) { + cerr << "warning: unable to set dumpable flag: " << cpp_strerror(errno) << std::endl; + } +#endif + + rgw::signal::wait_shutdown(); + + derr << "shutting down" << dendl; + + const auto finalize_async_signals = []() { + unregister_async_signal_handler(SIGHUP, rgw::signal::sighup_handler); + unregister_async_signal_handler(SIGTERM, rgw::signal::handle_sigterm); + unregister_async_signal_handler(SIGINT, rgw::signal::handle_sigterm); + unregister_async_signal_handler(SIGUSR1, rgw::signal::handle_sigterm); + shutdown_async_signal_handler(); + }; + + main.shutdown(finalize_async_signals); + + dout(1) << "final shutdown" << dendl; + + rgw::signal::signal_fd_finalize(); + + return 0; +} /* main(int argc, char* argv[]) */ diff --git a/src/rgw/rgw_main.h b/src/rgw/rgw_main.h new file mode 100644 index 000000000..bbe514351 --- /dev/null +++ b/src/rgw/rgw_main.h @@ -0,0 +1,134 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include +#include "rgw_common.h" +#include "rgw_rest.h" +#include "rgw_frontend.h" +#include "rgw_period_pusher.h" +#include "rgw_realm_reloader.h" +#include "rgw_ldap.h" +#include "rgw_lua.h" +#include "rgw_dmclock_scheduler_ctx.h" +#include "rgw_ratelimit.h" + + +class RGWPauser : public RGWRealmReloader::Pauser { + std::vector pausers; + +public: + ~RGWPauser() override = default; + + void add_pauser(Pauser* pauser) { + pausers.push_back(pauser); + } + + void pause() override { + std::for_each(pausers.begin(), pausers.end(), [](Pauser* p){p->pause();}); + } + void resume(rgw::sal::Driver* driver) override { + std::for_each(pausers.begin(), pausers.end(), [driver](Pauser* p){p->resume(driver);}); + } + +}; + +namespace rgw { + +namespace lua { class Background; } + +class RGWLib; +class AppMain { + /* several components should be initalized only if librgw is + * also serving HTTP */ + bool have_http_frontend{false}; + bool nfs{false}; + + std::vector fes; + std::vector fe_configs; + std::multimap fe_map; + std::unique_ptr ldh; + OpsLogSink* olog; + RGWREST rest; + std::unique_ptr lua_background; + std::unique_ptr implicit_tenant_context; + std::unique_ptr sched_ctx; + std::unique_ptr ratelimiter; + std::map service_map_meta; + // wow, realm reloader has a lot of parts + std::unique_ptr reloader; + std::unique_ptr pusher; + std::unique_ptr fe_pauser; + std::unique_ptr realm_watcher; + std::unique_ptr rgw_pauser; + DoutPrefixProvider* dpp; + RGWProcessEnv env; + +public: + AppMain(DoutPrefixProvider* dpp) + : dpp(dpp) + {} + + void shutdown(std::function finalize_async_signals + = []() { /* nada */}); + + rgw::sal::Driver* get_driver() { + return env.driver; + } + + rgw::LDAPHelper* get_ldh() { + return ldh.get(); + } + + void init_frontends1(bool nfs = false); + void init_numa(); + void init_storage(); + void init_perfcounters(); + void init_http_clients(); + void cond_init_apis(); + void init_ldap(); + void init_opslog(); + int init_frontends2(RGWLib* rgwlib = nullptr); + void init_tracepoints(); + void init_notification_endpoints(); + void init_lua(); + + bool have_http() { + return have_http_frontend; + } + + static OpsLogFile* ops_log_file; +}; /* AppMain */ +} // namespace rgw + +static inline RGWRESTMgr *set_logging(RGWRESTMgr* mgr) +{ + mgr->set_logging(true); + return mgr; +} + +static inline RGWRESTMgr *rest_filter(rgw::sal::Driver* driver, int dialect, RGWRESTMgr* orig) +{ + RGWSyncModuleInstanceRef sync_module = driver->get_sync_module(); + if (sync_module) { + return sync_module->get_rest_filter(dialect, orig); + } else { + return orig; + } +} + diff --git a/src/rgw/rgw_mdlog.h b/src/rgw/rgw_mdlog.h new file mode 100644 index 000000000..179cc2aca --- /dev/null +++ b/src/rgw/rgw_mdlog.h @@ -0,0 +1,185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "common/RWLock.h" + +#include "rgw_metadata.h" +#include "rgw_mdlog_types.h" + +#include "services/svc_rados.h" + +#define META_LOG_OBJ_PREFIX "meta.log." + +struct RGWMetadataLogInfo { + std::string marker; + real_time last_update; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; + +class RGWCompletionManager; + +class RGWMetadataLogInfoCompletion : public RefCountedObject { + public: + using info_callback_t = std::function; + private: + cls_log_header header; + RGWSI_RADOS::Obj io_obj; + librados::AioCompletion *completion; + std::mutex mutex; //< protects callback between cancel/complete + boost::optional callback; //< cleared on cancel + public: + explicit RGWMetadataLogInfoCompletion(info_callback_t callback); + ~RGWMetadataLogInfoCompletion() override; + + RGWSI_RADOS::Obj& get_io_obj() { return io_obj; } + cls_log_header& get_header() { return header; } + librados::AioCompletion* get_completion() { return completion; } + + void finish(librados::completion_t cb) { + std::lock_guard lock(mutex); + if (callback) { + (*callback)(completion->get_return_value(), header); + } + } + void cancel() { + std::lock_guard lock(mutex); + callback = boost::none; + } +}; + +class RGWMetadataLog { + CephContext *cct; + const std::string prefix; + + struct Svc { + RGWSI_Zone *zone{nullptr}; + RGWSI_Cls *cls{nullptr}; + } svc; + + static std::string make_prefix(const std::string& period) { + if (period.empty()) + return META_LOG_OBJ_PREFIX; + return META_LOG_OBJ_PREFIX + period + "."; + } + + RWLock lock; + std::set modified_shards; + + void mark_modified(int shard_id); +public: + RGWMetadataLog(CephContext *_cct, + RGWSI_Zone *_zone_svc, + RGWSI_Cls *_cls_svc, + const std::string& period) + : cct(_cct), + prefix(make_prefix(period)), + lock("RGWMetaLog::lock") { + svc.zone = _zone_svc; + svc.cls = _cls_svc; + } + + + void get_shard_oid(int id, std::string& oid) const { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", id); + oid = prefix + buf; + } + + int add_entry(const DoutPrefixProvider *dpp, const std::string& hash_key, const std::string& section, const std::string& key, bufferlist& bl); + int get_shard_id(const std::string& hash_key, int *shard_id); + int store_entries_in_shard(const DoutPrefixProvider *dpp, std::list& entries, int shard_id, librados::AioCompletion *completion); + + struct LogListCtx { + int cur_shard; + std::string marker; + real_time from_time; + real_time end_time; + + std::string cur_oid; + + bool done; + + LogListCtx() : cur_shard(0), done(false) {} + }; + + void init_list_entries(int shard_id, const real_time& from_time, + const real_time& end_time, const std::string& marker, + void **handle); + void complete_list_entries(void *handle); + int list_entries(const DoutPrefixProvider *dpp, + void *handle, + int max_entries, + std::list& entries, + std::string *out_marker, + bool *truncated); + + int trim(const DoutPrefixProvider *dpp, int shard_id, const real_time& from_time, const real_time& end_time, const std::string& start_marker, const std::string& end_marker); + int get_info(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfo *info); + int get_info_async(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfoCompletion *completion); + int lock_exclusive(const DoutPrefixProvider *dpp, int shard_id, timespan duration, std::string&zone_id, std::string& owner_id); + int unlock(const DoutPrefixProvider *dpp, int shard_id, std::string& zone_id, std::string& owner_id); + + int update_shards(std::list& shards); + + void read_clear_modified(std::set &modified); +}; + +struct LogStatusDump { + RGWMDLogStatus status; + + explicit LogStatusDump(RGWMDLogStatus _status) : status(_status) {} + void dump(Formatter *f) const; +}; + +struct RGWMetadataLogData { + obj_version read_version; + obj_version write_version; + RGWMDLogStatus status; + + RGWMetadataLogData() : status(MDLOG_STATUS_UNKNOWN) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWMetadataLogData) + +struct RGWMetadataLogHistory { + epoch_t oldest_realm_epoch; + std::string oldest_period_id; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(oldest_realm_epoch, bl); + encode(oldest_period_id, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(oldest_realm_epoch, p); + decode(oldest_period_id, p); + DECODE_FINISH(p); + } + + static const std::string oid; +}; +WRITE_CLASS_ENCODER(RGWMetadataLogHistory) + diff --git a/src/rgw/rgw_mdlog_types.h b/src/rgw/rgw_mdlog_types.h new file mode 100644 index 000000000..1862974d8 --- /dev/null +++ b/src/rgw/rgw_mdlog_types.h @@ -0,0 +1,35 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +enum RGWMDLogSyncType { + APPLY_ALWAYS, + APPLY_UPDATES, + APPLY_NEWER, + APPLY_EXCLUSIVE +}; + +enum RGWMDLogStatus { + MDLOG_STATUS_UNKNOWN, + MDLOG_STATUS_WRITE, + MDLOG_STATUS_SETATTRS, + MDLOG_STATUS_REMOVE, + MDLOG_STATUS_COMPLETE, + MDLOG_STATUS_ABORT, +}; + diff --git a/src/rgw/rgw_meta_sync_status.h b/src/rgw/rgw_meta_sync_status.h new file mode 100644 index 000000000..f8a2ae3ee --- /dev/null +++ b/src/rgw/rgw_meta_sync_status.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "common/ceph_time.h" + +struct rgw_meta_sync_info { + enum SyncState { + StateInit = 0, + StateBuildingFullSyncMaps = 1, + StateSync = 2, + }; + + uint16_t state; + uint32_t num_shards; + std::string period; //< period id of current metadata log + epoch_t realm_epoch = 0; //< realm epoch of period + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(state, bl); + encode(num_shards, bl); + encode(period, bl); + encode(realm_epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(state, bl); + decode(num_shards, bl); + if (struct_v >= 2) { + decode(period, bl); + decode(realm_epoch, bl); + } + DECODE_FINISH(bl); + } + + void decode_json(JSONObj *obj); + void dump(Formatter *f) const; + static void generate_test_instances(std::list& ls); + + rgw_meta_sync_info() : state((int)StateInit), num_shards(0) {} +}; +WRITE_CLASS_ENCODER(rgw_meta_sync_info) + +struct rgw_meta_sync_marker { + enum SyncState { + FullSync = 0, + IncrementalSync = 1, + }; + uint16_t state; + std::string marker; + std::string next_step_marker; + uint64_t total_entries; + uint64_t pos; + real_time timestamp; + epoch_t realm_epoch{0}; //< realm_epoch of period marker + + rgw_meta_sync_marker() : state(FullSync), total_entries(0), pos(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(state, bl); + encode(marker, bl); + encode(next_step_marker, bl); + encode(total_entries, bl); + encode(pos, bl); + encode(timestamp, bl); + encode(realm_epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(state, bl); + decode(marker, bl); + decode(next_step_marker, bl); + decode(total_entries, bl); + decode(pos, bl); + decode(timestamp, bl); + if (struct_v >= 2) { + decode(realm_epoch, bl); + } + DECODE_FINISH(bl); + } + + void decode_json(JSONObj *obj); + void dump(Formatter *f) const; + static void generate_test_instances(std::list& ls); +}; +WRITE_CLASS_ENCODER(rgw_meta_sync_marker) + +struct rgw_meta_sync_status { + rgw_meta_sync_info sync_info; + std::map sync_markers; + + rgw_meta_sync_status() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(sync_info, bl); + encode(sync_markers, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(sync_info, bl); + decode(sync_markers, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& ls); +}; +WRITE_CLASS_ENCODER(rgw_meta_sync_status) diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc new file mode 100644 index 000000000..7fd25ae75 --- /dev/null +++ b/src/rgw/rgw_metadata.cc @@ -0,0 +1,683 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_metadata.h" + +#include "rgw_mdlog.h" + + +#include "services/svc_meta.h" +#include "services/svc_meta_be_sobj.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void LogStatusDump::dump(Formatter *f) const { + string s; + switch (status) { + case MDLOG_STATUS_WRITE: + s = "write"; + break; + case MDLOG_STATUS_SETATTRS: + s = "set_attrs"; + break; + case MDLOG_STATUS_REMOVE: + s = "remove"; + break; + case MDLOG_STATUS_COMPLETE: + s = "complete"; + break; + case MDLOG_STATUS_ABORT: + s = "abort"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); +} + +void encode_json(const char *name, const obj_version& v, Formatter *f) +{ + f->open_object_section(name); + f->dump_string("tag", v.tag); + f->dump_unsigned("ver", v.ver); + f->close_section(); +} + +void decode_json_obj(obj_version& v, JSONObj *obj) +{ + JSONDecoder::decode_json("tag", v.tag, obj); + JSONDecoder::decode_json("ver", v.ver, obj); +} + +void RGWMetadataLogData::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(read_version, bl); + encode(write_version, bl); + uint32_t s = (uint32_t)status; + encode(s, bl); + ENCODE_FINISH(bl); +} + +void RGWMetadataLogData::decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(read_version, bl); + decode(write_version, bl); + uint32_t s; + decode(s, bl); + status = (RGWMDLogStatus)s; + DECODE_FINISH(bl); +} + +void RGWMetadataLogData::dump(Formatter *f) const { + encode_json("read_version", read_version, f); + encode_json("write_version", write_version, f); + encode_json("status", LogStatusDump(status), f); +} + +void decode_json_obj(RGWMDLogStatus& status, JSONObj *obj) { + string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "complete") { + status = MDLOG_STATUS_COMPLETE; + } else if (s == "write") { + status = MDLOG_STATUS_WRITE; + } else if (s == "remove") { + status = MDLOG_STATUS_REMOVE; + } else if (s == "set_attrs") { + status = MDLOG_STATUS_SETATTRS; + } else if (s == "abort") { + status = MDLOG_STATUS_ABORT; + } else { + status = MDLOG_STATUS_UNKNOWN; + } +} + +void RGWMetadataLogData::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("read_version", read_version, obj); + JSONDecoder::decode_json("write_version", write_version, obj); + JSONDecoder::decode_json("status", status, obj); +} + +RGWMetadataHandler_GenericMetaBE::Put::Put(RGWMetadataHandler_GenericMetaBE *_handler, + RGWSI_MetaBackend_Handler::Op *_op, + string& _entry, RGWMetadataObject *_obj, + RGWObjVersionTracker& _objv_tracker, + optional_yield _y, + RGWMDLogSyncType _type, bool _from_remote_zone): + handler(_handler), op(_op), + entry(_entry), obj(_obj), + objv_tracker(_objv_tracker), + apply_type(_type), + y(_y), + from_remote_zone(_from_remote_zone) +{ +} + +int RGWMetadataHandler_GenericMetaBE::do_put_operate(Put *put_op, const DoutPrefixProvider *dpp) +{ + int r = put_op->put_pre(dpp); + if (r != 0) { /* r can also be STATUS_NO_APPLY */ + return r; + } + + r = put_op->put(dpp); + if (r != 0) { + return r; + } + + r = put_op->put_post(dpp); + if (r != 0) { /* e.g., -error or STATUS_APPLIED */ + return r; + } + + return 0; +} + +int RGWMetadataHandler_GenericMetaBE::get(string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return do_get(op, entry, obj, y, dpp); + }); +} + +int RGWMetadataHandler_GenericMetaBE::put(string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return do_put(op, entry, obj, objv_tracker, y, dpp, type, from_remote_zone); + }); +} + +int RGWMetadataHandler_GenericMetaBE::remove(string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return do_remove(op, entry, objv_tracker, y, dpp); + }); +} + +int RGWMetadataHandler_GenericMetaBE::mutate(const string& entry, + const ceph::real_time& mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogStatus op_type, + std::function f) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + RGWSI_MetaBackend::MutateParams params(mtime, op_type); + return op->mutate(entry, + params, + objv_tracker, + y, + f, + dpp); + }); +} + +int RGWMetadataHandler_GenericMetaBE::get_shard_id(const string& entry, int *shard_id) +{ + return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) { + return op->get_shard_id(entry, shard_id); + }); +} + +int RGWMetadataHandler_GenericMetaBE::list_keys_init(const DoutPrefixProvider *dpp, const string& marker, void **phandle) +{ + auto op = std::make_unique(be_handler); + + int ret = op->list_init(dpp, marker); + if (ret < 0) { + return ret; + } + + *phandle = (void *)op.release(); + + return 0; +} + +int RGWMetadataHandler_GenericMetaBE::list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, list& keys, bool *truncated) +{ + auto op = static_cast(handle); + + int ret = op->list_next(dpp, max, &keys, truncated); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + if (ret == -ENOENT) { + if (truncated) { + *truncated = false; + } + return 0; + } + + return 0; +} + +void RGWMetadataHandler_GenericMetaBE::list_keys_complete(void *handle) +{ + auto op = static_cast(handle); + delete op; +} + +string RGWMetadataHandler_GenericMetaBE::get_marker(void *handle) +{ + auto op = static_cast(handle); + string marker; + int r = op->list_get_marker(&marker); + if (r < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << "(): list_get_marker() returned: r=" << r << dendl; + /* not much else to do */ + } + + return marker; +} + +RGWMetadataHandlerPut_SObj::RGWMetadataHandlerPut_SObj(RGWMetadataHandler_GenericMetaBE *handler, + RGWSI_MetaBackend_Handler::Op *op, + string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, + RGWMDLogSyncType type, bool from_remote_zone) : Put(handler, op, entry, obj, objv_tracker, y, type, from_remote_zone) { +} + +int RGWMetadataHandlerPut_SObj::put_pre(const DoutPrefixProvider *dpp) +{ + int ret = get(&old_obj, dpp); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + exists = (ret != -ENOENT); + + oo.reset(old_obj); + + auto old_ver = (!old_obj ? obj_version() : old_obj->get_version()); + auto old_mtime = (!old_obj ? ceph::real_time() : old_obj->get_mtime()); + + // are we actually going to perform this put, or is it too old? + if (!handler->check_versions(exists, old_ver, old_mtime, + objv_tracker.write_version, obj->get_mtime(), + apply_type)) { + return STATUS_NO_APPLY; + } + + objv_tracker.read_version = old_ver; /* maintain the obj version we just read */ + + return 0; +} + +int RGWMetadataHandlerPut_SObj::put(const DoutPrefixProvider *dpp) +{ + int ret = put_check(dpp); + if (ret != 0) { + return ret; + } + + return put_checked(dpp); +} + +int RGWMetadataHandlerPut_SObj::put_checked(const DoutPrefixProvider *dpp) +{ + RGWSI_MBSObj_PutParams params(obj->get_pattrs(), obj->get_mtime()); + + encode_obj(¶ms.bl); + + int ret = op->put(entry, params, &objv_tracker, y, dpp); + if (ret < 0) { + return ret; + } + + return 0; +} + +class RGWMetadataTopHandler : public RGWMetadataHandler { + struct iter_data { + set sections; + set::iterator iter; + }; + + struct Svc { + RGWSI_Meta *meta{nullptr}; + } svc; + + RGWMetadataManager *mgr; + +public: + RGWMetadataTopHandler(RGWSI_Meta *meta_svc, + RGWMetadataManager *_mgr) : mgr(_mgr) { + base_init(meta_svc->ctx()); + svc.meta = meta_svc; + } + + string get_type() override { return string(); } + + RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) { + return new RGWMetadataObject; + } + + int get(string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override { + return -ENOTSUP; + } + + int put(string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone) override { + return -ENOTSUP; + } + + int remove(string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override { + return -ENOTSUP; + } + + int mutate(const string& entry, + const ceph::real_time& mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogStatus op_type, + std::function f) { + return -ENOTSUP; + } + + int list_keys_init(const DoutPrefixProvider *dpp, const string& marker, void **phandle) override { + iter_data *data = new iter_data; + list sections; + mgr->get_sections(sections); + for (auto& s : sections) { + data->sections.insert(s); + } + data->iter = data->sections.lower_bound(marker); + + *phandle = data; + + return 0; + } + int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, list& keys, bool *truncated) override { + iter_data *data = static_cast(handle); + for (int i = 0; i < max && data->iter != data->sections.end(); ++i, ++(data->iter)) { + keys.push_back(*data->iter); + } + + *truncated = (data->iter != data->sections.end()); + + return 0; + } + void list_keys_complete(void *handle) override { + iter_data *data = static_cast(handle); + + delete data; + } + + virtual string get_marker(void *handle) override { + iter_data *data = static_cast(handle); + + if (data->iter != data->sections.end()) { + return *(data->iter); + } + + return string(); + } +}; + +RGWMetadataHandlerPut_SObj::~RGWMetadataHandlerPut_SObj() {} + +int RGWMetadataHandler::attach(RGWMetadataManager *manager) +{ + return manager->register_handler(this); +} + +RGWMetadataHandler::~RGWMetadataHandler() {} + +obj_version& RGWMetadataObject::get_version() +{ + return objv; +} + +RGWMetadataManager::RGWMetadataManager(RGWSI_Meta *_meta_svc) + : cct(_meta_svc->ctx()), meta_svc(_meta_svc) +{ + md_top_handler.reset(new RGWMetadataTopHandler(meta_svc, this)); +} + +RGWMetadataManager::~RGWMetadataManager() +{ +} + +int RGWMetadataManager::register_handler(RGWMetadataHandler *handler) +{ + string type = handler->get_type(); + + if (handlers.find(type) != handlers.end()) + return -EEXIST; + + handlers[type] = handler; + + return 0; +} + +RGWMetadataHandler *RGWMetadataManager::get_handler(const string& type) +{ + map::iterator iter = handlers.find(type); + if (iter == handlers.end()) + return NULL; + + return iter->second; +} + +void RGWMetadataManager::parse_metadata_key(const string& metadata_key, string& type, string& entry) +{ + auto pos = metadata_key.find(':'); + if (pos == string::npos) { + type = metadata_key; + } else { + type = metadata_key.substr(0, pos); + entry = metadata_key.substr(pos + 1); + } +} + +int RGWMetadataManager::find_handler(const string& metadata_key, RGWMetadataHandler **handler, string& entry) +{ + string type; + + parse_metadata_key(metadata_key, type, entry); + + if (type.empty()) { + *handler = md_top_handler.get(); + return 0; + } + + map::iterator iter = handlers.find(type); + if (iter == handlers.end()) + return -ENOENT; + + *handler = iter->second; + + return 0; + +} + +int RGWMetadataManager::get(string& metadata_key, Formatter *f, optional_yield y, const DoutPrefixProvider *dpp) +{ + RGWMetadataHandler *handler; + string entry; + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) { + return ret; + } + + RGWMetadataObject *obj; + + ret = handler->get(entry, &obj, y, dpp); + if (ret < 0) { + return ret; + } + + f->open_object_section("metadata_info"); + encode_json("key", metadata_key, f); + encode_json("ver", obj->get_version(), f); + real_time mtime = obj->get_mtime(); + if (!real_clock::is_zero(mtime)) { + utime_t ut(mtime); + encode_json("mtime", ut, f); + } + encode_json("data", *obj, f); + f->close_section(); + + delete obj; + + return 0; +} + +int RGWMetadataManager::put(string& metadata_key, bufferlist& bl, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType sync_type, + bool from_remote_zone, + obj_version *existing_version) +{ + RGWMetadataHandler *handler; + string entry; + + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) { + return ret; + } + + JSONParser parser; + if (!parser.parse(bl.c_str(), bl.length())) { + return -EINVAL; + } + + RGWObjVersionTracker objv_tracker; + + obj_version *objv = &objv_tracker.write_version; + + utime_t mtime; + + try { + JSONDecoder::decode_json("key", metadata_key, &parser); + JSONDecoder::decode_json("ver", *objv, &parser); + JSONDecoder::decode_json("mtime", mtime, &parser); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + + JSONObj *jo = parser.find_obj("data"); + if (!jo) { + return -EINVAL; + } + RGWMetadataObject *obj = handler->get_meta_obj(jo, *objv, mtime.to_real_time()); + if (!obj) { + return -EINVAL; + } + + ret = handler->put(entry, obj, objv_tracker, y, dpp, sync_type, from_remote_zone); + if (existing_version) { + *existing_version = objv_tracker.read_version; + } + + delete obj; + + return ret; +} + +int RGWMetadataManager::remove(string& metadata_key, optional_yield y, const DoutPrefixProvider *dpp) +{ + RGWMetadataHandler *handler; + string entry; + + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) { + return ret; + } + + RGWMetadataObject *obj; + ret = handler->get(entry, &obj, y, dpp); + if (ret < 0) { + return ret; + } + RGWObjVersionTracker objv_tracker; + objv_tracker.read_version = obj->get_version(); + delete obj; + + return handler->remove(entry, objv_tracker, y, dpp); +} + +int RGWMetadataManager::mutate(const string& metadata_key, + const ceph::real_time& mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogStatus op_type, + std::function f) +{ + RGWMetadataHandler *handler; + string entry; + + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) { + return ret; + } + + return handler->mutate(entry, mtime, objv_tracker, y, dpp, op_type, f); +} + +int RGWMetadataManager::get_shard_id(const string& section, const string& entry, int *shard_id) +{ + RGWMetadataHandler *handler = get_handler(section); + if (!handler) { + return -EINVAL; + } + + return handler->get_shard_id(entry, shard_id); +} + +struct list_keys_handle { + void *handle; + RGWMetadataHandler *handler; +}; + +int RGWMetadataManager::list_keys_init(const DoutPrefixProvider *dpp, const string& section, void **handle) +{ + return list_keys_init(dpp, section, string(), handle); +} + +int RGWMetadataManager::list_keys_init(const DoutPrefixProvider *dpp, const string& section, + const string& marker, void **handle) +{ + string entry; + RGWMetadataHandler *handler; + + int ret; + + ret = find_handler(section, &handler, entry); + if (ret < 0) { + return -ENOENT; + } + + list_keys_handle *h = new list_keys_handle; + h->handler = handler; + ret = handler->list_keys_init(dpp, marker, &h->handle); + if (ret < 0) { + delete h; + return ret; + } + + *handle = (void *)h; + + return 0; +} + +int RGWMetadataManager::list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, list& keys, bool *truncated) +{ + list_keys_handle *h = static_cast(handle); + + RGWMetadataHandler *handler = h->handler; + + return handler->list_keys_next(dpp, h->handle, max, keys, truncated); +} + +void RGWMetadataManager::list_keys_complete(void *handle) +{ + list_keys_handle *h = static_cast(handle); + + RGWMetadataHandler *handler = h->handler; + + handler->list_keys_complete(h->handle); + delete h; +} + +string RGWMetadataManager::get_marker(void *handle) +{ + list_keys_handle *h = static_cast(handle); + + return h->handler->get_marker(h->handle); +} + +void RGWMetadataManager::dump_log_entry(cls_log_entry& entry, Formatter *f) +{ + f->open_object_section("entry"); + f->dump_string("id", entry.id); + f->dump_string("section", entry.section); + f->dump_string("name", entry.name); + entry.timestamp.gmtime_nsec(f->dump_stream("timestamp")); + + try { + RGWMetadataLogData log_data; + auto iter = entry.data.cbegin(); + decode(log_data, iter); + + encode_json("data", log_data, f); + } catch (buffer::error& err) { + lderr(cct) << "failed to decode log entry: " << entry.section << ":" << entry.name<< " ts=" << entry.timestamp << dendl; + } + f->close_section(); +} + +void RGWMetadataManager::get_sections(list& sections) +{ + for (map::iterator iter = handlers.begin(); iter != handlers.end(); ++iter) { + sections.push_back(iter->first); + } +} + diff --git a/src/rgw/rgw_multi.cc b/src/rgw/rgw_multi.cc new file mode 100644 index 000000000..6e090d6b5 --- /dev/null +++ b/src/rgw/rgw_multi.cc @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_xml.h" +#include "rgw_multi.h" +#include "rgw_op.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" + +#include "services/svc_sys_obj.h" +#include "services/svc_tier_rados.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +bool RGWMultiPart::xml_end(const char *el) +{ + RGWMultiPartNumber *num_obj = static_cast(find_first("PartNumber")); + RGWMultiETag *etag_obj = static_cast(find_first("ETag")); + + if (!num_obj || !etag_obj) + return false; + + string s = num_obj->get_data(); + if (s.empty()) + return false; + + num = atoi(s.c_str()); + + s = etag_obj->get_data(); + etag = s; + + return true; +} + +bool RGWMultiCompleteUpload::xml_end(const char *el) { + XMLObjIter iter = find("Part"); + RGWMultiPart *part = static_cast(iter.get_next()); + while (part) { + int num = part->get_num(); + string etag = part->get_etag(); + parts[num] = etag; + part = static_cast(iter.get_next()); + } + return true; +} + +RGWMultiXMLParser::~RGWMultiXMLParser() {} + +XMLObj *RGWMultiXMLParser::alloc_obj(const char *el) { + XMLObj *obj = NULL; + // CompletedMultipartUpload is incorrect but some versions of some libraries use it, see PR #41700 + if (strcmp(el, "CompleteMultipartUpload") == 0 || + strcmp(el, "CompletedMultipartUpload") == 0 || + strcmp(el, "MultipartUpload") == 0) { + obj = new RGWMultiCompleteUpload(); + } else if (strcmp(el, "Part") == 0) { + obj = new RGWMultiPart(); + } else if (strcmp(el, "PartNumber") == 0) { + obj = new RGWMultiPartNumber(); + } else if (strcmp(el, "ETag") == 0) { + obj = new RGWMultiETag(); + } + + return obj; +} + +bool is_v2_upload_id(const string& upload_id) +{ + const char *uid = upload_id.c_str(); + + return (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX, sizeof(MULTIPART_UPLOAD_ID_PREFIX) - 1) == 0) || + (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX_LEGACY, sizeof(MULTIPART_UPLOAD_ID_PREFIX_LEGACY) - 1) == 0); +} + +void RGWUploadPartInfo::generate_test_instances(list& o) +{ + RGWUploadPartInfo *i = new RGWUploadPartInfo; + i->num = 1; + i->size = 10 * 1024 * 1024; + i->etag = "etag"; + o.push_back(i); + o.push_back(new RGWUploadPartInfo); +} + +void RGWUploadPartInfo::dump(Formatter *f) const +{ + encode_json("num", num, f); + encode_json("size", size, f); + encode_json("etag", etag, f); + utime_t ut(modified); + encode_json("modified", ut, f); + encode_json("past_prefixes", past_prefixes, f); +} + diff --git a/src/rgw/rgw_multi.h b/src/rgw/rgw_multi.h new file mode 100644 index 000000000..f57c90e74 --- /dev/null +++ b/src/rgw/rgw_multi.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "rgw_xml.h" +#include "rgw_obj_types.h" +#include "rgw_obj_manifest.h" +#include "rgw_compression_types.h" +#include "common/dout.h" +#include "rgw_sal_fwd.h" + +#define MULTIPART_UPLOAD_ID_PREFIX_LEGACY "2/" +#define MULTIPART_UPLOAD_ID_PREFIX "2~" // must contain a unique char that may not come up in gen_rand_alpha() + +class RGWMultiCompleteUpload : public XMLObj +{ +public: + RGWMultiCompleteUpload() {} + ~RGWMultiCompleteUpload() override {} + bool xml_end(const char *el) override; + + std::map parts; +}; + +class RGWMultiPart : public XMLObj +{ + std::string etag; + int num; +public: + RGWMultiPart() : num(0) {} + ~RGWMultiPart() override {} + bool xml_end(const char *el) override; + + std::string& get_etag() { return etag; } + int get_num() { return num; } +}; + +class RGWMultiPartNumber : public XMLObj +{ +public: + RGWMultiPartNumber() {} + ~RGWMultiPartNumber() override {} +}; + +class RGWMultiETag : public XMLObj +{ +public: + RGWMultiETag() {} + ~RGWMultiETag() override {} +}; + +class RGWMultiXMLParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) override; +public: + RGWMultiXMLParser() {} + virtual ~RGWMultiXMLParser() override; +}; + +extern bool is_v2_upload_id(const std::string& upload_id); diff --git a/src/rgw/rgw_multi_del.cc b/src/rgw/rgw_multi_del.cc new file mode 100644 index 000000000..443ffd60a --- /dev/null +++ b/src/rgw/rgw_multi_del.cc @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include + +#include "include/types.h" + +#include "rgw_xml.h" +#include "rgw_multi_del.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +bool RGWMultiDelObject::xml_end(const char *el) +{ + RGWMultiDelKey *key_obj = static_cast(find_first("Key")); + RGWMultiDelVersionId *vid = static_cast(find_first("VersionId")); + + if (!key_obj) + return false; + + string s = key_obj->get_data(); + if (s.empty()) + return false; + + key = s; + + if (vid) { + version_id = vid->get_data(); + } + + return true; +} + +bool RGWMultiDelDelete::xml_end(const char *el) { + RGWMultiDelQuiet *quiet_set = static_cast(find_first("Quiet")); + if (quiet_set) { + string quiet_val = quiet_set->get_data(); + quiet = (strcasecmp(quiet_val.c_str(), "true") == 0); + } + + XMLObjIter iter = find("Object"); + RGWMultiDelObject *object = static_cast(iter.get_next()); + while (object) { + const string& key = object->get_key(); + const string& instance = object->get_version_id(); + rgw_obj_key k(key, instance); + objects.push_back(k); + object = static_cast(iter.get_next()); + } + return true; +} + +XMLObj *RGWMultiDelXMLParser::alloc_obj(const char *el) { + XMLObj *obj = NULL; + if (strcmp(el, "Delete") == 0) { + obj = new RGWMultiDelDelete(); + } else if (strcmp(el, "Quiet") == 0) { + obj = new RGWMultiDelQuiet(); + } else if (strcmp(el, "Object") == 0) { + obj = new RGWMultiDelObject (); + } else if (strcmp(el, "Key") == 0) { + obj = new RGWMultiDelKey(); + } else if (strcmp(el, "VersionId") == 0) { + obj = new RGWMultiDelVersionId(); + } + + return obj; +} + diff --git a/src/rgw/rgw_multi_del.h b/src/rgw/rgw_multi_del.h new file mode 100644 index 000000000..b060decf4 --- /dev/null +++ b/src/rgw/rgw_multi_del.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "rgw_xml.h" +#include "rgw_common.h" + +class RGWMultiDelDelete : public XMLObj +{ +public: + RGWMultiDelDelete() :quiet(false) {} + ~RGWMultiDelDelete() override {} + bool xml_end(const char *el) override; + + std::vector objects; + bool quiet; + bool is_quiet() { return quiet; } +}; + +class RGWMultiDelQuiet : public XMLObj +{ +public: + RGWMultiDelQuiet() {} + ~RGWMultiDelQuiet() override {} +}; + +class RGWMultiDelObject : public XMLObj +{ + std::string key; + std::string version_id; +public: + RGWMultiDelObject() {} + ~RGWMultiDelObject() override {} + bool xml_end(const char *el) override; + + const std::string& get_key() { return key; } + const std::string& get_version_id() { return version_id; } +}; + +class RGWMultiDelKey : public XMLObj +{ +public: + RGWMultiDelKey() {} + ~RGWMultiDelKey() override {} +}; + +class RGWMultiDelVersionId : public XMLObj +{ +public: + RGWMultiDelVersionId() {} + ~RGWMultiDelVersionId() override {} +}; + +class RGWMultiDelXMLParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) override; +public: + RGWMultiDelXMLParser() {} + ~RGWMultiDelXMLParser() override {} +}; diff --git a/src/rgw/rgw_multiparser.cc b/src/rgw/rgw_multiparser.cc new file mode 100644 index 000000000..a8778abd9 --- /dev/null +++ b/src/rgw/rgw_multiparser.cc @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_multi.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int main(int argc, char **argv) { + RGWMultiXMLParser parser; + + if (!parser.init()) + exit(1); + + char buf[1024]; + + for (;;) { + int done; + int len; + + len = fread(buf, 1, sizeof(buf), stdin); + if (ferror(stdin)) { + fprintf(stderr, "Read error\n"); + exit(-1); + } + done = feof(stdin); + + bool result = parser.parse(buf, len, done); + if (!result) { + cerr << "failed to parse!" << std::endl; + } + + if (done) + break; + } + + exit(0); +} + diff --git a/src/rgw/rgw_multipart_meta_filter.cc b/src/rgw/rgw_multipart_meta_filter.cc new file mode 100644 index 000000000..c616cd480 --- /dev/null +++ b/src/rgw/rgw_multipart_meta_filter.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_tier_rados.h" + +using namespace std; + +const std::string MP_META_SUFFIX = ".meta"; + +bool MultipartMetaFilter::filter(const string& name, string& key) { + // the length of the suffix so we can skip past it + static const size_t MP_META_SUFFIX_LEN = MP_META_SUFFIX.length(); + + size_t len = name.size(); + + // make sure there's room for suffix plus at least one more + // character + if (len <= MP_META_SUFFIX_LEN) + return false; + + size_t pos = name.find(MP_META_SUFFIX, len - MP_META_SUFFIX_LEN); + if (pos == string::npos) + return false; + + pos = name.rfind('.', pos - 1); + if (pos == string::npos) + return false; + + key = name.substr(0, pos); + + return true; +} diff --git a/src/rgw/rgw_notify_event_type.cc b/src/rgw/rgw_notify_event_type.cc new file mode 100644 index 000000000..7a0ef9568 --- /dev/null +++ b/src/rgw/rgw_notify_event_type.cc @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_notify_event_type.h" +#include "include/str_list.h" + +namespace rgw::notify { + + std::string to_string(EventType t) { + switch (t) { + case ObjectCreated: + return "s3:ObjectCreated:*"; + case ObjectCreatedPut: + return "s3:ObjectCreated:Put"; + case ObjectCreatedPost: + return "s3:ObjectCreated:Post"; + case ObjectCreatedCopy: + return "s3:ObjectCreated:Copy"; + case ObjectCreatedCompleteMultipartUpload: + return "s3:ObjectCreated:CompleteMultipartUpload"; + case ObjectRemoved: + return "s3:ObjectRemoved:*"; + case ObjectRemovedDelete: + return "s3:ObjectRemoved:Delete"; + case ObjectRemovedDeleteMarkerCreated: + return "s3:ObjectRemoved:DeleteMarkerCreated"; + case ObjectLifecycle: + return "s3:ObjectLifecycle:*"; + case ObjectExpiration: + return "s3:ObjectLifecycle:Expiration:*"; + case ObjectExpirationCurrent: + return "s3:ObjectLifecycle:Expiration:Current"; + case ObjectExpirationNoncurrent: + return "s3:ObjectLifecycle:Expiration:Noncurrent"; + case ObjectExpirationDeleteMarker: + return "s3:ObjectLifecycle:Expiration:DeleteMarker"; + case ObjectExpirationAbortMPU: + return "s3:ObjectLifecycle:Expiration:AbortMPU"; + case ObjectTransition: + return "s3:ObjectLifecycle:Transition:*"; + case ObjectTransitionCurrent: + return "s3:ObjectLifecycle:Transition:Current"; + case ObjectTransitionNoncurrent: + return "s3:ObjectLifecycle:Transition:Noncurrent"; + case ObjectSynced: + return "s3:ObjectSynced:*"; + case ObjectSyncedCreate: + return "s3:ObjectSynced:Create"; + case ObjectSyncedDelete: + return "s3:ObjectSynced:Delete"; + case ObjectSyncedDeletionMarkerCreated: + return "s3:ObjectSynced:DeletionMarkerCreated"; + case UnknownEvent: + return "s3:UnknownEvent"; + } + return "s3:UnknownEvent"; + } + + std::string to_event_string(EventType t) { + return to_string(t).substr(3); + } + + EventType from_string(const std::string& s) { + if (s == "s3:ObjectCreated:*") + return ObjectCreated; + if (s == "s3:ObjectCreated:Put") + return ObjectCreatedPut; + if (s == "s3:ObjectCreated:Post") + return ObjectCreatedPost; + if (s == "s3:ObjectCreated:Copy") + return ObjectCreatedCopy; + if (s == "s3:ObjectCreated:CompleteMultipartUpload") + return ObjectCreatedCompleteMultipartUpload; + if (s == "s3:ObjectRemoved:*") + return ObjectRemoved; + if (s == "s3:ObjectRemoved:Delete") + return ObjectRemovedDelete; + if (s == "s3:ObjectRemoved:DeleteMarkerCreated") + return ObjectRemovedDeleteMarkerCreated; + if (s == "s3:ObjectLifecycle:*") + return ObjectLifecycle; + if (s == "s3:ObjectLifecycle:Expiration:*") + return ObjectExpiration; + if (s == "s3:ObjectLifecycle:Expiration:Current") + return ObjectExpirationCurrent; + if (s == "s3:ObjectLifecycle:Expiration:Noncurrent") + return ObjectExpirationNoncurrent; + if (s == "s3:ObjectLifecycle:Expiration:DeleteMarker") + return ObjectExpirationDeleteMarker; + if (s == "s3:ObjectLifecycle:Expiration:AbortMultipartUpload") + return ObjectExpirationAbortMPU; + if (s == "s3:ObjectLifecycle:Transition:*") + return ObjectTransition; + if (s == "s3:ObjectLifecycle:Transition:Current") + return ObjectTransitionCurrent; + if (s == "s3:ObjectLifecycle:Transition:Noncurrent") + return ObjectTransitionNoncurrent; + if (s == "s3:ObjectSynced:*") + return ObjectSynced; + if (s == "s3:ObjectSynced:Create") + return ObjectSyncedCreate; + if (s == "s3:ObjectSynced:Delete") + return ObjectSyncedDelete; + if (s == "s3:ObjectSynced:DeletionMarkerCreated") + return ObjectSyncedDeletionMarkerCreated; + return UnknownEvent; + } + +bool operator==(EventType lhs, EventType rhs) { + return lhs & rhs; +} + +void from_string_list(const std::string& string_list, EventTypeList& event_list) { + event_list.clear(); + ceph::for_each_substr(string_list, ",", [&event_list] (auto token) { + event_list.push_back(rgw::notify::from_string(std::string(token.begin(), token.end()))); + }); +} +} diff --git a/src/rgw/rgw_notify_event_type.h b/src/rgw/rgw_notify_event_type.h new file mode 100644 index 000000000..4fe1b5c90 --- /dev/null +++ b/src/rgw/rgw_notify_event_type.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +#include +#include + +namespace rgw::notify { + enum EventType { + ObjectCreated = 0xF, + ObjectCreatedPut = 0x1, + ObjectCreatedPost = 0x2, + ObjectCreatedCopy = 0x4, + ObjectCreatedCompleteMultipartUpload = 0x8, + ObjectRemoved = 0xF0, + ObjectRemovedDelete = 0x10, + ObjectRemovedDeleteMarkerCreated = 0x20, + // lifecycle events (RGW extension) + ObjectLifecycle = 0xFF00, + ObjectExpiration = 0xF00, + ObjectExpirationCurrent = 0x100, + ObjectExpirationNoncurrent = 0x200, + ObjectExpirationDeleteMarker = 0x400, + ObjectExpirationAbortMPU = 0x800, + ObjectTransition = 0xF000, + ObjectTransitionCurrent = 0x1000, + ObjectTransitionNoncurrent = 0x2000, + ObjectSynced = 0xF0000, + ObjectSyncedCreate = 0x10000, + ObjectSyncedDelete = 0x20000, + ObjectSyncedDeletionMarkerCreated = 0x40000, + UnknownEvent = 0x100000 + }; + + using EventTypeList = std::vector; + + // two event types are considered equal if their bits intersect + bool operator==(EventType lhs, EventType rhs); + + std::string to_string(EventType t); + + std::string to_event_string(EventType t); + + EventType from_string(const std::string& s); + + // create a vector of event types from comma separated list of event types + void from_string_list(const std::string& string_list, EventTypeList& event_list); +} + diff --git a/src/rgw/rgw_obj_manifest.cc b/src/rgw/rgw_obj_manifest.cc new file mode 100644 index 000000000..1d1c3b5cf --- /dev/null +++ b/src/rgw/rgw_obj_manifest.cc @@ -0,0 +1,260 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_obj_manifest.h" + +#include "rgw_rados.h" // RGW_OBJ_NS_SHADOW and RGW_OBJ_NS_MULTIPART + +using namespace std; + +void RGWObjManifest::obj_iterator::operator++() +{ + if (manifest->explicit_objs) { + ++explicit_iter; + + if (explicit_iter == manifest->objs.end()) { + ofs = manifest->obj_size; + stripe_size = 0; + return; + } + + update_explicit_pos(); + + update_location(); + return; + } + + uint64_t obj_size = manifest->get_obj_size(); + uint64_t head_size = manifest->get_head_size(); + + if (ofs == obj_size) { + return; + } + + if (manifest->rules.empty()) { + return; + } + + /* are we still pointing at the head? */ + if (ofs < head_size) { + rule_iter = manifest->rules.begin(); + const RGWObjManifestRule *rule = &rule_iter->second; + ofs = std::min(head_size, obj_size); + stripe_ofs = ofs; + cur_stripe = 1; + stripe_size = std::min(obj_size - ofs, rule->stripe_max_size); + if (rule->part_size > 0) { + stripe_size = std::min(stripe_size, rule->part_size); + } + update_location(); + return; + } + + const RGWObjManifestRule *rule = &rule_iter->second; + + stripe_ofs += rule->stripe_max_size; + cur_stripe++; + ldpp_dout(dpp, 20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl; + + if (rule->part_size > 0) { + /* multi part, multi stripes object */ + + ldpp_dout(dpp, 20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl; + + if (stripe_ofs >= part_ofs + rule->part_size) { + /* moved to the next part */ + cur_stripe = 0; + part_ofs += rule->part_size; + stripe_ofs = part_ofs; + + bool last_rule = (next_rule_iter == manifest->rules.end()); + /* move to the next rule? */ + if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) { + rule_iter = next_rule_iter; + last_rule = (next_rule_iter == manifest->rules.end()); + if (!last_rule) { + ++next_rule_iter; + } + cur_part_id = rule_iter->second.start_part_num; + } else { + cur_part_id++; + } + + rule = &rule_iter->second; + } + + stripe_size = std::min(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size); + } + + cur_override_prefix = rule->override_prefix; + + ofs = stripe_ofs; + if (ofs > obj_size) { + ofs = obj_size; + stripe_ofs = ofs; + stripe_size = 0; + } + + ldpp_dout(dpp, 20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl; + update_location(); +} + +void RGWObjManifest::obj_iterator::seek(uint64_t o) +{ + ofs = o; + if (manifest->explicit_objs) { + explicit_iter = manifest->objs.upper_bound(ofs); + if (explicit_iter != manifest->objs.begin()) { + --explicit_iter; + } + if (ofs < manifest->obj_size) { + update_explicit_pos(); + } else { + ofs = manifest->obj_size; + } + update_location(); + return; + } + if (o < manifest->get_head_size()) { + rule_iter = manifest->rules.begin(); + stripe_ofs = 0; + stripe_size = manifest->get_head_size(); + if (rule_iter != manifest->rules.end()) { + cur_part_id = rule_iter->second.start_part_num; + cur_override_prefix = rule_iter->second.override_prefix; + } + update_location(); + return; + } + + rule_iter = manifest->rules.upper_bound(ofs); + next_rule_iter = rule_iter; + if (rule_iter != manifest->rules.begin()) { + --rule_iter; + } + + if (rule_iter == manifest->rules.end()) { + update_location(); + return; + } + + const RGWObjManifestRule& rule = rule_iter->second; + + if (rule.part_size > 0) { + cur_part_id = rule.start_part_num + (ofs - rule.start_ofs) / rule.part_size; + } else { + cur_part_id = rule.start_part_num; + } + part_ofs = rule.start_ofs + (cur_part_id - rule.start_part_num) * rule.part_size; + + if (rule.stripe_max_size > 0) { + cur_stripe = (ofs - part_ofs) / rule.stripe_max_size; + + stripe_ofs = part_ofs + cur_stripe * rule.stripe_max_size; + if (!cur_part_id && manifest->get_head_size() > 0) { + cur_stripe++; + } + } else { + cur_stripe = 0; + stripe_ofs = part_ofs; + } + + if (!rule.part_size) { + stripe_size = rule.stripe_max_size; + stripe_size = std::min(manifest->get_obj_size() - stripe_ofs, stripe_size); + } else { + uint64_t next = std::min(stripe_ofs + rule.stripe_max_size, part_ofs + rule.part_size); + stripe_size = next - stripe_ofs; + } + + cur_override_prefix = rule.override_prefix; + + update_location(); +} + +void RGWObjManifest::obj_iterator::update_explicit_pos() +{ + ofs = explicit_iter->first; + stripe_ofs = ofs; + + auto next_iter = explicit_iter; + ++next_iter; + if (next_iter != manifest->objs.end()) { + stripe_size = next_iter->first - ofs; + } else { + stripe_size = manifest->obj_size - ofs; + } +} + +void RGWObjManifest::obj_iterator::update_location() +{ + if (manifest->explicit_objs) { + if (manifest->empty()) { + location = rgw_obj_select{}; + } else { + location = explicit_iter->second.loc; + } + return; + } + + if (ofs < manifest->get_head_size()) { + location = manifest->get_obj(); + location.set_placement_rule(manifest->get_head_placement_rule()); + return; + } + + manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, &cur_override_prefix, &location); +} + +void RGWObjManifest::get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, + uint64_t ofs, string *override_prefix, rgw_obj_select *location) const +{ + rgw_obj loc; + + string& oid = loc.key.name; + string& ns = loc.key.ns; + + if (!override_prefix || override_prefix->empty()) { + oid = prefix; + } else { + oid = *override_prefix; + } + + if (!cur_part_id) { + if (ofs < max_head_size) { + location->set_placement_rule(head_placement_rule); + *location = obj; + return; + } else { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", (int)cur_stripe); + oid += buf; + ns = RGW_OBJ_NS_SHADOW; + } + } else { + char buf[32]; + if (cur_stripe == 0) { + snprintf(buf, sizeof(buf), ".%d", (int)cur_part_id); + oid += buf; + ns= RGW_OBJ_NS_MULTIPART; + } else { + snprintf(buf, sizeof(buf), ".%d_%d", (int)cur_part_id, (int)cur_stripe); + oid += buf; + ns = RGW_OBJ_NS_SHADOW; + } + } + + if (!tail_placement.bucket.name.empty()) { + loc.bucket = tail_placement.bucket; + } else { + loc.bucket = obj.bucket; + } + + // Always overwrite instance with tail_instance + // to get the right shadow object location + loc.key.set_instance(tail_instance); + + location->set_placement_rule(tail_placement.placement_rule); + *location = loc; +} + diff --git a/src/rgw/rgw_obj_types.h b/src/rgw/rgw_obj_types.h new file mode 100644 index 000000000..1347a8ad0 --- /dev/null +++ b/src/rgw/rgw_obj_types.h @@ -0,0 +1,622 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * include files which can only be compiled in radosgw or OSD + * contexts (e.g., rgw_sal.h, rgw_common.h) */ + +#pragma once + +#include + +#include "rgw_pool_types.h" +#include "rgw_bucket_types.h" +#include "rgw_user_types.h" + +#include "common/dout.h" +#include "common/Formatter.h" + +struct rgw_obj_index_key { // cls_rgw_obj_key now aliases this type + std::string name; + std::string instance; + + rgw_obj_index_key() {} + rgw_obj_index_key(const std::string &_name) : name(_name) {} + rgw_obj_index_key(const std::string& n, const std::string& i) : name(n), instance(i) {} + + std::string to_string() const { + return fmt::format("{}({})", name, instance); + } + + bool empty() const { + return name.empty(); + } + + void set(const std::string& _name) { + name = _name; + instance.clear(); + } + + bool operator==(const rgw_obj_index_key& k) const { + return (name.compare(k.name) == 0) && + (instance.compare(k.instance) == 0); + } + + bool operator!=(const rgw_obj_index_key& k) const { + return (name.compare(k.name) != 0) || + (instance.compare(k.instance) != 0); + } + + bool operator<(const rgw_obj_index_key& k) const { + int r = name.compare(k.name); + if (r == 0) { + r = instance.compare(k.instance); + } + return (r < 0); + } + + bool operator<=(const rgw_obj_index_key& k) const { + return !(k < *this); + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(name, bl); + encode(instance, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(1, bl); + decode(name, bl); + decode(instance, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->dump_string("name", name); + f->dump_string("instance", instance); + } + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& ls) { + ls.push_back(new rgw_obj_index_key); + ls.push_back(new rgw_obj_index_key); + ls.back()->name = "name"; + ls.back()->instance = "instance"; + } + + size_t estimate_encoded_size() const { + constexpr size_t start_overhead = sizeof(__u8) + sizeof(__u8) + sizeof(ceph_le32); // version and length prefix + constexpr size_t string_overhead = sizeof(__u32); // strings are encoded with 32-bit length prefix + return start_overhead + + string_overhead + name.size() + + string_overhead + instance.size(); + } +}; +WRITE_CLASS_ENCODER(rgw_obj_index_key) + +struct rgw_obj_key { + std::string name; + std::string instance; + std::string ns; + + rgw_obj_key() {} + + // cppcheck-suppress noExplicitConstructor + rgw_obj_key(const std::string& n) : name(n) {} + rgw_obj_key(const std::string& n, const std::string& i) : name(n), instance(i) {} + rgw_obj_key(const std::string& n, const std::string& i, const std::string& _ns) : name(n), instance(i), ns(_ns) {} + + rgw_obj_key(const rgw_obj_index_key& k) { + parse_index_key(k.name, &name, &ns); + instance = k.instance; + } + + static void parse_index_key(const std::string& key, std::string *name, std::string *ns) { + if (key[0] != '_') { + *name = key; + ns->clear(); + return; + } + if (key[1] == '_') { + *name = key.substr(1); + ns->clear(); + return; + } + ssize_t pos = key.find('_', 1); + if (pos < 0) { + /* shouldn't happen, just use key */ + *name = key; + ns->clear(); + return; + } + + *name = key.substr(pos + 1); + *ns = key.substr(1, pos -1); + } + + void set(const std::string& n) { + name = n; + instance.clear(); + ns.clear(); + } + + void set(const std::string& n, const std::string& i) { + name = n; + instance = i; + ns.clear(); + } + + void set(const std::string& n, const std::string& i, const std::string& _ns) { + name = n; + instance = i; + ns = _ns; + } + + bool set(const rgw_obj_index_key& index_key) { + if (!parse_raw_oid(index_key.name, this)) { + return false; + } + instance = index_key.instance; + return true; + } + + void set_instance(const std::string& i) { + instance = i; + } + + const std::string& get_instance() const { + return instance; + } + + void set_ns(const std::string& _ns) { + ns = _ns; + } + + const std::string& get_ns() const { + return ns; + } + + std::string get_index_key_name() const { + if (ns.empty()) { + if (name.size() < 1 || name[0] != '_') { + return name; + } + return std::string("_") + name; + }; + + char buf[ns.size() + 16]; + snprintf(buf, sizeof(buf), "_%s_", ns.c_str()); + return std::string(buf) + name; + }; + + void get_index_key(rgw_obj_index_key* key) const { + key->name = get_index_key_name(); + key->instance = instance; + } + + std::string get_loc() const { + /* + * For backward compatibility. Older versions used to have object locator on all objects, + * however, the name was the effective object locator. This had the same effect as not + * having object locator at all for most objects but the ones that started with underscore as + * these were escaped. + */ + if (name[0] == '_' && ns.empty()) { + return name; + } + + return {}; + } + + bool empty() const { + return name.empty(); + } + + bool have_null_instance() const { + return instance == "null"; + } + + bool have_instance() const { + return !instance.empty(); + } + + bool need_to_encode_instance() const { + return have_instance() && !have_null_instance(); + } + + std::string get_oid() const { + if (ns.empty() && !need_to_encode_instance()) { + if (name.size() < 1 || name[0] != '_') { + return name; + } + return std::string("_") + name; + } + + std::string oid = "_"; + oid.append(ns); + if (need_to_encode_instance()) { + oid.append(std::string(":") + instance); + } + oid.append("_"); + oid.append(name); + return oid; + } + + bool operator==(const rgw_obj_key& k) const { + return (name.compare(k.name) == 0) && + (instance.compare(k.instance) == 0); + } + + bool operator<(const rgw_obj_key& k) const { + int r = name.compare(k.name); + if (r == 0) { + r = instance.compare(k.instance); + } + return (r < 0); + } + + bool operator<=(const rgw_obj_key& k) const { + return !(k < *this); + } + + static void parse_ns_field(std::string& ns, std::string& instance) { + int pos = ns.find(':'); + if (pos >= 0) { + instance = ns.substr(pos + 1); + ns = ns.substr(0, pos); + } else { + instance.clear(); + } + } + + // takes an oid and parses out the namespace (ns), name, and + // instance + static bool parse_raw_oid(const std::string& oid, rgw_obj_key *key) { + key->instance.clear(); + key->ns.clear(); + if (oid[0] != '_') { + key->name = oid; + return true; + } + + if (oid.size() >= 2 && oid[1] == '_') { + key->name = oid.substr(1); + return true; + } + + if (oid.size() < 3) // for namespace, min size would be 3: _x_ + return false; + + size_t pos = oid.find('_', 2); // oid must match ^_[^_].+$ + if (pos == std::string::npos) + return false; + + key->ns = oid.substr(1, pos - 1); + parse_ns_field(key->ns, key->instance); + + key->name = oid.substr(pos + 1); + return true; + } + + /** + * Translate a namespace-mangled object name to the user-facing name + * existing in the given namespace. + * + * If the object is part of the given namespace, it returns true + * and cuts down the name to the unmangled version. If it is not + * part of the given namespace, it returns false. + */ + static bool oid_to_key_in_ns(const std::string& oid, rgw_obj_key *key, const std::string& ns) { + bool ret = parse_raw_oid(oid, key); + if (!ret) { + return ret; + } + + return (ns == key->ns); + } + + /** + * Given a mangled object name and an empty namespace std::string, this + * function extracts the namespace into the std::string and sets the object + * name to be the unmangled version. + * + * It returns true after successfully doing so, or + * false if it fails. + */ + static bool strip_namespace_from_name(std::string& name, std::string& ns, std::string& instance) { + ns.clear(); + instance.clear(); + if (name[0] != '_') { + return true; + } + + size_t pos = name.find('_', 1); + if (pos == std::string::npos) { + return false; + } + + if (name[1] == '_') { + name = name.substr(1); + return true; + } + + size_t period_pos = name.find('.'); + if (period_pos < pos) { + return false; + } + + ns = name.substr(1, pos-1); + name = name.substr(pos+1, std::string::npos); + + parse_ns_field(ns, instance); + return true; + } + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(name, bl); + encode(instance, bl); + encode(ns, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(name, bl); + decode(instance, bl); + if (struct_v >= 2) { + decode(ns, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_obj_key) + +#if FMT_VERSION >= 90000 +template<> struct fmt::formatter : fmt::formatter { + template + auto format(const rgw_obj_key& key, FormatContext& ctx) const { + if (key.instance.empty()) { + return formatter::format(key.name, ctx); + } else { + return fmt::format_to(ctx.out(), "{}[{}]", key.name, key.instance); + } + } +}; +#endif + +inline std::ostream& operator<<(std::ostream& out, const rgw_obj_key &key) { +#if FMT_VERSION >= 90000 + return out << fmt::format("{}", key); +#else + if (key.instance.empty()) { + return out << fmt::format("{}", key.name); + } else { + return out << fmt::format("{}[{}]", key.name, key.instance); + } +#endif +} + +struct rgw_raw_obj { + rgw_pool pool; + std::string oid; + std::string loc; + + rgw_raw_obj() {} + rgw_raw_obj(const rgw_pool& _pool, const std::string& _oid) { + init(_pool, _oid); + } + rgw_raw_obj(const rgw_pool& _pool, const std::string& _oid, const std::string& _loc) : loc(_loc) { + init(_pool, _oid); + } + + void init(const rgw_pool& _pool, const std::string& _oid) { + pool = _pool; + oid = _oid; + } + + bool empty() const { + return oid.empty(); + } + + void encode(bufferlist& bl) const { + ENCODE_START(6, 6, bl); + encode(pool, bl); + encode(oid, bl); + encode(loc, bl); + ENCODE_FINISH(bl); + } + + void decode_from_rgw_obj(bufferlist::const_iterator& bl); + + void decode(bufferlist::const_iterator& bl) { + unsigned ofs = bl.get_off(); + DECODE_START(6, bl); + if (struct_v < 6) { + /* + * this object was encoded as rgw_obj, prior to rgw_raw_obj been split out of it, + * let's decode it as rgw_obj and convert it + */ + bl.seek(ofs); + decode_from_rgw_obj(bl); + return; + } + decode(pool, bl); + decode(oid, bl); + decode(loc, bl); + DECODE_FINISH(bl); + } + + bool operator<(const rgw_raw_obj& o) const { + int r = pool.compare(o.pool); + if (r == 0) { + r = oid.compare(o.oid); + if (r == 0) { + r = loc.compare(o.loc); + } + } + return (r < 0); + } + + bool operator==(const rgw_raw_obj& o) const { + return (pool == o.pool && oid == o.oid && loc == o.loc); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_raw_obj) + +inline std::ostream& operator<<(std::ostream& out, const rgw_raw_obj& o) { + out << o.pool << ":" << o.oid; + return out; +} + +struct rgw_obj { + rgw_bucket bucket; + rgw_obj_key key; + + bool in_extra_data{false}; /* in-memory only member, does not serialize */ + + // Represents the hash index source for this object once it is set (non-empty) + std::string index_hash_source; + + rgw_obj() {} + rgw_obj(const rgw_bucket& b, const std::string& name) : bucket(b), key(name) {} + rgw_obj(const rgw_bucket& b, const rgw_obj_key& k) : bucket(b), key(k) {} + rgw_obj(const rgw_bucket& b, const rgw_obj_index_key& k) : bucket(b), key(k) {} + + void init(const rgw_bucket& b, const rgw_obj_key& k) { + bucket = b; + key = k; + } + + void init(const rgw_bucket& b, const std::string& name) { + bucket = b; + key.set(name); + } + + void init(const rgw_bucket& b, const std::string& name, const std::string& i, const std::string& n) { + bucket = b; + key.set(name, i, n); + } + + void init_ns(const rgw_bucket& b, const std::string& name, const std::string& n) { + bucket = b; + key.name = name; + key.instance.clear(); + key.ns = n; + } + + bool empty() const { + return key.empty(); + } + + void set_key(const rgw_obj_key& k) { + key = k; + } + + std::string get_oid() const { + return key.get_oid(); + } + + const std::string& get_hash_object() const { + return index_hash_source.empty() ? key.name : index_hash_source; + } + + void set_in_extra_data(bool val) { + in_extra_data = val; + } + + bool is_in_extra_data() const { + return in_extra_data; + } + + void encode(bufferlist& bl) const { + ENCODE_START(6, 6, bl); + encode(bucket, bl); + encode(key.ns, bl); + encode(key.name, bl); + encode(key.instance, bl); +// encode(placement_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl); + if (struct_v < 6) { + std::string s; + decode(bucket.name, bl); /* bucket.name */ + decode(s, bl); /* loc */ + decode(key.ns, bl); + decode(key.name, bl); + if (struct_v >= 2) + decode(bucket, bl); + if (struct_v >= 4) + decode(key.instance, bl); + if (key.ns.empty() && key.instance.empty()) { + if (key.name[0] == '_') { + key.name = key.name.substr(1); + } + } else { + if (struct_v >= 5) { + decode(key.name, bl); + } else { + ssize_t pos = key.name.find('_', 1); + if (pos < 0) { + throw buffer::malformed_input(); + } + key.name = key.name.substr(pos + 1); + } + } + } else { + decode(bucket, bl); + decode(key.ns, bl); + decode(key.name, bl); + decode(key.instance, bl); +// decode(placement_id, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); + + bool operator==(const rgw_obj& o) const { + return (key == o.key) && + (bucket == o.bucket); + } + bool operator<(const rgw_obj& o) const { + int r = key.name.compare(o.key.name); + if (r == 0) { + r = bucket.bucket_id.compare(o.bucket.bucket_id); /* not comparing bucket.name, if bucket_id is equal so will be bucket.name */ + if (r == 0) { + r = key.ns.compare(o.key.ns); + if (r == 0) { + r = key.instance.compare(o.key.instance); + } + } + } + + return (r < 0); + } + + const rgw_pool& get_explicit_data_pool() { + if (!in_extra_data || bucket.explicit_placement.data_extra_pool.empty()) { + return bucket.explicit_placement.data_pool; + } + return bucket.explicit_placement.data_extra_pool; + } +}; +WRITE_CLASS_ENCODER(rgw_obj) diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc new file mode 100644 index 000000000..fd36a49c6 --- /dev/null +++ b/src/rgw/rgw_object_expirer.cc @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_object_expirer_core.h" + +#define dout_subsys ceph_subsys_rgw + +static rgw::sal::Driver* driver = NULL; + +class StoreDestructor { + rgw::sal::Driver* driver; + +public: + explicit StoreDestructor(rgw::sal::Driver* _s) : driver(_s) {} + ~StoreDestructor() { + if (driver) { + DriverManager::close_storage(driver); + } + } +}; + +static void usage() +{ + generic_server_usage(); +} + +int main(const int argc, const char **argv) +{ + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + std::cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + + for (std::vector::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } + } + + if (g_conf()->daemonize) { + global_init_daemonize(g_ceph_context); + } + + common_init_finish(g_ceph_context); + + const DoutPrefix dp(cct.get(), dout_subsys, "rgw object expirer: "); + DriverManager::Config cfg; + cfg.store_name = "rados"; + cfg.filter_name = "none"; + driver = DriverManager::get_storage(&dp, g_ceph_context, cfg, false, false, false, false, false); + if (!driver) { + std::cerr << "couldn't init storage provider" << std::endl; + return EIO; + } + + /* Guard to not forget about closing the rados driver. */ + StoreDestructor store_dtor(driver); + + RGWObjectExpirer objexp(driver); + objexp.start_processor(); + + const utime_t interval(g_ceph_context->_conf->rgw_objexp_gc_interval, 0); + while (true) { + interval.sleep(); + } + + /* unreachable */ + + return EXIT_SUCCESS; +} diff --git a/src/rgw/rgw_object_lock.cc b/src/rgw/rgw_object_lock.cc new file mode 100644 index 000000000..1d44328fe --- /dev/null +++ b/src/rgw/rgw_object_lock.cc @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +// +#include "rgw_object_lock.h" + +using namespace std; + +void DefaultRetention::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Mode", mode, obj, true); + if (mode.compare("GOVERNANCE") != 0 && mode.compare("COMPLIANCE") != 0) { + throw RGWXMLDecoder::err("bad Mode in lock rule"); + } + bool days_exist = RGWXMLDecoder::decode_xml("Days", days, obj); + bool years_exist = RGWXMLDecoder::decode_xml("Years", years, obj); + if ((days_exist && years_exist) || (!days_exist && !years_exist)) { + throw RGWXMLDecoder::err("either Days or Years must be specified, but not both"); + } +} + +void DefaultRetention::dump_xml(Formatter *f) const { + encode_xml("Mode", mode, f); + if (days > 0) { + encode_xml("Days", days, f); + } else { + encode_xml("Years", years, f); + } +} + +void ObjectLockRule::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("DefaultRetention", defaultRetention, obj, true); +} + +void ObjectLockRule::dump_xml(Formatter *f) const { + encode_xml("DefaultRetention", defaultRetention, f); +} + +void RGWObjectLock::decode_xml(XMLObj *obj) { + string enabled_str; + RGWXMLDecoder::decode_xml("ObjectLockEnabled", enabled_str, obj, true); + if (enabled_str.compare("Enabled") != 0) { + throw RGWXMLDecoder::err("invalid ObjectLockEnabled value"); + } else { + enabled = true; + } + rule_exist = RGWXMLDecoder::decode_xml("Rule", rule, obj); +} + +void RGWObjectLock::dump_xml(Formatter *f) const { + if (enabled) { + encode_xml("ObjectLockEnabled", "Enabled", f); + } + if (rule_exist) { + encode_xml("Rule", rule, f); + } +} + +ceph::real_time RGWObjectLock::get_lock_until_date(const ceph::real_time& mtime) const { + if (!rule_exist) { + return ceph::real_time(); + } + if (int days = get_days(); days > 0) { + return mtime + std::chrono::days(days); + } + return mtime + std::chrono::years(get_years()); +} + +void RGWObjectRetention::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Mode", mode, obj, true); + if (mode.compare("GOVERNANCE") != 0 && mode.compare("COMPLIANCE") != 0) { + throw RGWXMLDecoder::err("bad Mode in retention"); + } + string date_str; + RGWXMLDecoder::decode_xml("RetainUntilDate", date_str, obj, true); + boost::optional date = ceph::from_iso_8601(date_str); + if (boost::none == date) { + throw RGWXMLDecoder::err("invalid RetainUntilDate value"); + } + retain_until_date = *date; +} + +void RGWObjectRetention::dump_xml(Formatter *f) const { + encode_xml("Mode", mode, f); + string date = ceph::to_iso_8601(retain_until_date); + encode_xml("RetainUntilDate", date, f); +} + +void RGWObjectLegalHold::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Status", status, obj, true); + if (status.compare("ON") != 0 && status.compare("OFF") != 0) { + throw RGWXMLDecoder::err("bad status in legal hold"); + } +} + +void RGWObjectLegalHold::dump_xml(Formatter *f) const { + encode_xml("Status", status, f); +} + +bool RGWObjectLegalHold::is_enabled() const { + return status.compare("ON") == 0; +} diff --git a/src/rgw/rgw_object_lock.h b/src/rgw/rgw_object_lock.h new file mode 100644 index 000000000..27c73feae --- /dev/null +++ b/src/rgw/rgw_object_lock.h @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "common/ceph_time.h" +#include "common/iso_8601.h" +#include "rgw_xml.h" + +class DefaultRetention +{ +protected: + std::string mode; + int days; + int years; + +public: + DefaultRetention(): days(0), years(0) {}; + + int get_days() const { + return days; + } + + int get_years() const { + return years; + } + + std::string get_mode() const { + return mode; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(mode, bl); + encode(days, bl); + encode(years, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(mode, bl); + decode(days, bl); + decode(years, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(DefaultRetention) + +class ObjectLockRule +{ +protected: + DefaultRetention defaultRetention; +public: + int get_days() const { + return defaultRetention.get_days(); + } + + int get_years() const { + return defaultRetention.get_years(); + } + + std::string get_mode() const { + return defaultRetention.get_mode(); + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(defaultRetention, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(defaultRetention, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(ObjectLockRule) + +class RGWObjectLock +{ +protected: + bool enabled; + bool rule_exist; + ObjectLockRule rule; + +public: + RGWObjectLock():enabled(true), rule_exist(false) {} + + int get_days() const { + return rule.get_days(); + } + + int get_years() const { + return rule.get_years(); + } + + std::string get_mode() const { + return rule.get_mode(); + } + + bool retention_period_valid() const { + // DefaultRetention requires either Days or Years. + // You can't specify both at the same time. + // see https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTObjectLockConfiguration.html + return (get_years() > 0) != (get_days() > 0); + } + + bool has_rule() const { + return rule_exist; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(enabled, bl); + encode(rule_exist, bl); + if (rule_exist) { + encode(rule, bl); + } + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(enabled, bl); + decode(rule_exist, bl); + if (rule_exist) { + decode(rule, bl); + } + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + ceph::real_time get_lock_until_date(const ceph::real_time& mtime) const; +}; +WRITE_CLASS_ENCODER(RGWObjectLock) + +class RGWObjectRetention +{ +protected: + std::string mode; + ceph::real_time retain_until_date; +public: + RGWObjectRetention() {} + RGWObjectRetention(std::string _mode, ceph::real_time _date): mode(_mode), retain_until_date(_date) {} + + void set_mode(std::string _mode) { + mode = _mode; + } + + std::string get_mode() const { + return mode; + } + + void set_retain_until_date(ceph::real_time _retain_until_date) { + retain_until_date = _retain_until_date; + } + + ceph::real_time get_retain_until_date() const { + return retain_until_date; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(mode, bl); + encode(retain_until_date, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(mode, bl); + decode(retain_until_date, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWObjectRetention) + +class RGWObjectLegalHold +{ +protected: + std::string status; +public: + RGWObjectLegalHold() {} + RGWObjectLegalHold(std::string _status): status(_status) {} + void set_status(std::string _status) { + status = _status; + } + + std::string get_status() const { + return status; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(status, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(status, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + bool is_enabled() const; +}; +WRITE_CLASS_ENCODER(RGWObjectLegalHold) diff --git a/src/rgw/rgw_oidc_provider.cc b/src/rgw/rgw_oidc_provider.cc new file mode 100644 index 000000000..da6d73e23 --- /dev/null +++ b/src/rgw/rgw_oidc_provider.cc @@ -0,0 +1,182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "common/ceph_time.h" +#include "rgw_rados.h" +#include "rgw_zone.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_common.h" +#include "rgw_tools.h" +#include "rgw_oidc_provider.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw { namespace sal { + +const string RGWOIDCProvider::oidc_url_oid_prefix = "oidc_url."; +const string RGWOIDCProvider::oidc_arn_prefix = "arn:aws:iam::"; + +int RGWOIDCProvider::get_tenant_url_from_arn(string& tenant, string& url) +{ + auto provider_arn = rgw::ARN::parse(arn); + if (!provider_arn) { + return -EINVAL; + } + url = provider_arn->resource; + tenant = provider_arn->account; + auto pos = url.find("oidc-provider/"); + if (pos != std::string::npos) { + url.erase(pos, 14); + } + return 0; +} + +int RGWOIDCProvider::create(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + int ret; + + if (! validate_input(dpp)) { + return -EINVAL; + } + + string idp_url = url_remove_prefix(provider_url); + + /* check to see the name is not used */ + ret = read_url(dpp, idp_url, tenant); + if (exclusive && ret == 0) { + ldpp_dout(dpp, 0) << "ERROR: url " << provider_url << " already in use" + << id << dendl; + return -EEXIST; + } else if ( ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "failed reading provider url " << provider_url << ": " + << cpp_strerror(-ret) << dendl; + return ret; + } + + //arn + arn = oidc_arn_prefix + tenant + ":oidc-provider/" + idp_url; + + // Creation time + real_clock::time_point t = real_clock::now(); + + struct timeval tv; + real_clock::to_timeval(t, tv); + + char buf[30]; + struct tm result; + gmtime_r(&tv.tv_sec, &result); + strftime(buf,30,"%Y-%m-%dT%H:%M:%S", &result); + sprintf(buf + strlen(buf),".%dZ",(int)tv.tv_usec/1000); + creation_date.assign(buf, strlen(buf)); + + ret = store_url(dpp, idp_url, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: storing role info in OIDC pool: " + << provider_url << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +int RGWOIDCProvider::get(const DoutPrefixProvider *dpp) +{ + string url, tenant; + auto ret = get_tenant_url_from_arn(tenant, url); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to parse arn" << dendl; + return -EINVAL; + } + + if (this->tenant != tenant) { + ldpp_dout(dpp, 0) << "ERROR: tenant in arn doesn't match that of user " << this->tenant << ", " + << tenant << ": " << dendl; + return -EINVAL; + } + + ret = read_url(dpp, url, tenant); + if (ret < 0) { + return ret; + } + + return 0; +} + +void RGWOIDCProvider::dump(Formatter *f) const +{ + encode_json("OpenIDConnectProviderArn", arn, f); +} + +void RGWOIDCProvider::dump_all(Formatter *f) const +{ + f->open_object_section("ClientIDList"); + for (auto it : client_ids) { + encode_json("member", it, f); + } + f->close_section(); + encode_json("CreateDate", creation_date, f); + f->open_object_section("ThumbprintList"); + for (auto it : thumbprints) { + encode_json("member", it, f); + } + f->close_section(); + encode_json("Url", provider_url, f); +} + +void RGWOIDCProvider::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("OpenIDConnectProviderArn", arn, obj); +} + +bool RGWOIDCProvider::validate_input(const DoutPrefixProvider *dpp) +{ + if (provider_url.length() > MAX_OIDC_URL_LEN) { + ldpp_dout(dpp, 0) << "ERROR: Invalid length of url " << dendl; + return false; + } + if (client_ids.size() > MAX_OIDC_NUM_CLIENT_IDS) { + ldpp_dout(dpp, 0) << "ERROR: Invalid number of client ids " << dendl; + return false; + } + + for (auto& it : client_ids) { + if (it.length() > MAX_OIDC_CLIENT_ID_LEN) { + return false; + } + } + + if (thumbprints.size() > MAX_OIDC_NUM_THUMBPRINTS) { + ldpp_dout(dpp, 0) << "ERROR: Invalid number of thumbprints " << thumbprints.size() << dendl; + return false; + } + + for (auto& it : thumbprints) { + if (it.length() > MAX_OIDC_THUMBPRINT_LEN) { + return false; + } + } + + return true; +} + +const string& RGWOIDCProvider::get_url_oid_prefix() +{ + return oidc_url_oid_prefix; +} + +} } // namespace rgw::sal diff --git a/src/rgw/rgw_oidc_provider.h b/src/rgw/rgw_oidc_provider.h new file mode 100644 index 000000000..581ee879a --- /dev/null +++ b/src/rgw/rgw_oidc_provider.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "common/ceph_context.h" +#include "common/ceph_json.h" + +#include "rgw/rgw_sal.h" + +namespace rgw { namespace sal { + +class RGWOIDCProvider +{ +public: + static const std::string oidc_url_oid_prefix; + static const std::string oidc_arn_prefix; + static constexpr int MAX_OIDC_NUM_CLIENT_IDS = 100; + static constexpr int MAX_OIDC_CLIENT_ID_LEN = 255; + static constexpr int MAX_OIDC_NUM_THUMBPRINTS = 5; + static constexpr int MAX_OIDC_THUMBPRINT_LEN = 40; + static constexpr int MAX_OIDC_URL_LEN = 255; + +protected: + std::string id; + std::string provider_url; + std::string arn; + std::string creation_date; + std::string tenant; + std::vector client_ids; + std::vector thumbprints; + + int get_tenant_url_from_arn(std::string& tenant, std::string& url); + virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) = 0; + virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) = 0; + bool validate_input(const DoutPrefixProvider *dpp); + +public: + void set_arn(std::string _arn) { + arn = _arn; + } + void set_url(std::string _provider_url) { + provider_url = _provider_url; + } + void set_tenant(std::string _tenant) { + tenant = _tenant; + } + void set_client_ids(std::vector& _client_ids) { + client_ids = std::move(_client_ids); + } + void set_thumbprints(std::vector& _thumbprints) { + thumbprints = std::move(_thumbprints); + } + + RGWOIDCProvider(std::string provider_url, + std::string tenant, + std::vector client_ids, + std::vector thumbprints) + : provider_url(std::move(provider_url)), + tenant(std::move(tenant)), + client_ids(std::move(client_ids)), + thumbprints(std::move(thumbprints)) { + } + + RGWOIDCProvider( std::string arn, + std::string tenant) + : arn(std::move(arn)), + tenant(std::move(tenant)) { + } + + RGWOIDCProvider(std::string tenant) + : tenant(std::move(tenant)) {} + + RGWOIDCProvider() {} + + virtual ~RGWOIDCProvider() = default; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(id, bl); + encode(provider_url, bl); + encode(arn, bl); + encode(creation_date, bl); + encode(tenant, bl); + encode(client_ids, bl); + encode(thumbprints, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(id, bl); + decode(provider_url, bl); + decode(arn, bl); + decode(creation_date, bl); + decode(tenant, bl); + decode(client_ids, bl); + decode(thumbprints, bl); + DECODE_FINISH(bl); + } + + const std::string& get_provider_url() const { return provider_url; } + const std::string& get_arn() const { return arn; } + const std::string& get_create_date() const { return creation_date; } + const std::vector& get_client_ids() const { return client_ids;} + const std::vector& get_thumbprints() const { return thumbprints; } + + int create(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y); + virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) = 0; + int get(const DoutPrefixProvider *dpp); + void dump(Formatter *f) const; + void dump_all(Formatter *f) const; + void decode_json(JSONObj *obj); + + static const std::string& get_url_oid_prefix(); +}; +WRITE_CLASS_ENCODER(RGWOIDCProvider) + +} } // namespace rgw::sal diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc new file mode 100644 index 000000000..71fb198f3 --- /dev/null +++ b/src/rgw/rgw_op.cc @@ -0,0 +1,8958 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "include/scope_guard.h" +#include "common/Clock.h" +#include "common/armor.h" +#include "common/errno.h" +#include "common/mime.h" +#include "common/utf8.h" +#include "common/ceph_json.h" +#include "common/static_ptr.h" +#include "rgw_tracer.h" + +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_acl_swift.h" +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_log.h" +#include "rgw_multi.h" +#include "rgw_multi_del.h" +#include "rgw_cors.h" +#include "rgw_cors_s3.h" +#include "rgw_rest_conn.h" +#include "rgw_rest_s3.h" +#include "rgw_tar.h" +#include "rgw_client_io.h" +#include "rgw_compression.h" +#include "rgw_role.h" +#include "rgw_tag_s3.h" +#include "rgw_putobj_processor.h" +#include "rgw_crypt.h" +#include "rgw_perf_counters.h" +#include "rgw_process_env.h" +#include "rgw_notify.h" +#include "rgw_notify_event_type.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "rgw_lua_data_filter.h" +#include "rgw_lua.h" + +#include "services/svc_zone.h" +#include "services/svc_quota.h" +#include "services/svc_sys_obj.h" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rgw/cls_rgw_client.h" + + +#include "include/ceph_assert.h" + +#include "compressor/Compressor.h" + +#ifdef WITH_ARROW_FLIGHT +#include "rgw_flight.h" +#include "rgw_flight_frontend.h" +#endif + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/rgw_op.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace librados; +using ceph::crypto::MD5; +using boost::optional; +using boost::none; + +using rgw::ARN; +using rgw::IAM::Effect; +using rgw::IAM::Policy; + +static string mp_ns = RGW_OBJ_NS_MULTIPART; +static string shadow_ns = RGW_OBJ_NS_SHADOW; + +static void forward_req_info(const DoutPrefixProvider *dpp, CephContext *cct, req_info& info, const std::string& bucket_name); + +static MultipartMetaFilter mp_filter; + +// this probably should belong in the rgw_iam_policy_keywords, I'll get it to it +// at some point +static constexpr auto S3_EXISTING_OBJTAG = "s3:ExistingObjectTag"; +static constexpr auto S3_RESOURCE_TAG = "s3:ResourceTag"; +static constexpr auto S3_RUNTIME_RESOURCE_VAL = "${s3:ResourceTag"; + +int RGWGetObj::parse_range(void) +{ + int r = -ERANGE; + string rs(range_str); + string ofs_str; + string end_str; + + ignore_invalid_range = s->cct->_conf->rgw_ignore_get_invalid_range; + partial_content = false; + + size_t pos = rs.find("bytes="); + if (pos == string::npos) { + pos = 0; + while (isspace(rs[pos])) + pos++; + int end = pos; + while (isalpha(rs[end])) + end++; + if (strncasecmp(rs.c_str(), "bytes", end - pos) != 0) + return 0; + while (isspace(rs[end])) + end++; + if (rs[end] != '=') + return 0; + rs = rs.substr(end + 1); + } else { + rs = rs.substr(pos + 6); /* size of("bytes=") */ + } + pos = rs.find('-'); + if (pos == string::npos) + goto done; + + partial_content = true; + + ofs_str = rs.substr(0, pos); + end_str = rs.substr(pos + 1); + if (end_str.length()) { + end = atoll(end_str.c_str()); + if (end < 0) + goto done; + } + + if (ofs_str.length()) { + ofs = atoll(ofs_str.c_str()); + } else { // RFC2616 suffix-byte-range-spec + ofs = -end; + end = -1; + } + + if (end >= 0 && end < ofs) + goto done; + + range_parsed = true; + return 0; + +done: + if (ignore_invalid_range) { + partial_content = false; + ofs = 0; + end = -1; + range_parsed = false; // allow retry + r = 0; + } + + return r; +} + +static int decode_policy(const DoutPrefixProvider *dpp, + CephContext *cct, + bufferlist& bl, + RGWAccessControlPolicy *policy) +{ + auto iter = bl.cbegin(); + try { + policy->decode(iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (cct->_conf->subsys.should_gather()) { + ldpp_dout(dpp, 15) << __func__ << " Read AccessControlPolicy"; + RGWAccessControlPolicy_S3 *s3policy = static_cast(policy); + s3policy->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + + +static int get_user_policy_from_attr(const DoutPrefixProvider *dpp, + CephContext * const cct, + map& attrs, + RGWAccessControlPolicy& policy /* out */) +{ + auto aiter = attrs.find(RGW_ATTR_ACL); + if (aiter != attrs.end()) { + int ret = decode_policy(dpp, cct, aiter->second, &policy); + if (ret < 0) { + return ret; + } + } else { + return -ENOENT; + } + + return 0; +} + +/** + * Get the AccessControlPolicy for an object off of disk. + * policy: must point to a valid RGWACL, and will be filled upon return. + * bucket: name of the bucket containing the object. + * object: name of the object to get the ACL for. + * Returns: 0 on success, -ERR# otherwise. + */ +int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider *dpp, + CephContext *cct, + rgw::sal::Driver* driver, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy *policy, + optional_yield y) +{ + map::iterator aiter = bucket_attrs.find(RGW_ATTR_ACL); + + if (aiter != bucket_attrs.end()) { + int ret = decode_policy(dpp, cct, aiter->second, policy); + if (ret < 0) + return ret; + } else { + ldpp_dout(dpp, 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl; + std::unique_ptr user = driver->get_user(bucket_info.owner); + /* object exists, but policy is broken */ + int r = user->load_user(dpp, y); + if (r < 0) + return r; + + policy->create_default(bucket_info.owner, user->get_display_name()); + } + return 0; +} + +static int get_obj_policy_from_attr(const DoutPrefixProvider *dpp, + CephContext *cct, + rgw::sal::Driver* driver, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy *policy, + string *storage_class, + rgw::sal::Object* obj, + optional_yield y) +{ + bufferlist bl; + int ret = 0; + + std::unique_ptr rop = obj->get_read_op(); + + ret = rop->get_attr(dpp, RGW_ATTR_ACL, bl, y); + if (ret >= 0) { + ret = decode_policy(dpp, cct, bl, policy); + if (ret < 0) + return ret; + } else if (ret == -ENODATA) { + /* object exists, but policy is broken */ + ldpp_dout(dpp, 0) << "WARNING: couldn't find acl header for object, generating default" << dendl; + std::unique_ptr user = driver->get_user(bucket_info.owner); + ret = user->load_user(dpp, y); + if (ret < 0) + return ret; + + policy->create_default(bucket_info.owner, user->get_display_name()); + } + + if (storage_class) { + bufferlist scbl; + int r = rop->get_attr(dpp, RGW_ATTR_STORAGE_CLASS, scbl, y); + if (r >= 0) { + *storage_class = scbl.to_str(); + } else { + storage_class->clear(); + } + } + + return ret; +} + + +static boost::optional get_iam_policy_from_attr(CephContext* cct, + map& attrs, + const string& tenant) { + auto i = attrs.find(RGW_ATTR_IAM_POLICY); + if (i != attrs.end()) { + return Policy(cct, tenant, i->second, false); + } else { + return none; + } +} + +static boost::optional +get_public_access_conf_from_attr(const map& attrs) +{ + if (auto aiter = attrs.find(RGW_ATTR_PUBLIC_ACCESS); + aiter != attrs.end()) { + bufferlist::const_iterator iter{&aiter->second}; + PublicAccessBlockConfiguration access_conf; + try { + access_conf.decode(iter); + } catch (const buffer::error& e) { + return boost::none; + } + return access_conf; + } + return boost::none; +} + +vector get_iam_user_policy_from_attr(CephContext* cct, + map& attrs, + const string& tenant) { + vector policies; + if (auto it = attrs.find(RGW_ATTR_USER_POLICY); it != attrs.end()) { + bufferlist out_bl = attrs[RGW_ATTR_USER_POLICY]; + map policy_map; + decode(policy_map, out_bl); + for (auto& it : policy_map) { + bufferlist bl = bufferlist::static_from_string(it.second); + Policy p(cct, tenant, bl, false); + policies.push_back(std::move(p)); + } + } + return policies; +} + +static int read_bucket_policy(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + req_state *s, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy *policy, + rgw_bucket& bucket, + optional_yield y) +{ + if (!s->system_request && bucket_info.flags & BUCKET_SUSPENDED) { + ldpp_dout(dpp, 0) << "NOTICE: bucket " << bucket_info.bucket.name + << " is suspended" << dendl; + return -ERR_USER_SUSPENDED; + } + + if (bucket.name.empty()) { + return 0; + } + + int ret = rgw_op_get_bucket_policy_from_attr(dpp, s->cct, driver, bucket_info, bucket_attrs, policy, y); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_BUCKET; + } + + return ret; +} + +static int read_obj_policy(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + req_state *s, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy* acl, + string *storage_class, + boost::optional& policy, + rgw::sal::Bucket* bucket, + rgw::sal::Object* object, + optional_yield y, + bool copy_src=false) +{ + string upload_id; + upload_id = s->info.args.get("uploadId"); + std::unique_ptr mpobj; + rgw_obj obj; + + if (!s->system_request && bucket_info.flags & BUCKET_SUSPENDED) { + ldpp_dout(dpp, 0) << "NOTICE: bucket " << bucket_info.bucket.name + << " is suspended" << dendl; + return -ERR_USER_SUSPENDED; + } + + // when getting policy info for copy-source obj, upload_id makes no sense. + // 'copy_src' is used to make this function backward compatible. + if (!upload_id.empty() && !copy_src) { + /* multipart upload */ + std::unique_ptr upload; + upload = bucket->get_multipart_upload(object->get_name(), upload_id); + mpobj = upload->get_meta_obj(); + mpobj->set_in_extra_data(true); + object = mpobj.get(); + } + policy = get_iam_policy_from_attr(s->cct, bucket_attrs, bucket->get_tenant()); + + int ret = get_obj_policy_from_attr(dpp, s->cct, driver, bucket_info, + bucket_attrs, acl, storage_class, object, + s->yield); + if (ret == -ENOENT) { + /* object does not exist checking the bucket's ACL to make sure + that we send a proper error code */ + RGWAccessControlPolicy bucket_policy(s->cct); + ret = rgw_op_get_bucket_policy_from_attr(dpp, s->cct, driver, bucket_info, bucket_attrs, &bucket_policy, y); + if (ret < 0) { + return ret; + } + const rgw_user& bucket_owner = bucket_policy.get_owner().get_id(); + if (bucket_owner.compare(s->user->get_id()) != 0 && + ! s->auth.identity->is_admin_of(bucket_owner)) { + auto r = eval_identity_or_session_policies(dpp, s->iam_user_policies, s->env, + rgw::IAM::s3ListBucket, ARN(bucket->get_key())); + if (r == Effect::Allow) + return -ENOENT; + if (r == Effect::Deny) + return -EACCES; + if (policy) { + ARN b_arn(bucket->get_key()); + r = policy->eval(s->env, *s->auth.identity, rgw::IAM::s3ListBucket, b_arn); + if (r == Effect::Allow) + return -ENOENT; + if (r == Effect::Deny) + return -EACCES; + } + if (! s->session_policies.empty()) { + r = eval_identity_or_session_policies(dpp, s->session_policies, s->env, + rgw::IAM::s3ListBucket, ARN(bucket->get_key())); + if (r == Effect::Allow) + return -ENOENT; + if (r == Effect::Deny) + return -EACCES; + } + if (! bucket_policy.verify_permission(s, *s->auth.identity, s->perm_mask, RGW_PERM_READ)) + ret = -EACCES; + else + ret = -ENOENT; + } else { + ret = -ENOENT; + } + } + + return ret; +} + +/** + * Get the AccessControlPolicy for an user, bucket or object off of disk. + * s: The req_state to draw information from. + * only_bucket: If true, reads the user and bucket ACLs rather than the object ACL. + * Returns: 0 on success, -ERR# otherwise. + */ +int rgw_build_bucket_policies(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, req_state* s, optional_yield y) +{ + int ret = 0; + + string bi = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance"); + if (!bi.empty()) { + // note: overwrites s->bucket_name, may include a tenant/ + ret = rgw_bucket_parse_bucket_instance(bi, &s->bucket_name, &s->bucket_instance_id, &s->bucket_instance_shard_id); + if (ret < 0) { + return ret; + } + } + + if(s->dialect.compare("s3") == 0) { + s->bucket_acl = std::make_unique(s->cct); + } else if(s->dialect.compare("swift") == 0) { + /* We aren't allocating the account policy for those operations using + * the Swift's infrastructure that don't really need req_state::user. + * Typical example here is the implementation of /info. */ + if (!s->user->get_id().empty()) { + s->user_acl = std::make_unique(s->cct); + } + s->bucket_acl = std::make_unique(s->cct); + } else { + s->bucket_acl = std::make_unique(s->cct); + } + + /* check if copy source is within the current domain */ + if (!s->src_bucket_name.empty()) { + std::unique_ptr src_bucket; + ret = driver->get_bucket(dpp, nullptr, + rgw_bucket_key(s->src_tenant_name, + s->src_bucket_name), + &src_bucket, y); + if (ret == 0) { + string& zonegroup = src_bucket->get_info().zonegroup; + s->local_source = driver->get_zone()->get_zonegroup().equals(zonegroup); + } + } + + struct { + rgw_user uid; + std::string display_name; + } acct_acl_user = { + s->user->get_id(), + s->user->get_display_name(), + }; + + if (!s->bucket_name.empty()) { + s->bucket_exists = true; + + /* This is the only place that s->bucket is created. It should never be + * overwritten. */ + ret = driver->get_bucket(dpp, s->user.get(), rgw_bucket(s->bucket_tenant, s->bucket_name, s->bucket_instance_id), &s->bucket, y); + if (ret < 0) { + if (ret != -ENOENT) { + string bucket_log; + bucket_log = rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name); + ldpp_dout(dpp, 0) << "NOTICE: couldn't get bucket from bucket_name (name=" + << bucket_log << ")" << dendl; + return ret; + } + s->bucket_exists = false; + return -ERR_NO_SUCH_BUCKET; + } + if (!rgw::sal::Object::empty(s->object.get())) { + s->object->set_bucket(s->bucket.get()); + } + + s->bucket_mtime = s->bucket->get_modification_time(); + s->bucket_attrs = s->bucket->get_attrs(); + ret = read_bucket_policy(dpp, driver, s, s->bucket->get_info(), + s->bucket->get_attrs(), + s->bucket_acl.get(), s->bucket->get_key(), y); + acct_acl_user = { + s->bucket->get_info().owner, + s->bucket_acl->get_owner().get_display_name(), + }; + + s->bucket_owner = s->bucket_acl->get_owner(); + + std::unique_ptr zonegroup; + int r = driver->get_zonegroup(s->bucket->get_info().zonegroup, &zonegroup); + if (!r) { + s->zonegroup_endpoint = zonegroup->get_endpoint(); + s->zonegroup_name = zonegroup->get_name(); + } + if (r < 0 && ret == 0) { + ret = r; + } + + if (!driver->get_zone()->get_zonegroup().equals(s->bucket->get_info().zonegroup)) { + ldpp_dout(dpp, 0) << "NOTICE: request for data in a different zonegroup (" + << s->bucket->get_info().zonegroup << " != " + << driver->get_zone()->get_zonegroup().get_id() << ")" << dendl; + /* we now need to make sure that the operation actually requires copy source, that is + * it's a copy operation + */ + if (driver->get_zone()->get_zonegroup().is_master_zonegroup() && s->system_request) { + /*If this is the master, don't redirect*/ + } else if (s->op_type == RGW_OP_GET_BUCKET_LOCATION ) { + /* If op is get bucket location, don't redirect */ + } else if (!s->local_source || + (s->op != OP_PUT && s->op != OP_COPY) || + rgw::sal::Object::empty(s->object.get())) { + return -ERR_PERMANENT_REDIRECT; + } + } + + /* init dest placement */ + s->dest_placement.storage_class = s->info.storage_class; + s->dest_placement.inherit_from(s->bucket->get_placement_rule()); + + if (!driver->valid_placement(s->dest_placement)) { + ldpp_dout(dpp, 0) << "NOTICE: invalid dest placement: " << s->dest_placement.to_str() << dendl; + return -EINVAL; + } + + s->bucket_access_conf = get_public_access_conf_from_attr(s->bucket->get_attrs()); + } + + /* handle user ACL only for those APIs which support it */ + if (s->user_acl) { + std::unique_ptr acl_user = driver->get_user(acct_acl_user.uid); + + ret = acl_user->read_attrs(dpp, y); + if (!ret) { + ret = get_user_policy_from_attr(dpp, s->cct, acl_user->get_attrs(), *s->user_acl); + } + if (-ENOENT == ret) { + /* In already existing clusters users won't have ACL. In such case + * assuming that only account owner has the rights seems to be + * reasonable. That allows to have only one verification logic. + * NOTE: there is small compatibility kludge for global, empty tenant: + * 1. if we try to reach an existing bucket, its owner is considered + * as account owner. + * 2. otherwise account owner is identity stored in s->user->user_id. */ + s->user_acl->create_default(acct_acl_user.uid, + acct_acl_user.display_name); + ret = 0; + } else if (ret < 0) { + ldpp_dout(dpp, 0) << "NOTICE: couldn't get user attrs for handling ACL " + "(user_id=" << s->user->get_id() << ", ret=" << ret << ")" << dendl; + return ret; + } + } + // We don't need user policies in case of STS token returned by AssumeRole, + // hence the check for user type + if (! s->user->get_id().empty() && s->auth.identity->get_identity_type() != TYPE_ROLE) { + try { + ret = s->user->read_attrs(dpp, y); + if (ret == 0) { + auto user_policies = get_iam_user_policy_from_attr(s->cct, + s->user->get_attrs(), + s->user->get_tenant()); + s->iam_user_policies.insert(s->iam_user_policies.end(), + std::make_move_iterator(user_policies.begin()), + std::make_move_iterator(user_policies.end())); + } else { + if (ret == -ENOENT) + ret = 0; + else ret = -EACCES; + } + } catch (const std::exception& e) { + ldpp_dout(dpp, -1) << "Error reading IAM User Policy: " << e.what() << dendl; + ret = -EACCES; + } + } + + try { + s->iam_policy = get_iam_policy_from_attr(s->cct, s->bucket_attrs, s->bucket_tenant); + } catch (const std::exception& e) { + // Really this is a can't happen condition. We parse the policy + // when it's given to us, so perhaps we should abort or otherwise + // raise bloody murder. + ldpp_dout(dpp, 0) << "Error reading IAM Policy: " << e.what() << dendl; + ret = -EACCES; + } + + bool success = driver->get_zone()->get_redirect_endpoint(&s->redirect_zone_endpoint); + if (success) { + ldpp_dout(dpp, 20) << "redirect_zone_endpoint=" << s->redirect_zone_endpoint << dendl; + } + + return ret; +} + +/** + * Get the AccessControlPolicy for a bucket or object off of disk. + * s: The req_state to draw information from. + * only_bucket: If true, reads the bucket ACL rather than the object ACL. + * Returns: 0 on success, -ERR# otherwise. + */ +int rgw_build_object_policies(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + req_state *s, bool prefetch_data, optional_yield y) +{ + int ret = 0; + + if (!rgw::sal::Object::empty(s->object.get())) { + if (!s->bucket_exists) { + return -ERR_NO_SUCH_BUCKET; + } + s->object_acl = std::make_unique(s->cct); + + s->object->set_atomic(); + if (prefetch_data) { + s->object->set_prefetch_data(); + } + ret = read_obj_policy(dpp, driver, s, s->bucket->get_info(), s->bucket_attrs, + s->object_acl.get(), nullptr, s->iam_policy, s->bucket.get(), + s->object.get(), y); + } + + return ret; +} + +static int rgw_iam_remove_objtags(const DoutPrefixProvider *dpp, req_state* s, rgw::sal::Object* object, bool has_existing_obj_tag, bool has_resource_tag) { + object->set_atomic(); + int op_ret = object->get_obj_attrs(s->yield, dpp); + if (op_ret < 0) + return op_ret; + rgw::sal::Attrs attrs = object->get_attrs(); + auto tags = attrs.find(RGW_ATTR_TAGS); + if (tags != attrs.end()) { + RGWObjTags tagset; + try { + auto bliter = tags->second.cbegin(); + tagset.decode(bliter); + } catch (buffer::error& err) { + ldpp_dout(s, 0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + return -EIO; + } + for (auto& tag: tagset.get_tags()) { + if (has_existing_obj_tag) { + vector::iterator> iters; + string key = "s3:ExistingObjectTag/" + tag.first; + auto result = s->env.equal_range(key); + for (auto& it = result.first; it != result.second; ++it) + { + if (tag.second == it->second) { + iters.emplace_back(it); + } + } + for (auto& it : iters) { + s->env.erase(it); + } + }//end if has_existing_obj_tag + if (has_resource_tag) { + vector::iterator> iters; + string key = "s3:ResourceTag/" + tag.first; + auto result = s->env.equal_range(key); + for (auto& it = result.first; it != result.second; ++it) + { + if (tag.second == it->second) { + iters.emplace_back(it); + } + } + for (auto& it : iters) { + s->env.erase(it); + } + }//end if has_resource_tag + } + } + return 0; +} + +void rgw_add_to_iam_environment(rgw::IAM::Environment& e, std::string_view key, std::string_view val){ + // This variant just adds non empty key pairs to IAM env., values can be empty + // in certain cases like tagging + if (!key.empty()) + e.emplace(key,val); +} + +static int rgw_iam_add_tags_from_bl(req_state* s, bufferlist& bl, bool has_existing_obj_tag=false, bool has_resource_tag=false){ + RGWObjTags& tagset = s->tagset; + try { + auto bliter = bl.cbegin(); + tagset.decode(bliter); + } catch (buffer::error& err) { + ldpp_dout(s, 0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + return -EIO; + } + + for (const auto& tag: tagset.get_tags()){ + if (has_existing_obj_tag) + rgw_add_to_iam_environment(s->env, "s3:ExistingObjectTag/" + tag.first, tag.second); + if (has_resource_tag) + rgw_add_to_iam_environment(s->env, "s3:ResourceTag/" + tag.first, tag.second); + } + return 0; +} + +static int rgw_iam_add_objtags(const DoutPrefixProvider *dpp, req_state* s, rgw::sal::Object* object, bool has_existing_obj_tag, bool has_resource_tag) { + object->set_atomic(); + int op_ret = object->get_obj_attrs(s->yield, dpp); + if (op_ret < 0) + return op_ret; + rgw::sal::Attrs attrs = object->get_attrs(); + auto tags = attrs.find(RGW_ATTR_TAGS); + if (tags != attrs.end()){ + return rgw_iam_add_tags_from_bl(s, tags->second, has_existing_obj_tag, has_resource_tag); + } + return 0; +} + +static int rgw_iam_add_objtags(const DoutPrefixProvider *dpp, req_state* s, bool has_existing_obj_tag, bool has_resource_tag) { + if (!rgw::sal::Object::empty(s->object.get())) { + return rgw_iam_add_objtags(dpp, s, s->object.get(), has_existing_obj_tag, has_resource_tag); + } + return 0; +} + +static int rgw_iam_add_buckettags(const DoutPrefixProvider *dpp, req_state* s, rgw::sal::Bucket* bucket) { + rgw::sal::Attrs attrs = bucket->get_attrs(); + auto tags = attrs.find(RGW_ATTR_TAGS); + if (tags != attrs.end()) { + return rgw_iam_add_tags_from_bl(s, tags->second, false, true); + } + return 0; +} + +static int rgw_iam_add_buckettags(const DoutPrefixProvider *dpp, req_state* s) { + return rgw_iam_add_buckettags(dpp, s, s->bucket.get()); +} + +static void rgw_iam_add_crypt_attrs(rgw::IAM::Environment& e, + const meta_map_t& attrs) +{ + constexpr auto encrypt_attr = "x-amz-server-side-encryption"; + constexpr auto s3_encrypt_attr = "s3:x-amz-server-side-encryption"; + if (auto h = attrs.find(encrypt_attr); h != attrs.end()) { + rgw_add_to_iam_environment(e, s3_encrypt_attr, h->second); + } + + constexpr auto kms_attr = "x-amz-server-side-encryption-aws-kms-key-id"; + constexpr auto s3_kms_attr = "s3:x-amz-server-side-encryption-aws-kms-key-id"; + if (auto h = attrs.find(kms_attr); h != attrs.end()) { + rgw_add_to_iam_environment(e, s3_kms_attr, h->second); + } +} + +static std::tuple rgw_check_policy_condition(const DoutPrefixProvider *dpp, + boost::optional iam_policy, + boost::optional> identity_policies, + boost::optional> session_policies, + bool check_obj_exist_tag=true) { + bool has_existing_obj_tag = false, has_resource_tag = false; + bool iam_policy_s3_exist_tag = false, iam_policy_s3_resource_tag = false; + if (iam_policy) { + if (check_obj_exist_tag) { + iam_policy_s3_exist_tag = iam_policy->has_partial_conditional(S3_EXISTING_OBJTAG); + } + iam_policy_s3_resource_tag = iam_policy->has_partial_conditional(S3_RESOURCE_TAG) || iam_policy->has_partial_conditional_value(S3_RUNTIME_RESOURCE_VAL); + } + + bool identity_policy_s3_exist_tag = false, identity_policy_s3_resource_tag = false; + if (identity_policies) { + for (auto& identity_policy : identity_policies.get()) { + if (check_obj_exist_tag) { + if (identity_policy.has_partial_conditional(S3_EXISTING_OBJTAG)) + identity_policy_s3_exist_tag = true; + } + if (identity_policy.has_partial_conditional(S3_RESOURCE_TAG) || identity_policy.has_partial_conditional_value(S3_RUNTIME_RESOURCE_VAL)) + identity_policy_s3_resource_tag = true; + if (identity_policy_s3_exist_tag && identity_policy_s3_resource_tag) // check all policies till both are set to true + break; + } + } + + bool session_policy_s3_exist_tag = false, session_policy_s3_resource_flag = false; + if (session_policies) { + for (auto& session_policy : session_policies.get()) { + if (check_obj_exist_tag) { + if (session_policy.has_partial_conditional(S3_EXISTING_OBJTAG)) + session_policy_s3_exist_tag = true; + } + if (session_policy.has_partial_conditional(S3_RESOURCE_TAG) || session_policy.has_partial_conditional_value(S3_RUNTIME_RESOURCE_VAL)) + session_policy_s3_resource_flag = true; + if (session_policy_s3_exist_tag && session_policy_s3_resource_flag) + break; + } + } + + has_existing_obj_tag = iam_policy_s3_exist_tag || identity_policy_s3_exist_tag || session_policy_s3_exist_tag; + has_resource_tag = iam_policy_s3_resource_tag || identity_policy_s3_resource_tag || session_policy_s3_resource_flag; + return make_tuple(has_existing_obj_tag, has_resource_tag); +} + +static std::tuple rgw_check_policy_condition(const DoutPrefixProvider *dpp, req_state* s, bool check_obj_exist_tag=true) { + return rgw_check_policy_condition(dpp, s->iam_policy, s->iam_user_policies, s->session_policies, check_obj_exist_tag); +} + +static void rgw_add_grant_to_iam_environment(rgw::IAM::Environment& e, req_state *s){ + + using header_pair_t = std::pair ; + static const std::initializer_list acl_header_conditionals { + {"HTTP_X_AMZ_GRANT_READ", "s3:x-amz-grant-read"}, + {"HTTP_X_AMZ_GRANT_WRITE", "s3:x-amz-grant-write"}, + {"HTTP_X_AMZ_GRANT_READ_ACP", "s3:x-amz-grant-read-acp"}, + {"HTTP_X_AMZ_GRANT_WRITE_ACP", "s3:x-amz-grant-write-acp"}, + {"HTTP_X_AMZ_GRANT_FULL_CONTROL", "s3:x-amz-grant-full-control"} + }; + + if (s->has_acl_header){ + for (const auto& c: acl_header_conditionals){ + auto hdr = s->info.env->get(c.first); + if(hdr) { + e.emplace(c.second, hdr); + } + } + } +} + +void rgw_build_iam_environment(rgw::sal::Driver* driver, + req_state* s) +{ + const auto& m = s->info.env->get_map(); + auto t = ceph::real_clock::now(); + s->env.emplace("aws:CurrentTime", std::to_string(ceph::real_clock::to_time_t(t))); + s->env.emplace("aws:EpochTime", ceph::to_iso_8601(t)); + // TODO: This is fine for now, but once we have STS we'll need to + // look and see. Also this won't work with the IdentityApplier + // model, since we need to know the actual credential. + s->env.emplace("aws:PrincipalType", "User"); + + auto i = m.find("HTTP_REFERER"); + if (i != m.end()) { + s->env.emplace("aws:Referer", i->second); + } + + if (rgw_transport_is_secure(s->cct, *s->info.env)) { + s->env.emplace("aws:SecureTransport", "true"); + } + + const auto remote_addr_param = s->cct->_conf->rgw_remote_addr_param; + if (remote_addr_param.length()) { + i = m.find(remote_addr_param); + } else { + i = m.find("REMOTE_ADDR"); + } + if (i != m.end()) { + const string* ip = &(i->second); + string temp; + if (remote_addr_param == "HTTP_X_FORWARDED_FOR") { + const auto comma = ip->find(','); + if (comma != string::npos) { + temp.assign(*ip, 0, comma); + ip = &temp; + } + } + s->env.emplace("aws:SourceIp", *ip); + } + + i = m.find("HTTP_USER_AGENT"); { + if (i != m.end()) + s->env.emplace("aws:UserAgent", i->second); + } + + if (s->user) { + // What to do about aws::userid? One can have multiple access + // keys so that isn't really suitable. Do we have a durable + // identifier that can persist through name changes? + s->env.emplace("aws:username", s->user->get_id().id); + } + + i = m.find("HTTP_X_AMZ_SECURITY_TOKEN"); + if (i != m.end()) { + s->env.emplace("sts:authentication", "true"); + } else { + s->env.emplace("sts:authentication", "false"); + } +} + +/* + * GET on CloudTiered objects is processed only when sent from the sync client. + * In all other cases, fail with `ERR_INVALID_OBJECT_STATE`. + */ +int handle_cloudtier_obj(rgw::sal::Attrs& attrs, bool sync_cloudtiered) { + int op_ret = 0; + auto attr_iter = attrs.find(RGW_ATTR_MANIFEST); + if (attr_iter != attrs.end()) { + RGWObjManifest m; + try { + decode(m, attr_iter->second); + if (m.get_tier_type() == "cloud-s3") { + if (!sync_cloudtiered) { + /* XXX: Instead send presigned redirect or read-through */ + op_ret = -ERR_INVALID_OBJECT_STATE; + } else { // fetch object for sync and set cloud_tier attrs + bufferlist t, t_tier; + RGWObjTier tier_config; + m.get_tier_config(&tier_config); + + t.append("cloud-s3"); + attrs[RGW_ATTR_CLOUD_TIER_TYPE] = t; + encode(tier_config, t_tier); + attrs[RGW_ATTR_CLOUD_TIER_CONFIG] = t_tier; + } + } + } catch (const buffer::end_of_buffer&) { + // ignore empty manifest; it's not cloud-tiered + } catch (const std::exception& e) { + } + } + + return op_ret; +} + +void rgw_bucket_object_pre_exec(req_state *s) +{ + if (s->expect_cont) + dump_continue(s); + + dump_bucket_from_state(s); +} + +// So! Now and then when we try to update bucket information, the +// bucket has changed during the course of the operation. (Or we have +// a cache consistency problem that Watch/Notify isn't ruling out +// completely.) +// +// When this happens, we need to update the bucket info and try +// again. We have, however, to try the right *part* again. We can't +// simply re-send, since that will obliterate the previous update. +// +// Thus, callers of this function should include everything that +// merges information to be changed into the bucket information as +// well as the call to set it. +// +// The called function must return an integer, negative on error. In +// general, they should just return op_ret. +namespace { +template +int retry_raced_bucket_write(const DoutPrefixProvider *dpp, rgw::sal::Bucket* b, const F& f) { + auto r = f(); + for (auto i = 0u; i < 15u && r == -ECANCELED; ++i) { + r = b->try_refresh_info(dpp, nullptr); + if (r >= 0) { + r = f(); + } + } + return r; +} +} + + +int RGWGetObj::verify_permission(optional_yield y) +{ + s->object->set_atomic(); + + if (prefetch_data()) { + s->object->set_prefetch_data(); + } + + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (torrent.get_flag()) { + if (s->object->get_instance().empty()) { + action = rgw::IAM::s3GetObjectTorrent; + } else { + action = rgw::IAM::s3GetObjectVersionTorrent; + } + } else { + if (s->object->get_instance().empty()) { + action = rgw::IAM::s3GetObject; + } else { + action = rgw::IAM::s3GetObjectVersion; + } + } + + if (!verify_object_permission(this, s, action)) { + return -EACCES; + } + + if (s->bucket->get_info().obj_lock_enabled()) { + get_retention = verify_object_permission(this, s, rgw::IAM::s3GetObjectRetention); + get_legal_hold = verify_object_permission(this, s, rgw::IAM::s3GetObjectLegalHold); + } + + return 0; +} + +RGWOp::~RGWOp(){}; + +int RGWOp::verify_op_mask() +{ + uint32_t required_mask = op_mask(); + + ldpp_dout(this, 20) << "required_mask= " << required_mask + << " user.op_mask=" << s->user->get_info().op_mask << dendl; + + if ((s->user->get_info().op_mask & required_mask) != required_mask) { + return -EPERM; + } + + if (!s->system_request && (required_mask & RGW_OP_TYPE_MODIFY) && !driver->get_zone()->is_writeable()) { + ldpp_dout(this, 5) << "NOTICE: modify request to a read-only zone by a " + "non-system user, permission denied" << dendl; + return -EPERM; + } + + return 0; +} + +int RGWGetObjTags::verify_permission(optional_yield y) +{ + auto iam_action = s->object->get_instance().empty()? + rgw::IAM::s3GetObjectTagging: + rgw::IAM::s3GetObjectVersionTagging; + + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + if (!verify_object_permission(this, s,iam_action)) + return -EACCES; + + return 0; +} + +void RGWGetObjTags::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjTags::execute(optional_yield y) +{ + rgw::sal::Attrs attrs; + + s->object->set_atomic(); + + op_ret = s->object->get_obj_attrs(y, this); + + if (op_ret == 0){ + attrs = s->object->get_attrs(); + auto tags = attrs.find(RGW_ATTR_TAGS); + if(tags != attrs.end()){ + has_tags = true; + tags_bl.append(tags->second); + } + } + send_response_data(tags_bl); +} + +int RGWPutObjTags::verify_permission(optional_yield y) +{ + auto iam_action = s->object->get_instance().empty() ? + rgw::IAM::s3PutObjectTagging: + rgw::IAM::s3PutObjectVersionTagging; + + //Using buckets tags for authorization makes more sense. + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, true); + if (has_s3_existing_tag) + rgw_iam_add_objtags(this, s, true, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + if (!verify_object_permission(this, s,iam_action)) + return -EACCES; + return 0; +} + +void RGWPutObjTags::execute(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) + return; + + if (rgw::sal::Object::empty(s->object.get())){ + op_ret= -EINVAL; // we only support tagging on existing objects + return; + } + + s->object->set_atomic(); + op_ret = s->object->modify_obj_attrs(RGW_ATTR_TAGS, tags_bl, y, this); + if (op_ret == -ECANCELED){ + op_ret = -ERR_TAG_CONFLICT; + } +} + +void RGWDeleteObjTags::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + + +int RGWDeleteObjTags::verify_permission(optional_yield y) +{ + if (!rgw::sal::Object::empty(s->object.get())) { + auto iam_action = s->object->get_instance().empty() ? + rgw::IAM::s3DeleteObjectTagging: + rgw::IAM::s3DeleteObjectVersionTagging; + + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + if (!verify_object_permission(this, s, iam_action)) + return -EACCES; + } + return 0; +} + +void RGWDeleteObjTags::execute(optional_yield y) +{ + if (rgw::sal::Object::empty(s->object.get())) + return; + + op_ret = s->object->delete_obj_attrs(this, RGW_ATTR_TAGS, y); +} + +int RGWGetBucketTags::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketTagging)) { + return -EACCES; + } + + return 0; +} + +void RGWGetBucketTags::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetBucketTags::execute(optional_yield y) +{ + auto iter = s->bucket_attrs.find(RGW_ATTR_TAGS); + if (iter != s->bucket_attrs.end()) { + has_tags = true; + tags_bl.append(iter->second); + } else { + op_ret = -ERR_NO_SUCH_TAG_SET; + } + send_response_data(tags_bl); +} + +int RGWPutBucketTags::verify_permission(optional_yield y) { + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketTagging); +} + +void RGWPutBucketTags::execute(optional_yield y) +{ + + op_ret = get_params(this, y); + if (op_ret < 0) + return; + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, y] { + rgw::sal::Attrs attrs = s->bucket->get_attrs(); + attrs[RGW_ATTR_TAGS] = tags_bl; + return s->bucket->merge_and_store_attrs(this, attrs, y); + }); + +} + +void RGWDeleteBucketTags::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +int RGWDeleteBucketTags::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketTagging); +} + +void RGWDeleteBucketTags::execute(optional_yield y) +{ + bufferlist in_data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, y] { + rgw::sal::Attrs attrs = s->bucket->get_attrs(); + attrs.erase(RGW_ATTR_TAGS); + op_ret = s->bucket->merge_and_store_attrs(this, attrs, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "RGWDeleteBucketTags() failed to remove RGW_ATTR_TAGS on bucket=" + << s->bucket->get_name() + << " returned err= " << op_ret << dendl; + } + return op_ret; + }); +} + +int RGWGetBucketReplication::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3GetReplicationConfiguration)) { + return -EACCES; + } + + return 0; +} + +void RGWGetBucketReplication::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetBucketReplication::execute(optional_yield y) +{ + send_response_data(); +} + +int RGWPutBucketReplication::verify_permission(optional_yield y) { + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutReplicationConfiguration); +} + +void RGWPutBucketReplication::execute(optional_yield y) { + + op_ret = get_params(y); + if (op_ret < 0) + return; + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + auto sync_policy = (s->bucket->get_info().sync_policy ? *s->bucket->get_info().sync_policy : rgw_sync_policy_info()); + + for (auto& group : sync_policy_groups) { + sync_policy.groups[group.id] = group; + } + + s->bucket->get_info().set_sync_policy(std::move(sync_policy)); + + int ret = s->bucket->put_info(this, false, real_time()); + if (ret < 0) { + ldpp_dout(this, 0) << "ERROR: put_bucket_instance_info (bucket=" << s->bucket << ") returned ret=" << ret << dendl; + return ret; + } + + return 0; + }); +} + +void RGWDeleteBucketReplication::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +int RGWDeleteBucketReplication::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3DeleteReplicationConfiguration); +} + +void RGWDeleteBucketReplication::execute(optional_yield y) +{ + bufferlist in_data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + if (!s->bucket->get_info().sync_policy) { + return 0; + } + + rgw_sync_policy_info sync_policy = *s->bucket->get_info().sync_policy; + + update_sync_policy(&sync_policy); + + s->bucket->get_info().set_sync_policy(std::move(sync_policy)); + + int ret = s->bucket->put_info(this, false, real_time()); + if (ret < 0) { + ldpp_dout(this, 0) << "ERROR: put_bucket_instance_info (bucket=" << s->bucket << ") returned ret=" << ret << dendl; + return ret; + } + + return 0; + }); +} + +int RGWOp::do_aws4_auth_completion() +{ + ldpp_dout(this, 5) << "NOTICE: call to do_aws4_auth_completion" << dendl; + if (s->auth.completer) { + if (!s->auth.completer->complete()) { + return -ERR_AMZ_CONTENT_SHA256_MISMATCH; + } else { + ldpp_dout(this, 10) << "v4 auth ok -- do_aws4_auth_completion" << dendl; + } + + /* TODO(rzarzynski): yes, we're really called twice on PUTs. Only first + * call passes, so we disable second one. This is old behaviour, sorry! + * Plan for tomorrow: seek and destroy. */ + s->auth.completer = nullptr; + } + + return 0; +} + +int RGWOp::init_quota() +{ + /* no quota enforcement for system requests */ + if (s->system_request) + return 0; + + /* init quota related stuff */ + if (!(s->user->get_info().op_mask & RGW_OP_TYPE_MODIFY)) { + return 0; + } + + /* Need a bucket to get quota */ + if (rgw::sal::Bucket::empty(s->bucket.get())) { + return 0; + } + + std::unique_ptr owner_user = + driver->get_user(s->bucket->get_info().owner); + rgw::sal::User* user; + + if (s->user->get_id() == s->bucket_owner.get_id()) { + user = s->user.get(); + } else { + int r = owner_user->load_user(this, s->yield); + if (r < 0) + return r; + user = owner_user.get(); + + } + + driver->get_quota(quota); + + if (s->bucket->get_info().quota.enabled) { + quota.bucket_quota = s->bucket->get_info().quota; + } else if (user->get_info().quota.bucket_quota.enabled) { + quota.bucket_quota = user->get_info().quota.bucket_quota; + } + + if (user->get_info().quota.user_quota.enabled) { + quota.user_quota = user->get_info().quota.user_quota; + } + + return 0; +} + +static bool validate_cors_rule_method(const DoutPrefixProvider *dpp, RGWCORSRule *rule, const char *req_meth) { + if (!req_meth) { + ldpp_dout(dpp, 5) << "req_meth is null" << dendl; + return false; + } + + uint8_t flags = get_cors_method_flags(req_meth); + + if (rule->get_allowed_methods() & flags) { + ldpp_dout(dpp, 10) << "Method " << req_meth << " is supported" << dendl; + } else { + ldpp_dout(dpp, 5) << "Method " << req_meth << " is not supported" << dendl; + return false; + } + + return true; +} + +static bool validate_cors_rule_header(const DoutPrefixProvider *dpp, RGWCORSRule *rule, const char *req_hdrs) { + if (req_hdrs) { + vector hdrs; + get_str_vec(req_hdrs, hdrs); + for (const auto& hdr : hdrs) { + if (!rule->is_header_allowed(hdr.c_str(), hdr.length())) { + ldpp_dout(dpp, 5) << "Header " << hdr << " is not registered in this rule" << dendl; + return false; + } + } + } + return true; +} + +int RGWOp::read_bucket_cors() +{ + bufferlist bl; + + map::iterator aiter = s->bucket_attrs.find(RGW_ATTR_CORS); + if (aiter == s->bucket_attrs.end()) { + ldpp_dout(this, 20) << "no CORS configuration attr found" << dendl; + cors_exist = false; + return 0; /* no CORS configuration found */ + } + + cors_exist = true; + + bl = aiter->second; + + auto iter = bl.cbegin(); + try { + bucket_cors.decode(iter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: could not decode CORS, caught buffer::error" << dendl; + return -EIO; + } + if (s->cct->_conf->subsys.should_gather()) { + RGWCORSConfiguration_S3 *s3cors = static_cast(&bucket_cors); + ldpp_dout(this, 15) << "Read RGWCORSConfiguration"; + s3cors->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + +/** CORS 6.2.6. + * If any of the header field-names is not a ASCII case-insensitive match for + * any of the values in list of headers do not set any additional headers and + * terminate this set of steps. + * */ +static void get_cors_response_headers(const DoutPrefixProvider *dpp, RGWCORSRule *rule, const char *req_hdrs, string& hdrs, string& exp_hdrs, unsigned *max_age) { + if (req_hdrs) { + list hl; + get_str_list(req_hdrs, hl); + for(list::iterator it = hl.begin(); it != hl.end(); ++it) { + if (!rule->is_header_allowed((*it).c_str(), (*it).length())) { + ldpp_dout(dpp, 5) << "Header " << (*it) << " is not registered in this rule" << dendl; + } else { + if (hdrs.length() > 0) hdrs.append(","); + hdrs.append((*it)); + } + } + } + rule->format_exp_headers(exp_hdrs); + *max_age = rule->get_max_age(); +} + +/** + * Generate the CORS header response + * + * This is described in the CORS standard, section 6.2. + */ +bool RGWOp::generate_cors_headers(string& origin, string& method, string& headers, string& exp_headers, unsigned *max_age) +{ + /* CORS 6.2.1. */ + const char *orig = s->info.env->get("HTTP_ORIGIN"); + if (!orig) { + return false; + } + + /* Custom: */ + origin = orig; + int temp_op_ret = read_bucket_cors(); + if (temp_op_ret < 0) { + op_ret = temp_op_ret; + return false; + } + + if (!cors_exist) { + ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl; + return false; + } + + /* CORS 6.2.2. */ + RGWCORSRule *rule = bucket_cors.host_name_rule(orig); + if (!rule) + return false; + + /* + * Set the Allowed-Origin header to a asterisk if this is allowed in the rule + * and no Authorization was send by the client + * + * The origin parameter specifies a URI that may access the resource. The browser must enforce this. + * For requests without credentials, the server may specify "*" as a wildcard, + * thereby allowing any origin to access the resource. + */ + const char *authorization = s->info.env->get("HTTP_AUTHORIZATION"); + if (!authorization && rule->has_wildcard_origin()) + origin = "*"; + + /* CORS 6.2.3. */ + const char *req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + if (!req_meth) { + req_meth = s->info.method; + } + + if (req_meth) { + method = req_meth; + /* CORS 6.2.5. */ + if (!validate_cors_rule_method(this, rule, req_meth)) { + return false; + } + } + + /* CORS 6.2.4. */ + const char *req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_HEADERS"); + + /* CORS 6.2.6. */ + get_cors_response_headers(this, rule, req_hdrs, headers, exp_headers, max_age); + + return true; +} + +int rgw_policy_from_attrset(const DoutPrefixProvider *dpp, CephContext *cct, map& attrset, RGWAccessControlPolicy *policy) +{ + map::iterator aiter = attrset.find(RGW_ATTR_ACL); + if (aiter == attrset.end()) + return -EIO; + + bufferlist& bl = aiter->second; + auto iter = bl.cbegin(); + try { + policy->decode(iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (cct->_conf->subsys.should_gather()) { + RGWAccessControlPolicy_S3 *s3policy = static_cast(policy); + ldpp_dout(dpp, 15) << __func__ << " Read AccessControlPolicy"; + s3policy->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + +int RGWGetObj::read_user_manifest_part(rgw::sal::Bucket* bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const off_t start_ofs, + const off_t end_ofs, + bool swift_slo) +{ + ldpp_dout(this, 20) << "user manifest obj=" << ent.key.name + << "[" << ent.key.instance << "]" << dendl; + RGWGetObj_CB cb(this); + RGWGetObj_Filter* filter = &cb; + boost::optional decompress; + + int64_t cur_ofs = start_ofs; + int64_t cur_end = end_ofs; + + std::unique_ptr part = bucket->get_object(ent.key); + + RGWAccessControlPolicy obj_policy(s->cct); + + ldpp_dout(this, 20) << "reading obj=" << part << " ofs=" << cur_ofs + << " end=" << cur_end << dendl; + + part->set_atomic(); + part->set_prefetch_data(); + + std::unique_ptr read_op = part->get_read_op(); + + if (!swift_slo) { + /* SLO etag is optional */ + read_op->params.if_match = ent.meta.etag.c_str(); + } + + op_ret = read_op->prepare(s->yield, this); + if (op_ret < 0) + return op_ret; + op_ret = part->range_to_ofs(ent.meta.accounted_size, cur_ofs, cur_end); + if (op_ret < 0) + return op_ret; + bool need_decompress; + op_ret = rgw_compression_info_from_attrset(part->get_attrs(), need_decompress, cs_info); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to decode compression info" << dendl; + return -EIO; + } + + if (need_decompress) + { + if (cs_info.orig_size != ent.meta.accounted_size) { + // hmm.. something wrong, object not as expected, abort! + ldpp_dout(this, 0) << "ERROR: expected cs_info.orig_size=" << cs_info.orig_size + << ", actual read size=" << ent.meta.size << dendl; + return -EIO; + } + decompress.emplace(s->cct, &cs_info, partial_content, filter); + filter = &*decompress; + } + else + { + if (part->get_obj_size() != ent.meta.size) { + // hmm.. something wrong, object not as expected, abort! + ldpp_dout(this, 0) << "ERROR: expected obj_size=" << part->get_obj_size() + << ", actual read size=" << ent.meta.size << dendl; + return -EIO; + } + } + + op_ret = rgw_policy_from_attrset(s, s->cct, part->get_attrs(), &obj_policy); + if (op_ret < 0) + return op_ret; + + /* We can use global user_acl because LOs cannot have segments + * stored inside different accounts. */ + if (s->system_request) { + ldpp_dout(this, 2) << "overriding permissions due to system operation" << dendl; + } else if (s->auth.identity->is_admin_of(s->user->get_id())) { + ldpp_dout(this, 2) << "overriding permissions due to admin operation" << dendl; + } else if (!verify_object_permission(this, s, part->get_obj(), s->user_acl.get(), + bucket_acl, &obj_policy, bucket_policy, + s->iam_user_policies, s->session_policies, action)) { + return -EPERM; + } + if (ent.meta.size == 0) { + return 0; + } + + perfcounter->inc(l_rgw_get_b, cur_end - cur_ofs); + filter->fixup_range(cur_ofs, cur_end); + op_ret = read_op->iterate(this, cur_ofs, cur_end, filter, s->yield); + if (op_ret >= 0) + op_ret = filter->flush(); + return op_ret; +} + +static int iterate_user_manifest_parts(const DoutPrefixProvider *dpp, + CephContext * const cct, + rgw::sal::Driver* const driver, + const off_t ofs, + const off_t end, + rgw::sal::Bucket* bucket, + const string& obj_prefix, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + uint64_t * const ptotal_len, + uint64_t * const pobj_size, + string * const pobj_sum, + int (*cb)(rgw::sal::Bucket* bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + off_t start_ofs, + off_t end_ofs, + void *param, + bool swift_slo), + void * const cb_param, + optional_yield y) +{ + uint64_t obj_ofs = 0, len_count = 0; + bool found_start = false, found_end = false, handled_end = false; + string delim; + + utime_t start_time = ceph_clock_now(); + + rgw::sal::Bucket::ListParams params; + params.prefix = obj_prefix; + params.delim = delim; + + rgw::sal::Bucket::ListResults results; + MD5 etag_sum; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + etag_sum.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + do { + static constexpr auto MAX_LIST_OBJS = 100u; + int r = bucket->list(dpp, params, MAX_LIST_OBJS, results, y); + if (r < 0) { + return r; + } + + for (rgw_bucket_dir_entry& ent : results.objs) { + const uint64_t cur_total_len = obj_ofs; + const uint64_t obj_size = ent.meta.accounted_size; + uint64_t start_ofs = 0, end_ofs = obj_size; + + if ((ptotal_len || cb) && !found_start && cur_total_len + obj_size > (uint64_t)ofs) { + start_ofs = ofs - obj_ofs; + found_start = true; + } + + obj_ofs += obj_size; + if (pobj_sum) { + etag_sum.Update((const unsigned char *)ent.meta.etag.c_str(), + ent.meta.etag.length()); + } + + if ((ptotal_len || cb) && !found_end && obj_ofs > (uint64_t)end) { + end_ofs = end - cur_total_len + 1; + found_end = true; + } + + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now() - start_time)); + + if (found_start && !handled_end) { + len_count += end_ofs - start_ofs; + + if (cb) { + r = cb(bucket, ent, bucket_acl, bucket_policy, start_ofs, end_ofs, + cb_param, false /* swift_slo */); + if (r < 0) { + return r; + } + } + } + + handled_end = found_end; + start_time = ceph_clock_now(); + } + } while (results.is_truncated); + + if (ptotal_len) { + *ptotal_len = len_count; + } + if (pobj_size) { + *pobj_size = obj_ofs; + } + if (pobj_sum) { + complete_etag(etag_sum, pobj_sum); + } + + return 0; +} + +struct rgw_slo_part { + RGWAccessControlPolicy *bucket_acl = nullptr; + Policy* bucket_policy = nullptr; + rgw::sal::Bucket* bucket; + string obj_name; + uint64_t size = 0; + string etag; +}; + +static int iterate_slo_parts(const DoutPrefixProvider *dpp, + CephContext *cct, + rgw::sal::Driver* driver, + off_t ofs, + off_t end, + map& slo_parts, + int (*cb)(rgw::sal::Bucket* bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy *bucket_acl, + const boost::optional& bucket_policy, + off_t start_ofs, + off_t end_ofs, + void *param, + bool swift_slo), + void *cb_param) +{ + bool found_start = false, found_end = false; + + if (slo_parts.empty()) { + return 0; + } + + utime_t start_time = ceph_clock_now(); + + map::iterator iter = slo_parts.upper_bound(ofs); + if (iter != slo_parts.begin()) { + --iter; + } + + uint64_t obj_ofs = iter->first; + + for (; iter != slo_parts.end() && !found_end; ++iter) { + rgw_slo_part& part = iter->second; + rgw_bucket_dir_entry ent; + + ent.key.name = part.obj_name; + ent.meta.accounted_size = ent.meta.size = part.size; + ent.meta.etag = part.etag; + + uint64_t cur_total_len = obj_ofs; + uint64_t start_ofs = 0, end_ofs = ent.meta.size - 1; + + if (!found_start && cur_total_len + ent.meta.size > (uint64_t)ofs) { + start_ofs = ofs - obj_ofs; + found_start = true; + } + + obj_ofs += ent.meta.size; + + if (!found_end && obj_ofs > (uint64_t)end) { + end_ofs = end - cur_total_len; + found_end = true; + } + + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now() - start_time)); + + if (found_start) { + if (cb) { + ldpp_dout(dpp, 20) << "iterate_slo_parts()" + << " obj=" << part.obj_name + << " start_ofs=" << start_ofs + << " end_ofs=" << end_ofs + << dendl; + + // SLO is a Swift thing, and Swift has no knowledge of S3 Policies. + int r = cb(part.bucket, ent, part.bucket_acl, + (part.bucket_policy ? + boost::optional(*part.bucket_policy) : none), + start_ofs, end_ofs, cb_param, true /* swift_slo */); + if (r < 0) + return r; + } + } + + start_time = ceph_clock_now(); + } + + return 0; +} + +static int get_obj_user_manifest_iterate_cb(rgw::sal::Bucket* bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const off_t start_ofs, + const off_t end_ofs, + void * const param, + bool swift_slo = false) +{ + RGWGetObj *op = static_cast(param); + return op->read_user_manifest_part( + bucket, ent, bucket_acl, bucket_policy, start_ofs, end_ofs, swift_slo); +} + +int RGWGetObj::handle_user_manifest(const char *prefix, optional_yield y) +{ + const std::string_view prefix_view(prefix); + ldpp_dout(this, 2) << "RGWGetObj::handle_user_manifest() prefix=" + << prefix_view << dendl; + + const size_t pos = prefix_view.find('/'); + if (pos == string::npos) { + return -EINVAL; + } + + const std::string bucket_name = url_decode(prefix_view.substr(0, pos)); + const std::string obj_prefix = url_decode(prefix_view.substr(pos + 1)); + + RGWAccessControlPolicy _bucket_acl(s->cct); + RGWAccessControlPolicy *bucket_acl; + boost::optional _bucket_policy; + boost::optional* bucket_policy; + RGWBucketInfo bucket_info; + std::unique_ptr ubucket; + rgw::sal::Bucket* pbucket = NULL; + int r = 0; + + if (bucket_name.compare(s->bucket->get_name()) != 0) { + map bucket_attrs; + r = driver->get_bucket(this, s->user.get(), s->user->get_tenant(), bucket_name, &ubucket, y); + if (r < 0) { + ldpp_dout(this, 0) << "could not get bucket info for bucket=" + << bucket_name << dendl; + return r; + } + bucket_acl = &_bucket_acl; + r = read_bucket_policy(this, driver, s, ubucket->get_info(), bucket_attrs, bucket_acl, ubucket->get_key(), y); + if (r < 0) { + ldpp_dout(this, 0) << "failed to read bucket policy" << dendl; + return r; + } + _bucket_policy = get_iam_policy_from_attr(s->cct, bucket_attrs, s->user->get_tenant()); + bucket_policy = &_bucket_policy; + pbucket = ubucket.get(); + } else { + pbucket = s->bucket.get(); + bucket_acl = s->bucket_acl.get(); + bucket_policy = &s->iam_policy; + } + + /* dry run to find out: + * - total length (of the parts we are going to send to client), + * - overall DLO's content size, + * - md5 sum of overall DLO's content (for etag of Swift API). */ + r = iterate_user_manifest_parts(this, s->cct, driver, ofs, end, + pbucket, obj_prefix, bucket_acl, *bucket_policy, + nullptr, &s->obj_size, &lo_etag, + nullptr /* cb */, nullptr /* cb arg */, y); + if (r < 0) { + return r; + } + s->object->set_obj_size(s->obj_size); + + r = s->object->range_to_ofs(s->obj_size, ofs, end); + if (r < 0) { + return r; + } + + r = iterate_user_manifest_parts(this, s->cct, driver, ofs, end, + pbucket, obj_prefix, bucket_acl, *bucket_policy, + &total_len, nullptr, nullptr, + nullptr, nullptr, y); + if (r < 0) { + return r; + } + + if (!get_data) { + bufferlist bl; + send_response_data(bl, 0, 0); + return 0; + } + + r = iterate_user_manifest_parts(this, s->cct, driver, ofs, end, + pbucket, obj_prefix, bucket_acl, *bucket_policy, + nullptr, nullptr, nullptr, + get_obj_user_manifest_iterate_cb, (void *)this, y); + if (r < 0) { + return r; + } + + if (!total_len) { + bufferlist bl; + send_response_data(bl, 0, 0); + } + + return r; +} + +int RGWGetObj::handle_slo_manifest(bufferlist& bl, optional_yield y) +{ + RGWSLOInfo slo_info; + auto bliter = bl.cbegin(); + try { + decode(slo_info, bliter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode slo manifest" << dendl; + return -EIO; + } + ldpp_dout(this, 2) << "RGWGetObj::handle_slo_manifest()" << dendl; + + vector allocated_acls; + map>> policies; + map> buckets; + + map slo_parts; + + MD5 etag_sum; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + etag_sum.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + total_len = 0; + + for (const auto& entry : slo_info.entries) { + const string& path = entry.path; + + /* If the path starts with slashes, strip them all. */ + const size_t pos_init = path.find_first_not_of('/'); + /* According to the documentation of std::string::find following check + * is not necessary as we should get the std::string::npos propagation + * here. This might be true with the accuracy to implementation's bugs. + * See following question on SO: + * http://stackoverflow.com/questions/1011790/why-does-stdstring-findtext-stdstringnpos-not-return-npos + */ + if (pos_init == string::npos) { + return -EINVAL; + } + + const size_t pos_sep = path.find('/', pos_init); + if (pos_sep == string::npos) { + return -EINVAL; + } + + string bucket_name = path.substr(pos_init, pos_sep - pos_init); + string obj_name = path.substr(pos_sep + 1); + + rgw::sal::Bucket* bucket; + RGWAccessControlPolicy *bucket_acl; + Policy* bucket_policy; + + if (bucket_name.compare(s->bucket->get_name()) != 0) { + const auto& piter = policies.find(bucket_name); + if (piter != policies.end()) { + bucket_acl = piter->second.first; + bucket_policy = piter->second.second.get_ptr(); + bucket = buckets[bucket_name].get(); + } else { + allocated_acls.push_back(RGWAccessControlPolicy(s->cct)); + RGWAccessControlPolicy& _bucket_acl = allocated_acls.back(); + + std::unique_ptr tmp_bucket; + int r = driver->get_bucket(this, s->user.get(), s->user->get_tenant(), bucket_name, &tmp_bucket, y); + if (r < 0) { + ldpp_dout(this, 0) << "could not get bucket info for bucket=" + << bucket_name << dendl; + return r; + } + bucket = tmp_bucket.get(); + bucket_acl = &_bucket_acl; + r = read_bucket_policy(this, driver, s, tmp_bucket->get_info(), tmp_bucket->get_attrs(), bucket_acl, + tmp_bucket->get_key(), y); + if (r < 0) { + ldpp_dout(this, 0) << "failed to read bucket ACL for bucket " + << bucket << dendl; + return r; + } + auto _bucket_policy = get_iam_policy_from_attr( + s->cct, tmp_bucket->get_attrs(), tmp_bucket->get_tenant()); + bucket_policy = _bucket_policy.get_ptr(); + buckets[bucket_name].swap(tmp_bucket); + policies[bucket_name] = make_pair(bucket_acl, _bucket_policy); + } + } else { + bucket = s->bucket.get(); + bucket_acl = s->bucket_acl.get(); + bucket_policy = s->iam_policy.get_ptr(); + } + + rgw_slo_part part; + part.bucket_acl = bucket_acl; + part.bucket_policy = bucket_policy; + part.bucket = bucket; + part.obj_name = obj_name; + part.size = entry.size_bytes; + part.etag = entry.etag; + ldpp_dout(this, 20) << "slo_part: bucket=" << part.bucket + << " obj=" << part.obj_name + << " size=" << part.size + << " etag=" << part.etag + << dendl; + + etag_sum.Update((const unsigned char *)entry.etag.c_str(), + entry.etag.length()); + + slo_parts[total_len] = part; + total_len += part.size; + } /* foreach entry */ + + complete_etag(etag_sum, &lo_etag); + + s->obj_size = slo_info.total_size; + s->object->set_obj_size(slo_info.total_size); + ldpp_dout(this, 20) << "s->obj_size=" << s->obj_size << dendl; + + int r = s->object->range_to_ofs(total_len, ofs, end); + if (r < 0) { + return r; + } + + total_len = end - ofs + 1; + ldpp_dout(this, 20) << "Requested: ofs=" << ofs + << " end=" << end + << " total=" << total_len + << dendl; + + r = iterate_slo_parts(this, s->cct, driver, ofs, end, slo_parts, + get_obj_user_manifest_iterate_cb, (void *)this); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWGetObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + /* garbage collection related handling: + * defer_gc disabled for https://tracker.ceph.com/issues/47866 */ + return send_response_data(bl, bl_ofs, bl_len); +} + +int RGWGetObj::get_lua_filter(std::unique_ptr* filter, RGWGetObj_Filter* cb) { + std::string script; + const auto rc = rgw::lua::read_script(s, s->penv.lua.manager.get(), s->bucket_tenant, s->yield, rgw::lua::context::getData, script); + if (rc == -ENOENT) { + // no script, nothing to do + return 0; + } else if (rc < 0) { + ldpp_dout(this, 5) << "WARNING: failed to read data script. error: " << rc << dendl; + return rc; + } + filter->reset(new rgw::lua::RGWGetObjFilter(s, script, cb)); + return 0; +} + +bool RGWGetObj::prefetch_data() +{ + /* HEAD request, stop prefetch*/ + if (!get_data || s->info.env->exists("HTTP_X_RGW_AUTH")) { + return false; + } + + range_str = s->info.env->get("HTTP_RANGE"); + // TODO: add range prefetch + if (range_str) { + parse_range(); + return false; + } + + return get_data; +} + +void RGWGetObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +static inline void rgw_cond_decode_objtags( + req_state *s, + const std::map &attrs) +{ + const auto& tags = attrs.find(RGW_ATTR_TAGS); + if (tags != attrs.end()) { + try { + bufferlist::const_iterator iter{&tags->second}; + s->tagset.decode(iter); + } catch (buffer::error& err) { + ldpp_dout(s, 0) + << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + } + } +} + +void RGWGetObj::execute(optional_yield y) +{ + bufferlist bl; + gc_invalidate_time = ceph_clock_now(); + gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + + bool need_decompress = false; + int64_t ofs_x = 0, end_x = 0; + bool encrypted = false; + + RGWGetObj_CB cb(this); + RGWGetObj_Filter* filter = (RGWGetObj_Filter *)&cb; + boost::optional decompress; +#ifdef WITH_ARROW_FLIGHT + boost::optional flight_filter; +#endif + std::unique_ptr decrypt; + std::unique_ptr run_lua; + map::iterator attr_iter; + + perfcounter->inc(l_rgw_get); + + std::unique_ptr read_op(s->object->get_read_op()); + + op_ret = get_params(y); + if (op_ret < 0) + goto done_err; + + op_ret = init_common(); + if (op_ret < 0) + goto done_err; + + read_op->params.mod_ptr = mod_ptr; + read_op->params.unmod_ptr = unmod_ptr; + read_op->params.high_precision_time = s->system_request; /* system request need to use high precision time */ + read_op->params.mod_zone_id = mod_zone_id; + read_op->params.mod_pg_ver = mod_pg_ver; + read_op->params.if_match = if_match; + read_op->params.if_nomatch = if_nomatch; + read_op->params.lastmod = &lastmod; + + op_ret = read_op->prepare(s->yield, this); + if (op_ret < 0) + goto done_err; + version_id = s->object->get_instance(); + s->obj_size = s->object->get_obj_size(); + attrs = s->object->get_attrs(); + + /* STAT ops don't need data, and do no i/o */ + if (get_type() == RGW_OP_STAT_OBJ) { + return; + } + if (s->info.env->exists("HTTP_X_RGW_AUTH")) { + op_ret = 0; + goto done_err; + } + /* start gettorrent */ + if (torrent.get_flag()) + { + attr_iter = attrs.find(RGW_ATTR_CRYPT_MODE); + if (attr_iter != attrs.end() && attr_iter->second.to_str() == "SSE-C-AES256") { + ldpp_dout(this, 0) << "ERROR: torrents are not supported for objects " + "encrypted with SSE-C" << dendl; + op_ret = -EINVAL; + goto done_err; + } + torrent.init(s, driver); + rgw_obj obj = s->object->get_obj(); + op_ret = torrent.get_torrent_file(s->object.get(), total_len, bl, obj); + if (op_ret < 0) + { + ldpp_dout(this, 0) << "ERROR: failed to get_torrent_file ret= " << op_ret + << dendl; + goto done_err; + } + op_ret = send_response_data(bl, 0, total_len); + if (op_ret < 0) + { + ldpp_dout(this, 0) << "ERROR: failed to send_response_data ret= " << op_ret << dendl; + goto done_err; + } + return; + } + /* end gettorrent */ + + // run lua script on decompressed and decrypted data - first filter runs last + op_ret = get_lua_filter(&run_lua, filter); + if (run_lua != nullptr) { + filter = run_lua.get(); + } + if (op_ret < 0) { + goto done_err; + } + +#ifdef WITH_ARROW_FLIGHT + if (s->penv.flight_store) { + if (ofs == 0) { + // insert a GetObj_Filter to monitor and create flight + flight_filter.emplace(s, filter); + filter = &*flight_filter; + } + } else { + ldpp_dout(this, 0) << "ERROR: flight_store not created in " << __func__ << dendl; + } +#endif + + op_ret = rgw_compression_info_from_attrset(attrs, need_decompress, cs_info); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to decode compression info, cannot decompress" << dendl; + goto done_err; + } + + // where encryption and compression are combined, compression was applied to + // the data before encryption. if the system header rgwx-skip-decrypt is + // present, we have to skip the decompression filter too + encrypted = attrs.count(RGW_ATTR_CRYPT_MODE); + + if (need_decompress && (!encrypted || !skip_decrypt)) { + s->obj_size = cs_info.orig_size; + s->object->set_obj_size(cs_info.orig_size); + decompress.emplace(s->cct, &cs_info, partial_content, filter); + filter = &*decompress; + } + + attr_iter = attrs.find(RGW_ATTR_OBJ_REPLICATION_TRACE); + if (attr_iter != attrs.end()) { + try { + std::vector zones; + auto p = attr_iter->second.cbegin(); + decode(zones, p); + for (const auto& zone: zones) { + if (zone == dst_zone_trace) { + op_ret = -ERR_NOT_MODIFIED; + ldpp_dout(this, 4) << "Object already has been copied to this destination. Returning " + << op_ret << dendl; + goto done_err; + } + } + } catch (const buffer::error&) {} + } + + if (get_type() == RGW_OP_GET_OBJ && get_data) { + op_ret = handle_cloudtier_obj(attrs, sync_cloudtiered); + if (op_ret < 0) { + ldpp_dout(this, 4) << "Cannot get cloud tiered object: " << *s->object + <<". Failing with " << op_ret << dendl; + if (op_ret == -ERR_INVALID_OBJECT_STATE) { + s->err.message = "This object was transitioned to cloud-s3"; + } + goto done_err; + } + } + + attr_iter = attrs.find(RGW_ATTR_USER_MANIFEST); + if (attr_iter != attrs.end() && !skip_manifest) { + op_ret = handle_user_manifest(attr_iter->second.c_str(), y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to handle user manifest ret=" + << op_ret << dendl; + goto done_err; + } + return; + } + + attr_iter = attrs.find(RGW_ATTR_SLO_MANIFEST); + if (attr_iter != attrs.end() && !skip_manifest) { + is_slo = true; + op_ret = handle_slo_manifest(attr_iter->second, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to handle slo manifest ret=" << op_ret + << dendl; + goto done_err; + } + return; + } + + // for range requests with obj size 0 + if (range_str && !(s->obj_size)) { + total_len = 0; + op_ret = -ERANGE; + goto done_err; + } + + op_ret = s->object->range_to_ofs(s->obj_size, ofs, end); + if (op_ret < 0) + goto done_err; + total_len = (ofs <= end ? end + 1 - ofs : 0); + + ofs_x = ofs; + end_x = end; + filter->fixup_range(ofs_x, end_x); + + /* Check whether the object has expired. Swift API documentation + * stands that we should return 404 Not Found in such case. */ + if (need_object_expiration() && s->object->is_expired()) { + op_ret = -ENOENT; + goto done_err; + } + + /* Decode S3 objtags, if any */ + rgw_cond_decode_objtags(s, attrs); + + start = ofs; + + attr_iter = attrs.find(RGW_ATTR_MANIFEST); + op_ret = this->get_decrypt_filter(&decrypt, filter, + attr_iter != attrs.end() ? &(attr_iter->second) : nullptr); + if (decrypt != nullptr) { + filter = decrypt.get(); + filter->fixup_range(ofs_x, end_x); + } + if (op_ret < 0) { + goto done_err; + } + + + if (!get_data || ofs > end) { + send_response_data(bl, 0, 0); + return; + } + + perfcounter->inc(l_rgw_get_b, end - ofs); + + op_ret = read_op->iterate(this, ofs_x, end_x, filter, s->yield); + + if (op_ret >= 0) + op_ret = filter->flush(); + + perfcounter->tinc(l_rgw_get_lat, s->time_elapsed()); + if (op_ret < 0) { + goto done_err; + } + + op_ret = send_response_data(bl, 0, 0); + if (op_ret < 0) { + goto done_err; + } + return; + +done_err: + send_response_data_error(y); +} + +int RGWGetObj::init_common() +{ + if (range_str) { + /* range parsed error when prefetch */ + if (!range_parsed) { + int r = parse_range(); + if (r < 0) + return r; + } + } + if (if_mod) { + if (parse_time(if_mod, &mod_time) < 0) + return -EINVAL; + mod_ptr = &mod_time; + } + + if (if_unmod) { + if (parse_time(if_unmod, &unmod_time) < 0) + return -EINVAL; + unmod_ptr = &unmod_time; + } + + return 0; +} + +int RGWListBuckets::verify_permission(optional_yield y) +{ + rgw::Partition partition = rgw::Partition::aws; + rgw::Service service = rgw::Service::s3; + + string tenant; + if (s->auth.identity->get_identity_type() == TYPE_ROLE) { + tenant = s->auth.identity->get_role_tenant(); + } else { + tenant = s->user->get_tenant(); + } + + if (!verify_user_permission(this, s, ARN(partition, service, "", tenant, "*"), rgw::IAM::s3ListAllMyBuckets, false)) { + return -EACCES; + } + + return 0; +} + +int RGWGetUsage::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + return 0; +} + +void RGWListBuckets::execute(optional_yield y) +{ + bool done; + bool started = false; + uint64_t total_count = 0; + + const uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + + op_ret = get_params(y); + if (op_ret < 0) { + goto send_end; + } + + if (supports_account_metadata()) { + op_ret = s->user->read_attrs(this, s->yield); + if (op_ret < 0) { + goto send_end; + } + } + + is_truncated = false; + do { + rgw::sal::BucketList buckets; + uint64_t read_count; + if (limit >= 0) { + read_count = min(limit - total_count, max_buckets); + } else { + read_count = max_buckets; + } + + op_ret = s->user->list_buckets(this, marker, end_marker, read_count, should_get_stats(), buckets, y); + + if (op_ret < 0) { + /* hmm.. something wrong here.. the user was authenticated, so it + should exist */ + ldpp_dout(this, 10) << "WARNING: failed on rgw_get_user_buckets uid=" + << s->user->get_id() << dendl; + break; + } + + is_truncated = buckets.is_truncated(); + + /* We need to have stats for all our policies - even if a given policy + * isn't actually used in a given account. In such situation its usage + * stats would be simply full of zeros. */ + std::set targets; + driver->get_zone()->get_zonegroup().get_placement_target_names(targets); + for (const auto& policy : targets) { + policies_stats.emplace(policy, decltype(policies_stats)::mapped_type()); + } + + std::map>& m = buckets.get_buckets(); + for (const auto& kv : m) { + const auto& bucket = kv.second; + + global_stats.bytes_used += bucket->get_size(); + global_stats.bytes_used_rounded += bucket->get_size_rounded(); + global_stats.objects_count += bucket->get_count(); + + /* operator[] still can create a new entry for storage policy seen + * for first time. */ + auto& policy_stats = policies_stats[bucket->get_placement_rule().to_str()]; + policy_stats.bytes_used += bucket->get_size(); + policy_stats.bytes_used_rounded += bucket->get_size_rounded(); + policy_stats.buckets_count++; + policy_stats.objects_count += bucket->get_count(); + } + global_stats.buckets_count += m.size(); + total_count += m.size(); + + done = (m.size() < read_count || (limit >= 0 && total_count >= (uint64_t)limit)); + + if (!started) { + send_response_begin(buckets.count() > 0); + started = true; + } + + if (read_count > 0 && + !m.empty()) { + auto riter = m.rbegin(); + marker = riter->first; + + handle_listing_chunk(std::move(buckets)); + } + } while (is_truncated && !done); + +send_end: + if (!started) { + send_response_begin(false); + } + send_response_end(); +} + +void RGWGetUsage::execute(optional_yield y) +{ + uint64_t start_epoch = 0; + uint64_t end_epoch = (uint64_t)-1; + op_ret = get_params(y); + if (op_ret < 0) + return; + + if (!start_date.empty()) { + op_ret = utime_t::parse_date(start_date, &start_epoch, NULL); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to parse start date" << dendl; + return; + } + } + + if (!end_date.empty()) { + op_ret = utime_t::parse_date(end_date, &end_epoch, NULL); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to parse end date" << dendl; + return; + } + } + + uint32_t max_entries = 1000; + + bool is_truncated = true; + + RGWUsageIter usage_iter; + + while (s->bucket && is_truncated) { + op_ret = s->bucket->read_usage(this, start_epoch, end_epoch, max_entries, &is_truncated, + usage_iter, usage); + if (op_ret == -ENOENT) { + op_ret = 0; + is_truncated = false; + } + + if (op_ret < 0) { + return; + } + } + + op_ret = rgw_user_sync_all_stats(this, driver, s->user.get(), y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to sync user stats" << dendl; + return; + } + + op_ret = rgw_user_get_all_buckets_stats(this, driver, s->user.get(), buckets_usage, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get user's buckets stats" << dendl; + return; + } + + op_ret = s->user->read_stats(this, y, &stats); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: can't read user header" << dendl; + return; + } + + return; +} + +int RGWStatAccount::verify_permission(optional_yield y) +{ + if (!verify_user_permission_no_policy(this, s, RGW_PERM_READ)) { + return -EACCES; + } + + return 0; +} + +void RGWStatAccount::execute(optional_yield y) +{ + string marker; + rgw::sal::BucketList buckets; + uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + const string *lastmarker; + + do { + + lastmarker = nullptr; + op_ret = s->user->list_buckets(this, marker, string(), max_buckets, true, buckets, y); + if (op_ret < 0) { + /* hmm.. something wrong here.. the user was authenticated, so it + should exist */ + ldpp_dout(this, 10) << "WARNING: failed on list_buckets uid=" + << s->user->get_id() << " ret=" << op_ret << dendl; + break; + } else { + /* We need to have stats for all our policies - even if a given policy + * isn't actually used in a given account. In such situation its usage + * stats would be simply full of zeros. */ + std::set names; + driver->get_zone()->get_zonegroup().get_placement_target_names(names); + for (const auto& policy : names) { + policies_stats.emplace(policy, decltype(policies_stats)::mapped_type()); + } + + std::map>& m = buckets.get_buckets(); + for (const auto& kv : m) { + const auto& bucket = kv.second; + lastmarker = &kv.first; + + global_stats.bytes_used += bucket->get_size(); + global_stats.bytes_used_rounded += bucket->get_size_rounded(); + global_stats.objects_count += bucket->get_count(); + + /* operator[] still can create a new entry for storage policy seen + * for first time. */ + auto& policy_stats = policies_stats[bucket->get_placement_rule().to_str()]; + policy_stats.bytes_used += bucket->get_size(); + policy_stats.bytes_used_rounded += bucket->get_size_rounded(); + policy_stats.buckets_count++; + policy_stats.objects_count += bucket->get_count(); + } + global_stats.buckets_count += m.size(); + + } + if (!lastmarker) { + ldpp_dout(this, -1) << "ERROR: rgw_read_user_buckets, stasis at marker=" + << marker << " uid=" << s->user->get_id() << dendl; + break; + } + marker = *lastmarker; + } while (buckets.is_truncated()); +} + +int RGWGetBucketVersioning::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketVersioning); +} + +void RGWGetBucketVersioning::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetBucketVersioning::execute(optional_yield y) +{ + if (! s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + versioned = s->bucket->versioned(); + versioning_enabled = s->bucket->versioning_enabled(); + mfa_enabled = s->bucket->get_info().mfa_enabled(); +} + +int RGWSetBucketVersioning::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketVersioning); +} + +void RGWSetBucketVersioning::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWSetBucketVersioning::execute(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) + return; + + if (! s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + if (s->bucket->get_info().obj_lock_enabled() && versioning_status != VersioningEnabled) { + s->err.message = "bucket versioning cannot be disabled on buckets with object lock enabled"; + ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl; + op_ret = -ERR_INVALID_BUCKET_STATE; + return; + } + + bool cur_mfa_status = s->bucket->get_info().mfa_enabled(); + + mfa_set_status &= (mfa_status != cur_mfa_status); + + if (mfa_set_status && + !s->mfa_verified) { + op_ret = -ERR_MFA_REQUIRED; + return; + } + //if mfa is enabled for bucket, make sure mfa code is validated in case versioned status gets changed + if (cur_mfa_status) { + bool req_versioning_status = false; + //if requested versioning status is not the same as the one set for the bucket, return error + if (versioning_status == VersioningEnabled) { + req_versioning_status = (s->bucket->get_info().flags & BUCKET_VERSIONS_SUSPENDED) != 0; + } else if (versioning_status == VersioningSuspended) { + req_versioning_status = (s->bucket->get_info().flags & BUCKET_VERSIONS_SUSPENDED) == 0; + } + if (req_versioning_status && !s->mfa_verified) { + op_ret = -ERR_MFA_REQUIRED; + return; + } + } + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + bool modified = mfa_set_status; + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [&] { + if (mfa_set_status) { + if (mfa_status) { + s->bucket->get_info().flags |= BUCKET_MFA_ENABLED; + } else { + s->bucket->get_info().flags &= ~BUCKET_MFA_ENABLED; + } + } + + if (versioning_status == VersioningEnabled) { + s->bucket->get_info().flags |= BUCKET_VERSIONED; + s->bucket->get_info().flags &= ~BUCKET_VERSIONS_SUSPENDED; + modified = true; + } else if (versioning_status == VersioningSuspended) { + s->bucket->get_info().flags |= (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED); + modified = true; + } else { + return op_ret; + } + s->bucket->set_attrs(rgw::sal::Attrs(s->bucket_attrs)); + return s->bucket->put_info(this, false, real_time()); + }); + + if (!modified) { + return; + } + + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name() + << " returned err=" << op_ret << dendl; + return; + } +} + +int RGWGetBucketWebsite::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketWebsite); +} + +void RGWGetBucketWebsite::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetBucketWebsite::execute(optional_yield y) +{ + if (!s->bucket->get_info().has_website) { + op_ret = -ERR_NO_SUCH_WEBSITE_CONFIGURATION; + } +} + +int RGWSetBucketWebsite::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketWebsite); +} + +void RGWSetBucketWebsite::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWSetBucketWebsite::execute(optional_yield y) +{ + op_ret = get_params(y); + + if (op_ret < 0) + return; + + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << " forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + s->bucket->get_info().has_website = true; + s->bucket->get_info().website_conf = website_conf; + op_ret = s->bucket->put_info(this, false, real_time()); + return op_ret; + }); + + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name() + << " returned err=" << op_ret << dendl; + return; + } +} + +int RGWDeleteBucketWebsite::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3DeleteBucketWebsite); +} + +void RGWDeleteBucketWebsite::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteBucketWebsite::execute(optional_yield y) +{ + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + bufferlist in_data; + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: forward_to_master failed on bucket=" << s->bucket->get_name() + << "returned err=" << op_ret << dendl; + return; + } + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + s->bucket->get_info().has_website = false; + s->bucket->get_info().website_conf = RGWBucketWebsiteConf(); + op_ret = s->bucket->put_info(this, false, real_time()); + return op_ret; + }); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket + << " returned err=" << op_ret << dendl; + return; + } +} + +int RGWStatBucket::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + // This (a HEAD request on a bucket) is governed by the s3:ListBucket permission. + if (!verify_bucket_permission(this, s, rgw::IAM::s3ListBucket)) { + return -EACCES; + } + + return 0; +} + +void RGWStatBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWStatBucket::execute(optional_yield y) +{ + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + op_ret = driver->get_bucket(this, s->user.get(), s->bucket->get_key(), &bucket, y); + if (op_ret) { + return; + } + op_ret = bucket->update_container_stats(s); +} + +int RGWListBucket::verify_permission(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) { + return op_ret; + } + if (!prefix.empty()) + s->env.emplace("s3:prefix", prefix); + + if (!delimiter.empty()) + s->env.emplace("s3:delimiter", delimiter); + + s->env.emplace("s3:max-keys", std::to_string(max)); + + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, + s, + list_versions ? + rgw::IAM::s3ListBucketVersions : + rgw::IAM::s3ListBucket)) { + return -EACCES; + } + + return 0; +} + +int RGWListBucket::parse_max_keys() +{ + // Bound max value of max-keys to configured value for security + // Bound min value of max-keys to '0' + // Some S3 clients explicitly send max-keys=0 to detect if the bucket is + // empty without listing any items. + return parse_value_and_bound(max_keys, max, 0, + g_conf().get_val("rgw_max_listing_results"), + default_max); +} + +void RGWListBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListBucket::execute(optional_yield y) +{ + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + if (allow_unordered && !delimiter.empty()) { + ldpp_dout(this, 0) << + "ERROR: unordered bucket listing requested with a delimiter" << dendl; + op_ret = -EINVAL; + return; + } + + if (need_container_stats()) { + op_ret = s->bucket->update_container_stats(s); + } + + rgw::sal::Bucket::ListParams params; + params.prefix = prefix; + params.delim = delimiter; + params.marker = marker; + params.end_marker = end_marker; + params.list_versions = list_versions; + params.allow_unordered = allow_unordered; + params.shard_id = shard_id; + + rgw::sal::Bucket::ListResults results; + + op_ret = s->bucket->list(this, params, max, results, y); + if (op_ret >= 0) { + next_marker = results.next_marker; + is_truncated = results.is_truncated; + objs = std::move(results.objs); + common_prefixes = std::move(results.common_prefixes); + } +} + +int RGWGetBucketLogging::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketLogging); +} + +int RGWGetBucketLocation::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketLocation); +} + +int RGWCreateBucket::verify_permission(optional_yield y) +{ + /* This check is mostly needed for S3 that doesn't support account ACL. + * Swift doesn't allow to delegate any permission to an anonymous user, + * so it will become an early exit in such case. */ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + rgw_bucket bucket; + bucket.name = s->bucket_name; + bucket.tenant = s->bucket_tenant; + ARN arn = ARN(bucket); + if (!verify_user_permission(this, s, arn, rgw::IAM::s3CreateBucket, false)) { + return -EACCES; + } + + if (s->user->get_tenant() != s->bucket_tenant) { + //AssumeRole is meant for cross account access + if (s->auth.identity->get_identity_type() != TYPE_ROLE) { + ldpp_dout(this, 10) << "user cannot create a bucket in a different tenant" + << " (user_id.tenant=" << s->user->get_tenant() + << " requested=" << s->bucket_tenant << ")" + << dendl; + return -EACCES; + } + } + + if (s->user->get_max_buckets() < 0) { + return -EPERM; + } + + if (s->user->get_max_buckets()) { + rgw::sal::BucketList buckets; + string marker; + op_ret = s->user->list_buckets(this, marker, string(), s->user->get_max_buckets(), + false, buckets, y); + if (op_ret < 0) { + return op_ret; + } + + if ((int)buckets.count() >= s->user->get_max_buckets()) { + return -ERR_TOO_MANY_BUCKETS; + } + } + + return 0; +} + +void RGWCreateBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +static void prepare_add_del_attrs(const map& orig_attrs, + map& out_attrs, + map& out_rmattrs) +{ + for (const auto& kv : orig_attrs) { + const string& name = kv.first; + + /* Check if the attr is user-defined metadata item. */ + if (name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, + RGW_ATTR_META_PREFIX) == 0) { + /* For the objects all existing meta attrs have to be removed. */ + out_rmattrs[name] = kv.second; + } else if (out_attrs.find(name) == std::end(out_attrs)) { + out_attrs[name] = kv.second; + } + } +} + +/* Fuse resource metadata basing on original attributes in @orig_attrs, set + * of _custom_ attribute names to remove in @rmattr_names and attributes in + * @out_attrs. Place results in @out_attrs. + * + * NOTE: it's supposed that all special attrs already present in @out_attrs + * will be preserved without any change. Special attributes are those which + * names start with RGW_ATTR_META_PREFIX. They're complement to custom ones + * used for X-Account-Meta-*, X-Container-Meta-*, X-Amz-Meta and so on. */ +static void prepare_add_del_attrs(const map& orig_attrs, + const set& rmattr_names, + map& out_attrs) +{ + for (const auto& kv : orig_attrs) { + const string& name = kv.first; + + /* Check if the attr is user-defined metadata item. */ + if (name.compare(0, strlen(RGW_ATTR_META_PREFIX), + RGW_ATTR_META_PREFIX) == 0) { + /* For the buckets all existing meta attrs are preserved, + except those that are listed in rmattr_names. */ + if (rmattr_names.find(name) != std::end(rmattr_names)) { + const auto aiter = out_attrs.find(name); + + if (aiter != std::end(out_attrs)) { + out_attrs.erase(aiter); + } + } else { + /* emplace() won't alter the map if the key is already present. + * This behaviour is fully intensional here. */ + out_attrs.emplace(kv); + } + } else if (out_attrs.find(name) == std::end(out_attrs)) { + out_attrs[name] = kv.second; + } + } +} + + +static void populate_with_generic_attrs(const req_state * const s, + map& out_attrs) +{ + for (const auto& kv : s->generic_attrs) { + bufferlist& attrbl = out_attrs[kv.first]; + const string& val = kv.second; + attrbl.clear(); + attrbl.append(val.c_str(), val.size() + 1); + } +} + + +static int filter_out_quota_info(std::map& add_attrs, + const std::set& rmattr_names, + RGWQuotaInfo& quota, + bool * quota_extracted = nullptr) +{ + bool extracted = false; + + /* Put new limit on max objects. */ + auto iter = add_attrs.find(RGW_ATTR_QUOTA_NOBJS); + std::string err; + if (std::end(add_attrs) != iter) { + quota.max_objects = + static_cast(strict_strtoll(iter->second.c_str(), 10, &err)); + if (!err.empty()) { + return -EINVAL; + } + add_attrs.erase(iter); + extracted = true; + } + + /* Put new limit on bucket (container) size. */ + iter = add_attrs.find(RGW_ATTR_QUOTA_MSIZE); + if (iter != add_attrs.end()) { + quota.max_size = + static_cast(strict_strtoll(iter->second.c_str(), 10, &err)); + if (!err.empty()) { + return -EINVAL; + } + add_attrs.erase(iter); + extracted = true; + } + + for (const auto& name : rmattr_names) { + /* Remove limit on max objects. */ + if (name.compare(RGW_ATTR_QUOTA_NOBJS) == 0) { + quota.max_objects = -1; + extracted = true; + } + + /* Remove limit on max bucket size. */ + if (name.compare(RGW_ATTR_QUOTA_MSIZE) == 0) { + quota.max_size = -1; + extracted = true; + } + } + + /* Swift requries checking on raw usage instead of the 4 KiB rounded one. */ + quota.check_on_raw = true; + quota.enabled = quota.max_size > 0 || quota.max_objects > 0; + + if (quota_extracted) { + *quota_extracted = extracted; + } + + return 0; +} + + +static void filter_out_website(std::map& add_attrs, + const std::set& rmattr_names, + RGWBucketWebsiteConf& ws_conf) +{ + std::string lstval; + + /* Let's define a mapping between each custom attribute and the memory where + * attribute's value should be stored. The memory location is expressed by + * a non-const reference. */ + const auto mapping = { + std::make_pair(RGW_ATTR_WEB_INDEX, std::ref(ws_conf.index_doc_suffix)), + std::make_pair(RGW_ATTR_WEB_ERROR, std::ref(ws_conf.error_doc)), + std::make_pair(RGW_ATTR_WEB_LISTINGS, std::ref(lstval)), + std::make_pair(RGW_ATTR_WEB_LIST_CSS, std::ref(ws_conf.listing_css_doc)), + std::make_pair(RGW_ATTR_SUBDIR_MARKER, std::ref(ws_conf.subdir_marker)) + }; + + for (const auto& kv : mapping) { + const char * const key = kv.first; + auto& target = kv.second; + + auto iter = add_attrs.find(key); + + if (std::end(add_attrs) != iter) { + /* The "target" is a reference to ws_conf. */ + target = iter->second.c_str(); + add_attrs.erase(iter); + } + + if (rmattr_names.count(key)) { + target = std::string(); + } + } + + if (! lstval.empty()) { + ws_conf.listing_enabled = boost::algorithm::iequals(lstval, "true"); + } +} + + +void RGWCreateBucket::execute(optional_yield y) +{ + buffer::list aclbl; + buffer::list corsbl; + string bucket_name = rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name); + + op_ret = get_params(y); + if (op_ret < 0) + return; + + if (!relaxed_region_enforcement && + !location_constraint.empty() && + !driver->get_zone()->has_zonegroup_api(location_constraint)) { + ldpp_dout(this, 0) << "location constraint (" << location_constraint << ")" + << " can't be found." << dendl; + op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; + s->err.message = "The specified location-constraint is not valid"; + return; + } + + if (!relaxed_region_enforcement && !driver->get_zone()->get_zonegroup().is_master_zonegroup() && !location_constraint.empty() && + driver->get_zone()->get_zonegroup().get_api_name() != location_constraint) { + ldpp_dout(this, 0) << "location constraint (" << location_constraint << ")" + << " doesn't match zonegroup" << " (" << driver->get_zone()->get_zonegroup().get_api_name() << ")" + << dendl; + op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; + s->err.message = "The specified location-constraint is not valid"; + return; + } + + std::set names; + driver->get_zone()->get_zonegroup().get_placement_target_names(names); + if (!placement_rule.name.empty() && + !names.count(placement_rule.name)) { + ldpp_dout(this, 0) << "placement target (" << placement_rule.name << ")" + << " doesn't exist in the placement targets of zonegroup" + << " (" << driver->get_zone()->get_zonegroup().get_api_name() << ")" << dendl; + op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; + s->err.message = "The specified placement target does not exist"; + return; + } + + /* we need to make sure we read bucket info, it's not read before for this + * specific request */ + { + std::unique_ptr tmp_bucket; + op_ret = driver->get_bucket(this, s->user.get(), s->bucket_tenant, + s->bucket_name, &tmp_bucket, y); + if (op_ret < 0 && op_ret != -ENOENT) + return; + s->bucket_exists = (op_ret != -ENOENT); + + if (s->bucket_exists) { + if (!s->system_request && + driver->get_zone()->get_zonegroup().get_id() != + tmp_bucket->get_info().zonegroup) { + op_ret = -EEXIST; + return; + } + /* Initialize info from req_state */ + info = tmp_bucket->get_info(); + } + } + + s->bucket_owner.set_id(s->user->get_id()); /* XXX dang use s->bucket->owner */ + s->bucket_owner.set_name(s->user->get_display_name()); + + string zonegroup_id; + + if (s->system_request) { + zonegroup_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup"); + if (zonegroup_id.empty()) { + zonegroup_id = driver->get_zone()->get_zonegroup().get_id(); + } + } else { + zonegroup_id = driver->get_zone()->get_zonegroup().get_id(); + } + + /* Encode special metadata first as we're using std::map::emplace under + * the hood. This method will add the new items only if the map doesn't + * contain such keys yet. */ + policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + if (has_cors) { + cors_config.encode(corsbl); + emplace_attr(RGW_ATTR_CORS, std::move(corsbl)); + } + + RGWQuotaInfo quota_info; + const RGWQuotaInfo * pquota_info = nullptr; + if (need_metadata_upload()) { + /* It's supposed that following functions WILL NOT change any special + * attributes (like RGW_ATTR_ACL) if they are already present in attrs. */ + op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs, false); + if (op_ret < 0) { + return; + } + prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); + + op_ret = filter_out_quota_info(attrs, rmattr_names, quota_info); + if (op_ret < 0) { + return; + } else { + pquota_info = "a_info; + } + + /* Web site of Swift API. */ + filter_out_website(attrs, rmattr_names, info.website_conf); + info.has_website = !info.website_conf.is_empty(); + } + + rgw_bucket tmp_bucket; + tmp_bucket.tenant = s->bucket_tenant; /* ignored if bucket exists */ + tmp_bucket.name = s->bucket_name; + + /* Handle updates of the metadata for Swift's object versioning. */ + if (swift_ver_location) { + info.swift_ver_location = *swift_ver_location; + info.swift_versioning = (! swift_ver_location->empty()); + } + + /* We're replacing bucket with the newly created one */ + ldpp_dout(this, 10) << "user=" << s->user << " bucket=" << tmp_bucket << dendl; + op_ret = s->user->create_bucket(this, tmp_bucket, zonegroup_id, + placement_rule, + info.swift_ver_location, + pquota_info, policy, attrs, info, ep_objv, + true, obj_lock_enabled, &s->bucket_exists, s->info, + &s->bucket, y); + + /* continue if EEXIST and create_bucket will fail below. this way we can + * recover from a partial create by retrying it. */ + ldpp_dout(this, 20) << "rgw_create_bucket returned ret=" << op_ret << " bucket=" << s->bucket.get() << dendl; + + if (op_ret) + return; + + const bool existed = s->bucket_exists; + if (need_metadata_upload() && existed) { + /* OK, it looks we lost race with another request. As it's required to + * handle metadata fusion and upload, the whole operation becomes very + * similar in nature to PutMetadataBucket. However, as the attrs may + * changed in the meantime, we have to refresh. */ + short tries = 0; + do { + map battrs; + + op_ret = s->bucket->load_bucket(this, y); + if (op_ret < 0) { + return; + } else if (!s->bucket->is_owner(s->user.get())) { + /* New bucket doesn't belong to the account we're operating on. */ + op_ret = -EEXIST; + return; + } else { + s->bucket_attrs = s->bucket->get_attrs(); + } + + attrs.clear(); + + op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs, false); + if (op_ret < 0) { + return; + } + prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); + op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket->get_info().quota); + if (op_ret < 0) { + return; + } + + /* Handle updates of the metadata for Swift's object versioning. */ + if (swift_ver_location) { + s->bucket->get_info().swift_ver_location = *swift_ver_location; + s->bucket->get_info().swift_versioning = (! swift_ver_location->empty()); + } + + /* Web site of Swift API. */ + filter_out_website(attrs, rmattr_names, s->bucket->get_info().website_conf); + s->bucket->get_info().has_website = !s->bucket->get_info().website_conf.is_empty(); + + /* This will also set the quota on the bucket. */ + op_ret = s->bucket->merge_and_store_attrs(this, attrs, y); + } while (op_ret == -ECANCELED && tries++ < 20); + + /* Restore the proper return code. */ + if (op_ret >= 0) { + op_ret = -ERR_BUCKET_EXISTS; + } + } +} + +int RGWDeleteBucket::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3DeleteBucket)) { + return -EACCES; + } + + return 0; +} + +void RGWDeleteBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteBucket::execute(optional_yield y) +{ + if (s->bucket_name.empty()) { + op_ret = -EINVAL; + return; + } + + if (!s->bucket_exists) { + ldpp_dout(this, 0) << "ERROR: bucket " << s->bucket_name << " not found" << dendl; + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + RGWObjVersionTracker ot; + ot.read_version = s->bucket->get_version(); + + if (s->system_request) { + string tag = s->info.args.get(RGW_SYS_PARAM_PREFIX "tag"); + string ver_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "ver"); + if (!tag.empty()) { + ot.read_version.tag = tag; + uint64_t ver; + string err; + ver = strict_strtol(ver_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 0) << "failed to parse ver param" << dendl; + op_ret = -EINVAL; + return; + } + ot.read_version.ver = ver; + } + } + + op_ret = s->bucket->sync_user_stats(this, y); + if ( op_ret < 0) { + ldpp_dout(this, 1) << "WARNING: failed to sync user stats before bucket delete: op_ret= " << op_ret << dendl; + } + + op_ret = s->bucket->check_empty(this, y); + if (op_ret < 0) { + return; + } + + bufferlist in_data; + op_ret = driver->forward_request_to_master(this, s->user.get(), &ot.read_version, in_data, nullptr, s->info, y); + if (op_ret < 0) { + if (op_ret == -ENOENT) { + /* adjust error, we want to return with NoSuchBucket and not + * NoSuchKey */ + op_ret = -ERR_NO_SUCH_BUCKET; + } + return; + } + + op_ret = rgw_remove_sse_s3_bucket_key(s); + if (op_ret != 0) { + // do nothing; it will already have been logged + } + + op_ret = s->bucket->remove_bucket(this, false, false, nullptr, y); + if (op_ret < 0 && op_ret == -ECANCELED) { + // lost a race, either with mdlog sync or another delete bucket operation. + // in either case, we've already called ctl.bucket->unlink_bucket() + op_ret = 0; + } + + return; +} + +int RGWPutObj::init_processing(optional_yield y) { + copy_source = url_decode(s->info.env->get("HTTP_X_AMZ_COPY_SOURCE", "")); + copy_source_range = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_RANGE"); + size_t pos; + int ret; + + /* handle x-amz-copy-source */ + std::string_view cs_view(copy_source); + if (! cs_view.empty()) { + if (cs_view[0] == '/') + cs_view.remove_prefix(1); + copy_source_bucket_name = std::string(cs_view); + pos = copy_source_bucket_name.find("/"); + if (pos == std::string::npos) { + ret = -EINVAL; + ldpp_dout(this, 5) << "x-amz-copy-source bad format" << dendl; + return ret; + } + copy_source_object_name = + copy_source_bucket_name.substr(pos + 1, copy_source_bucket_name.size()); + copy_source_bucket_name = copy_source_bucket_name.substr(0, pos); +#define VERSION_ID_STR "?versionId=" + pos = copy_source_object_name.find(VERSION_ID_STR); + if (pos == std::string::npos) { + copy_source_object_name = url_decode(copy_source_object_name); + } else { + copy_source_version_id = + copy_source_object_name.substr(pos + sizeof(VERSION_ID_STR) - 1); + copy_source_object_name = + url_decode(copy_source_object_name.substr(0, pos)); + } + pos = copy_source_bucket_name.find(":"); + if (pos == std::string::npos) { + // if tenant is not specified in x-amz-copy-source, use tenant of the requester + copy_source_tenant_name = s->user->get_tenant(); + } else { + copy_source_tenant_name = copy_source_bucket_name.substr(0, pos); + copy_source_bucket_name = copy_source_bucket_name.substr(pos + 1, copy_source_bucket_name.size()); + if (copy_source_bucket_name.empty()) { + ret = -EINVAL; + ldpp_dout(this, 5) << "source bucket name is empty" << dendl; + return ret; + } + } + std::unique_ptr bucket; + ret = driver->get_bucket(this, s->user.get(), copy_source_tenant_name, copy_source_bucket_name, + &bucket, y); + if (ret < 0) { + ldpp_dout(this, 5) << __func__ << "(): get_bucket() returned ret=" << ret << dendl; + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_BUCKET; + } + return ret; + } + + ret = bucket->load_bucket(this, y); + if (ret < 0) { + ldpp_dout(this, 5) << __func__ << "(): load_bucket() returned ret=" << ret << dendl; + return ret; + } + copy_source_bucket_info = bucket->get_info(); + + /* handle x-amz-copy-source-range */ + if (copy_source_range) { + string range = copy_source_range; + pos = range.find("bytes="); + if (pos == std::string::npos || pos != 0) { + ret = -EINVAL; + ldpp_dout(this, 5) << "x-amz-copy-source-range bad format" << dendl; + return ret; + } + /* 6 is the length of "bytes=" */ + range = range.substr(pos + 6); + pos = range.find("-"); + if (pos == std::string::npos) { + ret = -EINVAL; + ldpp_dout(this, 5) << "x-amz-copy-source-range bad format" << dendl; + return ret; + } + string first = range.substr(0, pos); + string last = range.substr(pos + 1); + if (first.find_first_not_of("0123456789") != std::string::npos || + last.find_first_not_of("0123456789") != std::string::npos) { + ldpp_dout(this, 5) << "x-amz-copy-source-range bad format not an integer" << dendl; + ret = -EINVAL; + return ret; + } + copy_source_range_fst = strtoull(first.c_str(), NULL, 10); + copy_source_range_lst = strtoull(last.c_str(), NULL, 10); + if (copy_source_range_fst > copy_source_range_lst) { + ret = -ERANGE; + ldpp_dout(this, 5) << "x-amz-copy-source-range bad format first number bigger than second" << dendl; + return ret; + } + } + + } /* copy_source */ + return RGWOp::init_processing(y); +} + +int RGWPutObj::verify_permission(optional_yield y) +{ + if (! copy_source.empty()) { + + RGWAccessControlPolicy cs_acl(s->cct); + boost::optional policy; + map cs_attrs; + std::unique_ptr cs_bucket; + int ret = driver->get_bucket(NULL, copy_source_bucket_info, &cs_bucket); + if (ret < 0) + return ret; + + std::unique_ptr cs_object = + cs_bucket->get_object(rgw_obj_key(copy_source_object_name, copy_source_version_id)); + + cs_object->set_atomic(); + cs_object->set_prefetch_data(); + + /* check source object permissions */ + if (ret = read_obj_policy(this, driver, s, copy_source_bucket_info, cs_attrs, &cs_acl, nullptr, + policy, cs_bucket.get(), cs_object.get(), y, true); ret < 0) { + return ret; + } + + /* admin request overrides permission checks */ + if (! s->auth.identity->is_admin_of(cs_acl.get_owner().get_id())) { + if (policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + //add source object tags for permission evaluation + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, policy, s->iam_user_policies, s->session_policies); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, cs_object.get(), has_s3_existing_tag, has_s3_resource_tag); + auto usr_policy_res = Effect::Pass; + rgw::ARN obj_arn(cs_object->get_obj()); + for (auto& user_policy : s->iam_user_policies) { + if (usr_policy_res = user_policy.eval(s->env, *s->auth.identity, + cs_object->get_instance().empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion, + obj_arn); usr_policy_res == Effect::Deny) + return -EACCES; + else if (usr_policy_res == Effect::Allow) + break; + } + rgw::IAM::Effect e = Effect::Pass; + if (policy) { + rgw::ARN obj_arn(cs_object->get_obj()); + e = policy->eval(s->env, *s->auth.identity, + cs_object->get_instance().empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion, + obj_arn); + } + if (e == Effect::Deny) { + return -EACCES; + } else if (usr_policy_res == Effect::Pass && e == Effect::Pass && + !cs_acl.verify_permission(this, *s->auth.identity, s->perm_mask, + RGW_PERM_READ)) { + return -EACCES; + } + rgw_iam_remove_objtags(this, s, cs_object.get(), has_s3_existing_tag, has_s3_resource_tag); + } else if (!cs_acl.verify_permission(this, *s->auth.identity, s->perm_mask, + RGW_PERM_READ)) { + return -EACCES; + } + } + } + + if (s->bucket_access_conf && s->bucket_access_conf->block_public_acls()) { + if (s->canned_acl.compare("public-read") || + s->canned_acl.compare("public-read-write") || + s->canned_acl.compare("authenticated-read")) + return -EACCES; + } + + auto op_ret = get_params(y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "get_params() returned ret=" << op_ret << dendl; + return op_ret; + } + + if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + rgw_add_grant_to_iam_environment(s->env, s); + + rgw_add_to_iam_environment(s->env, "s3:x-amz-acl", s->canned_acl); + + if (obj_tags != nullptr && obj_tags->count() > 0){ + auto tags = obj_tags->get_tags(); + for (const auto& kv: tags){ + rgw_add_to_iam_environment(s->env, "s3:RequestObjectTag/"+kv.first, kv.second); + } + } + + // add server-side encryption headers + rgw_iam_add_crypt_attrs(s->env, s->info.crypt_attribute_map); + + // Add bucket tags for authorization + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (identity_policy_res == Effect::Deny) + return -EACCES; + + rgw::IAM::Effect e = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + if (s->iam_policy) { + ARN obj_arn(s->object->get_obj()); + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + obj_arn, + princ_type); + } + if (e == Effect::Deny) { + return -EACCES; + } + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (session_policy_res == Effect::Deny) { + return -EACCES; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && e == Effect::Allow)) + return 0; + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) + return 0; + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) + return 0; + } + return -EACCES; + } + if (e == Effect::Allow || identity_policy_res == Effect::Allow) { + return 0; + } + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + + +void RGWPutObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +class RGWPutObj_CB : public RGWGetObj_Filter +{ + RGWPutObj *op; +public: + explicit RGWPutObj_CB(RGWPutObj *_op) : op(_op) {} + ~RGWPutObj_CB() override {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override { + return op->get_data_cb(bl, bl_ofs, bl_len); + } +}; + +int RGWPutObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + bufferlist bl_tmp; + bl.begin(bl_ofs).copy(bl_len, bl_tmp); + + bl_aux.append(bl_tmp); + + return bl_len; +} + +int RGWPutObj::get_data(const off_t fst, const off_t lst, bufferlist& bl) +{ + RGWPutObj_CB cb(this); + RGWGetObj_Filter* filter = &cb; + boost::optional decompress; + std::unique_ptr decrypt; + RGWCompressionInfo cs_info; + map attrs; + int ret = 0; + + uint64_t obj_size; + int64_t new_ofs, new_end; + + new_ofs = fst; + new_end = lst; + + std::unique_ptr bucket; + ret = driver->get_bucket(nullptr, copy_source_bucket_info, &bucket); + if (ret < 0) + return ret; + + std::unique_ptr obj = bucket->get_object(rgw_obj_key(copy_source_object_name, copy_source_version_id)); + std::unique_ptr read_op(obj->get_read_op()); + + ret = read_op->prepare(s->yield, this); + if (ret < 0) + return ret; + + obj_size = obj->get_obj_size(); + + bool need_decompress; + op_ret = rgw_compression_info_from_attrset(obj->get_attrs(), need_decompress, cs_info); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to decode compression info" << dendl; + return -EIO; + } + + bool partial_content = true; + if (need_decompress) + { + obj_size = cs_info.orig_size; + decompress.emplace(s->cct, &cs_info, partial_content, filter); + filter = &*decompress; + } + + auto attr_iter = obj->get_attrs().find(RGW_ATTR_MANIFEST); + op_ret = this->get_decrypt_filter(&decrypt, + filter, + obj->get_attrs(), + attr_iter != obj->get_attrs().end() ? &(attr_iter->second) : nullptr); + if (decrypt != nullptr) { + filter = decrypt.get(); + } + if (op_ret < 0) { + return op_ret; + } + + ret = obj->range_to_ofs(obj_size, new_ofs, new_end); + if (ret < 0) + return ret; + + filter->fixup_range(new_ofs, new_end); + ret = read_op->iterate(this, new_ofs, new_end, filter, s->yield); + + if (ret >= 0) + ret = filter->flush(); + + bl.claim_append(bl_aux); + + return ret; +} + +// special handling for compression type = "random" with multipart uploads +static CompressorRef get_compressor_plugin(const req_state *s, + const std::string& compression_type) +{ + if (compression_type != "random") { + return Compressor::create(s->cct, compression_type); + } + + bool is_multipart{false}; + const auto& upload_id = s->info.args.get("uploadId", &is_multipart); + + if (!is_multipart) { + return Compressor::create(s->cct, compression_type); + } + + // use a hash of the multipart upload id so all parts use the same plugin + const auto alg = std::hash{}(upload_id) % Compressor::COMP_ALG_LAST; + if (alg == Compressor::COMP_ALG_NONE) { + return nullptr; + } + return Compressor::create(s->cct, alg); +} + +int RGWPutObj::get_lua_filter(std::unique_ptr* filter, rgw::sal::DataProcessor* cb) { + std::string script; + const auto rc = rgw::lua::read_script(s, s->penv.lua.manager.get(), s->bucket_tenant, s->yield, rgw::lua::context::putData, script); + if (rc == -ENOENT) { + // no script, nothing to do + return 0; + } else if (rc < 0) { + ldpp_dout(this, 5) << "WARNING: failed to read data script. error: " << rc << dendl; + return rc; + } + filter->reset(new rgw::lua::RGWPutObjFilter(s, script, cb)); + return 0; +} + +void RGWPutObj::execute(optional_yield y) +{ + char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1]; + char supplied_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + bufferlist bl, aclbl, bs; + int len; + + off_t fst; + off_t lst; + + bool need_calc_md5 = (dlo_manifest == NULL) && (slo_info == NULL); + perfcounter->inc(l_rgw_put); + // report latency on return + auto put_lat = make_scope_guard([&] { + perfcounter->tinc(l_rgw_put_lat, s->time_elapsed()); + }); + + op_ret = -EINVAL; + if (rgw::sal::Object::empty(s->object.get())) { + return; + } + + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + op_ret = get_system_versioning_params(s, &olh_epoch, &version_id); + if (op_ret < 0) { + ldpp_dout(this, 20) << "get_system_versioning_params() returned ret=" + << op_ret << dendl; + return; + } + + if (supplied_md5_b64) { + need_calc_md5 = true; + + ldpp_dout(this, 15) << "supplied_md5_b64=" << supplied_md5_b64 << dendl; + op_ret = ceph_unarmor(supplied_md5_bin, &supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1], + supplied_md5_b64, supplied_md5_b64 + strlen(supplied_md5_b64)); + ldpp_dout(this, 15) << "ceph_armor ret=" << op_ret << dendl; + if (op_ret != CEPH_CRYPTO_MD5_DIGESTSIZE) { + op_ret = -ERR_INVALID_DIGEST; + return; + } + + buf_to_hex((const unsigned char *)supplied_md5_bin, CEPH_CRYPTO_MD5_DIGESTSIZE, supplied_md5); + ldpp_dout(this, 15) << "supplied_md5=" << supplied_md5 << dendl; + } + + if (!chunked_upload) { /* with chunked upload we don't know how big is the upload. + we also check sizes at the end anyway */ + op_ret = s->bucket->check_quota(this, quota, s->content_length, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "check_quota() returned ret=" << op_ret << dendl; + return; + } + } + + if (supplied_etag) { + strncpy(supplied_md5, supplied_etag, sizeof(supplied_md5) - 1); + supplied_md5[sizeof(supplied_md5) - 1] = '\0'; + } + + const bool multipart = !multipart_upload_id.empty(); + + /* Handle object versioning of Swift API. */ + if (! multipart) { + op_ret = s->object->swift_versioning_copy(this, s->yield); + if (op_ret < 0) { + return; + } + } + + // make reservation for notification if needed + std::unique_ptr res + = driver->get_notification( + s->object.get(), s->src_object.get(), s, + rgw::notify::ObjectCreatedPut, y); + if(!multipart) { + op_ret = res->publish_reserve(this, obj_tags.get()); + if (op_ret < 0) { + return; + } + } + + // create the object processor + std::unique_ptr processor; + + rgw_placement_rule *pdest_placement = &s->dest_placement; + + if (multipart) { + std::unique_ptr upload; + upload = s->bucket->get_multipart_upload(s->object->get_name(), + multipart_upload_id); + op_ret = upload->get_info(this, s->yield, &pdest_placement); + + s->trace->SetAttribute(tracing::rgw::UPLOAD_ID, multipart_upload_id); + multipart_trace = tracing::rgw::tracer.add_span(name(), upload->get_trace()); + + if (op_ret < 0) { + if (op_ret != -ENOENT) { + ldpp_dout(this, 0) << "ERROR: get_multipart_info returned " << op_ret << ": " << cpp_strerror(-op_ret) << dendl; + } else {// -ENOENT: raced with upload complete/cancel, no need to spam log + ldpp_dout(this, 20) << "failed to get multipart info (returned " << op_ret << ": " << cpp_strerror(-op_ret) << "): probably raced with upload complete / cancel" << dendl; + } + return; + } + /* upload will go out of scope, so copy the dest placement for later use */ + s->dest_placement = *pdest_placement; + pdest_placement = &s->dest_placement; + ldpp_dout(this, 20) << "dest_placement for part=" << *pdest_placement << dendl; + processor = upload->get_writer(this, s->yield, s->object.get(), + s->user->get_id(), pdest_placement, + multipart_part_num, multipart_part_str); + } else if(append) { + if (s->bucket->versioned()) { + op_ret = -ERR_INVALID_BUCKET_STATE; + return; + } + processor = driver->get_append_writer(this, s->yield, s->object.get(), + s->bucket_owner.get_id(), + pdest_placement, s->req_id, position, + &cur_accounted_size); + } else { + if (s->bucket->versioning_enabled()) { + if (!version_id.empty()) { + s->object->set_instance(version_id); + } else { + s->object->gen_rand_obj_instance_name(); + version_id = s->object->get_instance(); + } + } + processor = driver->get_atomic_writer(this, s->yield, s->object.get(), + s->bucket_owner.get_id(), + pdest_placement, olh_epoch, s->req_id); + } + + op_ret = processor->prepare(s->yield); + if (op_ret < 0) { + ldpp_dout(this, 20) << "processor->prepare() returned ret=" << op_ret + << dendl; + return; + } + if ((! copy_source.empty()) && !copy_source_range) { + std::unique_ptr bucket; + op_ret = driver->get_bucket(nullptr, copy_source_bucket_info, &bucket); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get bucket with error" << op_ret << dendl; + return; + } + std::unique_ptr obj = + bucket->get_object(rgw_obj_key(copy_source_object_name, copy_source_version_id)); + + RGWObjState *astate; + op_ret = obj->get_obj_state(this, &astate, s->yield); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: get copy source obj state returned with error" << op_ret << dendl; + return; + } + bufferlist bl; + if (astate->get_attr(RGW_ATTR_MANIFEST, bl)) { + RGWObjManifest m; + try{ + decode(m, bl); + if (m.get_tier_type() == "cloud-s3") { + op_ret = -ERR_INVALID_OBJECT_STATE; + s->err.message = "This object was transitioned to cloud-s3"; + ldpp_dout(this, 4) << "Cannot copy cloud tiered object. Failing with " + << op_ret << dendl; + return; + } + } catch (const buffer::end_of_buffer&) { + // ignore empty manifest; it's not cloud-tiered + } catch (const std::exception& e) { + ldpp_dout(this, 1) << "WARNING: failed to decode object manifest for " + << *s->object << ": " << e.what() << dendl; + } + } + + if (!astate->exists){ + op_ret = -ENOENT; + return; + } + lst = astate->accounted_size - 1; + } else { + lst = copy_source_range_lst; + } + fst = copy_source_range_fst; + + // no filters by default + rgw::sal::DataProcessor *filter = processor.get(); + + const auto& compression_type = driver->get_compression_type(*pdest_placement); + CompressorRef plugin; + boost::optional compressor; + + std::unique_ptr encrypt; + std::unique_ptr run_lua; + + if (!append) { // compression and encryption only apply to full object uploads + op_ret = get_encrypt_filter(&encrypt, filter); + if (op_ret < 0) { + return; + } + if (encrypt != nullptr) { + filter = &*encrypt; + } + // a zonegroup feature is required to combine compression and encryption + const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup(); + const bool compress_encrypted = zonegroup.supports(rgw::zone_features::compress_encrypted); + if (compression_type != "none" && + (encrypt == nullptr || compress_encrypted)) { + plugin = get_compressor_plugin(s, compression_type); + if (!plugin) { + ldpp_dout(this, 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } else { + compressor.emplace(s->cct, plugin, filter); + filter = &*compressor; + // always send incompressible hint when rgw is itself doing compression + s->object->set_compressed(); + } + } + // run lua script before data is compressed and encrypted - last filter runs first + op_ret = get_lua_filter(&run_lua, filter); + if (op_ret < 0) { + return; + } + if (run_lua) { + filter = &*run_lua; + } + } + tracepoint(rgw_op, before_data_transfer, s->req_id.c_str()); + do { + bufferlist data; + if (fst > lst) + break; + if (copy_source.empty()) { + len = get_data(data); + } else { + off_t cur_lst = min(fst + s->cct->_conf->rgw_max_chunk_size - 1, lst); + op_ret = get_data(fst, cur_lst, data); + if (op_ret < 0) + return; + len = data.length(); + s->content_length += len; + fst += len; + } + if (len < 0) { + op_ret = len; + ldpp_dout(this, 20) << "get_data() returned ret=" << op_ret << dendl; + return; + } else if (len == 0) { + break; + } + + if (need_calc_md5) { + hash.Update((const unsigned char *)data.c_str(), data.length()); + } + + /* update torrrent */ + torrent.update(data); + + op_ret = filter->process(std::move(data), ofs); + if (op_ret < 0) { + ldpp_dout(this, 20) << "processor->process() returned ret=" + << op_ret << dendl; + return; + } + + ofs += len; + } while (len > 0); + tracepoint(rgw_op, after_data_transfer, s->req_id.c_str(), ofs); + + // flush any data in filters + op_ret = filter->process({}, ofs); + if (op_ret < 0) { + return; + } + + if (!chunked_upload && ofs != s->content_length) { + op_ret = -ERR_REQUEST_TIMEOUT; + return; + } + s->obj_size = ofs; + s->object->set_obj_size(ofs); + + perfcounter->inc(l_rgw_put_b, s->obj_size); + + op_ret = do_aws4_auth_completion(); + if (op_ret < 0) { + return; + } + + op_ret = s->bucket->check_quota(this, quota, s->obj_size, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "second check_quota() returned op_ret=" << op_ret << dendl; + return; + } + + hash.Final(m); + + if (compressor && compressor->is_compressed()) { + bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = s->obj_size; + cs_info.compressor_message = compressor->get_compressor_message(); + cs_info.blocks = move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + attrs[RGW_ATTR_COMPRESSION] = tmp; + ldpp_dout(this, 20) << "storing " << RGW_ATTR_COMPRESSION + << " with type=" << cs_info.compression_type + << ", orig_size=" << cs_info.orig_size + << ", blocks=" << cs_info.blocks.size() << dendl; + } + + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + etag = calc_md5; + + if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) { + op_ret = -ERR_BAD_DIGEST; + return; + } + + policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + if (dlo_manifest) { + op_ret = encode_dlo_manifest_attr(dlo_manifest, attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "bad user manifest: " << dlo_manifest << dendl; + return; + } + } + + if (slo_info) { + bufferlist manifest_bl; + encode(*slo_info, manifest_bl); + emplace_attr(RGW_ATTR_SLO_MANIFEST, std::move(manifest_bl)); + } + + if (supplied_etag && etag.compare(supplied_etag) != 0) { + op_ret = -ERR_UNPROCESSABLE_ENTITY; + return; + } + bl.append(etag.c_str(), etag.size()); + emplace_attr(RGW_ATTR_ETAG, std::move(bl)); + + populate_with_generic_attrs(s, attrs); + op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs); + if (op_ret < 0) { + return; + } + encode_delete_at_attr(delete_at, attrs); + encode_obj_tags_attr(obj_tags.get(), attrs); + rgw_cond_decode_objtags(s, attrs); + + /* Add a custom metadata to expose the information whether an object + * is an SLO or not. Appending the attribute must be performed AFTER + * processing any input from user in order to prohibit overwriting. */ + if (slo_info) { + bufferlist slo_userindicator_bl; + slo_userindicator_bl.append("True", 4); + emplace_attr(RGW_ATTR_SLO_UINDICATOR, std::move(slo_userindicator_bl)); + } + if (obj_legal_hold) { + bufferlist obj_legal_hold_bl; + obj_legal_hold->encode(obj_legal_hold_bl); + emplace_attr(RGW_ATTR_OBJECT_LEGAL_HOLD, std::move(obj_legal_hold_bl)); + } + if (obj_retention) { + bufferlist obj_retention_bl; + obj_retention->encode(obj_retention_bl); + emplace_attr(RGW_ATTR_OBJECT_RETENTION, std::move(obj_retention_bl)); + } + + tracepoint(rgw_op, processor_complete_enter, s->req_id.c_str()); + op_ret = processor->complete(s->obj_size, etag, &mtime, real_time(), attrs, + (delete_at ? *delete_at : real_time()), if_match, if_nomatch, + (user_data.empty() ? nullptr : &user_data), nullptr, nullptr, + s->yield); + tracepoint(rgw_op, processor_complete_exit, s->req_id.c_str()); + + /* produce torrent */ + if (s->cct->_conf->rgw_torrent_flag && (ofs == torrent.get_data_len())) + { + torrent.init(s, driver); + torrent.set_create_date(mtime); + op_ret = torrent.complete(y); + if (0 != op_ret) + { + ldpp_dout(this, 0) << "ERROR: torrent.handle_data() returned " << op_ret << dendl; + return; + } + } + + // send request to notification manager + int ret = res->publish_commit(this, s->obj_size, mtime, etag, s->object->get_instance()); + if (ret < 0) { + ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl; + // too late to rollback operation, hence op_ret is not set here + } +} + +int RGWPostObj::verify_permission(optional_yield y) +{ + return 0; +} + +void RGWPostObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPostObj::execute(optional_yield y) +{ + boost::optional compressor; + CompressorRef plugin; + char supplied_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + + /* Read in the data from the POST form. */ + op_ret = get_params(y); + if (op_ret < 0) { + return; + } + + op_ret = verify_params(); + if (op_ret < 0) { + return; + } + + // add server-side encryption headers + rgw_iam_add_crypt_attrs(s->env, s->info.crypt_attribute_map); + + if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (identity_policy_res == Effect::Deny) { + op_ret = -EACCES; + return; + } + + rgw::IAM::Effect e = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + if (s->iam_policy) { + ARN obj_arn(s->object->get_obj()); + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + obj_arn, + princ_type); + } + if (e == Effect::Deny) { + op_ret = -EACCES; + return; + } + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (session_policy_res == Effect::Deny) { + op_ret = -EACCES; + return; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && e == Effect::Allow)) { + op_ret = 0; + return; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) { + op_ret = 0; + return; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) { + op_ret = 0; + return; + } + } + op_ret = -EACCES; + return; + } + if (identity_policy_res == Effect::Pass && e == Effect::Pass && !verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + op_ret = -EACCES; + return; + } + } else if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + op_ret = -EACCES; + return; + } + + // make reservation for notification if needed + std::unique_ptr res + = driver->get_notification(s->object.get(), s->src_object.get(), s, rgw::notify::ObjectCreatedPost, y); + op_ret = res->publish_reserve(this); + if (op_ret < 0) { + return; + } + + /* Start iteration over data fields. It's necessary as Swift's FormPost + * is capable to handle multiple files in single form. */ + do { + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + ceph::buffer::list bl, aclbl; + + op_ret = s->bucket->check_quota(this, quota, s->content_length, y); + if (op_ret < 0) { + return; + } + + if (supplied_md5_b64) { + char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1]; + ldpp_dout(this, 15) << "supplied_md5_b64=" << supplied_md5_b64 << dendl; + op_ret = ceph_unarmor(supplied_md5_bin, &supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1], + supplied_md5_b64, supplied_md5_b64 + strlen(supplied_md5_b64)); + ldpp_dout(this, 15) << "ceph_armor ret=" << op_ret << dendl; + if (op_ret != CEPH_CRYPTO_MD5_DIGESTSIZE) { + op_ret = -ERR_INVALID_DIGEST; + return; + } + + buf_to_hex((const unsigned char *)supplied_md5_bin, CEPH_CRYPTO_MD5_DIGESTSIZE, supplied_md5); + ldpp_dout(this, 15) << "supplied_md5=" << supplied_md5 << dendl; + } + + std::unique_ptr obj = + s->bucket->get_object(rgw_obj_key(get_current_filename())); + if (s->bucket->versioning_enabled()) { + obj->gen_rand_obj_instance_name(); + } + + std::unique_ptr processor; + processor = driver->get_atomic_writer(this, s->yield, obj.get(), + s->bucket_owner.get_id(), + &s->dest_placement, 0, s->req_id); + op_ret = processor->prepare(s->yield); + if (op_ret < 0) { + return; + } + + /* No filters by default. */ + rgw::sal::DataProcessor *filter = processor.get(); + + std::unique_ptr encrypt; + op_ret = get_encrypt_filter(&encrypt, filter); + if (op_ret < 0) { + return; + } + if (encrypt != nullptr) { + filter = encrypt.get(); + } else { + const auto& compression_type = driver->get_compression_type(s->dest_placement); + if (compression_type != "none") { + plugin = Compressor::create(s->cct, compression_type); + if (!plugin) { + ldpp_dout(this, 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } else { + compressor.emplace(s->cct, plugin, filter); + filter = &*compressor; + } + } + } + + bool again; + do { + ceph::bufferlist data; + int len = get_data(data, again); + + if (len < 0) { + op_ret = len; + return; + } + + if (!len) { + break; + } + + hash.Update((const unsigned char *)data.c_str(), data.length()); + op_ret = filter->process(std::move(data), ofs); + if (op_ret < 0) { + return; + } + + ofs += len; + + if (ofs > max_len) { + op_ret = -ERR_TOO_LARGE; + return; + } + } while (again); + + // flush + op_ret = filter->process({}, ofs); + if (op_ret < 0) { + return; + } + + if (ofs < min_len) { + op_ret = -ERR_TOO_SMALL; + return; + } + + s->obj_size = ofs; + s->object->set_obj_size(ofs); + + + op_ret = s->bucket->check_quota(this, quota, s->obj_size, y); + if (op_ret < 0) { + return; + } + + hash.Final(m); + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + etag = calc_md5; + + if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) { + op_ret = -ERR_BAD_DIGEST; + return; + } + + bl.append(etag.c_str(), etag.size()); + emplace_attr(RGW_ATTR_ETAG, std::move(bl)); + + policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + const std::string content_type = get_current_content_type(); + if (! content_type.empty()) { + ceph::bufferlist ct_bl; + ct_bl.append(content_type.c_str(), content_type.size() + 1); + emplace_attr(RGW_ATTR_CONTENT_TYPE, std::move(ct_bl)); + } + + if (compressor && compressor->is_compressed()) { + ceph::bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = s->obj_size; + cs_info.compressor_message = compressor->get_compressor_message(); + cs_info.blocks = move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + emplace_attr(RGW_ATTR_COMPRESSION, std::move(tmp)); + } + + op_ret = processor->complete(s->obj_size, etag, nullptr, real_time(), attrs, + (delete_at ? *delete_at : real_time()), + nullptr, nullptr, nullptr, nullptr, nullptr, + s->yield); + if (op_ret < 0) { + return; + } + } while (is_next_file_to_upload()); + + // send request to notification manager + int ret = res->publish_commit(this, ofs, s->object->get_mtime(), etag, s->object->get_instance()); + if (ret < 0) { + ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl; + // too late to rollback operation, hence op_ret is not set here + } +} + + +void RGWPutMetadataAccount::filter_out_temp_url(map& add_attrs, + const set& rmattr_names, + map& temp_url_keys) +{ + map::iterator iter; + + iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY1); + if (iter != add_attrs.end()) { + temp_url_keys[0] = iter->second.c_str(); + add_attrs.erase(iter); + } + + iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY2); + if (iter != add_attrs.end()) { + temp_url_keys[1] = iter->second.c_str(); + add_attrs.erase(iter); + } + + for (const string& name : rmattr_names) { + if (name.compare(RGW_ATTR_TEMPURL_KEY1) == 0) { + temp_url_keys[0] = string(); + } + if (name.compare(RGW_ATTR_TEMPURL_KEY2) == 0) { + temp_url_keys[1] = string(); + } + } +} + +int RGWPutMetadataAccount::init_processing(optional_yield y) +{ + /* First, go to the base class. At the time of writing the method was + * responsible only for initializing the quota. This isn't necessary + * here as we are touching metadata only. I'm putting this call only + * for the future. */ + op_ret = RGWOp::init_processing(y); + if (op_ret < 0) { + return op_ret; + } + + op_ret = get_params(y); + if (op_ret < 0) { + return op_ret; + } + + op_ret = s->user->read_attrs(this, y); + if (op_ret < 0) { + return op_ret; + } + orig_attrs = s->user->get_attrs(); + + if (has_policy) { + bufferlist acl_bl; + policy.encode(acl_bl); + attrs.emplace(RGW_ATTR_ACL, std::move(acl_bl)); + } + + op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs, false); + if (op_ret < 0) { + return op_ret; + } + prepare_add_del_attrs(orig_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); + + /* Try extract the TempURL-related stuff now to allow verify_permission + * evaluate whether we need FULL_CONTROL or not. */ + filter_out_temp_url(attrs, rmattr_names, temp_url_keys); + + /* The same with quota except a client needs to be reseller admin. */ + op_ret = filter_out_quota_info(attrs, rmattr_names, new_quota, + &new_quota_extracted); + if (op_ret < 0) { + return op_ret; + } + + return 0; +} + +int RGWPutMetadataAccount::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (!verify_user_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + /* Altering TempURL keys requires FULL_CONTROL. */ + if (!temp_url_keys.empty() && s->perm_mask != RGW_PERM_FULL_CONTROL) { + return -EPERM; + } + + /* We are failing this intensionally to allow system user/reseller admin + * override in rgw_process.cc. This is the way to specify a given RGWOp + * expect extra privileges. */ + if (new_quota_extracted) { + return -EACCES; + } + + return 0; +} + +void RGWPutMetadataAccount::execute(optional_yield y) +{ + /* Params have been extracted earlier. See init_processing(). */ + op_ret = s->user->load_user(this, y); + if (op_ret < 0) { + return; + } + + /* Handle the TempURL-related stuff. */ + if (!temp_url_keys.empty()) { + for (auto& pair : temp_url_keys) { + s->user->get_info().temp_url_keys[pair.first] = std::move(pair.second); + } + } + + /* Handle the quota extracted at the verify_permission step. */ + if (new_quota_extracted) { + s->user->get_info().quota.user_quota = std::move(new_quota); + } + + /* We are passing here the current (old) user info to allow the function + * optimize-out some operations. */ + s->user->set_attrs(attrs); + op_ret = s->user->store_user(this, y, false, &s->user->get_info()); +} + +int RGWPutMetadataBucket::verify_permission(optional_yield y) +{ + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWPutMetadataBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutMetadataBucket::execute(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) { + return; + } + + op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs, false); + if (op_ret < 0) { + return; + } + + if (!placement_rule.empty() && + placement_rule != s->bucket->get_placement_rule()) { + op_ret = -EEXIST; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + /* Encode special metadata first as we're using std::map::emplace under + * the hood. This method will add the new items only if the map doesn't + * contain such keys yet. */ + if (has_policy) { + if (s->dialect.compare("swift") == 0) { + auto old_policy = \ + static_cast(s->bucket_acl.get()); + auto new_policy = static_cast(&policy); + new_policy->filter_merge(policy_rw_mask, old_policy); + policy = *new_policy; + } + buffer::list bl; + policy.encode(bl); + emplace_attr(RGW_ATTR_ACL, std::move(bl)); + } + + if (has_cors) { + buffer::list bl; + cors_config.encode(bl); + emplace_attr(RGW_ATTR_CORS, std::move(bl)); + } + + /* It's supposed that following functions WILL NOT change any + * special attributes (like RGW_ATTR_ACL) if they are already + * present in attrs. */ + prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); + + /* According to the Swift's behaviour and its container_quota + * WSGI middleware implementation: anyone with write permissions + * is able to set the bucket quota. This stays in contrast to + * account quotas that can be set only by clients holding + * reseller admin privileges. */ + op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket->get_info().quota); + if (op_ret < 0) { + return op_ret; + } + + if (swift_ver_location) { + s->bucket->get_info().swift_ver_location = *swift_ver_location; + s->bucket->get_info().swift_versioning = (!swift_ver_location->empty()); + } + + /* Web site of Swift API. */ + filter_out_website(attrs, rmattr_names, s->bucket->get_info().website_conf); + s->bucket->get_info().has_website = !s->bucket->get_info().website_conf.is_empty(); + + /* Setting attributes also stores the provided bucket info. Due + * to this fact, the new quota settings can be serialized with + * the same call. */ + op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield); + return op_ret; + }); +} + +int RGWPutMetadataObject::verify_permission(optional_yield y) +{ + // This looks to be something specific to Swift. We could add + // operations like swift:PutMetadataObject to the Policy Engine. + if (!verify_object_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWPutMetadataObject::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutMetadataObject::execute(optional_yield y) +{ + rgw_obj target_obj; + rgw::sal::Attrs attrs, rmattrs; + + s->object->set_atomic(); + + op_ret = get_params(y); + if (op_ret < 0) { + return; + } + + op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs); + if (op_ret < 0) { + return; + } + + /* check if obj exists, read orig attrs */ + op_ret = s->object->get_obj_attrs(s->yield, s, &target_obj); + if (op_ret < 0) { + return; + } + + /* Check whether the object has expired. Swift API documentation + * stands that we should return 404 Not Found in such case. */ + if (need_object_expiration() && s->object->is_expired()) { + op_ret = -ENOENT; + return; + } + + /* Filter currently existing attributes. */ + prepare_add_del_attrs(s->object->get_attrs(), attrs, rmattrs); + populate_with_generic_attrs(s, attrs); + encode_delete_at_attr(delete_at, attrs); + + if (dlo_manifest) { + op_ret = encode_dlo_manifest_attr(dlo_manifest, attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "bad user manifest: " << dlo_manifest << dendl; + return; + } + } + + op_ret = s->object->set_obj_attrs(this, &attrs, &rmattrs, s->yield); +} + +int RGWDeleteObj::handle_slo_manifest(bufferlist& bl, optional_yield y) +{ + RGWSLOInfo slo_info; + auto bliter = bl.cbegin(); + try { + decode(slo_info, bliter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode slo manifest" << dendl; + return -EIO; + } + + try { + deleter = std::unique_ptr(\ + new RGWBulkDelete::Deleter(this, driver, s)); + } catch (const std::bad_alloc&) { + return -ENOMEM; + } + + list items; + for (const auto& iter : slo_info.entries) { + const string& path_str = iter.path; + + const size_t pos_init = path_str.find_first_not_of('/'); + if (std::string_view::npos == pos_init) { + return -EINVAL; + } + + const size_t sep_pos = path_str.find('/', pos_init); + if (std::string_view::npos == sep_pos) { + return -EINVAL; + } + + RGWBulkDelete::acct_path_t path; + + path.bucket_name = url_decode(path_str.substr(pos_init, sep_pos - pos_init)); + path.obj_key = url_decode(path_str.substr(sep_pos + 1)); + + items.push_back(path); + } + + /* Request removal of the manifest object itself. */ + RGWBulkDelete::acct_path_t path; + path.bucket_name = s->bucket_name; + path.obj_key = s->object->get_key(); + items.push_back(path); + + int ret = deleter->delete_chunk(items, y); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWDeleteObj::verify_permission(optional_yield y) +{ + int op_ret = get_params(y); + if (op_ret) { + return op_ret; + } + + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (s->iam_policy || ! s->iam_user_policies.empty() || ! s->session_policies.empty()) { + if (s->bucket->get_info().obj_lock_enabled() && bypass_governance_mode) { + auto r = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket->get_key(), s->object->get_name())); + if (r == Effect::Deny) { + bypass_perm = false; + } else if (r == Effect::Pass && s->iam_policy) { + ARN obj_arn(ARN(s->bucket->get_key(), s->object->get_name())); + r = s->iam_policy->eval(s->env, *s->auth.identity, rgw::IAM::s3BypassGovernanceRetention, obj_arn); + if (r == Effect::Deny) { + bypass_perm = false; + } + } else if (r == Effect::Pass && !s->session_policies.empty()) { + r = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket->get_key(), s->object->get_name())); + if (r == Effect::Deny) { + bypass_perm = false; + } + } + } + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + s->object->get_instance().empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(s->bucket->get_key(), s->object->get_name())); + if (identity_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect r = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + ARN obj_arn(ARN(s->bucket->get_key(), s->object->get_name())); + if (s->iam_policy) { + r = s->iam_policy->eval(s->env, *s->auth.identity, + s->object->get_instance().empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + obj_arn, + princ_type); + } + if (r == Effect::Deny) + return -EACCES; + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + s->object->get_instance().empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + obj_arn); + if (session_policy_res == Effect::Deny) { + return -EACCES; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && r == Effect::Allow)) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || r == Effect::Allow) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) { + return 0; + } + } + return -EACCES; + } + if (r == Effect::Allow || identity_policy_res == Effect::Allow) + return 0; + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + if (s->bucket->get_info().mfa_enabled() && + !s->object->get_instance().empty() && + !s->mfa_verified) { + ldpp_dout(this, 5) << "NOTICE: object delete request with a versioned object, mfa auth not provided" << dendl; + return -ERR_MFA_REQUIRED; + } + + return 0; +} + +void RGWDeleteObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteObj::execute(optional_yield y) +{ + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + if (!rgw::sal::Object::empty(s->object.get())) { + uint64_t obj_size = 0; + std::string etag; + { + RGWObjState* astate = nullptr; + bool check_obj_lock = s->object->have_instance() && s->bucket->get_info().obj_lock_enabled(); + + op_ret = s->object->get_obj_state(this, &astate, s->yield, true); + if (op_ret < 0) { + if (need_object_expiration() || multipart_delete) { + return; + } + + if (check_obj_lock) { + /* check if obj exists, read orig attrs */ + if (op_ret == -ENOENT) { + /* object maybe delete_marker, skip check_obj_lock*/ + check_obj_lock = false; + } else { + return; + } + } + } else { + obj_size = astate->size; + etag = astate->attrset[RGW_ATTR_ETAG].to_str(); + } + + // ignore return value from get_obj_attrs in all other cases + op_ret = 0; + + if (check_obj_lock) { + ceph_assert(astate); + int object_lock_response = verify_object_lock(this, astate->attrset, bypass_perm, bypass_governance_mode); + if (object_lock_response != 0) { + op_ret = object_lock_response; + if (op_ret == -EACCES) { + s->err.message = "forbidden by object lock"; + } + return; + } + } + + if (multipart_delete) { + if (!astate) { + op_ret = -ERR_NOT_SLO_MANIFEST; + return; + } + + const auto slo_attr = astate->attrset.find(RGW_ATTR_SLO_MANIFEST); + + if (slo_attr != astate->attrset.end()) { + op_ret = handle_slo_manifest(slo_attr->second, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to handle slo manifest ret=" << op_ret << dendl; + } + } else { + op_ret = -ERR_NOT_SLO_MANIFEST; + } + + return; + } + } + + // make reservation for notification if needed + const auto versioned_object = s->bucket->versioning_enabled(); + const auto event_type = versioned_object && + s->object->get_instance().empty() ? + rgw::notify::ObjectRemovedDeleteMarkerCreated : + rgw::notify::ObjectRemovedDelete; + std::unique_ptr res + = driver->get_notification(s->object.get(), s->src_object.get(), s, + event_type, y); + op_ret = res->publish_reserve(this); + if (op_ret < 0) { + return; + } + + s->object->set_atomic(); + + bool ver_restored = false; + op_ret = s->object->swift_versioning_restore(ver_restored, this); + if (op_ret < 0) { + return; + } + + if (!ver_restored) { + uint64_t epoch = 0; + + /* Swift's versioning mechanism hasn't found any previous version of + * the object that could be restored. This means we should proceed + * with the regular delete path. */ + op_ret = get_system_versioning_params(s, &epoch, &version_id); + if (op_ret < 0) { + return; + } + + std::unique_ptr del_op = s->object->get_delete_op(); + del_op->params.obj_owner = s->owner; + del_op->params.bucket_owner = s->bucket_owner; + del_op->params.versioning_status = s->bucket->get_info().versioning_status(); + del_op->params.unmod_since = unmod_since; + del_op->params.high_precision_time = s->system_request; + del_op->params.olh_epoch = epoch; + del_op->params.marker_version_id = version_id; + + op_ret = del_op->delete_obj(this, y); + if (op_ret >= 0) { + delete_marker = del_op->result.delete_marker; + version_id = del_op->result.version_id; + } + + /* Check whether the object has expired. Swift API documentation + * stands that we should return 404 Not Found in such case. */ + if (need_object_expiration() && s->object->is_expired()) { + op_ret = -ENOENT; + return; + } + } + + if (op_ret == -ECANCELED) { + op_ret = 0; + } + if (op_ret == -ERR_PRECONDITION_FAILED && no_precondition_error) { + op_ret = 0; + } + + // send request to notification manager + int ret = res->publish_commit(this, obj_size, ceph::real_clock::now(), etag, version_id); + if (ret < 0) { + ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl; + // too late to rollback operation, hence op_ret is not set here + } + } else { + op_ret = -EINVAL; + } +} + +bool RGWCopyObj::parse_copy_location(const std::string_view& url_src, + string& bucket_name, + rgw_obj_key& key, + req_state* s) +{ + std::string_view name_str; + std::string_view params_str; + + // search for ? before url-decoding so we don't accidentally match %3F + size_t pos = url_src.find('?'); + if (pos == string::npos) { + name_str = url_src; + } else { + name_str = url_src.substr(0, pos); + params_str = url_src.substr(pos + 1); + } + + if (name_str[0] == '/') // trim leading slash + name_str.remove_prefix(1); + + std::string dec_src = url_decode(name_str); + + pos = dec_src.find('/'); + if (pos == string::npos) + return false; + + bucket_name = dec_src.substr(0, pos); + key.name = dec_src.substr(pos + 1); + + if (key.name.empty()) { + return false; + } + + if (! params_str.empty()) { + RGWHTTPArgs args; + args.set(std::string(params_str)); + args.parse(s); + + key.instance = args.get("versionId", NULL); + } + + return true; +} + +int RGWCopyObj::init_processing(optional_yield y) +{ + op_ret = RGWOp::init_processing(y); + if (op_ret < 0) { + return op_ret; + } + + op_ret = get_params(y); + if (op_ret < 0) + return op_ret; + + op_ret = get_system_versioning_params(s, &olh_epoch, &version_id); + if (op_ret < 0) { + return op_ret; + } + + op_ret = driver->get_bucket(this, s->user.get(), + rgw_bucket_key(s->src_tenant_name, + s->src_bucket_name), + &src_bucket, y); + if (op_ret < 0) { + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_BUCKET; + } + return op_ret; + } + + /* This is the only place the bucket is set on src_object */ + s->src_object->set_bucket(src_bucket.get()); + return 0; +} + +int RGWCopyObj::verify_permission(optional_yield y) +{ + RGWAccessControlPolicy src_acl(s->cct); + boost::optional src_policy; + + /* get buckets info (source and dest) */ + if (s->local_source && source_zone.empty()) { + s->src_object->set_atomic(); + s->src_object->set_prefetch_data(); + + rgw_placement_rule src_placement; + + /* check source object permissions */ + op_ret = read_obj_policy(this, driver, s, src_bucket->get_info(), src_bucket->get_attrs(), &src_acl, &src_placement.storage_class, + src_policy, src_bucket.get(), s->src_object.get(), y); + if (op_ret < 0) { + return op_ret; + } + + /* follow up on previous checks that required reading source object head */ + if (need_to_check_storage_class) { + src_placement.inherit_from(src_bucket->get_placement_rule()); + + op_ret = check_storage_class(src_placement); + if (op_ret < 0) { + return op_ret; + } + } + + /* admin request overrides permission checks */ + if (!s->auth.identity->is_admin_of(src_acl.get_owner().get_id())) { + if (src_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, src_policy, s->iam_user_policies, s->session_policies); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, s->src_object.get(), has_s3_existing_tag, has_s3_resource_tag); + + ARN obj_arn(s->src_object->get_obj()); + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + s->src_object->get_instance().empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion, + obj_arn); + if (identity_policy_res == Effect::Deny) { + return -EACCES; + } + auto e = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + if (src_policy) { + e = src_policy->eval(s->env, *s->auth.identity, + s->src_object->get_instance().empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion, + obj_arn, + princ_type); + } + if (e == Effect::Deny) { + return -EACCES; + } + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + s->src_object->get_instance().empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion, + obj_arn); + if (session_policy_res == Effect::Deny) { + return -EACCES; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && + (session_policy_res != Effect::Allow || e != Effect::Allow)) { + return -EACCES; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && e != Effect::Allow) { + return -EACCES; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) { + return -EACCES; + } + } + } + if (identity_policy_res == Effect::Pass && e == Effect::Pass && + !src_acl.verify_permission(this, *s->auth.identity, s->perm_mask, + RGW_PERM_READ)) { + return -EACCES; + } + //remove src object tags as it may interfere with policy evaluation of destination obj + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_remove_objtags(this, s, s->src_object.get(), has_s3_existing_tag, has_s3_resource_tag); + + } else if (!src_acl.verify_permission(this, *s->auth.identity, + s->perm_mask, + RGW_PERM_READ)) { + return -EACCES; + } + } + } + + RGWAccessControlPolicy dest_bucket_policy(s->cct); + + s->object->set_atomic(); + + /* check dest bucket permissions */ + op_ret = read_bucket_policy(this, driver, s, s->bucket->get_info(), + s->bucket->get_attrs(), + &dest_bucket_policy, s->bucket->get_key(), y); + if (op_ret < 0) { + return op_ret; + } + auto dest_iam_policy = get_iam_policy_from_attr(s->cct, s->bucket->get_attrs(), s->bucket->get_tenant()); + /* admin request overrides permission checks */ + if (! s->auth.identity->is_admin_of(dest_policy.get_owner().get_id())){ + if (dest_iam_policy != boost::none || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + //Add destination bucket tags for authorization + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, dest_iam_policy, s->iam_user_policies, s->session_policies); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s, s->bucket.get()); + + rgw_add_to_iam_environment(s->env, "s3:x-amz-copy-source", copy_source); + if (md_directive) + rgw_add_to_iam_environment(s->env, "s3:x-amz-metadata-directive", + *md_directive); + + ARN obj_arn(s->object->get_obj()); + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, + s->env, + rgw::IAM::s3PutObject, + obj_arn); + if (identity_policy_res == Effect::Deny) { + return -EACCES; + } + auto e = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + if (dest_iam_policy) { + e = dest_iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + obj_arn, + princ_type); + } + if (e == Effect::Deny) { + return -EACCES; + } + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3PutObject, obj_arn); + if (session_policy_res == Effect::Deny) { + return false; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && + (session_policy_res != Effect::Allow || e == Effect::Allow)) { + return -EACCES; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && e != Effect::Allow) { + return -EACCES; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) { + return -EACCES; + } + } + } + if (identity_policy_res == Effect::Pass && e == Effect::Pass && + ! dest_bucket_policy.verify_permission(this, + *s->auth.identity, + s->perm_mask, + RGW_PERM_WRITE)){ + return -EACCES; + } + } else if (! dest_bucket_policy.verify_permission(this, *s->auth.identity, s->perm_mask, + RGW_PERM_WRITE)) { + return -EACCES; + } + + } + + op_ret = init_dest_policy(); + if (op_ret < 0) { + return op_ret; + } + + return 0; +} + + +int RGWCopyObj::init_common() +{ + if (if_mod) { + if (parse_time(if_mod, &mod_time) < 0) { + op_ret = -EINVAL; + return op_ret; + } + mod_ptr = &mod_time; + } + + if (if_unmod) { + if (parse_time(if_unmod, &unmod_time) < 0) { + op_ret = -EINVAL; + return op_ret; + } + unmod_ptr = &unmod_time; + } + + bufferlist aclbl; + dest_policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs); + if (op_ret < 0) { + return op_ret; + } + populate_with_generic_attrs(s, attrs); + + return 0; +} + +static void copy_obj_progress_cb(off_t ofs, void *param) +{ + RGWCopyObj *op = static_cast(param); + op->progress_cb(ofs); +} + +void RGWCopyObj::progress_cb(off_t ofs) +{ + if (!s->cct->_conf->rgw_copy_obj_progress) + return; + + if (ofs - last_ofs < + static_cast(s->cct->_conf->rgw_copy_obj_progress_every_bytes)) { + return; + } + + send_partial_response(ofs); + + last_ofs = ofs; +} + +void RGWCopyObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWCopyObj::execute(optional_yield y) +{ + if (init_common() < 0) + return; + + // make reservation for notification if needed + std::unique_ptr res + = driver->get_notification( + s->object.get(), s->src_object.get(), + s, rgw::notify::ObjectCreatedCopy, y); + op_ret = res->publish_reserve(this); + if (op_ret < 0) { + return; + } + + if ( ! version_id.empty()) { + s->object->set_instance(version_id); + } else if (s->bucket->versioning_enabled()) { + s->object->gen_rand_obj_instance_name(); + } + + s->src_object->set_atomic(); + s->object->set_atomic(); + + encode_delete_at_attr(delete_at, attrs); + + if (obj_retention) { + bufferlist obj_retention_bl; + obj_retention->encode(obj_retention_bl); + emplace_attr(RGW_ATTR_OBJECT_RETENTION, std::move(obj_retention_bl)); + } + if (obj_legal_hold) { + bufferlist obj_legal_hold_bl; + obj_legal_hold->encode(obj_legal_hold_bl); + emplace_attr(RGW_ATTR_OBJECT_LEGAL_HOLD, std::move(obj_legal_hold_bl)); + } + + uint64_t obj_size = 0; + { + // get src object size (cached in obj_ctx from verify_permission()) + RGWObjState* astate = nullptr; + op_ret = s->src_object->get_obj_state(this, &astate, s->yield, true); + if (op_ret < 0) { + return; + } + + /* Check if the src object is cloud-tiered */ + bufferlist bl; + if (astate->get_attr(RGW_ATTR_MANIFEST, bl)) { + RGWObjManifest m; + try{ + decode(m, bl); + if (m.get_tier_type() == "cloud-s3") { + op_ret = -ERR_INVALID_OBJECT_STATE; + s->err.message = "This object was transitioned to cloud-s3"; + ldpp_dout(this, 4) << "Cannot copy cloud tiered object. Failing with " + << op_ret << dendl; + return; + } + } catch (const buffer::end_of_buffer&) { + // ignore empty manifest; it's not cloud-tiered + } catch (const std::exception& e) { + ldpp_dout(this, 1) << "WARNING: failed to decode object manifest for " + << *s->object << ": " << e.what() << dendl; + } + } + + obj_size = astate->size; + + if (!s->system_request) { // no quota enforcement for system requests + if (astate->accounted_size > static_cast(s->cct->_conf->rgw_max_put_size)) { + op_ret = -ERR_TOO_LARGE; + return; + } + // enforce quota against the destination bucket owner + op_ret = s->bucket->check_quota(this, quota, astate->accounted_size, y); + if (op_ret < 0) { + return; + } + } + } + + bool high_precision_time = (s->system_request); + + /* Handle object versioning of Swift API. In case of copying to remote this + * should fail gently (op_ret == 0) as the dst_obj will not exist here. */ + op_ret = s->object->swift_versioning_copy(this, s->yield); + if (op_ret < 0) { + return; + } + + op_ret = s->src_object->copy_object(s->user.get(), + &s->info, + source_zone, + s->object.get(), + s->bucket.get(), + src_bucket.get(), + s->dest_placement, + &src_mtime, + &mtime, + mod_ptr, + unmod_ptr, + high_precision_time, + if_match, + if_nomatch, + attrs_mod, + copy_if_newer, + attrs, + RGWObjCategory::Main, + olh_epoch, + delete_at, + (version_id.empty() ? NULL : &version_id), + &s->req_id, /* use req_id as tag */ + &etag, + copy_obj_progress_cb, (void *)this, + this, + s->yield); + + // send request to notification manager + int ret = res->publish_commit(this, obj_size, mtime, etag, s->object->get_instance()); + if (ret < 0) { + ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl; + // too late to rollback operation, hence op_ret is not set here + } +} + +int RGWGetACLs::verify_permission(optional_yield y) +{ + bool perm; + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (!rgw::sal::Object::empty(s->object.get())) { + auto iam_action = s->object->get_instance().empty() ? + rgw::IAM::s3GetObjectAcl : + rgw::IAM::s3GetObjectVersionAcl; + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + perm = verify_object_permission(this, s, iam_action); + } else { + if (!s->bucket_exists) { + return -ERR_NO_SUCH_BUCKET; + } + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + perm = verify_bucket_permission(this, s, rgw::IAM::s3GetBucketAcl); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWGetACLs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetACLs::execute(optional_yield y) +{ + stringstream ss; + RGWAccessControlPolicy* const acl = \ + (!rgw::sal::Object::empty(s->object.get()) ? s->object_acl.get() : s->bucket_acl.get()); + RGWAccessControlPolicy_S3* const s3policy = \ + static_cast(acl); + s3policy->to_xml(ss); + acls = ss.str(); +} + + + +int RGWPutACLs::verify_permission(optional_yield y) +{ + bool perm; + + rgw_add_to_iam_environment(s->env, "s3:x-amz-acl", s->canned_acl); + + rgw_add_grant_to_iam_environment(s->env, s); + if (!rgw::sal::Object::empty(s->object.get())) { + auto iam_action = s->object->get_instance().empty() ? rgw::IAM::s3PutObjectAcl : rgw::IAM::s3PutObjectVersionAcl; + op_ret = rgw_iam_add_objtags(this, s, true, true); + perm = verify_object_permission(this, s, iam_action); + } else { + op_ret = rgw_iam_add_buckettags(this, s); + perm = verify_bucket_permission(this, s, rgw::IAM::s3PutBucketAcl); + } + if (!perm) + return -EACCES; + + return 0; +} + +int RGWGetLC::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + bool perm; + perm = verify_bucket_permission(this, s, rgw::IAM::s3GetLifecycleConfiguration); + if (!perm) + return -EACCES; + + return 0; +} + +int RGWPutLC::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + bool perm; + perm = verify_bucket_permission(this, s, rgw::IAM::s3PutLifecycleConfiguration); + if (!perm) + return -EACCES; + + return 0; +} + +int RGWDeleteLC::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + bool perm; + perm = verify_bucket_permission(this, s, rgw::IAM::s3PutLifecycleConfiguration); + if (!perm) + return -EACCES; + + return 0; +} + +void RGWPutACLs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetLC::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutLC::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteLC::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutACLs::execute(optional_yield y) +{ + bufferlist bl; + + RGWAccessControlPolicy_S3 *policy = NULL; + RGWACLXMLParser_S3 parser(s->cct); + RGWAccessControlPolicy_S3 new_policy(s->cct); + stringstream ss; + + op_ret = 0; /* XXX redundant? */ + + if (!parser.init()) { + op_ret = -EINVAL; + return; + } + + + RGWAccessControlPolicy* const existing_policy = \ + (rgw::sal::Object::empty(s->object.get()) ? s->bucket_acl.get() : s->object_acl.get()); + + owner = existing_policy->get_owner(); + + op_ret = get_params(y); + if (op_ret < 0) { + if (op_ret == -ERANGE) { + ldpp_dout(this, 4) << "The size of request xml data is larger than the max limitation, data size = " + << s->length << dendl; + op_ret = -ERR_MALFORMED_XML; + s->err.message = "The XML you provided was larger than the maximum " + + std::to_string(s->cct->_conf->rgw_max_put_param_size) + + " bytes allowed."; + } + return; + } + + char* buf = data.c_str(); + ldpp_dout(this, 15) << "read len=" << data.length() << " data=" << (buf ? buf : "") << dendl; + + if (!s->canned_acl.empty() && data.length() > 0) { + op_ret = -EINVAL; + return; + } + + if (!s->canned_acl.empty() || s->has_acl_header) { + op_ret = get_policy_from_state(driver, s, ss); + if (op_ret < 0) + return; + + data.clear(); + data.append(ss.str()); + } + + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -EINVAL; + return; + } + policy = static_cast(parser.find_first("AccessControlPolicy")); + if (!policy) { + op_ret = -EINVAL; + return; + } + + const RGWAccessControlList& req_acl = policy->get_acl(); + const multimap& req_grant_map = req_acl.get_grant_map(); +#define ACL_GRANTS_MAX_NUM 100 + int max_num = s->cct->_conf->rgw_acl_grants_max_num; + if (max_num < 0) { + max_num = ACL_GRANTS_MAX_NUM; + } + + int grants_num = req_grant_map.size(); + if (grants_num > max_num) { + ldpp_dout(this, 4) << "An acl can have up to " << max_num + << " grants, request acl grants num: " << grants_num << dendl; + op_ret = -ERR_LIMIT_EXCEEDED; + s->err.message = "The request is rejected, because the acl grants number you requested is larger than the maximum " + + std::to_string(max_num) + + " grants allowed in an acl."; + return; + } + + // forward bucket acl requests to meta master zone + if ((rgw::sal::Object::empty(s->object.get()))) { + bufferlist in_data; + // include acl data unless it was generated from a canned_acl + if (s->canned_acl.empty()) { + in_data.append(data); + } + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + if (s->cct->_conf->subsys.should_gather()) { + ldpp_dout(this, 15) << "Old AccessControlPolicy"; + policy->to_xml(*_dout); + *_dout << dendl; + } + + op_ret = policy->rebuild(this, driver, &owner, new_policy, s->err.message); + if (op_ret < 0) + return; + + if (s->cct->_conf->subsys.should_gather()) { + ldpp_dout(this, 15) << "New AccessControlPolicy:"; + new_policy.to_xml(*_dout); + *_dout << dendl; + } + + if (s->bucket_access_conf && + s->bucket_access_conf->block_public_acls() && + new_policy.is_public(this)) { + op_ret = -EACCES; + return; + } + new_policy.encode(bl); + map attrs; + + if (!rgw::sal::Object::empty(s->object.get())) { + s->object->set_atomic(); + //if instance is empty, we should modify the latest object + op_ret = s->object->modify_obj_attrs(RGW_ATTR_ACL, bl, s->yield, this); + } else { + map attrs = s->bucket_attrs; + attrs[RGW_ATTR_ACL] = bl; + op_ret = s->bucket->merge_and_store_attrs(this, attrs, y); + } + if (op_ret == -ECANCELED) { + op_ret = 0; /* lost a race, but it's ok because acls are immutable */ + } +} + +void RGWPutLC::execute(optional_yield y) +{ + bufferlist bl; + + RGWLifecycleConfiguration_S3 config(s->cct); + RGWXMLParser parser; + RGWLifecycleConfiguration_S3 new_config(s->cct); + + // amazon says that Content-MD5 is required for this op specifically, but MD5 + // is not a security primitive and FIPS mode makes it difficult to use. if the + // client provides the header we'll try to verify its checksum, but the header + // itself is no longer required + std::optional content_md5_bin; + + content_md5 = s->info.env->get("HTTP_CONTENT_MD5"); + if (content_md5 != nullptr) { + try { + content_md5_bin = rgw::from_base64(std::string_view(content_md5)); + } catch (...) { + s->err.message = "Request header Content-MD5 contains character " + "that is not base64 encoded."; + ldpp_dout(this, 5) << s->err.message << dendl; + op_ret = -ERR_BAD_DIGEST; + return; + } + } + + if (!parser.init()) { + op_ret = -EINVAL; + return; + } + + op_ret = get_params(y); + if (op_ret < 0) + return; + + char* buf = data.c_str(); + ldpp_dout(this, 15) << "read len=" << data.length() << " data=" << (buf ? buf : "") << dendl; + + if (content_md5_bin) { + MD5 data_hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + data_hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + unsigned char data_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE]; + data_hash.Update(reinterpret_cast(buf), data.length()); + data_hash.Final(data_hash_res); + + if (memcmp(data_hash_res, content_md5_bin->c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) { + op_ret = -ERR_BAD_DIGEST; + s->err.message = "The Content-MD5 you specified did not match what we received."; + ldpp_dout(this, 5) << s->err.message + << " Specified content md5: " << content_md5 + << ", calculated content md5: " << data_hash_res + << dendl; + return; + } + } + + if (!parser.parse(buf, data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("LifecycleConfiguration", config, &parser); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "Bad lifecycle configuration: " << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + + op_ret = config.rebuild(new_config); + if (op_ret < 0) + return; + + if (s->cct->_conf->subsys.should_gather()) { + XMLFormatter xf; + new_config.dump_xml(&xf); + stringstream ss; + xf.flush(ss); + ldpp_dout(this, 15) << "New LifecycleConfiguration:" << ss.str() << dendl; + } + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = driver->get_rgwlc()->set_bucket_config(s->bucket.get(), s->bucket_attrs, &new_config); + if (op_ret < 0) { + return; + } + return; +} + +void RGWDeleteLC::execute(optional_yield y) +{ + bufferlist data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = driver->get_rgwlc()->remove_bucket_config(s->bucket.get(), s->bucket_attrs); + if (op_ret < 0) { + return; + } + return; +} + +int RGWGetCORS::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketCORS); +} + +void RGWGetCORS::execute(optional_yield y) +{ + op_ret = read_bucket_cors(); + if (op_ret < 0) + return ; + + if (!cors_exist) { + ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl; + op_ret = -ERR_NO_CORS_FOUND; + return; + } +} + +int RGWPutCORS::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketCORS); +} + +void RGWPutCORS::execute(optional_yield y) +{ + rgw_raw_obj obj; + + op_ret = get_params(y); + if (op_ret < 0) + return; + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + rgw::sal::Attrs attrs(s->bucket_attrs); + attrs[RGW_ATTR_CORS] = cors_bl; + return s->bucket->merge_and_store_attrs(this, attrs, s->yield); + }); +} + +int RGWDeleteCORS::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + // No separate delete permission + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketCORS); +} + +void RGWDeleteCORS::execute(optional_yield y) +{ + bufferlist data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + op_ret = read_bucket_cors(); + if (op_ret < 0) + return op_ret; + + if (!cors_exist) { + ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl; + op_ret = -ENOENT; + return op_ret; + } + + rgw::sal::Attrs attrs(s->bucket_attrs); + attrs.erase(RGW_ATTR_CORS); + op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield); + if (op_ret < 0) { + ldpp_dout(this, 0) << "RGWLC::RGWDeleteCORS() failed to set attrs on bucket=" << s->bucket->get_name() + << " returned err=" << op_ret << dendl; + } + return op_ret; + }); +} + +void RGWOptionsCORS::get_response_params(string& hdrs, string& exp_hdrs, unsigned *max_age) { + get_cors_response_headers(this, rule, req_hdrs, hdrs, exp_hdrs, max_age); +} + +int RGWOptionsCORS::validate_cors_request(RGWCORSConfiguration *cc) { + rule = cc->host_name_rule(origin); + if (!rule) { + ldpp_dout(this, 10) << "There is no cors rule present for " << origin << dendl; + return -ENOENT; + } + + if (!validate_cors_rule_method(this, rule, req_meth)) { + return -ENOENT; + } + + if (!validate_cors_rule_header(this, rule, req_hdrs)) { + return -ENOENT; + } + + return 0; +} + +void RGWOptionsCORS::execute(optional_yield y) +{ + op_ret = read_bucket_cors(); + if (op_ret < 0) + return; + + origin = s->info.env->get("HTTP_ORIGIN"); + if (!origin) { + ldpp_dout(this, 0) << "Missing mandatory Origin header" << dendl; + op_ret = -EINVAL; + return; + } + req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + if (!req_meth) { + ldpp_dout(this, 0) << "Missing mandatory Access-control-request-method header" << dendl; + op_ret = -EINVAL; + return; + } + if (!cors_exist) { + ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl; + op_ret = -ENOENT; + return; + } + req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_HEADERS"); + op_ret = validate_cors_request(&bucket_cors); + if (!rule) { + origin = req_meth = NULL; + return; + } + return; +} + +int RGWGetRequestPayment::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketRequestPayment); +} + +void RGWGetRequestPayment::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetRequestPayment::execute(optional_yield y) +{ + requester_pays = s->bucket->get_info().requester_pays; +} + +int RGWSetRequestPayment::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketRequestPayment); +} + +void RGWSetRequestPayment::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWSetRequestPayment::execute(optional_yield y) +{ + + op_ret = get_params(y); + if (op_ret < 0) + return; + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + s->bucket->get_info().requester_pays = requester_pays; + op_ret = s->bucket->put_info(this, false, real_time()); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name() + << " returned err=" << op_ret << dendl; + return; + } + s->bucket_attrs = s->bucket->get_attrs(); +} + +int RGWInitMultipart::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + // add server-side encryption headers + rgw_iam_add_crypt_attrs(s->env, s->info.crypt_attribute_map); + + if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (identity_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect e = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + ARN obj_arn(s->object->get_obj()); + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + obj_arn, + princ_type); + } + if (e == Effect::Deny) { + return -EACCES; + } + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (session_policy_res == Effect::Deny) { + return -EACCES; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && e == Effect::Allow)) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) { + return 0; + } + } + return -EACCES; + } + if (e == Effect::Allow || identity_policy_res == Effect::Allow) { + return 0; + } + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWInitMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWInitMultipart::execute(optional_yield y) +{ + multipart_trace = tracing::rgw::tracer.start_trace(tracing::rgw::MULTIPART, s->trace_enabled); + bufferlist aclbl, tracebl; + rgw::sal::Attrs attrs; + + op_ret = get_params(y); + if (op_ret < 0) { + return; + } + + if (rgw::sal::Object::empty(s->object.get())) + return; + + if (multipart_trace) { + tracing::encode(multipart_trace->GetContext(), tracebl); + attrs[RGW_ATTR_TRACE] = tracebl; + } + + policy.encode(aclbl); + attrs[RGW_ATTR_ACL] = aclbl; + + populate_with_generic_attrs(s, attrs); + + /* select encryption mode */ + op_ret = prepare_encryption(attrs); + if (op_ret != 0) + return; + + op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs); + if (op_ret < 0) { + return; + } + + std::unique_ptr upload; + upload = s->bucket->get_multipart_upload(s->object->get_name(), + upload_id); + op_ret = upload->init(this, s->yield, s->owner, s->dest_placement, attrs); + + if (op_ret == 0) { + upload_id = upload->get_upload_id(); + } + s->trace->SetAttribute(tracing::rgw::UPLOAD_ID, upload_id); + multipart_trace->UpdateName(tracing::rgw::MULTIPART + upload_id); + +} + +int RGWCompleteMultipart::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + // add server-side encryption headers + rgw_iam_add_crypt_attrs(s->env, s->info.crypt_attribute_map); + + if (s->iam_policy || ! s->iam_user_policies.empty() || ! s->session_policies.empty()) { + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (identity_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect e = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + rgw::ARN obj_arn(s->object->get_obj()); + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + obj_arn, + princ_type); + } + if (e == Effect::Deny) { + return -EACCES; + } + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (session_policy_res == Effect::Deny) { + return -EACCES; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && e == Effect::Allow)) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) { + return 0; + } + } + return -EACCES; + } + if (e == Effect::Allow || identity_policy_res == Effect::Allow) { + return 0; + } + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWCompleteMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWCompleteMultipart::execute(optional_yield y) +{ + RGWMultiCompleteUpload *parts; + RGWMultiXMLParser parser; + std::unique_ptr upload; + off_t ofs = 0; + std::unique_ptr meta_obj; + std::unique_ptr target_obj; + uint64_t olh_epoch = 0; + + op_ret = get_params(y); + if (op_ret < 0) + return; + op_ret = get_system_versioning_params(s, &olh_epoch, &version_id); + if (op_ret < 0) { + return; + } + + if (!data.length()) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + if (!parser.init()) { + op_ret = -EIO; + return; + } + + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + parts = static_cast(parser.find_first("CompleteMultipartUpload")); + if (!parts || parts->parts.empty()) { + // CompletedMultipartUpload is incorrect but some versions of some libraries use it, see PR #41700 + parts = static_cast(parser.find_first("CompletedMultipartUpload")); + } + + if (!parts || parts->parts.empty()) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + + if ((int)parts->parts.size() > + s->cct->_conf->rgw_multipart_part_upload_limit) { + op_ret = -ERANGE; + return; + } + + upload = s->bucket->get_multipart_upload(s->object->get_name(), upload_id); + + RGWCompressionInfo cs_info; + bool compressed = false; + uint64_t accounted_size = 0; + + list remove_objs; /* objects to be removed from index listing */ + + meta_obj = upload->get_meta_obj(); + meta_obj->set_in_extra_data(true); + meta_obj->set_hash_source(s->object->get_name()); + + /*take a cls lock on meta_obj to prevent racing completions (or retries) + from deleting the parts*/ + int max_lock_secs_mp = + s->cct->_conf.get_val("rgw_mp_lock_max_time"); + utime_t dur(max_lock_secs_mp, 0); + + serializer = meta_obj->get_serializer(this, "RGWCompleteMultipart"); + op_ret = serializer->try_lock(this, dur, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "failed to acquire lock" << dendl; + if (op_ret == -ENOENT && check_previously_completed(parts)) { + ldpp_dout(this, 1) << "NOTICE: This multipart completion is already completed" << dendl; + op_ret = 0; + return; + } + op_ret = -ERR_INTERNAL_ERROR; + s->err.message = "This multipart completion is already in progress"; + return; + } + + op_ret = meta_obj->get_obj_attrs(s->yield, this); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << meta_obj + << " ret=" << op_ret << dendl; + return; + } + s->trace->SetAttribute(tracing::rgw::UPLOAD_ID, upload_id); + jspan_context trace_ctx(false, false); + extract_span_context(meta_obj->get_attrs(), trace_ctx); + multipart_trace = tracing::rgw::tracer.add_span(name(), trace_ctx); + + + // make reservation for notification if needed + std::unique_ptr res + = driver->get_notification(meta_obj.get(), nullptr, s, rgw::notify::ObjectCreatedCompleteMultipartUpload, y, &s->object->get_name()); + op_ret = res->publish_reserve(this); + if (op_ret < 0) { + return; + } + + target_obj = s->bucket->get_object(rgw_obj_key(s->object->get_name())); + if (s->bucket->versioning_enabled()) { + if (!version_id.empty()) { + target_obj->set_instance(version_id); + } else { + target_obj->gen_rand_obj_instance_name(); + version_id = target_obj->get_instance(); + } + } + target_obj->set_attrs(meta_obj->get_attrs()); + + op_ret = upload->complete(this, y, s->cct, parts->parts, remove_objs, accounted_size, compressed, cs_info, ofs, s->req_id, s->owner, olh_epoch, target_obj.get()); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: upload complete failed ret=" << op_ret << dendl; + return; + } + + // remove the upload meta object ; the meta object is not versioned + // when the bucket is, as that would add an unneeded delete marker + int r = meta_obj->delete_object(this, y, true /* prevent versioning */); + if (r >= 0) { + /* serializer's exclusive lock is released */ + serializer->clear_locked(); + } else { + ldpp_dout(this, 0) << "WARNING: failed to remove object " << meta_obj << dendl; + } + + // send request to notification manager + int ret = res->publish_commit(this, ofs, upload->get_mtime(), etag, target_obj->get_instance()); + if (ret < 0) { + ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl; + // too late to rollback operation, hence op_ret is not set here + } +} // RGWCompleteMultipart::execute + +bool RGWCompleteMultipart::check_previously_completed(const RGWMultiCompleteUpload* parts) +{ + // re-calculate the etag from the parts and compare to the existing object + int ret = s->object->get_obj_attrs(s->yield, this); + if (ret < 0) { + ldpp_dout(this, 0) << __func__ << "() ERROR: get_obj_attrs() returned ret=" << ret << dendl; + return false; + } + rgw::sal::Attrs sattrs = s->object->get_attrs(); + string oetag = sattrs[RGW_ATTR_ETAG].to_str(); + + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + for (const auto& [index, part] : parts->parts) { + std::string partetag = rgw_string_unquote(part); + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + hex_to_buf(partetag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + ldpp_dout(this, 20) << __func__ << "() re-calculating multipart etag: part: " + << index << ", etag: " << partetag << dendl; + } + + unsigned char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + hash.Final(final_etag); + buf_to_hex(final_etag, CEPH_CRYPTO_MD5_DIGESTSIZE, final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)parts->parts.size()); + + if (oetag.compare(final_etag_str) != 0) { + ldpp_dout(this, 1) << __func__ << "() NOTICE: etag mismatch: object etag:" + << oetag << ", re-calculated etag:" << final_etag_str << dendl; + return false; + } + ldpp_dout(this, 5) << __func__ << "() object etag and re-calculated etag match, etag: " << oetag << dendl; + return true; +} + +void RGWCompleteMultipart::complete() +{ + /* release exclusive lock iff not already */ + if (unlikely(serializer.get() && serializer->is_locked())) { + int r = serializer->unlock(); + if (r < 0) { + ldpp_dout(this, 0) << "WARNING: failed to unlock " << *serializer.get() << dendl; + } + } + + etag = s->object->get_attrs()[RGW_ATTR_ETAG].to_str(); + + send_response(); +} + +int RGWAbortMultipart::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + rgw::IAM::s3AbortMultipartUpload, + s->object->get_obj()); + if (identity_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect e = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + ARN obj_arn(s->object->get_obj()); + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3AbortMultipartUpload, + obj_arn, princ_type); + } + + if (e == Effect::Deny) { + return -EACCES; + } + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3PutObject, + s->object->get_obj()); + if (session_policy_res == Effect::Deny) { + return -EACCES; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && e == Effect::Allow)) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) { + return 0; + } + } + return -EACCES; + } + if (e == Effect::Allow || identity_policy_res == Effect::Allow) { + return 0; + } + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWAbortMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWAbortMultipart::execute(optional_yield y) +{ + op_ret = -EINVAL; + string upload_id; + upload_id = s->info.args.get("uploadId"); + std::unique_ptr meta_obj; + std::unique_ptr upload; + + if (upload_id.empty() || rgw::sal::Object::empty(s->object.get())) + return; + + upload = s->bucket->get_multipart_upload(s->object->get_name(), upload_id); + jspan_context trace_ctx(false, false); + if (tracing::rgw::tracer.is_enabled()) { + // read meta object attributes for trace info + meta_obj = upload->get_meta_obj(); + meta_obj->set_in_extra_data(true); + meta_obj->get_obj_attrs(s->yield, this); + extract_span_context(meta_obj->get_attrs(), trace_ctx); + } + multipart_trace = tracing::rgw::tracer.add_span(name(), trace_ctx); + + op_ret = upload->abort(this, s->cct); +} + +int RGWListMultipart::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (!verify_object_permission(this, s, rgw::IAM::s3ListMultipartUploadParts)) + return -EACCES; + + return 0; +} + +void RGWListMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListMultipart::execute(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) + return; + + upload = s->bucket->get_multipart_upload(s->object->get_name(), upload_id); + + rgw::sal::Attrs attrs; + op_ret = upload->get_info(this, s->yield, &placement, &attrs); + /* decode policy */ + map::iterator iter = attrs.find(RGW_ATTR_ACL); + if (iter != attrs.end()) { + auto bliter = iter->second.cbegin(); + try { + policy.decode(bliter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + op_ret = -EIO; + } + } + if (op_ret < 0) + return; + + op_ret = upload->list_parts(this, s->cct, max_parts, marker, NULL, &truncated); +} + +int RGWListBucketMultiparts::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, + s, + rgw::IAM::s3ListBucketMultipartUploads)) + return -EACCES; + + return 0; +} + +void RGWListBucketMultiparts::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListBucketMultiparts::execute(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) + return; + + if (s->prot_flags & RGW_REST_SWIFT) { + string path_args; + path_args = s->info.args.get("path"); + if (!path_args.empty()) { + if (!delimiter.empty() || !prefix.empty()) { + op_ret = -EINVAL; + return; + } + prefix = path_args; + delimiter="/"; + } + } + + op_ret = s->bucket->list_multiparts(this, prefix, marker_meta, + delimiter, max_uploads, uploads, + &common_prefixes, &is_truncated); + if (op_ret < 0) { + return; + } + + if (!uploads.empty()) { + next_marker_key = uploads.back()->get_key(); + next_marker_upload_id = uploads.back()->get_upload_id(); + } +} + +void RGWGetHealthCheck::execute(optional_yield y) +{ + if (!g_conf()->rgw_healthcheck_disabling_path.empty() && + (::access(g_conf()->rgw_healthcheck_disabling_path.c_str(), F_OK) == 0)) { + /* Disabling path specified & existent in the filesystem. */ + op_ret = -ERR_SERVICE_UNAVAILABLE; /* 503 */ + } else { + op_ret = 0; /* 200 OK */ + } +} + +int RGWDeleteMultiObj::verify_permission(optional_yield y) +{ + int op_ret = get_params(y); + if (op_ret) { + return op_ret; + } + + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (s->iam_policy || ! s->iam_user_policies.empty() || ! s->session_policies.empty()) { + if (s->bucket->get_info().obj_lock_enabled() && bypass_governance_mode) { + ARN bucket_arn(s->bucket->get_key()); + auto r = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket->get_key())); + if (r == Effect::Deny) { + bypass_perm = false; + } else if (r == Effect::Pass && s->iam_policy) { + r = s->iam_policy->eval(s->env, *s->auth.identity, rgw::IAM::s3BypassGovernanceRetention, + bucket_arn); + if (r == Effect::Deny) { + bypass_perm = false; + } + } else if (r == Effect::Pass && !s->session_policies.empty()) { + r = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket->get_key())); + if (r == Effect::Deny) { + bypass_perm = false; + } + } + } + + bool not_versioned = rgw::sal::Object::empty(s->object.get()) || s->object->get_instance().empty(); + + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + not_versioned ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(s->bucket->get_key())); + if (identity_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect r = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + rgw::ARN bucket_arn(s->bucket->get_key()); + if (s->iam_policy) { + r = s->iam_policy->eval(s->env, *s->auth.identity, + not_versioned ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + bucket_arn, + princ_type); + } + if (r == Effect::Deny) + return -EACCES; + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + not_versioned ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(s->bucket->get_key())); + if (session_policy_res == Effect::Deny) { + return -EACCES; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && r == Effect::Allow)) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || r == Effect::Allow) { + return 0; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) { + return 0; + } + } + return -EACCES; + } + if (r == Effect::Allow || identity_policy_res == Effect::Allow) + return 0; + } + + acl_allowed = verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE); + if (!acl_allowed) + return -EACCES; + + return 0; +} + +void RGWDeleteMultiObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteMultiObj::write_ops_log_entry(rgw_log_entry& entry) const { + int num_err = 0; + int num_ok = 0; + for (auto iter = ops_log_entries.begin(); + iter != ops_log_entries.end(); + ++iter) { + if (iter->error) { + num_err++; + } else { + num_ok++; + } + } + entry.delete_multi_obj_meta.num_err = num_err; + entry.delete_multi_obj_meta.num_ok = num_ok; + entry.delete_multi_obj_meta.objects = std::move(ops_log_entries); +} + +void RGWDeleteMultiObj::wait_flush(optional_yield y, + boost::asio::deadline_timer *formatter_flush_cond, + std::function predicate) +{ + if (y && formatter_flush_cond) { + auto yc = y.get_yield_context(); + while (!predicate()) { + boost::system::error_code error; + formatter_flush_cond->async_wait(yc[error]); + rgw_flush_formatter(s, s->formatter); + } + } +} + +void RGWDeleteMultiObj::handle_individual_object(const rgw_obj_key& o, optional_yield y, + boost::asio::deadline_timer *formatter_flush_cond) +{ + std::string version_id; + std::unique_ptr obj = bucket->get_object(o); + if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + o.instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(obj->get_obj())); + if (identity_policy_res == Effect::Deny) { + send_partial_response(o, false, "", -EACCES, formatter_flush_cond); + return; + } + + rgw::IAM::Effect e = Effect::Pass; + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + if (s->iam_policy) { + ARN obj_arn(obj->get_obj()); + e = s->iam_policy->eval(s->env, + *s->auth.identity, + o.instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + obj_arn, + princ_type); + } + if (e == Effect::Deny) { + send_partial_response(o, false, "", -EACCES, formatter_flush_cond); + return; + } + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + o.instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(obj->get_obj())); + if (session_policy_res == Effect::Deny) { + send_partial_response(o, false, "", -EACCES, formatter_flush_cond); + return; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && + (session_policy_res != Effect::Allow || e != Effect::Allow)) { + send_partial_response(o, false, "", -EACCES, formatter_flush_cond); + return; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && e != Effect::Allow) { + send_partial_response(o, false, "", -EACCES, formatter_flush_cond); + return; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) { + send_partial_response(o, false, "", -EACCES, formatter_flush_cond); + return; + } + } + send_partial_response(o, false, "", -EACCES, formatter_flush_cond); + return; + } + + if ((identity_policy_res == Effect::Pass && e == Effect::Pass && !acl_allowed)) { + send_partial_response(o, false, "", -EACCES, formatter_flush_cond); + return; + } + } + + uint64_t obj_size = 0; + std::string etag; + + if (!rgw::sal::Object::empty(obj.get())) { + RGWObjState* astate = nullptr; + bool check_obj_lock = obj->have_instance() && bucket->get_info().obj_lock_enabled(); + const auto ret = obj->get_obj_state(this, &astate, y, true); + + if (ret < 0) { + if (ret == -ENOENT) { + // object maybe delete_marker, skip check_obj_lock + check_obj_lock = false; + } else { + // Something went wrong. + send_partial_response(o, false, "", ret, formatter_flush_cond); + return; + } + } else { + obj_size = astate->size; + etag = astate->attrset[RGW_ATTR_ETAG].to_str(); + } + + if (check_obj_lock) { + ceph_assert(astate); + int object_lock_response = verify_object_lock(this, astate->attrset, bypass_perm, bypass_governance_mode); + if (object_lock_response != 0) { + send_partial_response(o, false, "", object_lock_response, formatter_flush_cond); + return; + } + } + } + + // make reservation for notification if needed + const auto versioned_object = s->bucket->versioning_enabled(); + const auto event_type = versioned_object && obj->get_instance().empty() ? + rgw::notify::ObjectRemovedDeleteMarkerCreated : + rgw::notify::ObjectRemovedDelete; + std::unique_ptr res + = driver->get_notification(obj.get(), s->src_object.get(), s, event_type, y); + op_ret = res->publish_reserve(this); + if (op_ret < 0) { + send_partial_response(o, false, "", op_ret, formatter_flush_cond); + return; + } + + obj->set_atomic(); + + std::unique_ptr del_op = obj->get_delete_op(); + del_op->params.versioning_status = obj->get_bucket()->get_info().versioning_status(); + del_op->params.obj_owner = s->owner; + del_op->params.bucket_owner = s->bucket_owner; + del_op->params.marker_version_id = version_id; + + op_ret = del_op->delete_obj(this, y); + if (op_ret == -ENOENT) { + op_ret = 0; + } + + send_partial_response(o, del_op->result.delete_marker, del_op->result.version_id, op_ret, formatter_flush_cond); + + // send request to notification manager + int ret = res->publish_commit(this, obj_size, ceph::real_clock::now(), etag, version_id); + if (ret < 0) { + ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl; + // too late to rollback operation, hence op_ret is not set here + } +} + +void RGWDeleteMultiObj::execute(optional_yield y) +{ + RGWMultiDelDelete *multi_delete; + vector::iterator iter; + RGWMultiDelXMLParser parser; + uint32_t aio_count = 0; + const uint32_t max_aio = std::max(1, s->cct->_conf->rgw_multi_obj_del_max_aio); + char* buf; + std::optional formatter_flush_cond; + if (y) { + formatter_flush_cond = std::make_optional(y.get_io_context()); + } + + buf = data.c_str(); + if (!buf) { + op_ret = -EINVAL; + goto error; + } + + if (!parser.init()) { + op_ret = -EINVAL; + goto error; + } + + if (!parser.parse(buf, data.length(), 1)) { + op_ret = -EINVAL; + goto error; + } + + multi_delete = static_cast(parser.find_first("Delete")); + if (!multi_delete) { + op_ret = -EINVAL; + goto error; + } else { +#define DELETE_MULTI_OBJ_MAX_NUM 1000 + int max_num = s->cct->_conf->rgw_delete_multi_obj_max_num; + if (max_num < 0) { + max_num = DELETE_MULTI_OBJ_MAX_NUM; + } + int multi_delete_object_num = multi_delete->objects.size(); + if (multi_delete_object_num > max_num) { + op_ret = -ERR_MALFORMED_XML; + goto error; + } + } + + if (multi_delete->is_quiet()) + quiet = true; + + if (s->bucket->get_info().mfa_enabled()) { + bool has_versioned = false; + for (auto i : multi_delete->objects) { + if (!i.instance.empty()) { + has_versioned = true; + break; + } + } + if (has_versioned && !s->mfa_verified) { + ldpp_dout(this, 5) << "NOTICE: multi-object delete request with a versioned object, mfa auth not provided" << dendl; + op_ret = -ERR_MFA_REQUIRED; + goto error; + } + } + + begin_response(); + if (multi_delete->objects.empty()) { + goto done; + } + + for (iter = multi_delete->objects.begin(); + iter != multi_delete->objects.end(); + ++iter) { + rgw_obj_key obj_key = *iter; + if (y) { + wait_flush(y, &*formatter_flush_cond, [&aio_count, max_aio] { + return aio_count < max_aio; + }); + aio_count++; + spawn::spawn(y.get_yield_context(), [this, &y, &aio_count, obj_key, &formatter_flush_cond] (yield_context yield) { + handle_individual_object(obj_key, optional_yield { y.get_io_context(), yield }, &*formatter_flush_cond); + aio_count--; + }); + } else { + handle_individual_object(obj_key, y, nullptr); + } + } + if (formatter_flush_cond) { + wait_flush(y, &*formatter_flush_cond, [this, n=multi_delete->objects.size()] { + return n == ops_log_entries.size(); + }); + } + + /* set the return code to zero, errors at this point will be + dumped to the response */ + op_ret = 0; + +done: + // will likely segfault if begin_response() has not been called + end_response(); + return; + +error: + send_status(); + return; + +} + +bool RGWBulkDelete::Deleter::verify_permission(RGWBucketInfo& binfo, + map& battrs, + ACLOwner& bucket_owner /* out */, + optional_yield y) +{ + RGWAccessControlPolicy bacl(driver->ctx()); + int ret = read_bucket_policy(dpp, driver, s, binfo, battrs, &bacl, binfo.bucket, y); + if (ret < 0) { + return false; + } + + auto policy = get_iam_policy_from_attr(s->cct, battrs, binfo.bucket.tenant); + + bucket_owner = bacl.get_owner(); + + /* We can use global user_acl because each BulkDelete request is allowed + * to work on entities from a single account only. */ + return verify_bucket_permission(dpp, s, binfo.bucket, s->user_acl.get(), + &bacl, policy, s->iam_user_policies, s->session_policies, rgw::IAM::s3DeleteBucket); +} + +bool RGWBulkDelete::Deleter::delete_single(const acct_path_t& path, optional_yield y) +{ + std::unique_ptr bucket; + ACLOwner bowner; + RGWObjVersionTracker ot; + + int ret = driver->get_bucket(dpp, s->user.get(), s->user->get_tenant(), path.bucket_name, &bucket, y); + if (ret < 0) { + goto binfo_fail; + } + + ret = bucket->load_bucket(dpp, s->yield); + if (ret < 0) { + goto binfo_fail; + } + + if (!verify_permission(bucket->get_info(), bucket->get_attrs(), bowner, y)) { + ret = -EACCES; + goto auth_fail; + } + + if (!path.obj_key.empty()) { + ACLOwner bucket_owner; + + bucket_owner.set_id(bucket->get_info().owner); + std::unique_ptr obj = bucket->get_object(path.obj_key); + obj->set_atomic(); + + std::unique_ptr del_op = obj->get_delete_op(); + del_op->params.versioning_status = obj->get_bucket()->get_info().versioning_status(); + del_op->params.obj_owner = bowner; + del_op->params.bucket_owner = bucket_owner; + + ret = del_op->delete_obj(dpp, y); + if (ret < 0) { + goto delop_fail; + } + } else { + ret = bucket->remove_bucket(dpp, false, true, &s->info, s->yield); + if (ret < 0) { + goto delop_fail; + } + } + + num_deleted++; + return true; + +binfo_fail: + if (-ENOENT == ret) { + ldpp_dout(dpp, 20) << "cannot find bucket = " << path.bucket_name << dendl; + num_unfound++; + } else { + ldpp_dout(dpp, 20) << "cannot get bucket info, ret = " << ret << dendl; + + fail_desc_t failed_item = { + .err = ret, + .path = path + }; + failures.push_back(failed_item); + } + return false; + +auth_fail: + ldpp_dout(dpp, 20) << "wrong auth for " << path << dendl; + { + fail_desc_t failed_item = { + .err = ret, + .path = path + }; + failures.push_back(failed_item); + } + return false; + +delop_fail: + if (-ENOENT == ret) { + ldpp_dout(dpp, 20) << "cannot find entry " << path << dendl; + num_unfound++; + } else { + fail_desc_t failed_item = { + .err = ret, + .path = path + }; + failures.push_back(failed_item); + } + return false; +} + +bool RGWBulkDelete::Deleter::delete_chunk(const std::list& paths, optional_yield y) +{ + ldpp_dout(dpp, 20) << "in delete_chunk" << dendl; + for (auto path : paths) { + ldpp_dout(dpp, 20) << "bulk deleting path: " << path << dendl; + delete_single(path, y); + } + + return true; +} + +int RGWBulkDelete::verify_permission(optional_yield y) +{ + return 0; +} + +void RGWBulkDelete::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWBulkDelete::execute(optional_yield y) +{ + deleter = std::unique_ptr(new Deleter(this, driver, s)); + + bool is_truncated = false; + do { + list items; + + int ret = get_data(items, &is_truncated); + if (ret < 0) { + return; + } + + ret = deleter->delete_chunk(items, y); + } while (!op_ret && is_truncated); + + return; +} + + +constexpr std::array RGWBulkUploadOp::terminal_errors; + +int RGWBulkUploadOp::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (! verify_user_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + if (s->user->get_tenant() != s->bucket_tenant) { + ldpp_dout(this, 10) << "user cannot create a bucket in a different tenant" + << " (user_id.tenant=" << s->user->get_tenant() + << " requested=" << s->bucket_tenant << ")" << dendl; + return -EACCES; + } + + if (s->user->get_max_buckets() < 0) { + return -EPERM; + } + + return 0; +} + +void RGWBulkUploadOp::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +boost::optional> +RGWBulkUploadOp::parse_path(const std::string_view& path) +{ + /* We need to skip all slashes at the beginning in order to preserve + * compliance with Swift. */ + const size_t start_pos = path.find_first_not_of('/'); + + if (std::string_view::npos != start_pos) { + /* Seperator is the first slash after the leading ones. */ + const size_t sep_pos = path.substr(start_pos).find('/'); + + if (std::string_view::npos != sep_pos) { + const auto bucket_name = path.substr(start_pos, sep_pos - start_pos); + const auto obj_name = path.substr(sep_pos + 1); + + return std::make_pair(std::string(bucket_name), + rgw_obj_key(std::string(obj_name))); + } else { + /* It's guaranteed here that bucket name is at least one character + * long and is different than slash. */ + return std::make_pair(std::string(path.substr(start_pos)), + rgw_obj_key()); + } + } + + return none; +} + +std::pair +RGWBulkUploadOp::handle_upload_path(req_state *s) +{ + std::string bucket_path, file_prefix; + if (! s->init_state.url_bucket.empty()) { + file_prefix = bucket_path = s->init_state.url_bucket + "/"; + if (!rgw::sal::Object::empty(s->object.get())) { + const std::string& object_name = s->object->get_name(); + + /* As rgw_obj_key::empty() already verified emptiness of s->object->get_name(), + * we can safely examine its last element. */ + if (object_name.back() == '/') { + file_prefix.append(object_name); + } else { + file_prefix.append(object_name).append("/"); + } + } + } + return std::make_pair(bucket_path, file_prefix); +} + +int RGWBulkUploadOp::handle_dir_verify_permission(optional_yield y) +{ + if (s->user->get_max_buckets() > 0) { + rgw::sal::BucketList buckets; + std::string marker; + op_ret = s->user->list_buckets(this, marker, std::string(), s->user->get_max_buckets(), + false, buckets, y); + if (op_ret < 0) { + return op_ret; + } + + if (buckets.count() >= static_cast(s->user->get_max_buckets())) { + return -ERR_TOO_MANY_BUCKETS; + } + } + + return 0; +} + +static void forward_req_info(const DoutPrefixProvider *dpp, CephContext *cct, req_info& info, const std::string& bucket_name) +{ + /* the request of container or object level will contain bucket name. + * only at account level need to append the bucket name */ + if (info.script_uri.find(bucket_name) != std::string::npos) { + return; + } + + ldpp_dout(dpp, 20) << "append the bucket: "<< bucket_name << " to req_info" << dendl; + info.script_uri.append("/").append(bucket_name); + info.request_uri_aws4 = info.request_uri = info.script_uri; + info.effective_uri = "/" + bucket_name; +} + +void RGWBulkUploadOp::init(rgw::sal::Driver* const driver, + req_state* const s, + RGWHandler* const h) +{ + RGWOp::init(driver, s, h); +} + +int RGWBulkUploadOp::handle_dir(const std::string_view path, optional_yield y) +{ + ldpp_dout(this, 20) << "got directory=" << path << dendl; + + op_ret = handle_dir_verify_permission(y); + if (op_ret < 0) { + return op_ret; + } + + std::string bucket_name; + rgw_obj_key object_junk; + std::tie(bucket_name, object_junk) = *parse_path(path); + + /* we need to make sure we read bucket info, it's not read before for this + * specific request */ + std::unique_ptr bucket; + + /* Create metadata: ACLs. */ + std::map attrs; + RGWAccessControlPolicy policy; + policy.create_default(s->user->get_id(), s->user->get_display_name()); + ceph::bufferlist aclbl; + policy.encode(aclbl); + attrs.emplace(RGW_ATTR_ACL, std::move(aclbl)); + + obj_version objv, ep_objv; + bool bucket_exists; + RGWQuotaInfo quota_info; + const RGWQuotaInfo* pquota_info = nullptr; + RGWBucketInfo out_info; + string swift_ver_location; + rgw_bucket new_bucket; + req_info info = s->info; + new_bucket.tenant = s->bucket_tenant; /* ignored if bucket exists */ + new_bucket.name = bucket_name; + rgw_placement_rule placement_rule; + placement_rule.storage_class = s->info.storage_class; + forward_req_info(this, s->cct, info, bucket_name); + + op_ret = s->user->create_bucket(this, new_bucket, + driver->get_zone()->get_zonegroup().get_id(), + placement_rule, swift_ver_location, + pquota_info, policy, attrs, + out_info, ep_objv, + true, false, &bucket_exists, + info, &bucket, y); + /* continue if EEXIST and create_bucket will fail below. this way we can + * recover from a partial create by retrying it. */ + ldpp_dout(this, 20) << "rgw_create_bucket returned ret=" << op_ret + << ", bucket=" << bucket << dendl; + + return op_ret; +} + + +bool RGWBulkUploadOp::handle_file_verify_permission(RGWBucketInfo& binfo, + const rgw_obj& obj, + std::map& battrs, + ACLOwner& bucket_owner /* out */, + optional_yield y) +{ + RGWAccessControlPolicy bacl(driver->ctx()); + op_ret = read_bucket_policy(this, driver, s, binfo, battrs, &bacl, binfo.bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "cannot read_policy() for bucket" << dendl; + return false; + } + + auto policy = get_iam_policy_from_attr(s->cct, battrs, binfo.bucket.tenant); + + bucket_owner = bacl.get_owner(); + if (policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) { + auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env, + rgw::IAM::s3PutObject, obj); + if (identity_policy_res == Effect::Deny) { + return false; + } + + rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other; + ARN obj_arn(obj); + auto e = policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, obj_arn, princ_type); + if (e == Effect::Deny) { + return false; + } + + if (!s->session_policies.empty()) { + auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env, + rgw::IAM::s3PutObject, obj); + if (session_policy_res == Effect::Deny) { + return false; + } + if (princ_type == rgw::IAM::PolicyPrincipal::Role) { + //Intersection of session policy and identity policy plus intersection of session policy and bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || + (session_policy_res == Effect::Allow && e == Effect::Allow)) { + return true; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) { + //Intersection of session policy and identity policy plus bucket policy + if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) { + return true; + } + } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy + if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) { + return true; + } + } + return false; + } + if (e == Effect::Allow || identity_policy_res == Effect::Allow) { + return true; + } + } + + return verify_bucket_permission_no_policy(this, s, s->user_acl.get(), + &bacl, RGW_PERM_WRITE); +} + +int RGWBulkUploadOp::handle_file(const std::string_view path, + const size_t size, + AlignedStreamGetter& body, optional_yield y) +{ + + ldpp_dout(this, 20) << "got file=" << path << ", size=" << size << dendl; + + if (size > static_cast(s->cct->_conf->rgw_max_put_size)) { + op_ret = -ERR_TOO_LARGE; + return op_ret; + } + + std::string bucket_name; + rgw_obj_key object; + std::tie(bucket_name, object) = *parse_path(path); + + std::unique_ptr bucket; + ACLOwner bowner; + + op_ret = driver->get_bucket(this, s->user.get(), rgw_bucket(rgw_bucket_key(s->user->get_tenant(), bucket_name)), &bucket, y); + if (op_ret < 0) { + if (op_ret == -ENOENT) { + ldpp_dout(this, 20) << "non existent directory=" << bucket_name << dendl; + } + return op_ret; + } + + std::unique_ptr obj = bucket->get_object(object); + + if (! handle_file_verify_permission(bucket->get_info(), + obj->get_obj(), + bucket->get_attrs(), bowner, y)) { + ldpp_dout(this, 20) << "object creation unauthorized" << dendl; + op_ret = -EACCES; + return op_ret; + } + + op_ret = bucket->check_quota(this, quota, size, y); + if (op_ret < 0) { + return op_ret; + } + + if (bucket->versioning_enabled()) { + obj->gen_rand_obj_instance_name(); + } + + rgw_placement_rule dest_placement = s->dest_placement; + dest_placement.inherit_from(bucket->get_placement_rule()); + + std::unique_ptr processor; + processor = driver->get_atomic_writer(this, s->yield, obj.get(), + bowner.get_id(), + &s->dest_placement, 0, s->req_id); + op_ret = processor->prepare(s->yield); + if (op_ret < 0) { + ldpp_dout(this, 20) << "cannot prepare processor due to ret=" << op_ret << dendl; + return op_ret; + } + + /* No filters by default. */ + rgw::sal::DataProcessor *filter = processor.get(); + + const auto& compression_type = driver->get_compression_type(dest_placement); + CompressorRef plugin; + boost::optional compressor; + if (compression_type != "none") { + plugin = Compressor::create(s->cct, compression_type); + if (! plugin) { + ldpp_dout(this, 1) << "Cannot load plugin for rgw_compression_type " + << compression_type << dendl; + } else { + compressor.emplace(s->cct, plugin, filter); + filter = &*compressor; + } + } + + /* Upload file content. */ + ssize_t len = 0; + size_t ofs = 0; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + do { + ceph::bufferlist data; + len = body.get_at_most(s->cct->_conf->rgw_max_chunk_size, data); + + ldpp_dout(this, 20) << "body=" << data.c_str() << dendl; + if (len < 0) { + op_ret = len; + return op_ret; + } else if (len > 0) { + hash.Update((const unsigned char *)data.c_str(), data.length()); + op_ret = filter->process(std::move(data), ofs); + if (op_ret < 0) { + ldpp_dout(this, 20) << "filter->process() returned ret=" << op_ret << dendl; + return op_ret; + } + + ofs += len; + } + + } while (len > 0); + + // flush + op_ret = filter->process({}, ofs); + if (op_ret < 0) { + return op_ret; + } + + if (ofs != size) { + ldpp_dout(this, 10) << "real file size different from declared" << dendl; + op_ret = -EINVAL; + return op_ret; + } + + op_ret = bucket->check_quota(this, quota, size, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "quota exceeded for path=" << path << dendl; + return op_ret; + } + + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + hash.Final(m); + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + /* Create metadata: ETAG. */ + std::map attrs; + std::string etag = calc_md5; + ceph::bufferlist etag_bl; + etag_bl.append(etag.c_str(), etag.size() + 1); + attrs.emplace(RGW_ATTR_ETAG, std::move(etag_bl)); + + /* Create metadata: ACLs. */ + RGWAccessControlPolicy policy; + policy.create_default(s->user->get_id(), s->user->get_display_name()); + ceph::bufferlist aclbl; + policy.encode(aclbl); + attrs.emplace(RGW_ATTR_ACL, std::move(aclbl)); + + /* Create metadata: compression info. */ + if (compressor && compressor->is_compressed()) { + ceph::bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = size; + cs_info.compressor_message = compressor->get_compressor_message(); + cs_info.blocks = std::move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + attrs.emplace(RGW_ATTR_COMPRESSION, std::move(tmp)); + } + + /* Complete the transaction. */ + op_ret = processor->complete(size, etag, nullptr, ceph::real_time(), + attrs, ceph::real_time() /* delete_at */, + nullptr, nullptr, nullptr, nullptr, nullptr, + s->yield); + if (op_ret < 0) { + ldpp_dout(this, 20) << "processor::complete returned op_ret=" << op_ret << dendl; + } + + return op_ret; +} + +void RGWBulkUploadOp::execute(optional_yield y) +{ + ceph::bufferlist buffer(64 * 1024); + + ldpp_dout(this, 20) << "start" << dendl; + + /* Create an instance of stream-abstracting class. Having this indirection + * allows for easy introduction of decompressors like gzip and bzip2. */ + auto stream = create_stream(); + if (! stream) { + return; + } + + /* Handling the $UPLOAD_PATH accordingly to the Swift's Bulk middleware. See: + * https://github.com/openstack/swift/blob/2.13.0/swift/common/middleware/bulk.py#L31-L41 */ + std::string bucket_path, file_prefix; + std::tie(bucket_path, file_prefix) = handle_upload_path(s); + + auto status = rgw::tar::StatusIndicator::create(); + do { + op_ret = stream->get_exactly(rgw::tar::BLOCK_SIZE, buffer); + if (op_ret < 0) { + ldpp_dout(this, 2) << "cannot read header" << dendl; + return; + } + + /* We need to re-interpret the buffer as a TAR block. Exactly two blocks + * must be tracked to detect out end-of-archive. It occurs when both of + * them are empty (zeroed). Tracing this particular inter-block dependency + * is responsibility of the rgw::tar::StatusIndicator class. */ + boost::optional header; + std::tie(status, header) = rgw::tar::interpret_block(status, buffer); + + if (! status.empty() && header) { + /* This specific block isn't empty (entirely zeroed), so we can parse + * it as a TAR header and dispatch. At the moment we do support only + * regular files and directories. Everything else (symlinks, devices) + * will be ignored but won't cease the whole upload. */ + switch (header->get_filetype()) { + case rgw::tar::FileType::NORMAL_FILE: { + ldpp_dout(this, 2) << "handling regular file" << dendl; + + std::string filename; + if (bucket_path.empty()) + filename = header->get_filename(); + else + filename = file_prefix + std::string(header->get_filename()); + auto body = AlignedStreamGetter(0, header->get_filesize(), + rgw::tar::BLOCK_SIZE, *stream); + op_ret = handle_file(filename, + header->get_filesize(), + body, y); + if (! op_ret) { + /* Only regular files counts. */ + num_created++; + } else { + failures.emplace_back(op_ret, std::string(filename)); + } + break; + } + case rgw::tar::FileType::DIRECTORY: { + ldpp_dout(this, 2) << "handling regular directory" << dendl; + + std::string_view dirname = bucket_path.empty() ? header->get_filename() : bucket_path; + op_ret = handle_dir(dirname, y); + if (op_ret < 0 && op_ret != -ERR_BUCKET_EXISTS) { + failures.emplace_back(op_ret, std::string(dirname)); + } + break; + } + default: { + /* Not recognized. Skip. */ + op_ret = 0; + break; + } + } + + /* In case of any problems with sub-request authorization Swift simply + * terminates whole upload immediately. */ + if (boost::algorithm::contains(std::initializer_list{ op_ret }, + terminal_errors)) { + ldpp_dout(this, 2) << "terminating due to ret=" << op_ret << dendl; + break; + } + } else { + ldpp_dout(this, 2) << "an empty block" << dendl; + op_ret = 0; + } + + buffer.clear(); + } while (! status.eof()); + + return; +} + +RGWBulkUploadOp::AlignedStreamGetter::~AlignedStreamGetter() +{ + const size_t aligned_legnth = length + (-length % alignment); + ceph::bufferlist junk; + + DecoratedStreamGetter::get_exactly(aligned_legnth - position, junk); +} + +ssize_t RGWBulkUploadOp::AlignedStreamGetter::get_at_most(const size_t want, + ceph::bufferlist& dst) +{ + const size_t max_to_read = std::min(want, length - position); + const auto len = DecoratedStreamGetter::get_at_most(max_to_read, dst); + if (len > 0) { + position += len; + } + return len; +} + +ssize_t RGWBulkUploadOp::AlignedStreamGetter::get_exactly(const size_t want, + ceph::bufferlist& dst) +{ + const auto len = DecoratedStreamGetter::get_exactly(want, dst); + if (len > 0) { + position += len; + } + return len; +} + +int RGWGetAttrs::verify_permission(optional_yield y) +{ + s->object->set_atomic(); + + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + auto iam_action = s->object->get_instance().empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion; + + if (!verify_object_permission(this, s, iam_action)) { + return -EACCES; + } + + return 0; +} + +void RGWGetAttrs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetAttrs::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) + return; + + s->object->set_atomic(); + + op_ret = s->object->get_obj_attrs(s->yield, this); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << s->object + << " ret=" << op_ret << dendl; + return; + } + + /* XXX RGWObject::get_obj_attrs() does not support filtering (yet) */ + auto& obj_attrs = s->object->get_attrs(); + if (attrs.size() != 0) { + /* return only attrs requested */ + for (auto& att : attrs) { + auto iter = obj_attrs.find(att.first); + if (iter != obj_attrs.end()) { + att.second = iter->second; + } + } + } else { + /* return all attrs */ + for (auto& att : obj_attrs) { + attrs.insert(get_attrs_t::value_type(att.first, att.second));; + } + } + + return; + } + +int RGWRMAttrs::verify_permission(optional_yield y) +{ + // This looks to be part of the RGW-NFS machinery and has no S3 or + // Swift equivalent. + bool perm; + if (!rgw::sal::Object::empty(s->object.get())) { + perm = verify_object_permission_no_policy(this, s, RGW_PERM_WRITE); + } else { + perm = verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWRMAttrs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWRMAttrs::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) + return; + + s->object->set_atomic(); + + op_ret = s->object->set_obj_attrs(this, nullptr, &attrs, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to delete obj attrs, obj=" << s->object + << " ret=" << op_ret << dendl; + } + return; +} + +int RGWSetAttrs::verify_permission(optional_yield y) +{ + // This looks to be part of the RGW-NFS machinery and has no S3 or + // Swift equivalent. + bool perm; + if (!rgw::sal::Object::empty(s->object.get())) { + perm = verify_object_permission_no_policy(this, s, RGW_PERM_WRITE); + } else { + perm = verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWSetAttrs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWSetAttrs::execute(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) + return; + + if (!rgw::sal::Object::empty(s->object.get())) { + rgw::sal::Attrs a(attrs); + op_ret = s->object->set_obj_attrs(this, &a, nullptr, y); + } else { + op_ret = s->bucket->merge_and_store_attrs(this, attrs, y); + } + +} /* RGWSetAttrs::execute() */ + +void RGWGetObjLayout::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjLayout::execute(optional_yield y) +{ +} + + +int RGWConfigBucketMetaSearch::verify_permission(optional_yield y) +{ + if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) { + return -EACCES; + } + + return 0; +} + +void RGWConfigBucketMetaSearch::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWConfigBucketMetaSearch::execute(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "NOTICE: get_params() returned ret=" << op_ret << dendl; + return; + } + + s->bucket->get_info().mdsearch_config = mdsearch_config; + + op_ret = s->bucket->put_info(this, false, real_time()); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name() + << " returned err=" << op_ret << dendl; + return; + } + s->bucket_attrs = s->bucket->get_attrs(); +} + +int RGWGetBucketMetaSearch::verify_permission(optional_yield y) +{ + if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) { + return -EACCES; + } + + return 0; +} + +void RGWGetBucketMetaSearch::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +int RGWDelBucketMetaSearch::verify_permission(optional_yield y) +{ + if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) { + return -EACCES; + } + + return 0; +} + +void RGWDelBucketMetaSearch::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDelBucketMetaSearch::execute(optional_yield y) +{ + s->bucket->get_info().mdsearch_config.clear(); + + op_ret = s->bucket->put_info(this, false, real_time()); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name() + << " returned err=" << op_ret << dendl; + return; + } + s->bucket_attrs = s->bucket->get_attrs(); +} + + +RGWHandler::~RGWHandler() +{ +} + +int RGWHandler::init(rgw::sal::Driver* _driver, + req_state *_s, + rgw::io::BasicClient *cio) +{ + driver = _driver; + s = _s; + + return 0; +} + +int RGWHandler::do_init_permissions(const DoutPrefixProvider *dpp, optional_yield y) +{ + int ret = rgw_build_bucket_policies(dpp, driver, s, y); + if (ret < 0) { + ldpp_dout(dpp, 10) << "init_permissions on " << s->bucket + << " failed, ret=" << ret << dendl; + return ret==-ENODATA ? -EACCES : ret; + } + + rgw_build_iam_environment(driver, s); + return ret; +} + +int RGWHandler::do_read_permissions(RGWOp *op, bool only_bucket, optional_yield y) +{ + if (only_bucket) { + /* already read bucket info */ + return 0; + } + int ret = rgw_build_object_policies(op, driver, s, op->prefetch_data(), y); + + if (ret < 0) { + ldpp_dout(op, 10) << "read_permissions on " << s->bucket << ":" + << s->object << " only_bucket=" << only_bucket + << " ret=" << ret << dendl; + if (ret == -ENODATA) + ret = -EACCES; + if (s->auth.identity->is_anonymous() && ret == -EACCES) + ret = -EPERM; + } + + return ret; +} + +int RGWOp::error_handler(int err_no, string *error_content, optional_yield y) { + return dialect_handler->error_handler(err_no, error_content, y); +} + +int RGWHandler::error_handler(int err_no, string *error_content, optional_yield) { + // This is the do-nothing error handler + return err_no; +} + +std::ostream& RGWOp::gen_prefix(std::ostream& out) const +{ + // append : to the prefix + return s->gen_prefix(out) << s->dialect << ':' << name() << ' '; +} + +void RGWDefaultResponseOp::send_response() { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWPutBucketPolicy::send_response() +{ + if (!op_ret) { + /* A successful Put Bucket Policy should return a 204 on success */ + op_ret = STATUS_NO_CONTENT; + } + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWPutBucketPolicy::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketPolicy)) { + return -EACCES; + } + + return 0; +} + +int RGWPutBucketPolicy::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + // At some point when I have more time I want to make a version of + // rgw_rest_read_all_input that doesn't use malloc. + std::tie(op_ret, data) = read_all_input(s, max_size, false); + + // And throws exceptions. + return op_ret; +} + +void RGWPutBucketPolicy::execute(optional_yield y) +{ + op_ret = get_params(y); + if (op_ret < 0) { + return; + } + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + try { + const Policy p( + s->cct, s->bucket_tenant, data, + s->cct->_conf.get_val("rgw_policy_reject_invalid_principals")); + rgw::sal::Attrs attrs(s->bucket_attrs); + if (s->bucket_access_conf && + s->bucket_access_conf->block_public_policy() && + rgw::IAM::is_public(p)) { + op_ret = -EACCES; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [&p, this, &attrs] { + attrs[RGW_ATTR_IAM_POLICY].clear(); + attrs[RGW_ATTR_IAM_POLICY].append(p.text); + op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield); + return op_ret; + }); + } catch (rgw::IAM::PolicyParseException& e) { + ldpp_dout(this, 5) << "failed to parse policy: " << e.what() << dendl; + op_ret = -EINVAL; + s->err.message = e.what(); + } +} + +void RGWGetBucketPolicy::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/json"); + dump_body(s, policy); +} + +int RGWGetBucketPolicy::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketPolicy)) { + return -EACCES; + } + + return 0; +} + +void RGWGetBucketPolicy::execute(optional_yield y) +{ + rgw::sal::Attrs attrs(s->bucket_attrs); + auto aiter = attrs.find(RGW_ATTR_IAM_POLICY); + if (aiter == attrs.end()) { + ldpp_dout(this, 0) << "can't find bucket IAM POLICY attr bucket_name = " + << s->bucket_name << dendl; + op_ret = -ERR_NO_SUCH_BUCKET_POLICY; + s->err.message = "The bucket policy does not exist"; + return; + } else { + policy = attrs[RGW_ATTR_IAM_POLICY]; + + if (policy.length() == 0) { + ldpp_dout(this, 10) << "The bucket policy does not exist, bucket: " + << s->bucket_name << dendl; + op_ret = -ERR_NO_SUCH_BUCKET_POLICY; + s->err.message = "The bucket policy does not exist"; + return; + } + } +} + +void RGWDeleteBucketPolicy::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWDeleteBucketPolicy::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3DeleteBucketPolicy)) { + return -EACCES; + } + + return 0; +} + +void RGWDeleteBucketPolicy::execute(optional_yield y) +{ + bufferlist data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + rgw::sal::Attrs attrs(s->bucket_attrs); + attrs.erase(RGW_ATTR_IAM_POLICY); + op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield); + return op_ret; + }); +} + +void RGWPutBucketObjectLock::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +int RGWPutBucketObjectLock::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketObjectLockConfiguration); +} + +void RGWPutBucketObjectLock::execute(optional_yield y) +{ + if (!s->bucket->get_info().obj_lock_enabled()) { + s->err.message = "object lock configuration can't be set if bucket object lock not enabled"; + ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl; + op_ret = -ERR_INVALID_BUCKET_STATE; + return; + } + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + op_ret = -EINVAL; + return; + } + op_ret = get_params(y); + if (op_ret < 0) { + return; + } + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("ObjectLockConfiguration", obj_lock, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "unexpected xml:" << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + if (obj_lock.has_rule() && !obj_lock.retention_period_valid()) { + s->err.message = "retention period must be a positive integer value"; + ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl; + op_ret = -ERR_INVALID_RETENTION_PERIOD; + return; + } + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << __func__ << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + s->bucket->get_info().obj_lock = obj_lock; + op_ret = s->bucket->put_info(this, false, real_time()); + return op_ret; + }); + return; +} + +void RGWGetBucketObjectLock::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +int RGWGetBucketObjectLock::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketObjectLockConfiguration); +} + +void RGWGetBucketObjectLock::execute(optional_yield y) +{ + if (!s->bucket->get_info().obj_lock_enabled()) { + op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION; + return; + } +} + +int RGWPutObjRetention::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (!verify_object_permission(this, s, rgw::IAM::s3PutObjectRetention)) { + return -EACCES; + } + op_ret = get_params(y); + if (op_ret) { + return op_ret; + } + if (bypass_governance_mode) { + bypass_perm = verify_object_permission(this, s, rgw::IAM::s3BypassGovernanceRetention); + } + return 0; +} + +void RGWPutObjRetention::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutObjRetention::execute(optional_yield y) +{ + if (!s->bucket->get_info().obj_lock_enabled()) { + s->err.message = "object retention can't be set if bucket object lock not configured"; + ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + op_ret = -EINVAL; + return; + } + + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("Retention", obj_retention, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "unexpected xml:" << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + + if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) < ceph_clock_now()) { + s->err.message = "the retain-until date must be in the future"; + ldpp_dout(this, 0) << "ERROR: " << s->err.message << dendl; + op_ret = -EINVAL; + return; + } + bufferlist bl; + obj_retention.encode(bl); + + //check old retention + op_ret = s->object->get_obj_attrs(s->yield, this); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: get obj attr error"<< dendl; + return; + } + rgw::sal::Attrs attrs = s->object->get_attrs(); + auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (aiter != attrs.end()) { + RGWObjectRetention old_obj_retention; + try { + decode(old_obj_retention, aiter->second); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl; + op_ret = -EIO; + return; + } + if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) < ceph::real_clock::to_time_t(old_obj_retention.get_retain_until_date())) { + if (old_obj_retention.get_mode().compare("GOVERNANCE") != 0 || !bypass_perm || !bypass_governance_mode) { + s->err.message = "proposed retain-until date shortens an existing retention period and governance bypass check failed"; + op_ret = -EACCES; + return; + } + } else if (old_obj_retention.get_mode() == obj_retention.get_mode()) { + // ok if retention mode doesn't change + } else if (obj_retention.get_mode() == "GOVERNANCE") { + s->err.message = "can't change retention mode from COMPLIANCE to GOVERNANCE"; + op_ret = -EACCES; + return; + } else if (!bypass_perm || !bypass_governance_mode) { + s->err.message = "can't change retention mode from GOVERNANCE without governance bypass"; + op_ret = -EACCES; + return; + } + } + + op_ret = s->object->modify_obj_attrs(RGW_ATTR_OBJECT_RETENTION, bl, s->yield, this); + + return; +} + +int RGWGetObjRetention::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (!verify_object_permission(this, s, rgw::IAM::s3GetObjectRetention)) { + return -EACCES; + } + return 0; +} + +void RGWGetObjRetention::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjRetention::execute(optional_yield y) +{ + if (!s->bucket->get_info().obj_lock_enabled()) { + s->err.message = "bucket object lock not configured"; + ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + op_ret = s->object->get_obj_attrs(s->yield, this); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << s->object + << " ret=" << op_ret << dendl; + return; + } + rgw::sal::Attrs attrs = s->object->get_attrs(); + auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (aiter == attrs.end()) { + op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION; + return; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + obj_retention.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(this, 0) << __func__ << "decode object retention config failed" << dendl; + op_ret = -EIO; + return; + } + return; +} + +int RGWPutObjLegalHold::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (!verify_object_permission(this, s, rgw::IAM::s3PutObjectLegalHold)) { + return -EACCES; + } + return 0; +} + +void RGWPutObjLegalHold::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutObjLegalHold::execute(optional_yield y) { + if (!s->bucket->get_info().obj_lock_enabled()) { + s->err.message = "object legal hold can't be set if bucket object lock not enabled"; + ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + op_ret = -EINVAL; + return; + } + + op_ret = get_params(y); + if (op_ret < 0) + return; + + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("LegalHold", obj_legal_hold, &parser, true); + } catch (RGWXMLDecoder::err &err) { + ldpp_dout(this, 5) << "unexpected xml:" << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + bufferlist bl; + obj_legal_hold.encode(bl); + //if instance is empty, we should modify the latest object + op_ret = s->object->modify_obj_attrs(RGW_ATTR_OBJECT_LEGAL_HOLD, bl, s->yield, this); + return; +} + +int RGWGetObjLegalHold::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s); + if (has_s3_existing_tag || has_s3_resource_tag) + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + + if (!verify_object_permission(this, s, rgw::IAM::s3GetObjectLegalHold)) { + return -EACCES; + } + return 0; +} + +void RGWGetObjLegalHold::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjLegalHold::execute(optional_yield y) +{ + if (!s->bucket->get_info().obj_lock_enabled()) { + s->err.message = "bucket object lock not configured"; + ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + map attrs; + op_ret = s->object->get_obj_attrs(s->yield, this); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << s->object + << " ret=" << op_ret << dendl; + return; + } + auto aiter = s->object->get_attrs().find(RGW_ATTR_OBJECT_LEGAL_HOLD); + if (aiter == s->object->get_attrs().end()) { + op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION; + return; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + obj_legal_hold.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(this, 0) << __func__ << "decode object legal hold config failed" << dendl; + op_ret = -EIO; + return; + } + return; +} + +void RGWGetClusterStat::execute(optional_yield y) +{ + op_ret = driver->cluster_stat(stats_op); +} + +int RGWGetBucketPolicyStatus::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketPolicyStatus)) { + return -EACCES; + } + + return 0; +} + +void RGWGetBucketPolicyStatus::execute(optional_yield y) +{ + isPublic = (s->iam_policy && rgw::IAM::is_public(*s->iam_policy)) || s->bucket_acl->is_public(this); +} + +int RGWPutBucketPublicAccessBlock::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketPublicAccessBlock)) { + return -EACCES; + } + + return 0; +} + +int RGWPutBucketPublicAccessBlock::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + return op_ret; +} + +void RGWPutBucketPublicAccessBlock::execute(optional_yield y) +{ + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + op_ret = -EINVAL; + return; + } + + op_ret = get_params(y); + if (op_ret < 0) + return; + + if (!parser.parse(data.c_str(), data.length(), 1)) { + ldpp_dout(this, 0) << "ERROR: malformed XML" << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("PublicAccessBlockConfiguration", access_conf, &parser, true); + } catch (RGWXMLDecoder::err &err) { + ldpp_dout(this, 5) << "unexpected xml:" << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + bufferlist bl; + access_conf.encode(bl); + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, &bl] { + rgw::sal::Attrs attrs(s->bucket_attrs); + attrs[RGW_ATTR_PUBLIC_ACCESS] = bl; + return s->bucket->merge_and_store_attrs(this, attrs, s->yield); + }); + +} + +int RGWGetBucketPublicAccessBlock::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketPolicy)) { + return -EACCES; + } + + return 0; +} + +void RGWGetBucketPublicAccessBlock::execute(optional_yield y) +{ + auto attrs = s->bucket_attrs; + if (auto aiter = attrs.find(RGW_ATTR_PUBLIC_ACCESS); + aiter == attrs.end()) { + ldpp_dout(this, 0) << "can't find bucket IAM POLICY attr bucket_name = " + << s->bucket_name << dendl; + // return the default; + return; + } else { + bufferlist::const_iterator iter{&aiter->second}; + try { + access_conf.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(this, 0) << __func__ << "decode access_conf failed" << dendl; + op_ret = -EIO; + return; + } + } +} + + +void RGWDeleteBucketPublicAccessBlock::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWDeleteBucketPublicAccessBlock::verify_permission(optional_yield y) +{ + auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); + if (has_s3_resource_tag) + rgw_iam_add_buckettags(this, s); + + if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketPublicAccessBlock)) { + return -EACCES; + } + + return 0; +} + +void RGWDeleteBucketPublicAccessBlock::execute(optional_yield y) +{ + bufferlist data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] { + rgw::sal::Attrs attrs(s->bucket_attrs); + attrs.erase(RGW_ATTR_PUBLIC_ACCESS); + op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield); + return op_ret; + }); +} + +int RGWPutBucketEncryption::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + return op_ret; +} + +int RGWPutBucketEncryption::verify_permission(optional_yield y) +{ + if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketEncryption)) { + return -EACCES; + } + return 0; +} + +void RGWPutBucketEncryption::execute(optional_yield y) +{ + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + op_ret = -EINVAL; + return; + } + op_ret = get_params(y); + if (op_ret < 0) { + return; + } + if (!parser.parse(data.c_str(), data.length(), 1)) { + ldpp_dout(this, 0) << "ERROR: malformed XML" << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("ServerSideEncryptionConfiguration", bucket_encryption_conf, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "unexpected xml:" << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + bufferlist conf_bl; + bucket_encryption_conf.encode(conf_bl); + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, y, &conf_bl] { + rgw::sal::Attrs attrs = s->bucket->get_attrs(); + attrs[RGW_ATTR_BUCKET_ENCRYPTION_POLICY] = conf_bl; + return s->bucket->merge_and_store_attrs(this, attrs, y); + }); +} + +int RGWGetBucketEncryption::verify_permission(optional_yield y) +{ + if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketEncryption)) { + return -EACCES; + } + return 0; +} + +void RGWGetBucketEncryption::execute(optional_yield y) +{ + const auto& attrs = s->bucket_attrs; + if (auto aiter = attrs.find(RGW_ATTR_BUCKET_ENCRYPTION_POLICY); + aiter == attrs.end()) { + ldpp_dout(this, 0) << "can't find BUCKET ENCRYPTION attr for bucket_name = " << s->bucket_name << dendl; + op_ret = -ENOENT; + s->err.message = "The server side encryption configuration was not found"; + return; + } else { + bufferlist::const_iterator iter{&aiter->second}; + try { + bucket_encryption_conf.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(this, 0) << __func__ << "decode bucket_encryption_conf failed" << dendl; + op_ret = -EIO; + return; + } + } +} + +int RGWDeleteBucketEncryption::verify_permission(optional_yield y) +{ + if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketEncryption)) { + return -EACCES; + } + return 0; +} + +void RGWDeleteBucketEncryption::execute(optional_yield y) +{ + bufferlist data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, y] { + rgw::sal::Attrs attrs = s->bucket->get_attrs(); + attrs.erase(RGW_ATTR_BUCKET_ENCRYPTION_POLICY); + attrs.erase(RGW_ATTR_BUCKET_ENCRYPTION_KEY_ID); + op_ret = s->bucket->merge_and_store_attrs(this, attrs, y); + return op_ret; + }); +} + +void rgw_slo_entry::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("path", path, obj); + JSONDecoder::decode_json("etag", etag, obj); + JSONDecoder::decode_json("size_bytes", size_bytes, obj); +}; + diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h new file mode 100644 index 000000000..f398b5b15 --- /dev/null +++ b/src/rgw/rgw_op.h @@ -0,0 +1,2672 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/** + * All operations via the rados gateway are carried out by + * small classes known as RGWOps. This class contains a req_state + * and each possible command is a subclass of this with a defined + * execute() method that does whatever the subclass name implies. + * These subclasses must be further subclassed (by interface type) + * to provide additional virtual methods such as send_response or get_params. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "common/armor.h" +#include "common/mime.h" +#include "common/utf8.h" +#include "common/ceph_json.h" +#include "common/ceph_time.h" + +#include "rgw_common.h" +#include "rgw_dmclock.h" +#include "rgw_sal.h" +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_acl.h" +#include "rgw_cors.h" +#include "rgw_quota.h" +#include "rgw_putobj.h" +#include "rgw_sal.h" +#include "rgw_compression_types.h" +#include "rgw_log.h" + +#include "rgw_lc.h" +#include "rgw_torrent.h" +#include "rgw_tag.h" +#include "rgw_object_lock.h" +#include "cls/rgw/cls_rgw_client.h" +#include "rgw_public_access.h" +#include "rgw_bucket_encryption.h" +#include "rgw_tracer.h" + +#include "services/svc_sys_obj.h" +#include "services/svc_tier_rados.h" + +#include "include/ceph_assert.h" + +using ceph::crypto::SHA1; + +struct req_state; +class RGWOp; +class RGWRados; +class RGWMultiCompleteUpload; + + +namespace rgw { +namespace auth { +namespace registry { + +class StrategyRegistry; + +} +} +} + +int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider *dpp, + CephContext *cct, + rgw::sal::Driver* driver, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + RGWAccessControlPolicy *policy, + optional_yield y); + +class RGWHandler { +protected: + rgw::sal::Driver* driver{nullptr}; + req_state *s{nullptr}; + + int do_init_permissions(const DoutPrefixProvider *dpp, optional_yield y); + int do_read_permissions(RGWOp* op, bool only_bucket, optional_yield y); + +public: + RGWHandler() {} + virtual ~RGWHandler(); + + virtual int init(rgw::sal::Driver* driver, + req_state* _s, + rgw::io::BasicClient* cio); + + virtual int init_permissions(RGWOp*, optional_yield y) { + return 0; + } + + virtual int retarget(RGWOp* op, RGWOp** new_op, optional_yield) { + *new_op = op; + return 0; + } + + virtual int read_permissions(RGWOp* op, optional_yield y) = 0; + virtual int authorize(const DoutPrefixProvider* dpp, optional_yield y) = 0; + virtual int postauth_init(optional_yield y) = 0; + virtual int error_handler(int err_no, std::string* error_content, optional_yield y); + virtual void dump(const std::string& code, const std::string& message) const {} + + virtual bool supports_quota() { + return true; + } +}; + + + +void rgw_bucket_object_pre_exec(req_state *s); + +namespace dmc = rgw::dmclock; + +std::tuple rgw_rest_read_all_input(req_state *s, + const uint64_t max_len, + const bool allow_chunked=true); + +template +int rgw_rest_get_json_input(CephContext *cct, req_state *s, T& out, + uint64_t max_len, bool *empty) +{ + if (empty) + *empty = false; + + int rv = 0; + bufferlist data; + std::tie(rv, data) = rgw_rest_read_all_input(s, max_len); + if (rv < 0) { + return rv; + } + + if (!data.length()) { + if (empty) { + *empty = true; + } + + return -EINVAL; + } + + JSONParser parser; + + if (!parser.parse(data.c_str(), data.length())) { + return -EINVAL; + } + + try { + decode_json_obj(out, &parser); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + + return 0; +} + +/** + * Provide the base class for all ops. + */ +class RGWOp : public DoutPrefixProvider { +protected: + req_state *s; + RGWHandler *dialect_handler; + rgw::sal::Driver* driver; + RGWCORSConfiguration bucket_cors; + bool cors_exist; + RGWQuota quota; + int op_ret; + int do_aws4_auth_completion(); + bool init_called = false; + + virtual int init_quota(); + + std::tuple read_all_input(req_state *s, + const uint64_t max_len, + const bool allow_chunked=true) { + + int rv = 0; + bufferlist data; + std::tie(rv, data) = rgw_rest_read_all_input(s, max_len); + if (rv >= 0) { + do_aws4_auth_completion(); + } + + return std::make_tuple(rv, std::move(data)); + } + + template + int get_json_input(CephContext *cct, req_state *s, T& out, + uint64_t max_len, bool *empty) { + int r = rgw_rest_get_json_input(cct, s, out, max_len, empty); + if (r >= 0) { + do_aws4_auth_completion(); + } + return r; + } + +public: + RGWOp() + : s(nullptr), + dialect_handler(nullptr), + driver(nullptr), + cors_exist(false), + op_ret(0) { + } + + virtual ~RGWOp() override; + + int get_ret() const { return op_ret; } + + virtual int init_processing(optional_yield y) { + if (dialect_handler->supports_quota()) { + op_ret = init_quota(); + if (op_ret < 0) + return op_ret; + } + + return 0; + } + + virtual void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *dialect_handler) { + if (init_called) return; + this->driver = driver; + init_called = true; + this->s = s; + this->dialect_handler = dialect_handler; + } + int read_bucket_cors(); + bool generate_cors_headers(std::string& origin, std::string& method, std::string& headers, std::string& exp_headers, unsigned *max_age); + + virtual int verify_params() { return 0; } + virtual bool prefetch_data() { return false; } + + /* Authenticate requester -- verify its identity. + * + * NOTE: typically the procedure is common across all operations of the same + * dialect (S3, Swift API). However, there are significant exceptions in + * both APIs: browser uploads, /info and OPTIONS handlers. All of them use + * different, specific authentication schema driving the need for per-op + * authentication. The alternative is to duplicate parts of the method- + * dispatch logic in RGWHandler::authorize() and pollute it with a lot + * of special cases. */ + virtual int verify_requester(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) { + /* TODO(rzarzynski): rename RGWHandler::authorize to generic_authenticate. */ + return dialect_handler->authorize(this, y); + } + virtual int verify_permission(optional_yield y) = 0; + virtual int verify_op_mask(); + virtual void pre_exec() {} + virtual void execute(optional_yield y) = 0; + virtual void send_response() {} + virtual void complete() { + send_response(); + } + virtual const char* name() const = 0; + virtual RGWOpType get_type() { return RGW_OP_UNKNOWN; } + + virtual uint32_t op_mask() { return 0; } + + virtual int error_handler(int err_no, std::string *error_content, optional_yield y); + + // implements DoutPrefixProvider + std::ostream& gen_prefix(std::ostream& out) const override; + CephContext* get_cct() const override { return s->cct; } + unsigned get_subsys() const override { return ceph_subsys_rgw; } + + virtual dmc::client_id dmclock_client() { return dmc::client_id::metadata; } + virtual dmc::Cost dmclock_cost() { return 1; } + virtual void write_ops_log_entry(rgw_log_entry& entry) const {}; +}; + +class RGWDefaultResponseOp : public RGWOp { +public: + void send_response() override; +}; + +class RGWGetObj_Filter : public RGWGetDataCB +{ +protected: + RGWGetObj_Filter *next{nullptr}; +public: + RGWGetObj_Filter() {} + explicit RGWGetObj_Filter(RGWGetObj_Filter *next): next(next) {} + ~RGWGetObj_Filter() override {} + /** + * Passes data through filter. + * Filter can modify content of bl. + * When bl_len == 0 , it means 'flush + */ + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override { + if (next) { + return next->handle_data(bl, bl_ofs, bl_len); + } + return 0; + } + /** + * Flushes any cached data. Used by RGWGetObjFilter. + * Return logic same as handle_data. + */ + virtual int flush() { + if (next) { + return next->flush(); + } + return 0; + } + /** + * Allows filter to extend range required for successful filtering + */ + virtual int fixup_range(off_t& ofs, off_t& end) { + if (next) { + return next->fixup_range(ofs, end); + } + return 0; + } +}; + +class RGWGetObj : public RGWOp { +protected: + seed torrent; // get torrent + const char *range_str; + const char *if_mod; + const char *if_unmod; + const char *if_match; + const char *if_nomatch; + uint32_t mod_zone_id; + uint64_t mod_pg_ver; + off_t ofs; + uint64_t total_len; + off_t start; + off_t end; + ceph::real_time mod_time; + ceph::real_time lastmod; + ceph::real_time unmod_time; + ceph::real_time *mod_ptr; + ceph::real_time *unmod_ptr; + rgw::sal::Attrs attrs; + bool get_data; + bool partial_content; + bool ignore_invalid_range; + bool range_parsed; + bool skip_manifest; + bool skip_decrypt{false}; + bool sync_cloudtiered{false}; + utime_t gc_invalidate_time; + bool is_slo; + std::string lo_etag; + bool rgwx_stat; /* extended rgw stat operation */ + std::string version_id; + rgw_zone_set_entry dst_zone_trace; + + // compression attrs + RGWCompressionInfo cs_info; + off_t first_block, last_block; + off_t q_ofs, q_len; + bool first_data; + uint64_t cur_ofs; + bufferlist waiting; + uint64_t action = 0; + + bool get_retention; + bool get_legal_hold; + + int init_common(); +public: + RGWGetObj() { + range_str = NULL; + if_mod = NULL; + if_unmod = NULL; + if_match = NULL; + if_nomatch = NULL; + mod_zone_id = 0; + mod_pg_ver = 0; + start = 0; + ofs = 0; + total_len = 0; + end = -1; + mod_ptr = NULL; + unmod_ptr = NULL; + get_data = false; + partial_content = false; + range_parsed = false; + skip_manifest = false; + is_slo = false; + first_block = 0; + last_block = 0; + q_ofs = 0; + q_len = 0; + first_data = true; + cur_ofs = 0; + get_retention = false; + get_legal_hold = false; + } + + bool prefetch_data() override; + + void set_get_data(bool get_data) { + this->get_data = get_data; + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + int parse_range(); + int read_user_manifest_part( + rgw::sal::Bucket* bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const off_t start_ofs, + const off_t end_ofs, + bool swift_slo); + int handle_user_manifest(const char *prefix, optional_yield y); + int handle_slo_manifest(bufferlist& bl, optional_yield y); + + int get_data_cb(bufferlist& bl, off_t ofs, off_t len); + + virtual int get_params(optional_yield y) = 0; + virtual int send_response_data_error(optional_yield y) = 0; + virtual int send_response_data(bufferlist& bl, off_t ofs, off_t len) = 0; + + const char* name() const override { return "get_obj"; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + virtual bool need_object_expiration() { return false; } + /** + * calculates filter used to decrypt RGW objects data + */ + virtual int get_decrypt_filter(std::unique_ptr* filter, RGWGetObj_Filter* cb, bufferlist* manifest_bl) { + *filter = nullptr; + return 0; + } + + // get lua script to run as a "get object" filter + int get_lua_filter(std::unique_ptr* filter, + RGWGetObj_Filter* cb); + + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWGetObj_CB : public RGWGetObj_Filter +{ + RGWGetObj *op; +public: + explicit RGWGetObj_CB(RGWGetObj *_op) : op(_op) {} + ~RGWGetObj_CB() override {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override { + return op->get_data_cb(bl, bl_ofs, bl_len); + } +}; + +class RGWGetObjTags : public RGWOp { + protected: + bufferlist tags_bl; + bool has_tags{false}; + public: + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + void pre_exec() override; + + virtual void send_response_data(bufferlist& bl) = 0; + const char* name() const override { return "get_obj_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ_TAGGING; } + +}; + +class RGWPutObjTags : public RGWOp { + protected: + bufferlist tags_bl; + public: + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + virtual void send_response() override = 0; + virtual int get_params(optional_yield y) = 0; + const char* name() const override { return "put_obj_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + RGWOpType get_type() override { return RGW_OP_PUT_OBJ_TAGGING; } + +}; + +class RGWDeleteObjTags: public RGWOp { + public: + void pre_exec() override; + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + const char* name() const override { return "delete_obj_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + RGWOpType get_type() override { return RGW_OP_DELETE_OBJ_TAGGING;} +}; + +class RGWGetBucketTags : public RGWOp { +protected: + bufferlist tags_bl; + bool has_tags{false}; +public: + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + void pre_exec() override; + + virtual void send_response_data(bufferlist& bl) = 0; + const char* name() const override { return "get_bucket_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_TAGGING; } +}; + +class RGWPutBucketTags : public RGWOp { +protected: + bufferlist tags_bl; + bufferlist in_data; +public: + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + virtual void send_response() override = 0; + virtual int get_params(const DoutPrefixProvider *dpp, optional_yield y) = 0; + const char* name() const override { return "put_bucket_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_TAGGING; } +}; + +class RGWDeleteBucketTags : public RGWOp { +public: + void pre_exec() override; + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + const char* name() const override { return "delete_bucket_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET_TAGGING;} +}; + +struct rgw_sync_policy_group; + +class RGWGetBucketReplication : public RGWOp { +public: + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + void pre_exec() override; + + virtual void send_response_data() = 0; + const char* name() const override { return "get_bucket_replication"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_REPLICATION; } +}; + +class RGWPutBucketReplication : public RGWOp { +protected: + bufferlist in_data; + std::vector sync_policy_groups; +public: + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + virtual void send_response() override = 0; + virtual int get_params(optional_yield y) = 0; + const char* name() const override { return "put_bucket_replication"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_REPLICATION; } +}; + +class RGWDeleteBucketReplication : public RGWOp { +protected: + virtual void update_sync_policy(rgw_sync_policy_info *policy) = 0; +public: + void pre_exec() override; + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + const char* name() const override { return "delete_bucket_replication"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET_REPLICATION;} +}; + +class RGWBulkDelete : public RGWOp { +public: + struct acct_path_t { + std::string bucket_name; + rgw_obj_key obj_key; + }; + + struct fail_desc_t { + int err; + acct_path_t path; + }; + + class Deleter { + protected: + const DoutPrefixProvider * dpp; + unsigned int num_deleted; + unsigned int num_unfound; + std::list failures; + + rgw::sal::Driver* const driver; + req_state * const s; + + public: + Deleter(const DoutPrefixProvider* dpp, rgw::sal::Driver* const str, req_state * const s) + : dpp(dpp), + num_deleted(0), + num_unfound(0), + driver(str), + s(s) { + } + + unsigned int get_num_deleted() const { + return num_deleted; + } + + unsigned int get_num_unfound() const { + return num_unfound; + } + + const std::list get_failures() const { + return failures; + } + + bool verify_permission(RGWBucketInfo& binfo, + std::map& battrs, + ACLOwner& bucket_owner /* out */, + optional_yield y); + bool delete_single(const acct_path_t& path, optional_yield y); + bool delete_chunk(const std::list& paths, optional_yield y); + }; + /* End of Deleter subclass */ + + static const size_t MAX_CHUNK_ENTRIES = 1024; + +protected: + std::unique_ptr deleter; + +public: + RGWBulkDelete() + : deleter(nullptr) { + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_data(std::list& items, + bool * is_truncated) = 0; + void send_response() override = 0; + + const char* name() const override { return "bulk_delete"; } + RGWOpType get_type() override { return RGW_OP_BULK_DELETE; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +inline std::ostream& operator<<(std::ostream& out, const RGWBulkDelete::acct_path_t &o) { + return out << o.bucket_name << "/" << o.obj_key; +} + + +class RGWBulkUploadOp : public RGWOp { +protected: + class fail_desc_t { + public: + fail_desc_t(const int err, std::string path) + : err(err), + path(std::move(path)) { + } + + const int err; + const std::string path; + }; + + static constexpr std::array terminal_errors = { + { -EACCES, -EPERM } + }; + + /* FIXME: boost::container::small_vector failures; */ + std::vector failures; + size_t num_created; + + class StreamGetter; + class DecoratedStreamGetter; + class AlignedStreamGetter; + + virtual std::unique_ptr create_stream() = 0; + virtual void send_response() override = 0; + + boost::optional> + parse_path(const std::string_view& path); + + std::pair + handle_upload_path(req_state *s); + + bool handle_file_verify_permission(RGWBucketInfo& binfo, + const rgw_obj& obj, + std::map& battrs, + ACLOwner& bucket_owner /* out */, + optional_yield y); + int handle_file(std::string_view path, + size_t size, + AlignedStreamGetter& body, + optional_yield y); + + int handle_dir_verify_permission(optional_yield y); + int handle_dir(std::string_view path, optional_yield y); + +public: + RGWBulkUploadOp() + : num_created(0) { + } + + void init(rgw::sal::Driver* const driver, + req_state* const s, + RGWHandler* const h) override; + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + const char* name() const override { return "bulk_upload"; } + + RGWOpType get_type() override { + return RGW_OP_BULK_UPLOAD; + } + + uint32_t op_mask() override { + return RGW_OP_TYPE_WRITE; + } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; /* RGWBulkUploadOp */ + + +class RGWBulkUploadOp::StreamGetter { +public: + StreamGetter() = default; + virtual ~StreamGetter() = default; + + virtual ssize_t get_at_most(size_t want, ceph::bufferlist& dst) = 0; + virtual ssize_t get_exactly(size_t want, ceph::bufferlist& dst) = 0; +}; /* End of nested subclass StreamGetter */ + + +class RGWBulkUploadOp::DecoratedStreamGetter : public StreamGetter { + StreamGetter& decoratee; + +protected: + StreamGetter& get_decoratee() { + return decoratee; + } + +public: + explicit DecoratedStreamGetter(StreamGetter& decoratee) + : decoratee(decoratee) { + } + virtual ~DecoratedStreamGetter() = default; + + ssize_t get_at_most(const size_t want, ceph::bufferlist& dst) override { + return get_decoratee().get_at_most(want, dst); + } + + ssize_t get_exactly(const size_t want, ceph::bufferlist& dst) override { + return get_decoratee().get_exactly(want, dst); + } +}; /* RGWBulkUploadOp::DecoratedStreamGetter */ + + +class RGWBulkUploadOp::AlignedStreamGetter + : public RGWBulkUploadOp::DecoratedStreamGetter { + size_t position; + size_t length; + size_t alignment; + +public: + template + AlignedStreamGetter(const size_t position, + const size_t length, + const size_t alignment, + U&& decoratee) + : DecoratedStreamGetter(std::forward(decoratee)), + position(position), + length(length), + alignment(alignment) { + } + virtual ~AlignedStreamGetter(); + ssize_t get_at_most(size_t want, ceph::bufferlist& dst) override; + ssize_t get_exactly(size_t want, ceph::bufferlist& dst) override; +}; /* RGWBulkUploadOp::AlignedStreamGetter */ + + +struct RGWUsageStats { + uint64_t bytes_used = 0; + uint64_t bytes_used_rounded = 0; + uint64_t buckets_count = 0; + uint64_t objects_count = 0; +}; + +#define RGW_LIST_BUCKETS_LIMIT_MAX 10000 + +class RGWListBuckets : public RGWOp { +protected: + bool sent_data; + std::string marker; + std::string end_marker; + int64_t limit; + uint64_t limit_max; + bool is_truncated; + + RGWUsageStats global_stats; + std::map policies_stats; + + virtual uint64_t get_default_max() const { + return 1000; + } + +public: + RGWListBuckets() + : sent_data(false), + limit(RGW_LIST_BUCKETS_LIMIT_MAX), + limit_max(RGW_LIST_BUCKETS_LIMIT_MAX), + is_truncated(false) { + } + + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + virtual void handle_listing_chunk(rgw::sal::BucketList&& buckets) { + /* The default implementation, used by e.g. S3, just generates a new + * part of listing and sends it client immediately. Swift can behave + * differently: when the reverse option is requested, all incoming + * instances of RGWBucketList are buffered and finally reversed. */ + return send_response_data(buckets); + } + virtual void send_response_begin(bool has_buckets) = 0; + virtual void send_response_data(rgw::sal::BucketList& buckets) = 0; + virtual void send_response_end() = 0; + void send_response() override {} + + virtual bool should_get_stats() { return false; } + virtual bool supports_account_metadata() { return false; } + + const char* name() const override { return "list_buckets"; } + RGWOpType get_type() override { return RGW_OP_LIST_BUCKETS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; // class RGWListBuckets + +class RGWGetUsage : public RGWOp { +protected: + bool sent_data; + std::string start_date; + std::string end_date; + int show_log_entries; + int show_log_sum; + std::map categories; + std::map usage; + std::map summary_map; + std::map buckets_usage; + cls_user_header header; + RGWStorageStats stats; +public: + RGWGetUsage() : sent_data(false), show_log_entries(true), show_log_sum(true){ + } + + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override {} + + virtual bool should_get_stats() { return false; } + + const char* name() const override { return "get_self_usage"; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWStatAccount : public RGWOp { +protected: + RGWUsageStats global_stats; + std::map policies_stats; + +public: + RGWStatAccount() = default; + + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "stat_account"; } + RGWOpType get_type() override { return RGW_OP_STAT_ACCOUNT; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWListBucket : public RGWOp { +protected: + std::string prefix; + rgw_obj_key marker; + rgw_obj_key next_marker; + rgw_obj_key end_marker; + std::string max_keys; + std::string delimiter; + std::string encoding_type; + bool list_versions; + int max; + std::vector objs; + std::map common_prefixes; + + int default_max; + bool is_truncated; + bool allow_unordered; + + int shard_id; + + int parse_max_keys(); + +public: + RGWListBucket() : list_versions(false), max(0), + default_max(0), is_truncated(false), + allow_unordered(false), shard_id(-1) {} + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + } + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "list_bucket"; } + RGWOpType get_type() override { return RGW_OP_LIST_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + virtual bool need_container_stats() { return false; } +}; + +class RGWGetBucketLogging : public RGWOp { +public: + RGWGetBucketLogging() {} + int verify_permission(optional_yield y) override; + void execute(optional_yield) override { } + + void send_response() override = 0; + const char* name() const override { return "get_bucket_logging"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOGGING; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWGetBucketLocation : public RGWOp { +public: + RGWGetBucketLocation() {} + ~RGWGetBucketLocation() override {} + int verify_permission(optional_yield y) override; + void execute(optional_yield) override { } + + void send_response() override = 0; + const char* name() const override { return "get_bucket_location"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOCATION; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWGetBucketVersioning : public RGWOp { +protected: + bool versioned{false}; + bool versioning_enabled{false}; + bool mfa_enabled{false}; +public: + RGWGetBucketVersioning() = default; + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "get_bucket_versioning"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_VERSIONING; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +enum BucketVersionStatus { + VersioningStatusInvalid = -1, + VersioningNotChanged = 0, + VersioningEnabled = 1, + VersioningSuspended =2, +}; + +class RGWSetBucketVersioning : public RGWOp { +protected: + int versioning_status; + bool mfa_set_status{false}; + bool mfa_status{false}; + bufferlist in_data; +public: + RGWSetBucketVersioning() : versioning_status(VersioningNotChanged) {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) { return 0; } + + void send_response() override = 0; + const char* name() const override { return "set_bucket_versioning"; } + RGWOpType get_type() override { return RGW_OP_SET_BUCKET_VERSIONING; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetBucketWebsite : public RGWOp { +public: + RGWGetBucketWebsite() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "get_bucket_website"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_WEBSITE; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWSetBucketWebsite : public RGWOp { +protected: + bufferlist in_data; + RGWBucketWebsiteConf website_conf; +public: + RGWSetBucketWebsite() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) { return 0; } + + void send_response() override = 0; + const char* name() const override { return "set_bucket_website"; } + RGWOpType get_type() override { return RGW_OP_SET_BUCKET_WEBSITE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWDeleteBucketWebsite : public RGWOp { +public: + RGWDeleteBucketWebsite() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "delete_bucket_website"; } + RGWOpType get_type() override { return RGW_OP_SET_BUCKET_WEBSITE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWStatBucket : public RGWOp { +protected: + std::unique_ptr bucket; + +public: + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "stat_bucket"; } + RGWOpType get_type() override { return RGW_OP_STAT_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWCreateBucket : public RGWOp { +protected: + RGWAccessControlPolicy policy; + std::string location_constraint; + rgw_placement_rule placement_rule; + RGWBucketInfo info; + obj_version ep_objv; + bool has_cors; + bool relaxed_region_enforcement; + bool obj_lock_enabled; + RGWCORSConfiguration cors_config; + boost::optional swift_ver_location; + std::map attrs; + std::set rmattr_names; + + bufferlist in_data; + + virtual bool need_metadata_upload() const { return false; } + +public: + RGWCreateBucket() : has_cors(false), relaxed_region_enforcement(false), obj_lock_enabled(false) {} + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */ + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + policy.set_ctx(s->cct); + relaxed_region_enforcement = + s->cct->_conf.get_val("rgw_relaxed_region_enforcement"); + } + virtual int get_params(optional_yield y) { return 0; } + void send_response() override = 0; + const char* name() const override { return "create_bucket"; } + RGWOpType get_type() override { return RGW_OP_CREATE_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWDeleteBucket : public RGWOp { +protected: + RGWObjVersionTracker objv_tracker; + +public: + RGWDeleteBucket() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "delete_bucket"; } + RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } +}; + +struct rgw_slo_entry { + std::string path; + std::string etag; + uint64_t size_bytes; + + rgw_slo_entry() : size_bytes(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(path, bl); + encode(etag, bl); + encode(size_bytes, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(path, bl); + decode(etag, bl); + decode(size_bytes, bl); + DECODE_FINISH(bl); + } + + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_slo_entry) + +struct RGWSLOInfo { + std::vector entries; + uint64_t total_size; + + /* in memory only */ + bufferlist raw_data; + + RGWSLOInfo() : total_size(0) {} + ~RGWSLOInfo() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(entries, bl); + encode(total_size, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(entries, bl); + decode(total_size, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(RGWSLOInfo) + +class RGWPutObj : public RGWOp { +protected: + seed torrent; + off_t ofs; + const char *supplied_md5_b64; + const char *supplied_etag; + const char *if_match; + const char *if_nomatch; + std::string copy_source; + const char *copy_source_range; + RGWBucketInfo copy_source_bucket_info; + std::string copy_source_tenant_name; + std::string copy_source_bucket_name; + std::string copy_source_object_name; + std::string copy_source_version_id; + off_t copy_source_range_fst; + off_t copy_source_range_lst; + std::string etag; + bool chunked_upload; + RGWAccessControlPolicy policy; + std::unique_ptr obj_tags; + const char *dlo_manifest; + RGWSLOInfo *slo_info; + rgw::sal::Attrs attrs; + ceph::real_time mtime; + uint64_t olh_epoch; + std::string version_id; + bufferlist bl_aux; + std::map crypt_http_responses; + std::string user_data; + + std::string multipart_upload_id; + std::string multipart_part_str; + int multipart_part_num = 0; + jspan multipart_trace; + + boost::optional delete_at; + //append obj + bool append; + uint64_t position; + uint64_t cur_accounted_size; + + //object lock + RGWObjectRetention *obj_retention; + RGWObjectLegalHold *obj_legal_hold; + +public: + RGWPutObj() : ofs(0), + supplied_md5_b64(NULL), + supplied_etag(NULL), + if_match(NULL), + if_nomatch(NULL), + copy_source_range(NULL), + copy_source_range_fst(0), + copy_source_range_lst(0), + chunked_upload(0), + dlo_manifest(NULL), + slo_info(NULL), + olh_epoch(0), + append(false), + position(0), + cur_accounted_size(0), + obj_retention(nullptr), + obj_legal_hold(nullptr) {} + + ~RGWPutObj() override { + delete slo_info; + delete obj_retention; + delete obj_legal_hold; + } + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + policy.set_ctx(s->cct); + } + + virtual int init_processing(optional_yield y) override; + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */ + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + /* this is for cases when copying data from other object */ + virtual int get_decrypt_filter(std::unique_ptr* filter, + RGWGetObj_Filter* cb, + std::map& attrs, + bufferlist* manifest_bl) { + *filter = nullptr; + return 0; + } + virtual int get_encrypt_filter(std::unique_ptr *filter, + rgw::sal::DataProcessor *cb) { + return 0; + } + + // get lua script to run as a "put object" filter + int get_lua_filter(std::unique_ptr* filter, + rgw::sal::DataProcessor* cb); + + int get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len); + int get_data(const off_t fst, const off_t lst, bufferlist& bl); + + virtual int get_params(optional_yield y) = 0; + virtual int get_data(bufferlist& bl) = 0; + void send_response() override = 0; + const char* name() const override { return "put_obj"; } + RGWOpType get_type() override { return RGW_OP_PUT_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWPostObj : public RGWOp { +protected: + off_t min_len; + off_t max_len; + int len; + off_t ofs; + const char *supplied_md5_b64; + const char *supplied_etag; + std::string etag; + RGWAccessControlPolicy policy; + std::map attrs; + boost::optional delete_at; + + /* Must be called after get_data() or the result is undefined. */ + virtual std::string get_current_filename() const = 0; + virtual std::string get_current_content_type() const = 0; + virtual bool is_next_file_to_upload() { + return false; + } +public: + RGWPostObj() : min_len(0), + max_len(LLONG_MAX), + len(0), + ofs(0), + supplied_md5_b64(nullptr), + supplied_etag(nullptr) { + } + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */ + } + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + policy.set_ctx(s->cct); + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_encrypt_filter(std::unique_ptr *filter, + rgw::sal::DataProcessor *cb) { + return 0; + } + virtual int get_params(optional_yield y) = 0; + virtual int get_data(ceph::bufferlist& bl, bool& again) = 0; + void send_response() override = 0; + const char* name() const override { return "post_obj"; } + RGWOpType get_type() override { return RGW_OP_POST_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWPutMetadataAccount : public RGWOp { +protected: + std::set rmattr_names; + std::map attrs, orig_attrs; + std::map temp_url_keys; + RGWQuotaInfo new_quota; + bool new_quota_extracted; + + RGWAccessControlPolicy policy; + bool has_policy; + +public: + RGWPutMetadataAccount() + : new_quota_extracted(false), + has_policy(false) { + } + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + policy.set_ctx(s->cct); + } + int init_processing(optional_yield y) override; + int verify_permission(optional_yield y) override; + void pre_exec() override { } + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + virtual void filter_out_temp_url(std::map& add_attrs, + const std::set& rmattr_names, + std::map& temp_url_keys); + const char* name() const override { return "put_account_metadata"; } + RGWOpType get_type() override { return RGW_OP_PUT_METADATA_ACCOUNT; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWPutMetadataBucket : public RGWOp { +protected: + rgw::sal::Attrs attrs; + std::set rmattr_names; + bool has_policy, has_cors; + uint32_t policy_rw_mask; + RGWAccessControlPolicy policy; + RGWCORSConfiguration cors_config; + rgw_placement_rule placement_rule; + boost::optional swift_ver_location; + +public: + RGWPutMetadataBucket() + : has_policy(false), has_cors(false), policy_rw_mask(0) + {} + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */ + } + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + policy.set_ctx(s->cct); + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "put_bucket_metadata"; } + RGWOpType get_type() override { return RGW_OP_PUT_METADATA_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWPutMetadataObject : public RGWOp { +protected: + RGWAccessControlPolicy policy; + boost::optional delete_at; + const char *dlo_manifest; + +public: + RGWPutMetadataObject() + : dlo_manifest(NULL) + {} + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + policy.set_ctx(s->cct); + } + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "put_obj_metadata"; } + RGWOpType get_type() override { return RGW_OP_PUT_METADATA_OBJECT; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + virtual bool need_object_expiration() { return false; } +}; + +class RGWDeleteObj : public RGWOp { +protected: + bool delete_marker; + bool multipart_delete; + std::string version_id; + ceph::real_time unmod_since; /* if unmodified since */ + bool no_precondition_error; + std::unique_ptr deleter; + bool bypass_perm; + bool bypass_governance_mode; + +public: + RGWDeleteObj() + : delete_marker(false), + multipart_delete(false), + no_precondition_error(false), + deleter(nullptr), + bypass_perm(true), + bypass_governance_mode(false) { + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + int handle_slo_manifest(bufferlist& bl, optional_yield y); + + virtual int get_params(optional_yield y) { return 0; } + void send_response() override = 0; + const char* name() const override { return "delete_obj"; } + RGWOpType get_type() override { return RGW_OP_DELETE_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + virtual bool need_object_expiration() { return false; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWCopyObj : public RGWOp { +protected: + RGWAccessControlPolicy dest_policy; + const char *if_mod; + const char *if_unmod; + const char *if_match; + const char *if_nomatch; + // Required or it is not a copy operation + std::string_view copy_source; + // Not actually required + std::optional md_directive; + + off_t ofs; + off_t len; + off_t end; + ceph::real_time mod_time; + ceph::real_time unmod_time; + ceph::real_time *mod_ptr; + ceph::real_time *unmod_ptr; + rgw::sal::Attrs attrs; + std::unique_ptr src_bucket; + ceph::real_time src_mtime; + ceph::real_time mtime; + rgw::sal::AttrsMod attrs_mod; + std::string source_zone; + std::string etag; + + off_t last_ofs; + + std::string version_id; + uint64_t olh_epoch; + + boost::optional delete_at; + bool copy_if_newer; + + bool need_to_check_storage_class = false; + + //object lock + RGWObjectRetention *obj_retention; + RGWObjectLegalHold *obj_legal_hold; + + int init_common(); + +public: + RGWCopyObj() { + if_mod = NULL; + if_unmod = NULL; + if_match = NULL; + if_nomatch = NULL; + ofs = 0; + len = 0; + end = -1; + mod_ptr = NULL; + unmod_ptr = NULL; + attrs_mod = rgw::sal::ATTRSMOD_NONE; + last_ofs = 0; + olh_epoch = 0; + copy_if_newer = false; + obj_retention = nullptr; + obj_legal_hold = nullptr; + } + + ~RGWCopyObj() override { + delete obj_retention; + delete obj_legal_hold; + } + + static bool parse_copy_location(const std::string_view& src, + std::string& bucket_name, + rgw_obj_key& object, + req_state *s); + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); + } + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + dest_policy.set_ctx(s->cct); + } + int init_processing(optional_yield y) override; + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + void progress_cb(off_t ofs); + + virtual int check_storage_class(const rgw_placement_rule& src_placement) { + return 0; + } + + virtual int init_dest_policy() { return 0; } + virtual int get_params(optional_yield y) = 0; + virtual void send_partial_response(off_t ofs) {} + void send_response() override = 0; + const char* name() const override { return "copy_obj"; } + RGWOpType get_type() override { return RGW_OP_COPY_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWGetACLs : public RGWOp { +protected: + std::string acls; + +public: + RGWGetACLs() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "get_acls"; } + RGWOpType get_type() override { return RGW_OP_GET_ACLS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutACLs : public RGWOp { +protected: + bufferlist data; + ACLOwner owner; + +public: + RGWPutACLs() {} + ~RGWPutACLs() override {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_policy_from_state(rgw::sal::Driver* driver, req_state *s, std::stringstream& ss) { return 0; } + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "put_acls"; } + RGWOpType get_type() override { return RGW_OP_PUT_ACLS; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetLC : public RGWOp { +protected: + +public: + RGWGetLC() { } + ~RGWGetLC() override { } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield) override = 0; + + void send_response() override = 0; + const char* name() const override { return "get_lifecycle"; } + RGWOpType get_type() override { return RGW_OP_GET_LC; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutLC : public RGWOp { +protected: + bufferlist data; + const char *content_md5; + std::string cookie; + +public: + RGWPutLC() { + content_md5 = nullptr; + } + ~RGWPutLC() override {} + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *dialect_handler) override { + static constexpr std::size_t COOKIE_LEN = 16; + char buf[COOKIE_LEN + 1]; + + RGWOp::init(driver, s, dialect_handler); + gen_rand_alphanumeric(s->cct, buf, sizeof(buf) - 1); + cookie = buf; + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + +// virtual int get_policy_from_state(RGWRados* driver, req_state *s, std::stringstream& ss) { return 0; } + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "put_lifecycle"; } + RGWOpType get_type() override { return RGW_OP_PUT_LC; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWDeleteLC : public RGWOp { +public: + RGWDeleteLC() = default; + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "delete_lifecycle"; } + RGWOpType get_type() override { return RGW_OP_DELETE_LC; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetCORS : public RGWOp { +protected: + +public: + RGWGetCORS() {} + + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "get_cors"; } + RGWOpType get_type() override { return RGW_OP_GET_CORS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutCORS : public RGWOp { +protected: + bufferlist cors_bl; + bufferlist in_data; + +public: + RGWPutCORS() {} + ~RGWPutCORS() override {} + + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "put_cors"; } + RGWOpType get_type() override { return RGW_OP_PUT_CORS; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWDeleteCORS : public RGWOp { +protected: + +public: + RGWDeleteCORS() {} + + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "delete_cors"; } + RGWOpType get_type() override { return RGW_OP_DELETE_CORS; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWOptionsCORS : public RGWOp { +protected: + RGWCORSRule *rule; + const char *origin, *req_hdrs, *req_meth; + +public: + RGWOptionsCORS() : rule(NULL), origin(NULL), + req_hdrs(NULL), req_meth(NULL) { + } + + int verify_permission(optional_yield y) override {return 0;} + int validate_cors_request(RGWCORSConfiguration *cc); + void execute(optional_yield y) override; + void get_response_params(std::string& allowed_hdrs, std::string& exp_hdrs, unsigned *max_age); + void send_response() override = 0; + const char* name() const override { return "options_cors"; } + RGWOpType get_type() override { return RGW_OP_OPTIONS_CORS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutBucketEncryption : public RGWOp { +protected: + RGWBucketEncryptionConfig bucket_encryption_conf; + bufferlist data; +public: + RGWPutBucketEncryption() = default; + ~RGWPutBucketEncryption() {} + + int get_params(optional_yield y); + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + const char* name() const override { return "put_bucket_encryption"; } + RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_ENCRYPTION; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetBucketEncryption : public RGWOp { +protected: + RGWBucketEncryptionConfig bucket_encryption_conf; +public: + RGWGetBucketEncryption() {} + + int get_params(optional_yield y); + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + const char* name() const override { return "get_bucket_encryption"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_ENCRYPTION; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWDeleteBucketEncryption : public RGWOp { +protected: + RGWBucketEncryptionConfig bucket_encryption_conf; +public: + RGWDeleteBucketEncryption() {} + + int get_params(optional_yield y); + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + const char* name() const override { return "delete_bucket_encryption"; } + RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET_ENCRYPTION; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetRequestPayment : public RGWOp { +protected: + bool requester_pays; + +public: + RGWGetRequestPayment() : requester_pays(0) {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "get_request_payment"; } + RGWOpType get_type() override { return RGW_OP_GET_REQUEST_PAYMENT; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWSetRequestPayment : public RGWOp { +protected: + bool requester_pays; + bufferlist in_data; +public: + RGWSetRequestPayment() : requester_pays(false) {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) { return 0; } + + void send_response() override = 0; + const char* name() const override { return "set_request_payment"; } + RGWOpType get_type() override { return RGW_OP_SET_REQUEST_PAYMENT; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWInitMultipart : public RGWOp { +protected: + std::string upload_id; + RGWAccessControlPolicy policy; + ceph::real_time mtime; + jspan multipart_trace; + +public: + RGWInitMultipart() {} + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + policy.set_ctx(s->cct); + } + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "init_multipart"; } + RGWOpType get_type() override { return RGW_OP_INIT_MULTIPART; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + virtual int prepare_encryption(std::map& attrs) { return 0; } +}; + +class RGWCompleteMultipart : public RGWOp { +protected: + std::string upload_id; + std::string etag; + std::string version_id; + bufferlist data; + std::unique_ptr serializer; + jspan multipart_trace; + +public: + RGWCompleteMultipart() {} + ~RGWCompleteMultipart() = default; + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + bool check_previously_completed(const RGWMultiCompleteUpload* parts); + void complete() override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "complete_multipart"; } + RGWOpType get_type() override { return RGW_OP_COMPLETE_MULTIPART; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWAbortMultipart : public RGWOp { +protected: + jspan multipart_trace; +public: + RGWAbortMultipart() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + void send_response() override = 0; + const char* name() const override { return "abort_multipart"; } + RGWOpType get_type() override { return RGW_OP_ABORT_MULTIPART; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } +}; + +class RGWListMultipart : public RGWOp { +protected: + std::string upload_id; + std::unique_ptr upload; + int max_parts; + int marker; + RGWAccessControlPolicy policy; + bool truncated; + rgw_placement_rule* placement; + +public: + RGWListMultipart() { + max_parts = 1000; + marker = 0; + truncated = false; + } + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + policy = RGWAccessControlPolicy(s->cct); + } + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "list_multipart"; } + RGWOpType get_type() override { return RGW_OP_LIST_MULTIPART; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWListBucketMultiparts : public RGWOp { +protected: + std::string prefix; + std::string marker_meta; + std::string marker_key; + std::string marker_upload_id; + std::string next_marker_key; + std::string next_marker_upload_id; + int max_uploads; + std::string delimiter; + std::vector> uploads; + std::map common_prefixes; + bool is_truncated; + int default_max; + bool encode_url {false}; + +public: + RGWListBucketMultiparts() { + max_uploads = 0; + is_truncated = false; + default_max = 0; + } + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + max_uploads = default_max; + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "list_bucket_multiparts"; } + RGWOpType get_type() override { return RGW_OP_LIST_BUCKET_MULTIPARTS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + + +class RGWGetCrossDomainPolicy : public RGWOp { +public: + RGWGetCrossDomainPolicy() = default; + ~RGWGetCrossDomainPolicy() override = default; + + int verify_permission(optional_yield) override { + return 0; + } + + void execute(optional_yield) override { + op_ret = 0; + } + + const char* name() const override { return "get_crossdomain_policy"; } + + RGWOpType get_type() override { + return RGW_OP_GET_CROSS_DOMAIN_POLICY; + } + + uint32_t op_mask() override { + return RGW_OP_TYPE_READ; + } +}; + + +class RGWGetHealthCheck : public RGWOp { +public: + RGWGetHealthCheck() = default; + ~RGWGetHealthCheck() override = default; + + int verify_permission(optional_yield) override { + return 0; + } + + void execute(optional_yield y) override; + + const char* name() const override { return "get_health_check"; } + + RGWOpType get_type() override { + return RGW_OP_GET_HEALTH_CHECK; + } + + uint32_t op_mask() override { + return RGW_OP_TYPE_READ; + } +}; + + +class RGWDeleteMultiObj : public RGWOp { + /** + * Handles the deletion of an individual object and uses + * set_partial_response to record the outcome. + */ + void handle_individual_object(const rgw_obj_key& o, + optional_yield y, + boost::asio::deadline_timer *formatter_flush_cond); + + /** + * When the request is being executed in a coroutine, performs + * the actual formatter flushing and is responsible for the + * termination condition (when when all partial object responses + * have been sent). Note that the formatter flushing must be handled + * on the coroutine that invokes the execute method vs. the + * coroutines that are spawned to handle individual objects because + * the flush logic uses a yield context that was captured + * and saved on the req_state vs. one that is passed on the stack. + * This is a no-op in the case where we're not executing as a coroutine. + */ + void wait_flush(optional_yield y, + boost::asio::deadline_timer *formatter_flush_cond, + std::function predicate); + +protected: + std::vector ops_log_entries; + bufferlist data; + rgw::sal::Bucket* bucket; + bool quiet; + bool status_dumped; + bool acl_allowed = false; + bool bypass_perm; + bool bypass_governance_mode; + +public: + RGWDeleteMultiObj() { + quiet = false; + status_dumped = false; + bypass_perm = true; + bypass_governance_mode = false; + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + virtual void send_status() = 0; + virtual void begin_response() = 0; + virtual void send_partial_response(const rgw_obj_key& key, bool delete_marker, + const std::string& marker_version_id, int ret, + boost::asio::deadline_timer *formatter_flush_cond) = 0; + virtual void end_response() = 0; + const char* name() const override { return "multi_object_delete"; } + RGWOpType get_type() override { return RGW_OP_DELETE_MULTI_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + + void write_ops_log_entry(rgw_log_entry& entry) const override; +}; + +class RGWInfo: public RGWOp { +public: + RGWInfo() = default; + ~RGWInfo() override = default; + + int verify_permission(optional_yield) override { return 0; } + const char* name() const override { return "get info"; } + RGWOpType get_type() override { return RGW_OP_GET_INFO; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +extern int rgw_build_bucket_policies(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + req_state* s, optional_yield y); +extern int rgw_build_object_policies(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + req_state *s, bool prefetch_data, optional_yield y); +extern void rgw_build_iam_environment(rgw::sal::Driver* driver, + req_state* s); +extern std::vector get_iam_user_policy_from_attr(CephContext* cct, + std::map& attrs, + const std::string& tenant); + +inline int get_system_versioning_params(req_state *s, + uint64_t *olh_epoch, + std::string *version_id) +{ + if (!s->system_request) { + return 0; + } + + if (olh_epoch) { + std::string epoch_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "versioned-epoch"); + if (!epoch_str.empty()) { + std::string err; + *olh_epoch = strict_strtol(epoch_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_subdout(s, rgw, 0) << "failed to parse versioned-epoch param" + << dendl; + return -EINVAL; + } + } + } + + if (version_id) { + *version_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "version-id"); + } + + return 0; +} /* get_system_versioning_params */ + +static inline void format_xattr(std::string &xattr) +{ + /* If the extended attribute is not valid UTF-8, we encode it using + * quoted-printable encoding. + */ + if ((check_utf8(xattr.c_str(), xattr.length()) != 0) || + (check_for_control_characters(xattr.c_str(), xattr.length()) != 0)) { + static const char MIME_PREFIX_STR[] = "=?UTF-8?Q?"; + static const int MIME_PREFIX_LEN = sizeof(MIME_PREFIX_STR) - 1; + static const char MIME_SUFFIX_STR[] = "?="; + static const int MIME_SUFFIX_LEN = sizeof(MIME_SUFFIX_STR) - 1; + int mlen = mime_encode_as_qp(xattr.c_str(), NULL, 0); + char *mime = new char[MIME_PREFIX_LEN + mlen + MIME_SUFFIX_LEN + 1]; + strcpy(mime, MIME_PREFIX_STR); + mime_encode_as_qp(xattr.c_str(), mime + MIME_PREFIX_LEN, mlen); + strcpy(mime + MIME_PREFIX_LEN + (mlen - 1), MIME_SUFFIX_STR); + xattr.assign(mime); + delete [] mime; + } +} /* format_xattr */ + +/** + * Get the HTTP request metadata out of the req_state as a + * map(, where attr_name is RGW_ATTR_PREFIX.HTTP_NAME) + * s: The request state + * attrs: will be filled up with attrs mapped as + * On success returns 0. + * On failure returns a negative error code. + * + */ +inline int rgw_get_request_metadata(const DoutPrefixProvider *dpp, + CephContext* const cct, + struct req_info& info, + std::map& attrs, + const bool allow_empty_attrs = true) +{ + static const std::set blocklisted_headers = { + "x-amz-server-side-encryption-customer-algorithm", + "x-amz-server-side-encryption-customer-key", + "x-amz-server-side-encryption-customer-key-md5", + "x-amz-storage-class" + }; + + size_t valid_meta_count = 0; + for (auto& kv : info.x_meta_map) { + const std::string& name = kv.first; + std::string& xattr = kv.second; + + if (blocklisted_headers.count(name) == 1) { + ldpp_subdout(dpp, rgw, 10) << "skipping x>> " << name << dendl; + continue; + } else if (allow_empty_attrs || !xattr.empty()) { + ldpp_subdout(dpp, rgw, 10) << "x>> " << name << ":" << xattr << dendl; + format_xattr(xattr); + + std::string attr_name(RGW_ATTR_PREFIX); + attr_name.append(name); + + /* Check roughly whether we aren't going behind the limit on attribute + * name. Passing here doesn't guarantee that an OSD will accept that + * as ObjectStore::get_max_attr_name_length() can set the limit even + * lower than the "osd_max_attr_name_len" configurable. */ + const auto max_attr_name_len = cct->_conf->rgw_max_attr_name_len; + if (max_attr_name_len && attr_name.length() > max_attr_name_len) { + return -ENAMETOOLONG; + } + + /* Similar remarks apply to the check for value size. We're veryfing + * it early at the RGW's side as it's being claimed in /info. */ + const auto max_attr_size = cct->_conf->rgw_max_attr_size; + if (max_attr_size && xattr.length() > max_attr_size) { + return -EFBIG; + } + + /* Swift allows administrators to limit the number of metadats items + * send _in a single request_. */ + const auto max_attrs_num_in_req = cct->_conf->rgw_max_attrs_num_in_req; + if (max_attrs_num_in_req && + ++valid_meta_count > max_attrs_num_in_req) { + return -E2BIG; + } + + auto rval = attrs.emplace(std::move(attr_name), ceph::bufferlist()); + /* At the moment the value of the freshly created attribute key-value + * pair is an empty bufferlist. */ + + ceph::bufferlist& bl = rval.first->second; + bl.append(xattr.c_str(), xattr.size() + 1); + } + } + + return 0; +} /* rgw_get_request_metadata */ + +inline void encode_delete_at_attr(boost::optional delete_at, + std::map& attrs) +{ + if (delete_at == boost::none) { + return; + } + + bufferlist delatbl; + encode(*delete_at, delatbl); + attrs[RGW_ATTR_DELETE_AT] = delatbl; +} /* encode_delete_at_attr */ + +inline void encode_obj_tags_attr(RGWObjTags* obj_tags, std::map& attrs) +{ + if (obj_tags == nullptr){ + // we assume the user submitted a tag format which we couldn't parse since + // this wouldn't be parsed later by get/put obj tags, lets delete if the + // attr was populated + return; + } + + bufferlist tagsbl; + obj_tags->encode(tagsbl); + attrs[RGW_ATTR_TAGS] = tagsbl; +} + +inline int encode_dlo_manifest_attr(const char * const dlo_manifest, + std::map& attrs) +{ + std::string dm = dlo_manifest; + + if (dm.find('/') == std::string::npos) { + return -EINVAL; + } + + bufferlist manifest_bl; + manifest_bl.append(dlo_manifest, strlen(dlo_manifest) + 1); + attrs[RGW_ATTR_USER_MANIFEST] = manifest_bl; + + return 0; +} /* encode_dlo_manifest_attr */ + +inline void complete_etag(MD5& hash, std::string *etag) +{ + char etag_buf[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char etag_buf_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + + hash.Final((unsigned char *)etag_buf); + buf_to_hex((const unsigned char *)etag_buf, CEPH_CRYPTO_MD5_DIGESTSIZE, + etag_buf_str); + + *etag = etag_buf_str; +} /* complete_etag */ + +using boost::container::flat_map; + +class RGWGetAttrs : public RGWOp { +public: + using get_attrs_t = flat_map>; +protected: + get_attrs_t attrs; + +public: + RGWGetAttrs() + {} + + virtual ~RGWGetAttrs() {} + + void emplace_key(std::string&& key) { + attrs.emplace(std::move(key), std::nullopt); + } + + int verify_permission(optional_yield y); + void pre_exec(); + void execute(optional_yield y); + + virtual int get_params() = 0; + virtual void send_response() = 0; + virtual const char* name() const { return "get_attrs"; } + virtual RGWOpType get_type() { return RGW_OP_GET_ATTRS; } + virtual uint32_t op_mask() { return RGW_OP_TYPE_READ; } +}; /* RGWGetAttrs */ + +class RGWSetAttrs : public RGWOp { +protected: + std::map attrs; + +public: + RGWSetAttrs() {} + ~RGWSetAttrs() override {} + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + void send_response() override = 0; + const char* name() const override { return "set_attrs"; } + RGWOpType get_type() override { return RGW_OP_SET_ATTRS; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWRMAttrs : public RGWOp { +protected: + rgw::sal::Attrs attrs; + +public: + RGWRMAttrs() + {} + + virtual ~RGWRMAttrs() {} + + void emplace_key(std::string&& key) { + attrs.emplace(std::move(key), buffer::list()); + } + + int verify_permission(optional_yield y); + void pre_exec(); + void execute(optional_yield y); + + virtual int get_params() = 0; + virtual void send_response() = 0; + virtual const char* name() const { return "rm_attrs"; } + virtual RGWOpType get_type() { return RGW_OP_DELETE_ATTRS; } + virtual uint32_t op_mask() { return RGW_OP_TYPE_DELETE; } +}; /* RGWRMAttrs */ + +class RGWGetObjLayout : public RGWOp { +public: + RGWGetObjLayout() { + } + + int check_caps(RGWUserCaps& caps) { + return caps.check_cap("admin", RGW_CAP_READ); + } + int verify_permission(optional_yield) override { + return check_caps(s->user->get_info().caps); + } + void pre_exec() override; + void execute(optional_yield y) override; + + const char* name() const override { return "get_obj_layout"; } + virtual RGWOpType get_type() override { return RGW_OP_GET_OBJ_LAYOUT; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutBucketPolicy : public RGWOp { + bufferlist data; +public: + RGWPutBucketPolicy() = default; + ~RGWPutBucketPolicy() { + } + void send_response() override; + int verify_permission(optional_yield y) override; + uint32_t op_mask() override { + return RGW_OP_TYPE_WRITE; + } + void execute(optional_yield y) override; + int get_params(optional_yield y); + const char* name() const override { return "put_bucket_policy"; } + RGWOpType get_type() override { + return RGW_OP_PUT_BUCKET_POLICY; + } +}; + +class RGWGetBucketPolicy : public RGWOp { + buffer::list policy; +public: + RGWGetBucketPolicy() = default; + void send_response() override; + int verify_permission(optional_yield y) override; + uint32_t op_mask() override { + return RGW_OP_TYPE_READ; + } + void execute(optional_yield y) override; + const char* name() const override { return "get_bucket_policy"; } + RGWOpType get_type() override { + return RGW_OP_GET_BUCKET_POLICY; + } +}; + +class RGWDeleteBucketPolicy : public RGWOp { +public: + RGWDeleteBucketPolicy() = default; + void send_response() override; + int verify_permission(optional_yield y) override; + uint32_t op_mask() override { + return RGW_OP_TYPE_WRITE; + } + void execute(optional_yield y) override; + int get_params(optional_yield y); + const char* name() const override { return "delete_bucket_policy"; } + RGWOpType get_type() override { + return RGW_OP_DELETE_BUCKET_POLICY; + } +}; + +class RGWPutBucketObjectLock : public RGWOp { +protected: + bufferlist data; + bufferlist obj_lock_bl; + RGWObjectLock obj_lock; +public: + RGWPutBucketObjectLock() = default; + ~RGWPutBucketObjectLock() {} + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + virtual void send_response() override = 0; + virtual int get_params(optional_yield y) = 0; + const char* name() const override { return "put_bucket_object_lock"; } + RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_OBJ_LOCK; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetBucketObjectLock : public RGWOp { +public: + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + virtual void send_response() override = 0; + const char* name() const override {return "get_bucket_object_lock"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_OBJ_LOCK; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutObjRetention : public RGWOp { +protected: + bufferlist data; + RGWObjectRetention obj_retention; + bool bypass_perm; + bool bypass_governance_mode; +public: + RGWPutObjRetention():bypass_perm(true), bypass_governance_mode(false) {} + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + virtual void send_response() override = 0; + virtual int get_params(optional_yield y) = 0; + const char* name() const override { return "put_obj_retention"; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + RGWOpType get_type() override { return RGW_OP_PUT_OBJ_RETENTION; } +}; + +class RGWGetObjRetention : public RGWOp { +protected: + RGWObjectRetention obj_retention; +public: + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + virtual void send_response() override = 0; + const char* name() const override {return "get_obj_retention"; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ_RETENTION; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutObjLegalHold : public RGWOp { +protected: + bufferlist data; + RGWObjectLegalHold obj_legal_hold; +public: + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + virtual void send_response() override = 0; + virtual int get_params(optional_yield y) = 0; + const char* name() const override { return "put_obj_legal_hold"; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + RGWOpType get_type() override { return RGW_OP_PUT_OBJ_LEGAL_HOLD; } +}; + +class RGWGetObjLegalHold : public RGWOp { +protected: + RGWObjectLegalHold obj_legal_hold; +public: + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + virtual void send_response() override = 0; + const char* name() const override {return "get_obj_legal_hold"; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ_LEGAL_HOLD; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + + +class RGWConfigBucketMetaSearch : public RGWOp { +protected: + std::map mdsearch_config; +public: + RGWConfigBucketMetaSearch() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + virtual int get_params(optional_yield y) = 0; + const char* name() const override { return "config_bucket_meta_search"; } + virtual RGWOpType get_type() override { return RGW_OP_CONFIG_BUCKET_META_SEARCH; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetBucketMetaSearch : public RGWOp { +public: + RGWGetBucketMetaSearch() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield) override {} + + const char* name() const override { return "get_bucket_meta_search"; } + virtual RGWOpType get_type() override { return RGW_OP_GET_BUCKET_META_SEARCH; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWDelBucketMetaSearch : public RGWOp { +public: + RGWDelBucketMetaSearch() {} + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + + const char* name() const override { return "delete_bucket_meta_search"; } + virtual RGWOpType delete_type() { return RGW_OP_DEL_BUCKET_META_SEARCH; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetClusterStat : public RGWOp { +protected: + RGWClusterStat stats_op; +public: + RGWGetClusterStat() {} + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWOp::init(driver, s, h); + } + int verify_permission(optional_yield) override {return 0;} + virtual void send_response() override = 0; + virtual int get_params(optional_yield y) = 0; + void execute(optional_yield y) override; + const char* name() const override { return "get_cluster_stat"; } + dmc::client_id dmclock_client() override { return dmc::client_id::admin; } +}; + +class RGWGetBucketPolicyStatus : public RGWOp { +protected: + bool isPublic {false}; +public: + int verify_permission(optional_yield y) override; + const char* name() const override { return "get_bucket_policy_status"; } + virtual RGWOpType get_type() override { return RGW_OP_GET_BUCKET_POLICY_STATUS; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + void execute(optional_yield y) override; + dmc::client_id dmclock_client() override { return dmc::client_id::metadata; } +}; + +class RGWPutBucketPublicAccessBlock : public RGWOp { +protected: + bufferlist data; + PublicAccessBlockConfiguration access_conf; +public: + int verify_permission(optional_yield y) override; + const char* name() const override { return "put_bucket_public_access_block";} + virtual RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_PUBLIC_ACCESS_BLOCK; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + int get_params(optional_yield y); + void execute(optional_yield y) override; + dmc::client_id dmclock_client() override { return dmc::client_id::metadata; } +}; + +class RGWGetBucketPublicAccessBlock : public RGWOp { +protected: + PublicAccessBlockConfiguration access_conf; +public: + int verify_permission(optional_yield y) override; + const char* name() const override { return "get_bucket_public_access_block";} + virtual RGWOpType get_type() override { return RGW_OP_GET_BUCKET_PUBLIC_ACCESS_BLOCK; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + int get_params(optional_yield y); + void execute(optional_yield y) override; + dmc::client_id dmclock_client() override { return dmc::client_id::metadata; } +}; + +class RGWDeleteBucketPublicAccessBlock : public RGWOp { +protected: + PublicAccessBlockConfiguration access_conf; +public: + int verify_permission(optional_yield y) override; + const char* name() const override { return "delete_bucket_public_access_block";} + virtual RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET_PUBLIC_ACCESS_BLOCK; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + int get_params(optional_yield y); + void execute(optional_yield y) override; + void send_response() override; + dmc::client_id dmclock_client() override { return dmc::client_id::metadata; } +}; + +inline int parse_value_and_bound( + const std::string &input, + int &output, + const long lower_bound, + const long upper_bound, + const long default_val) +{ + if (!input.empty()) { + char *endptr; + output = strtol(input.c_str(), &endptr, 10); + if (endptr) { + if (endptr == input.c_str()) return -EINVAL; + while (*endptr && isspace(*endptr)) // ignore white space + endptr++; + if (*endptr) { + return -EINVAL; + } + } + if(output > upper_bound) { + output = upper_bound; + } + if(output < lower_bound) { + output = lower_bound; + } + } else { + output = default_val; + } + + return 0; +} + +int rgw_policy_from_attrset(const DoutPrefixProvider *dpp, + CephContext *cct, + std::map& attrset, + RGWAccessControlPolicy *policy); diff --git a/src/rgw/rgw_op_type.h b/src/rgw/rgw_op_type.h new file mode 100644 index 000000000..375c7348b --- /dev/null +++ b/src/rgw/rgw_op_type.h @@ -0,0 +1,133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +enum RGWOpType { + RGW_OP_UNKNOWN = 0, + RGW_OP_GET_OBJ, + RGW_OP_LIST_BUCKETS, + RGW_OP_STAT_ACCOUNT, + RGW_OP_LIST_BUCKET, + RGW_OP_GET_BUCKET_LOGGING, + RGW_OP_GET_BUCKET_LOCATION, + RGW_OP_GET_BUCKET_VERSIONING, + RGW_OP_SET_BUCKET_VERSIONING, + RGW_OP_GET_BUCKET_WEBSITE, + RGW_OP_SET_BUCKET_WEBSITE, + RGW_OP_STAT_BUCKET, + RGW_OP_CREATE_BUCKET, + RGW_OP_DELETE_BUCKET, + RGW_OP_PUT_OBJ, + RGW_OP_STAT_OBJ, + RGW_OP_POST_OBJ, + RGW_OP_PUT_METADATA_ACCOUNT, + RGW_OP_PUT_METADATA_BUCKET, + RGW_OP_PUT_METADATA_OBJECT, + RGW_OP_SET_TEMPURL, + RGW_OP_DELETE_OBJ, + RGW_OP_COPY_OBJ, + RGW_OP_GET_ACLS, + RGW_OP_PUT_ACLS, + RGW_OP_GET_CORS, + RGW_OP_PUT_CORS, + RGW_OP_DELETE_CORS, + RGW_OP_OPTIONS_CORS, + RGW_OP_GET_BUCKET_ENCRYPTION, + RGW_OP_PUT_BUCKET_ENCRYPTION, + RGW_OP_DELETE_BUCKET_ENCRYPTION, + RGW_OP_GET_REQUEST_PAYMENT, + RGW_OP_SET_REQUEST_PAYMENT, + RGW_OP_INIT_MULTIPART, + RGW_OP_COMPLETE_MULTIPART, + RGW_OP_ABORT_MULTIPART, + RGW_OP_LIST_MULTIPART, + RGW_OP_LIST_BUCKET_MULTIPARTS, + RGW_OP_DELETE_MULTI_OBJ, + RGW_OP_BULK_DELETE, + RGW_OP_GET_KEYS, + RGW_OP_GET_ATTRS, + RGW_OP_DELETE_ATTRS, + RGW_OP_SET_ATTRS, + RGW_OP_GET_CROSS_DOMAIN_POLICY, + RGW_OP_GET_HEALTH_CHECK, + RGW_OP_GET_INFO, + RGW_OP_CREATE_ROLE, + RGW_OP_DELETE_ROLE, + RGW_OP_GET_ROLE, + RGW_OP_MODIFY_ROLE_TRUST_POLICY, + RGW_OP_LIST_ROLES, + RGW_OP_PUT_ROLE_POLICY, + RGW_OP_GET_ROLE_POLICY, + RGW_OP_LIST_ROLE_POLICIES, + RGW_OP_DELETE_ROLE_POLICY, + RGW_OP_TAG_ROLE, + RGW_OP_LIST_ROLE_TAGS, + RGW_OP_UNTAG_ROLE, + RGW_OP_UPDATE_ROLE, + RGW_OP_PUT_BUCKET_POLICY, + RGW_OP_GET_BUCKET_POLICY, + RGW_OP_DELETE_BUCKET_POLICY, + RGW_OP_PUT_OBJ_TAGGING, + RGW_OP_GET_OBJ_TAGGING, + RGW_OP_DELETE_OBJ_TAGGING, + RGW_OP_PUT_LC, + RGW_OP_GET_LC, + RGW_OP_DELETE_LC, + RGW_OP_PUT_USER_POLICY, + RGW_OP_GET_USER_POLICY, + RGW_OP_LIST_USER_POLICIES, + RGW_OP_DELETE_USER_POLICY, + RGW_OP_PUT_BUCKET_OBJ_LOCK, + RGW_OP_GET_BUCKET_OBJ_LOCK, + RGW_OP_PUT_OBJ_RETENTION, + RGW_OP_GET_OBJ_RETENTION, + RGW_OP_PUT_OBJ_LEGAL_HOLD, + RGW_OP_GET_OBJ_LEGAL_HOLD, + /* rgw specific */ + RGW_OP_ADMIN_SET_METADATA, + RGW_OP_GET_OBJ_LAYOUT, + RGW_OP_BULK_UPLOAD, + RGW_OP_METADATA_SEARCH, + RGW_OP_CONFIG_BUCKET_META_SEARCH, + RGW_OP_GET_BUCKET_META_SEARCH, + RGW_OP_DEL_BUCKET_META_SEARCH, + RGW_OP_SYNC_DATALOG_NOTIFY, + RGW_OP_SYNC_DATALOG_NOTIFY2, + RGW_OP_SYNC_MDLOG_NOTIFY, + RGW_OP_PERIOD_POST, + /* sts specific*/ + RGW_STS_ASSUME_ROLE, + RGW_STS_GET_SESSION_TOKEN, + RGW_STS_ASSUME_ROLE_WEB_IDENTITY, + /* pubsub */ + RGW_OP_PUBSUB_TOPIC_CREATE, + RGW_OP_PUBSUB_TOPICS_LIST, + RGW_OP_PUBSUB_TOPIC_GET, + RGW_OP_PUBSUB_TOPIC_DELETE, + RGW_OP_PUBSUB_SUB_CREATE, + RGW_OP_PUBSUB_SUB_GET, + RGW_OP_PUBSUB_SUB_DELETE, + RGW_OP_PUBSUB_SUB_PULL, + RGW_OP_PUBSUB_SUB_ACK, + RGW_OP_PUBSUB_NOTIF_CREATE, + RGW_OP_PUBSUB_NOTIF_DELETE, + RGW_OP_PUBSUB_NOTIF_LIST, + RGW_OP_GET_BUCKET_TAGGING, + RGW_OP_PUT_BUCKET_TAGGING, + RGW_OP_DELETE_BUCKET_TAGGING, + RGW_OP_GET_BUCKET_REPLICATION, + RGW_OP_PUT_BUCKET_REPLICATION, + RGW_OP_DELETE_BUCKET_REPLICATION, + /* public access */ + RGW_OP_GET_BUCKET_POLICY_STATUS, + RGW_OP_PUT_BUCKET_PUBLIC_ACCESS_BLOCK, + RGW_OP_GET_BUCKET_PUBLIC_ACCESS_BLOCK, + RGW_OP_DELETE_BUCKET_PUBLIC_ACCESS_BLOCK, + /*OIDC provider specific*/ + RGW_OP_CREATE_OIDC_PROVIDER, + RGW_OP_DELETE_OIDC_PROVIDER, + RGW_OP_GET_OIDC_PROVIDER, + RGW_OP_LIST_OIDC_PROVIDERS, +}; + diff --git a/src/rgw/rgw_opa.cc b/src/rgw/rgw_opa.cc new file mode 100644 index 000000000..7422615ae --- /dev/null +++ b/src/rgw/rgw_opa.cc @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_opa.h" +#include "rgw_http_client.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int rgw_opa_authorize(RGWOp *& op, + req_state * const s) +{ + + ldpp_dout(op, 2) << "authorizing request using OPA" << dendl; + + /* get OPA url */ + const string& opa_url = s->cct->_conf->rgw_opa_url; + if (opa_url == "") { + ldpp_dout(op, 2) << "OPA_URL not provided" << dendl; + return -ERR_INVALID_REQUEST; + } + ldpp_dout(op, 2) << "OPA URL= " << opa_url.c_str() << dendl; + + /* get authentication token for OPA */ + const string& opa_token = s->cct->_conf->rgw_opa_token; + + int ret; + bufferlist bl; + RGWHTTPTransceiver req(s->cct, "POST", opa_url.c_str(), &bl); + + /* set required headers for OPA request */ + req.append_header("X-Auth-Token", opa_token); + req.append_header("Content-Type", "application/json"); + req.append_header("Expect", "100-continue"); + + /* check if we want to verify OPA server SSL certificate */ + req.set_verify_ssl(s->cct->_conf->rgw_opa_verify_ssl); + + /* create json request body */ + JSONFormatter jf; + jf.open_object_section(""); + jf.open_object_section("input"); + const char *request_method = s->info.env->get("REQUEST_METHOD"); + if (request_method) { + jf.dump_string("method", request_method); + } + jf.dump_string("relative_uri", s->relative_uri.c_str()); + jf.dump_string("decoded_uri", s->decoded_uri.c_str()); + jf.dump_string("params", s->info.request_params.c_str()); + jf.dump_string("request_uri_aws4", s->info.request_uri_aws4.c_str()); + if (s->object) { + jf.dump_string("object_name", s->object->get_name().c_str()); + } + if (s->auth.identity) { + jf.dump_string("subuser", s->auth.identity->get_subuser().c_str()); + } + if (s->user) { + jf.dump_object("user_info", s->user->get_info()); + } + if (s->bucket) { + jf.dump_object("bucket_info", s->bucket->get_info()); + } + jf.close_section(); + jf.close_section(); + + std::stringstream ss; + jf.flush(ss); + req.set_post_data(ss.str()); + req.set_send_length(ss.str().length()); + + /* send request */ + ret = req.process(null_yield); + if (ret < 0) { + ldpp_dout(op, 2) << "OPA process error:" << bl.c_str() << dendl; + return ret; + } + + /* check OPA response */ + JSONParser parser; + if (!parser.parse(bl.c_str(), bl.length())) { + ldpp_dout(op, 2) << "OPA parse error: malformed json" << dendl; + return -EINVAL; + } + + bool opa_result; + JSONDecoder::decode_json("result", opa_result, &parser); + + if (opa_result == false) { + ldpp_dout(op, 2) << "OPA rejecting request" << dendl; + return -EPERM; + } + + ldpp_dout(op, 2) << "OPA accepting request" << dendl; + return 0; +} diff --git a/src/rgw/rgw_opa.h b/src/rgw/rgw_opa.h new file mode 100644 index 000000000..6fd3b21bd --- /dev/null +++ b/src/rgw/rgw_opa.h @@ -0,0 +1,11 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_common.h" +#include "rgw_op.h" + +/* authorize request using OPA */ +int rgw_opa_authorize(RGWOp*& op, + req_state* s); diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/rgw_orphan.cc new file mode 100644 index 000000000..a8b4f5296 --- /dev/null +++ b/src/rgw/rgw_orphan.cc @@ -0,0 +1,1598 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + + +#include "common/config.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "rgw_op.h" +#include "rgw_multi.h" +#include "rgw_orphan.h" +#include "rgw_zone.h" +#include "rgw_bucket.h" +#include "rgw_sal_rados.h" + +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +#define DEFAULT_NUM_SHARDS 64 + +using namespace std; + +static string obj_fingerprint(const string& oid, const char *force_ns = NULL) +{ + ssize_t pos = oid.find('_'); + if (pos < 0) { + cerr << "ERROR: object does not have a bucket marker: " << oid << std::endl; + } + + string obj_marker = oid.substr(0, pos); + + rgw_obj_key key; + + rgw_obj_key::parse_raw_oid(oid.substr(pos + 1), &key); + + if (key.ns.empty()) { + return oid; + } + + string s = oid; + + if (force_ns) { + rgw_bucket b; + rgw_obj new_obj(b, key); + s = obj_marker + "_" + new_obj.get_oid(); + } + + /* cut out suffix */ + size_t i = s.size() - 1; + for (; i >= s.size() - 10; --i) { + char c = s[i]; + if (!isdigit(c) && c != '.' && c != '_') { + break; + } + } + + return s.substr(0, i + 1); +} + +int RGWOrphanStore::read_job(const string& job_name, RGWOrphanSearchState & state) +{ + set keys; + map vals; + keys.insert(job_name); + int r = ioctx.omap_get_vals_by_keys(oid, keys, &vals); + if (r < 0) { + return r; + } + + map::iterator iter = vals.find(job_name); + if (iter == vals.end()) { + return -ENOENT; + } + + try { + bufferlist& bl = iter->second; + decode(state, bl); + } catch (buffer::error& err) { + lderr(store->ctx()) << "ERROR: could not decode buffer" << dendl; + return -EIO; + } + + return 0; +} + +int RGWOrphanStore::write_job(const string& job_name, const RGWOrphanSearchState& state) +{ + map vals; + bufferlist bl; + encode(state, bl); + vals[job_name] = bl; + int r = ioctx.omap_set(oid, vals); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWOrphanStore::remove_job(const string& job_name) +{ + set keys; + keys.insert(job_name); + + int r = ioctx.omap_rm_keys(oid, keys); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWOrphanStore::list_jobs(map & job_list) +{ + map vals; + int MAX_READ=1024; + string marker=""; + int r = 0; + + // loop through all the omap vals from index object, storing them to job_list, + // read in batches of 1024, we update the marker every iteration and exit the + // loop when we find that total size read out is less than batch size + do { + r = ioctx.omap_get_vals(oid, marker, MAX_READ, &vals); + if (r < 0) { + return r; + } + r = vals.size(); + + for (const auto &it : vals) { + marker=it.first; + RGWOrphanSearchState state; + try { + bufferlist bl = it.second; + decode(state, bl); + } catch (buffer::error& err) { + lderr(store->ctx()) << "ERROR: could not decode buffer" << dendl; + return -EIO; + } + job_list[it.first] = state; + } + } while (r == MAX_READ); + + return 0; +} + +int RGWOrphanStore::init(const DoutPrefixProvider *dpp) +{ + const rgw_pool& log_pool = static_cast(store)->svc()->zone->get_zone_params().log_pool; + int r = rgw_init_ioctx(dpp, static_cast(store)->getRados()->get_rados_handle(), log_pool, ioctx); + if (r < 0) { + cerr << "ERROR: failed to open log pool (" << log_pool << " ret=" << r << std::endl; + return r; + } + + return 0; +} + +int RGWOrphanStore::store_entries(const DoutPrefixProvider *dpp, const string& oid, const map& entries) +{ + librados::ObjectWriteOperation op; + op.omap_set(entries); + cout << "storing " << entries.size() << " entries at " << oid << std::endl; + ldpp_dout(dpp, 20) << "storing " << entries.size() << " entries at " << oid << ": " << dendl; + for (map::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) { + ldpp_dout(dpp, 20) << " > " << iter->first << dendl; + } + int ret = rgw_rados_operate(dpp, ioctx, oid, &op, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << ret << dendl; + } + + return 0; +} + +int RGWOrphanStore::read_entries(const string& oid, const string& marker, map *entries, bool *truncated) +{ +#define MAX_OMAP_GET 100 + int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET, entries); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << cpp_strerror(-ret) << std::endl; + } + + *truncated = (entries->size() == MAX_OMAP_GET); + + return 0; +} + +int RGWOrphanSearch::init(const DoutPrefixProvider *dpp, const string& job_name, RGWOrphanSearchInfo *info, bool _detailed_mode) +{ + int r = orphan_store.init(dpp); + if (r < 0) { + return r; + } + + constexpr int64_t MAX_LIST_OBJS_ENTRIES=100; + + max_list_bucket_entries = std::max(store->ctx()->_conf->rgw_list_bucket_min_readahead, + MAX_LIST_OBJS_ENTRIES); + + detailed_mode = _detailed_mode; + RGWOrphanSearchState state; + r = orphan_store.read_job(job_name, state); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: failed to read state ret=" << r << dendl; + return r; + } + + if (r == 0) { + search_info = state.info; + search_stage = state.stage; + } else if (info) { /* r == -ENOENT, initiate a new job if info was provided */ + search_info = *info; + search_info.job_name = job_name; + search_info.num_shards = (info->num_shards ? info->num_shards : DEFAULT_NUM_SHARDS); + search_info.start_time = ceph_clock_now(); + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_INIT); + + r = save_state(); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to write state ret=" << r << dendl; + return r; + } + } else { + ldpp_dout(dpp, -1) << "ERROR: job not found" << dendl; + return r; + } + + index_objs_prefix = RGW_ORPHAN_INDEX_PREFIX + string("."); + index_objs_prefix += job_name; + + for (int i = 0; i < search_info.num_shards; i++) { + char buf[128]; + + snprintf(buf, sizeof(buf), "%s.rados.%d", index_objs_prefix.c_str(), i); + all_objs_index[i] = buf; + + snprintf(buf, sizeof(buf), "%s.buckets.%d", index_objs_prefix.c_str(), i); + buckets_instance_index[i] = buf; + + snprintf(buf, sizeof(buf), "%s.linked.%d", index_objs_prefix.c_str(), i); + linked_objs_index[i] = buf; + } + return 0; +} + +int RGWOrphanSearch::log_oids(const DoutPrefixProvider *dpp, map& log_shards, map >& oids) +{ + map >::iterator miter = oids.begin(); + + list liters; /* a list of iterator pairs for begin and end */ + + for (; miter != oids.end(); ++miter) { + log_iter_info info; + info.oid = log_shards[miter->first]; + info.cur = miter->second.begin(); + info.end = miter->second.end(); + liters.push_back(info); + } + + list::iterator list_iter; + while (!liters.empty()) { + list_iter = liters.begin(); + + while (list_iter != liters.end()) { + log_iter_info& cur_info = *list_iter; + + list::iterator& cur = cur_info.cur; + list::iterator& end = cur_info.end; + + map entries; +#define MAX_OMAP_SET_ENTRIES 100 + for (int j = 0; cur != end && j != MAX_OMAP_SET_ENTRIES; ++cur, ++j) { + ldpp_dout(dpp, 20) << "adding obj: " << *cur << dendl; + entries[*cur] = bufferlist(); + } + + int ret = orphan_store.store_entries(dpp, cur_info.oid, entries); + if (ret < 0) { + return ret; + } + list::iterator tmp = list_iter; + ++list_iter; + if (cur == end) { + liters.erase(tmp); + } + } + } + return 0; +} + +int RGWOrphanSearch::build_all_oids_index(const DoutPrefixProvider *dpp) +{ + librados::IoCtx ioctx; + + int ret = rgw_init_ioctx(dpp, static_cast(store)->getRados()->get_rados_handle(), search_info.pool, ioctx); + if (ret < 0) { + ldpp_dout(dpp, -1) << __func__ << ": rgw_init_ioctx() returned ret=" << ret << dendl; + return ret; + } + + ioctx.set_namespace(librados::all_nspaces); + librados::NObjectIterator i = ioctx.nobjects_begin(); + librados::NObjectIterator i_end = ioctx.nobjects_end(); + + map > oids; + + int count = 0; + uint64_t total = 0; + + cout << "logging all objects in the pool" << std::endl; + + for (; i != i_end; ++i) { + string nspace = i->get_nspace(); + string oid = i->get_oid(); + string locator = i->get_locator(); + + ssize_t pos = oid.find('_'); + if (pos < 0) { + cout << "unidentified oid: " << oid << ", skipping" << std::endl; + /* what is this object, oids should be in the format of _, + * skip this entry + */ + continue; + } + string stripped_oid = oid.substr(pos + 1); + rgw_obj_key key; + if (!rgw_obj_key::parse_raw_oid(stripped_oid, &key)) { + cout << "cannot parse oid: " << oid << ", skipping" << std::endl; + continue; + } + + if (key.ns.empty()) { + /* skipping head objects, we don't want to remove these as they are mutable and + * cleaning them up is racy (can race with object removal and a later recreation) + */ + cout << "skipping head object: oid=" << oid << std::endl; + continue; + } + + string oid_fp = obj_fingerprint(oid); + + ldout(store->ctx(), 20) << "oid_fp=" << oid_fp << dendl; + + int shard = orphan_shard(oid_fp); + oids[shard].push_back(oid); + +#define COUNT_BEFORE_FLUSH 1000 + ++total; + if (++count >= COUNT_BEFORE_FLUSH) { + ldout(store->ctx(), 1) << "iterated through " << total << " objects" << dendl; + ret = log_oids(dpp, all_objs_index, oids); + if (ret < 0) { + cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl; + return ret; + } + count = 0; + oids.clear(); + } + } + ret = log_oids(dpp, all_objs_index, oids); + if (ret < 0) { + cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl; + return ret; + } + + return 0; +} + +int RGWOrphanSearch::build_buckets_instance_index(const DoutPrefixProvider *dpp) +{ + void *handle; + int max = 1000; + string section = "bucket.instance"; + int ret = store->meta_list_keys_init(dpp, section, string(), &handle); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: can't get key: " << cpp_strerror(-ret) << dendl; + return ret; + } + + map > instances; + + bool truncated; + + RGWObjectCtx obj_ctx(store); + + int count = 0; + uint64_t total = 0; + + do { + list keys; + ret = store->meta_list_keys_next(dpp, handle, max, keys, &truncated); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + for (list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + ++total; + ldpp_dout(dpp, 10) << "bucket_instance=" << *iter << " total=" << total << dendl; + int shard = orphan_shard(*iter); + instances[shard].push_back(*iter); + + if (++count >= COUNT_BEFORE_FLUSH) { + ret = log_oids(dpp, buckets_instance_index, instances); + if (ret < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl; + return ret; + } + count = 0; + instances.clear(); + } + } + + } while (truncated); + + store->meta_list_keys_complete(handle); + + ret = log_oids(dpp, buckets_instance_index, instances); + if (ret < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWOrphanSearch::handle_stat_result(const DoutPrefixProvider *dpp, map >& oids, RGWRados::Object::Stat::Result& result) +{ + set obj_oids; + rgw_bucket& bucket = result.obj.bucket; + if (!result.manifest) { /* a very very old object, or part of a multipart upload during upload */ + const string loc = bucket.bucket_id + "_" + result.obj.get_oid(); + obj_oids.insert(obj_fingerprint(loc)); + + /* + * multipart parts don't have manifest on them, it's in the meta object. Instead of reading the + * meta object, just add a "shadow" object to the mix + */ + obj_oids.insert(obj_fingerprint(loc, "shadow")); + } else { + RGWObjManifest& manifest = *result.manifest; + + if (!detailed_mode && + manifest.get_obj_size() <= manifest.get_head_size()) { + ldpp_dout(dpp, 5) << "skipping object as it fits in a head" << dendl; + return 0; + } + + RGWObjManifest::obj_iterator miter; + for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) { + const rgw_raw_obj& loc = miter.get_location().get_raw_obj(store->getRados()); + string s = loc.oid; + obj_oids.insert(obj_fingerprint(s)); + } + } + + for (set::iterator iter = obj_oids.begin(); iter != obj_oids.end(); ++iter) { + ldpp_dout(dpp, 20) << __func__ << ": oid for obj=" << result.obj << ": " << *iter << dendl; + + int shard = orphan_shard(*iter); + oids[shard].push_back(*iter); + } + + return 0; +} + +int RGWOrphanSearch::pop_and_handle_stat_op(const DoutPrefixProvider *dpp, map >& oids, std::deque& ops) +{ + RGWRados::Object::Stat& front_op = ops.front(); + + int ret = front_op.wait(dpp); + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl; + } + goto done; + } + ret = handle_stat_result(dpp, oids, front_op.result); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: handle_stat_response() returned error: " << cpp_strerror(-ret) << dendl; + } +done: + ops.pop_front(); + return ret; +} + +int RGWOrphanSearch::build_linked_oids_for_bucket(const DoutPrefixProvider *dpp, const string& bucket_instance_id, map >& oids) +{ + RGWObjectCtx obj_ctx(store); + rgw_bucket orphan_bucket; + int shard_id; + int ret = rgw_bucket_parse_bucket_key(store->ctx(), bucket_instance_id, + &orphan_bucket, &shard_id); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << " failed to parse bucket instance: " + << bucket_instance_id << " skipping" << dendl; + return ret; + } + + std::unique_ptr cur_bucket; + ret = store->get_bucket(dpp, nullptr, orphan_bucket, &cur_bucket, null_yield); + if (ret < 0) { + if (ret == -ENOENT) { + /* probably raced with bucket removal */ + return 0; + } + ldpp_dout(dpp, -1) << __func__ << ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << ret << dendl; + return ret; + } + + if (cur_bucket->get_bucket_id() != orphan_bucket.bucket_id) { + ldpp_dout(dpp, 0) << __func__ << ": Skipping stale bucket instance: " + << orphan_bucket.name << ": " + << orphan_bucket.bucket_id << dendl; + return 0; + } + + if (cur_bucket->get_info().layout.resharding != rgw::BucketReshardState::None) { + ldpp_dout(dpp, 0) << __func__ << ": reshard in progress. Skipping " + << orphan_bucket.name << ": " + << orphan_bucket.bucket_id << dendl; + return 0; + } + + rgw_bucket b; + rgw_bucket_parse_bucket_key(store->ctx(), bucket_instance_id, &b, nullptr); + std::unique_ptr bucket; + ret = store->get_bucket(dpp, nullptr, b, &bucket, null_yield); + if (ret < 0) { + if (ret == -ENOENT) { + /* probably raced with bucket removal */ + return 0; + } + ldpp_dout(dpp, -1) << __func__ << ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << ret << dendl; + return ret; + } + + ldpp_dout(dpp, 10) << "building linked oids for bucket instance: " << bucket_instance_id << dendl; + RGWRados::Bucket target(store->getRados(), cur_bucket->get_info()); + RGWRados::Bucket::List list_op(&target); + + string marker; + list_op.params.marker = rgw_obj_key(marker); + list_op.params.list_versions = true; + list_op.params.enforce_ns = false; + + bool truncated; + + deque stat_ops; + + do { + vector result; + + ret = list_op.list_objects(dpp, max_list_bucket_entries, + &result, nullptr, &truncated, null_yield); + if (ret < 0) { + cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << std::endl; + return ret; + } + + for (vector::iterator iter = result.begin(); iter != result.end(); ++iter) { + rgw_bucket_dir_entry& entry = *iter; + if (entry.key.instance.empty()) { + ldpp_dout(dpp, 20) << "obj entry: " << entry.key.name << dendl; + } else { + ldpp_dout(dpp, 20) << "obj entry: " << entry.key.name << " [" << entry.key.instance << "]" << dendl; + } + + ldpp_dout(dpp, 20) << __func__ << ": entry.key.name=" << entry.key.name << " entry.key.instance=" << entry.key.instance << dendl; + + if (!detailed_mode && + entry.meta.accounted_size <= (uint64_t)store->ctx()->_conf->rgw_max_chunk_size) { + ldpp_dout(dpp, 5) << __func__ << "skipping stat as the object " << entry.key.name + << "fits in a head" << dendl; + continue; + } + + rgw_obj obj(cur_bucket->get_key(), entry.key); + + RGWRados::Object op_target(store->getRados(), cur_bucket->get_info(), obj_ctx, obj); + + stat_ops.push_back(RGWRados::Object::Stat(&op_target)); + RGWRados::Object::Stat& op = stat_ops.back(); + + ret = op.stat_async(dpp); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl; + return ret; + } + if (stat_ops.size() >= max_concurrent_ios) { + ret = pop_and_handle_stat_op(dpp, oids, stat_ops); + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl; + } + } + } + if (oids.size() >= COUNT_BEFORE_FLUSH) { + ret = log_oids(dpp, linked_objs_index, oids); + if (ret < 0) { + cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl; + return ret; + } + oids.clear(); + } + } + } while (truncated); + + while (!stat_ops.empty()) { + ret = pop_and_handle_stat_op(dpp, oids, stat_ops); + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl; + } + } + } + + return 0; +} + +int RGWOrphanSearch::build_linked_oids_index(const DoutPrefixProvider *dpp) +{ + map > oids; + map::iterator iter = buckets_instance_index.find(search_stage.shard); + for (; iter != buckets_instance_index.end(); ++iter) { + ldpp_dout(dpp, 0) << "building linked oids index: " << iter->first << "/" << buckets_instance_index.size() << dendl; + bool truncated; + + string oid = iter->second; + + do { + map entries; + int ret = orphan_store.read_entries(oid, search_stage.marker, &entries, &truncated); + if (ret == -ENOENT) { + truncated = false; + ret = 0; + } + + if (ret < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: read_entries() oid=" << oid << " returned ret=" << ret << dendl; + return ret; + } + + if (entries.empty()) { + break; + } + + for (map::iterator eiter = entries.begin(); eiter != entries.end(); ++eiter) { + ldpp_dout(dpp, 20) << " indexed entry: " << eiter->first << dendl; + ret = build_linked_oids_for_bucket(dpp, eiter->first, oids); + if (ret < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_linked_oids_for_bucket() indexed entry=" << eiter->first + << " returned ret=" << ret << dendl; + return ret; + } + } + + search_stage.shard = iter->first; + search_stage.marker = entries.rbegin()->first; /* last entry */ + } while (truncated); + + search_stage.marker.clear(); + } + + int ret = log_oids(dpp, linked_objs_index, oids); + if (ret < 0) { + cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl; + return ret; + } + + ret = save_state(); + if (ret < 0) { + cerr << __func__ << ": ERROR: failed to write state ret=" << ret << std::endl; + return ret; + } + + return 0; +} + +class OMAPReader { + librados::IoCtx ioctx; + string oid; + + map entries; + map::iterator iter; + string marker; + bool truncated; + +public: + OMAPReader(librados::IoCtx& _ioctx, const string& _oid) : ioctx(_ioctx), oid(_oid), truncated(true) { + iter = entries.end(); + } + + int get_next(string *key, bufferlist *pbl, bool *done); +}; + +int OMAPReader::get_next(string *key, bufferlist *pbl, bool *done) +{ + if (iter != entries.end()) { + *key = iter->first; + if (pbl) { + *pbl = iter->second; + } + ++iter; + *done = false; + marker = *key; + return 0; + } + + if (!truncated) { + *done = true; + return 0; + } + +#define MAX_OMAP_GET_ENTRIES 100 + int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET_ENTRIES, &entries); + if (ret < 0) { + if (ret == -ENOENT) { + *done = true; + return 0; + } + return ret; + } + + truncated = (entries.size() == MAX_OMAP_GET_ENTRIES); + iter = entries.begin(); + return get_next(key, pbl, done); +} + +int RGWOrphanSearch::compare_oid_indexes(const DoutPrefixProvider *dpp) +{ + ceph_assert(linked_objs_index.size() == all_objs_index.size()); + + librados::IoCtx& ioctx = orphan_store.get_ioctx(); + + librados::IoCtx data_ioctx; + + int ret = rgw_init_ioctx(dpp, static_cast(store)->getRados()->get_rados_handle(), search_info.pool, data_ioctx); + if (ret < 0) { + ldpp_dout(dpp, -1) << __func__ << ": rgw_init_ioctx() returned ret=" << ret << dendl; + return ret; + } + + uint64_t time_threshold = search_info.start_time.sec() - stale_secs; + + map::iterator liter = linked_objs_index.begin(); + map::iterator aiter = all_objs_index.begin(); + + for (; liter != linked_objs_index.end(); ++liter, ++aiter) { + OMAPReader linked_entries(ioctx, liter->second); + OMAPReader all_entries(ioctx, aiter->second); + + bool done; + + string cur_linked; + bool linked_done = false; + + + do { + string key; + int r = all_entries.get_next(&key, NULL, &done); + if (r < 0) { + return r; + } + if (done) { + break; + } + + string key_fp = obj_fingerprint(key); + + while (cur_linked < key_fp && !linked_done) { + r = linked_entries.get_next(&cur_linked, NULL, &linked_done); + if (r < 0) { + return r; + } + } + + if (cur_linked == key_fp) { + ldpp_dout(dpp, 20) << "linked: " << key << dendl; + continue; + } + + time_t mtime; + r = data_ioctx.stat(key, NULL, &mtime); + if (r < 0) { + if (r != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: ioctx.stat(" << key << ") returned ret=" << r << dendl; + } + continue; + } + if (stale_secs && (uint64_t)mtime >= time_threshold) { + ldpp_dout(dpp, 20) << "skipping: " << key << " (mtime=" << mtime << " threshold=" << time_threshold << ")" << dendl; + continue; + } + ldpp_dout(dpp, 20) << "leaked: " << key << dendl; + cout << "leaked: " << key << std::endl; + } while (!done); + } + + return 0; +} + +int RGWOrphanSearch::run(const DoutPrefixProvider *dpp) +{ + int r; + + switch (search_stage.stage) { + + case ORPHAN_SEARCH_STAGE_INIT: + ldpp_dout(dpp, 0) << __func__ << "(): initializing state" << dendl; + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSPOOL); + r = save_state(); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl; + return r; + } + // fall through + case ORPHAN_SEARCH_STAGE_LSPOOL: + ldpp_dout(dpp, 0) << __func__ << "(): building index of all objects in pool" << dendl; + r = build_all_oids_index(dpp); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl; + return r; + } + + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSBUCKETS); + r = save_state(); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl; + return r; + } + // fall through + + case ORPHAN_SEARCH_STAGE_LSBUCKETS: + ldpp_dout(dpp, 0) << __func__ << "(): building index of all bucket indexes" << dendl; + r = build_buckets_instance_index(dpp); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl; + return r; + } + + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_ITERATE_BI); + r = save_state(); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl; + return r; + } + // fall through + + + case ORPHAN_SEARCH_STAGE_ITERATE_BI: + ldpp_dout(dpp, 0) << __func__ << "(): building index of all linked objects" << dendl; + r = build_linked_oids_index(dpp); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl; + return r; + } + + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_COMPARE); + r = save_state(); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl; + return r; + } + // fall through + + case ORPHAN_SEARCH_STAGE_COMPARE: + r = compare_oid_indexes(dpp); + if (r < 0) { + ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl; + return r; + } + + break; + + default: + ceph_abort(); + }; + + return 0; +} + + +int RGWOrphanSearch::remove_index(map& index) +{ + librados::IoCtx& ioctx = orphan_store.get_ioctx(); + + for (map::iterator iter = index.begin(); iter != index.end(); ++iter) { + int r = ioctx.remove(iter->second); + if (r < 0) { + if (r != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: couldn't remove " << iter->second << ": ret=" << r << dendl; + } + } + } + return 0; +} + +int RGWOrphanSearch::finish() +{ + int r = remove_index(all_objs_index); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: remove_index(" << all_objs_index << ") returned ret=" << r << dendl; + } + r = remove_index(buckets_instance_index); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: remove_index(" << buckets_instance_index << ") returned ret=" << r << dendl; + } + r = remove_index(linked_objs_index); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: remove_index(" << linked_objs_index << ") returned ret=" << r << dendl; + } + + r = orphan_store.remove_job(search_info.job_name); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: could not remove job name (" << search_info.job_name << ") ret=" << r << dendl; + } + + return r; +} + + +int RGWRadosList::handle_stat_result(const DoutPrefixProvider *dpp, + RGWRados::Object::Stat::Result& result, + std::string& bucket_name, + rgw_obj_key& obj_key, + std::set& obj_oids) +{ + obj_oids.clear(); + + rgw_bucket& bucket = result.obj.bucket; + + ldpp_dout(dpp, 20) << "RGWRadosList::" << __func__ << + " bucket=" << bucket << + ", has_manifest=" << result.manifest.has_value() << + dendl; + + // iterator to store result of dlo/slo attribute find + decltype(result.attrs)::iterator attr_it = result.attrs.end(); + const std::string oid = bucket.marker + "_" + result.obj.get_oid(); + ldpp_dout(dpp, 20) << "radoslist processing object=\"" << + oid << "\"" << dendl; + if (visited_oids.find(oid) != visited_oids.end()) { + // apparently we hit a loop; don't continue with this oid + ldpp_dout(dpp, 15) << + "radoslist stopped loop at already visited object=\"" << + oid << "\"" << dendl; + return 0; + } + + bucket_name = bucket.name; + obj_key = result.obj.key; + + if (!result.manifest) { + /* a very very old object, or part of a multipart upload during upload */ + obj_oids.insert(oid); + + /* + * multipart parts don't have manifest on them, it's in the meta + * object; we'll process them in + * RGWRadosList::do_incomplete_multipart + */ + } else if ((attr_it = result.attrs.find(RGW_ATTR_USER_MANIFEST)) != + result.attrs.end()) { + // *** handle DLO object *** + + obj_oids.insert(oid); + visited_oids.insert(oid); // prevent dlo loops + ldpp_dout(dpp, 15) << "radoslist added to visited list DLO=\"" << + oid << "\"" << dendl; + + char* prefix_path_c = attr_it->second.c_str(); + const std::string& prefix_path = prefix_path_c; + + const size_t sep_pos = prefix_path.find('/'); + if (string::npos == sep_pos) { + return -EINVAL; + } + + const std::string bucket_name = prefix_path.substr(0, sep_pos); + const std::string prefix = prefix_path.substr(sep_pos + 1); + + add_bucket_prefix(bucket_name, prefix); + ldpp_dout(dpp, 25) << "radoslist DLO oid=\"" << oid << + "\" added bucket=\"" << bucket_name << "\" prefix=\"" << + prefix << "\" to process list" << dendl; + } else if ((attr_it = result.attrs.find(RGW_ATTR_USER_MANIFEST)) != + result.attrs.end()) { + // *** handle SLO object *** + + obj_oids.insert(oid); + visited_oids.insert(oid); // prevent slo loops + ldpp_dout(dpp, 15) << "radoslist added to visited list SLO=\"" << + oid << "\"" << dendl; + + RGWSLOInfo slo_info; + bufferlist::const_iterator bliter = attr_it->second.begin(); + try { + ::decode(slo_info, bliter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << + "ERROR: failed to decode slo manifest for " << oid << dendl; + return -EIO; + } + + for (const auto& iter : slo_info.entries) { + const string& path_str = iter.path; + + const size_t sep_pos = path_str.find('/', 1 /* skip initial slash */); + if (string::npos == sep_pos) { + return -EINVAL; + } + + std::string bucket_name; + std::string obj_name; + + bucket_name = url_decode(path_str.substr(1, sep_pos - 1)); + obj_name = url_decode(path_str.substr(sep_pos + 1)); + + const rgw_obj_key obj_key(obj_name); + add_bucket_filter(bucket_name, obj_key); + ldpp_dout(dpp, 25) << "radoslist SLO oid=\"" << oid << + "\" added bucket=\"" << bucket_name << "\" obj_key=\"" << + obj_key << "\" to process list" << dendl; + } + } else { + RGWObjManifest& manifest = *result.manifest; + + // in multipart, the head object contains no data and just has the + // manifest AND empty objects have no manifest, but they're + // realized as empty rados objects + if (0 == manifest.get_max_head_size() || + manifest.obj_begin(dpp) == manifest.obj_end(dpp)) { + obj_oids.insert(oid); + // first_insert = true; + } + + RGWObjManifest::obj_iterator miter; + for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) { + const rgw_raw_obj& loc = + miter.get_location().get_raw_obj(store->getRados()); + string s = loc.oid; + obj_oids.insert(s); + } + } + + return 0; +} // RGWRadosList::handle_stat_result + +int RGWRadosList::pop_and_handle_stat_op( + const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + std::deque& ops) +{ + std::string bucket_name; + rgw_obj_key obj_key; + std::set obj_oids; + RGWRados::Object::Stat& front_op = ops.front(); + + int ret = front_op.wait(dpp); + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << + cpp_strerror(-ret) << dendl; + } + goto done; + } + + ret = handle_stat_result(dpp, front_op.result, bucket_name, obj_key, obj_oids); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: handle_stat_result() returned error: " << + cpp_strerror(-ret) << dendl; + } + + // output results + for (const auto& o : obj_oids) { + if (include_rgw_obj_name) { + std::cout << o << + field_separator << bucket_name << + field_separator << obj_key << + std::endl; + } else { + std::cout << o << std::endl; + } + } + +done: + + // invalidate object context for this object to avoid memory leak + // (see pr https://github.com/ceph/ceph/pull/30174) + obj_ctx.invalidate(front_op.result.obj); + + ops.pop_front(); + return ret; +} + + +#if 0 // code that may be the basis for expansion +int RGWRadosList::build_buckets_instance_index() +{ + void *handle; + int max = 1000; + string section = "bucket.instance"; + int ret = store->meta_mgr->list_keys_init(section, &handle); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: can't get key: " << cpp_strerror(-ret) << dendl; + return ret; + } + + map > instances; + + bool truncated; + + RGWObjectCtx obj_ctx(store); + + int count = 0; + uint64_t total = 0; + + do { + list keys; + ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + for (list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + ++total; + ldout(store->ctx(), 10) << "bucket_instance=" << *iter << " total=" << total << dendl; + int shard = orphan_shard(*iter); + instances[shard].push_back(*iter); + + if (++count >= COUNT_BEFORE_FLUSH) { + ret = log_oids(buckets_instance_index, instances); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl; + return ret; + } + count = 0; + instances.clear(); + } + } + } while (truncated); + + ret = log_oids(buckets_instance_index, instances); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl; + return ret; + } + store->meta_mgr->list_keys_complete(handle); + + return 0; +} +#endif + + +int RGWRadosList::process_bucket( + const DoutPrefixProvider *dpp, + const std::string& bucket_instance_id, + const std::string& prefix, + const std::set& entries_filter) +{ + ldpp_dout(dpp, 10) << "RGWRadosList::" << __func__ << + " bucket_instance_id=" << bucket_instance_id << + ", prefix=" << prefix << + ", entries_filter.size=" << entries_filter.size() << dendl; + + RGWBucketInfo bucket_info; + int ret = store->getRados()->get_bucket_instance_info(bucket_instance_id, + bucket_info, + nullptr, + nullptr, + null_yield, + dpp); + if (ret < 0) { + if (ret == -ENOENT) { + // probably raced with bucket removal + return 0; + } + ldpp_dout(dpp, -1) << __func__ << + ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << + ret << dendl; + return ret; + } + + RGWRados::Bucket target(store->getRados(), bucket_info); + RGWRados::Bucket::List list_op(&target); + + std::string marker; + list_op.params.marker = rgw_obj_key(marker); + list_op.params.list_versions = true; + list_op.params.enforce_ns = false; + list_op.params.allow_unordered = false; + list_op.params.prefix = prefix; + + bool truncated; + + std::deque stat_ops; + std::string prev_versioned_key_name = ""; + + RGWObjectCtx obj_ctx(store); + + do { + std::vector result; + constexpr int64_t LIST_OBJS_MAX_ENTRIES = 100; + ret = list_op.list_objects(dpp, LIST_OBJS_MAX_ENTRIES, &result, + NULL, &truncated, null_yield); + if (ret == -ENOENT) { + // race with bucket delete? + ret = 0; + break; + } else if (ret < 0) { + std::cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << + std::endl; + return ret; + } + + for (std::vector::iterator iter = result.begin(); + iter != result.end(); + ++iter) { + rgw_bucket_dir_entry& entry = *iter; + + if (entry.key.instance.empty()) { + ldpp_dout(dpp, 20) << "obj entry: " << entry.key.name << dendl; + } else { + ldpp_dout(dpp, 20) << "obj entry: " << entry.key.name << + " [" << entry.key.instance << "]" << dendl; + } + + ldpp_dout(dpp, 20) << __func__ << ": entry.key.name=" << + entry.key.name << " entry.key.instance=" << entry.key.instance << + dendl; + + // ignore entries that are not in the filter if there is a filter + if (!entries_filter.empty() && + entries_filter.find(entry.key) == entries_filter.cend()) { + continue; + } + + std::unique_ptr bucket; + store->get_bucket(nullptr, bucket_info, &bucket); + // we need to do this in two cases below, so use a lambda + auto do_stat_key = + [&](const rgw_obj_key& key) -> int { + int ret; + + rgw_obj obj(bucket_info.bucket, key); + RGWRados::Object op_target(store->getRados(), bucket_info, + obj_ctx, obj); + + stat_ops.push_back(RGWRados::Object::Stat(&op_target)); + RGWRados::Object::Stat& op = stat_ops.back(); + + ret = op.stat_async(dpp); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << + cpp_strerror(-ret) << dendl; + return ret; + } + + if (stat_ops.size() >= max_concurrent_ios) { + ret = pop_and_handle_stat_op(dpp, obj_ctx, stat_ops); + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << + "ERROR: pop_and_handle_stat_op() returned error: " << + cpp_strerror(-ret) << dendl; + } + + // clear error, so we'll continue processing directory + ret = 0; + } + } + + return ret; + }; // do_stat_key lambda + + // for versioned objects, make sure the head object is handled + // as well by ignoring the instance identifier + if (!entry.key.instance.empty() && + entry.key.name != prev_versioned_key_name) { + // don't do the same key twice; even though out bucket index + // listing allows unordered, since all versions of an object + // use the same bucket index key, they'll all end up together + // and sorted + prev_versioned_key_name = entry.key.name; + + rgw_obj_key uninstanced(entry.key.name); + + ret = do_stat_key(uninstanced); + if (ret < 0) { + return ret; + } + } + + ret = do_stat_key(entry.key); + if (ret < 0) { + return ret; + } + } // for iter loop + } while (truncated); + + while (!stat_ops.empty()) { + ret = pop_and_handle_stat_op(dpp, obj_ctx, stat_ops); + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << + cpp_strerror(-ret) << dendl; + } + } + } + + return 0; +} + + +int RGWRadosList::run(const DoutPrefixProvider *dpp, + const bool yes_i_really_mean_it) +{ + int ret; + void* handle = nullptr; + + ret = store->meta_list_keys_init(dpp, "bucket", string(), &handle); + if (ret < 0) { + ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ << + " ERROR: list_keys_init returned " << + cpp_strerror(-ret) << dendl; + return ret; + } + + constexpr int max_keys = 1000; + bool truncated = true; + bool warned_indexless = false; + + do { + std::list buckets; + ret = store->meta_list_keys_next(dpp, handle, max_keys, buckets, &truncated); + + for (std::string& bucket_id : buckets) { + ret = run(dpp, bucket_id, true); + if (ret == -ENOENT) { + continue; + } else if (ret == -EINVAL) { + if (! warned_indexless) { + if (yes_i_really_mean_it) { + std::cerr << + "WARNING: because there is at least one indexless bucket (" << + bucket_id << + ") the results of radoslist are *incomplete*; continuing due to --yes-i-really-mean-it" << + std::endl; + warned_indexless = true; + } else { + std::cerr << "ERROR: because there is at least one indexless bucket (" << + bucket_id << + ") the results of radoslist are *incomplete*; use --yes-i-really-mean-it to bypass error" << + std::endl; + return ret; + } + } + continue; + } else if (ret < 0) { + return ret; + } + } + } while (truncated); + + return 0; +} // RGWRadosList::run(DoutPrefixProvider, bool) + + +int RGWRadosList::run(const DoutPrefixProvider *dpp, + const std::string& start_bucket_name, + const bool silent_indexless) +{ + int ret; + + add_bucket_entire(start_bucket_name); + + while (! bucket_process_map.empty()) { + // pop item from map and capture its key data + auto front = bucket_process_map.begin(); + std::string bucket_name = front->first; + process_t process; + std::swap(process, front->second); + bucket_process_map.erase(front); + + std::unique_ptr bucket; + ret = store->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield); + if (ret == -ENOENT) { + std::cerr << "WARNING: bucket " << bucket_name << + " does not exist; could it have been deleted very recently?" << + std::endl; + continue; + } else if (ret < 0) { + std::cerr << "ERROR: could not get info for bucket " << bucket_name << + " -- " << cpp_strerror(-ret) << std::endl; + return ret; + } else if (bucket->get_info().is_indexless()) { + if (! silent_indexless) { + std::cerr << "ERROR: unable to run radoslist on indexless bucket " << + bucket_name << std::endl; + } + return -EINVAL; + } + + const std::string bucket_id = bucket->get_key().get_key(); + + static const std::set empty_filter; + static const std::string empty_prefix; + + auto do_process_bucket = + [dpp, &bucket_id, this] + (const std::string& prefix, + const std::set& entries_filter) -> int { + int ret = process_bucket(dpp, bucket_id, prefix, entries_filter); + if (ret == -ENOENT) { + // bucket deletion race? + return 0; + } if (ret < 0) { + ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ << + ": ERROR: process_bucket(); bucket_id=" << + bucket_id << " returned ret=" << ret << dendl; + } + + return ret; + }; + + // either process the whole bucket *or* process the filters and/or + // the prefixes + if (process.entire_container) { + ret = do_process_bucket(empty_prefix, empty_filter); + if (ret < 0) { + return ret; + } + } else { + if (! process.filter_keys.empty()) { + ret = do_process_bucket(empty_prefix, process.filter_keys); + if (ret < 0) { + return ret; + } + } + for (const auto& p : process.prefixes) { + ret = do_process_bucket(p, empty_filter); + if (ret < 0) { + return ret; + } + } + } + } // while (! bucket_process_map.empty()) + + if (include_rgw_obj_name) { + return 0; + } + + // now handle incomplete multipart uploads by going back to the + // initial bucket + + std::unique_ptr bucket; + ret = store->get_bucket(dpp, nullptr, tenant_name, start_bucket_name, &bucket, null_yield); + if (ret == -ENOENT) { + // bucket deletion race? + return 0; + } else if (ret < 0) { + ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ << + ": ERROR: get_bucket_info returned ret=" << ret << dendl; + return ret; + } + + ret = do_incomplete_multipart(dpp, bucket.get()); + if (ret < 0) { + ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ << + ": ERROR: do_incomplete_multipart returned ret=" << ret << dendl; + return ret; + } + + return 0; +} // RGWRadosList::run(DoutPrefixProvider, string, bool) + + +int RGWRadosList::do_incomplete_multipart(const DoutPrefixProvider *dpp, + rgw::sal::Bucket* bucket) +{ + constexpr int max_uploads = 1000; + constexpr int max_parts = 1000; + std::string marker; + vector> uploads; + bool is_truncated; + int ret; + + // use empty strings for params.{prefix,delim} + + do { + ret = bucket->list_multiparts(dpp, string(), marker, string(), max_uploads, uploads, nullptr, &is_truncated); + if (ret == -ENOENT) { + // could bucket have been removed while this is running? + ldpp_dout(dpp, 5) << "RGWRadosList::" << __func__ << + ": WARNING: call to list_objects of multipart namespace got ENOENT; " + "assuming bucket removal race" << dendl; + break; + } else if (ret < 0) { + ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ << + ": ERROR: list_objects op returned ret=" << ret << dendl; + return ret; + } + + if (!uploads.empty()) { + // now process the uploads vector + for (const auto& upload : uploads) { + int parts_marker = 0; + bool is_parts_truncated = false; + + do { // while (is_parts_truncated); + ret = upload->list_parts(dpp, store->ctx(), max_parts, parts_marker, + &parts_marker, &is_parts_truncated); + if (ret == -ENOENT) { + ldpp_dout(dpp, 5) << "RGWRadosList::" << __func__ << + ": WARNING: list_multipart_parts returned ret=-ENOENT " + "for " << upload->get_upload_id() << ", moving on" << dendl; + break; + } else if (ret < 0) { + ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ << + ": ERROR: list_multipart_parts returned ret=" << ret << + dendl; + return ret; + } + + for (auto& p : upload->get_parts()) { + rgw::sal::RadosMultipartPart* part = + dynamic_cast(p.second.get()); + RGWObjManifest& manifest = part->get_manifest(); + for (auto obj_it = manifest.obj_begin(dpp); + obj_it != manifest.obj_end(dpp); + ++obj_it) { + const rgw_raw_obj& loc = + obj_it.get_location().get_raw_obj(store->getRados()); + std::cout << loc.oid << std::endl; + } // for (auto obj_it + } // for (auto& p + } while (is_parts_truncated); + } // for (const auto& upload + } // if objs not empty + } while (is_truncated); + + return 0; +} // RGWRadosList::do_incomplete_multipart + +void RGWOrphanSearchStage::dump(Formatter *f) const +{ + f->open_object_section("orphan_search_stage"); + string s; + switch(stage){ + case ORPHAN_SEARCH_STAGE_INIT: + s = "init"; + break; + case ORPHAN_SEARCH_STAGE_LSPOOL: + s = "lspool"; + break; + case ORPHAN_SEARCH_STAGE_LSBUCKETS: + s = "lsbuckets"; + break; + case ORPHAN_SEARCH_STAGE_ITERATE_BI: + s = "iterate_bucket_index"; + break; + case ORPHAN_SEARCH_STAGE_COMPARE: + s = "comparing"; + break; + default: + s = "unknown"; + } + f->dump_string("search_stage", s); + f->dump_int("shard",shard); + f->dump_string("marker",marker); + f->close_section(); +} + +void RGWOrphanSearchInfo::dump(Formatter *f) const +{ + f->open_object_section("orphan_search_info"); + f->dump_string("job_name", job_name); + encode_json("pool", pool, f); + f->dump_int("num_shards", num_shards); + encode_json("start_time", start_time, f); + f->close_section(); +} + +void RGWOrphanSearchState::dump(Formatter *f) const +{ + f->open_object_section("orphan_search_state"); + encode_json("info", info, f); + encode_json("stage", stage, f); + f->close_section(); +} + + diff --git a/src/rgw/rgw_orphan.h b/src/rgw/rgw_orphan.h new file mode 100644 index 000000000..db811d31d --- /dev/null +++ b/src/rgw/rgw_orphan.h @@ -0,0 +1,304 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "common/config.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "rgw_sal_rados.h" + +#define RGW_ORPHAN_INDEX_OID "orphan.index" +#define RGW_ORPHAN_INDEX_PREFIX "orphan.scan" + + +enum RGWOrphanSearchStageId { + ORPHAN_SEARCH_STAGE_UNKNOWN = 0, + ORPHAN_SEARCH_STAGE_INIT = 1, + ORPHAN_SEARCH_STAGE_LSPOOL = 2, + ORPHAN_SEARCH_STAGE_LSBUCKETS = 3, + ORPHAN_SEARCH_STAGE_ITERATE_BI = 4, + ORPHAN_SEARCH_STAGE_COMPARE = 5, +}; + + +struct RGWOrphanSearchStage { + RGWOrphanSearchStageId stage; + int shard; + std::string marker; + + RGWOrphanSearchStage() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN), shard(0) {} + explicit RGWOrphanSearchStage(RGWOrphanSearchStageId _stage) : stage(_stage), shard(0) {} + RGWOrphanSearchStage(RGWOrphanSearchStageId _stage, int _shard, const std::string& _marker) : stage(_stage), shard(_shard), marker(_marker) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode((int)stage, bl); + encode(shard, bl); + encode(marker, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + int s; + decode(s, bl); + stage = (RGWOrphanSearchStageId)s; + decode(shard, bl); + decode(marker, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOrphanSearchStage) + +struct RGWOrphanSearchInfo { + std::string job_name; + rgw_pool pool; + uint16_t num_shards; + utime_t start_time; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(job_name, bl); + encode(pool.to_str(), bl); + encode(num_shards, bl); + encode(start_time, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(job_name, bl); + std::string s; + decode(s, bl); + pool.from_str(s); + decode(num_shards, bl); + decode(start_time, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOrphanSearchInfo) + +struct RGWOrphanSearchState { + RGWOrphanSearchInfo info; + RGWOrphanSearchStage stage; + + RGWOrphanSearchState() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(info, bl); + encode(stage, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(info, bl); + decode(stage, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOrphanSearchState) + +class RGWOrphanStore { + rgw::sal::RadosStore* store; + librados::IoCtx ioctx; + + std::string oid; + +public: + explicit RGWOrphanStore(rgw::sal::RadosStore* _store) : store(_store), oid(RGW_ORPHAN_INDEX_OID) {} + + librados::IoCtx& get_ioctx() { return ioctx; } + + int init(const DoutPrefixProvider *dpp); + + int read_job(const std::string& job_name, RGWOrphanSearchState& state); + int write_job(const std::string& job_name, const RGWOrphanSearchState& state); + int remove_job(const std::string& job_name); + int list_jobs(std::map &job_list); + + + int store_entries(const DoutPrefixProvider *dpp, const std::string& oid, const std::map& entries); + int read_entries(const std::string& oid, const std::string& marker, std::map *entries, bool *truncated); +}; + + +class RGWOrphanSearch { + rgw::sal::RadosStore* store; + + RGWOrphanStore orphan_store; + + RGWOrphanSearchInfo search_info; + RGWOrphanSearchStage search_stage; + + std::map all_objs_index; + std::map buckets_instance_index; + std::map linked_objs_index; + + std::string index_objs_prefix; + + uint16_t max_concurrent_ios; + uint64_t stale_secs; + int64_t max_list_bucket_entries; + + bool detailed_mode; + + struct log_iter_info { + std::string oid; + std::list::iterator cur; + std::list::iterator end; + }; + + int log_oids(const DoutPrefixProvider *dpp, std::map& log_shards, std::map >& oids); + +#define RGW_ORPHANSEARCH_HASH_PRIME 7877 + int orphan_shard(const std::string& str) { + return ceph_str_hash_linux(str.c_str(), str.size()) % RGW_ORPHANSEARCH_HASH_PRIME % search_info.num_shards; + } + + int handle_stat_result(const DoutPrefixProvider *dpp, std::map >& oids, RGWRados::Object::Stat::Result& result); + int pop_and_handle_stat_op(const DoutPrefixProvider *dpp, std::map >& oids, std::deque& ops); + + int remove_index(std::map& index); +public: + RGWOrphanSearch(rgw::sal::RadosStore* _store, int _max_ios, uint64_t _stale_secs) : store(_store), orphan_store(store), max_concurrent_ios(_max_ios), stale_secs(_stale_secs) {} + + int save_state() { + RGWOrphanSearchState state; + state.info = search_info; + state.stage = search_stage; + return orphan_store.write_job(search_info.job_name, state); + } + + int init(const DoutPrefixProvider *dpp, const std::string& job_name, RGWOrphanSearchInfo *info, bool _detailed_mode=false); + + int create(const std::string& job_name, int num_shards); + + int build_all_oids_index(const DoutPrefixProvider *dpp); + int build_buckets_instance_index(const DoutPrefixProvider *dpp); + int build_linked_oids_for_bucket(const DoutPrefixProvider *dpp, const std::string& bucket_instance_id, std::map >& oids); + int build_linked_oids_index(const DoutPrefixProvider *dpp); + int compare_oid_indexes(const DoutPrefixProvider *dpp); + + int run(const DoutPrefixProvider *dpp); + int finish(); +}; + + +class RGWRadosList { + + /* + * process_t describes how to process a irectory, we will either + * process the whole thing (entire_container == true) or a portion + * of it (entire_container == false). When we only process a + * portion, we will list the specific keys and/or specific lexical + * prefixes. + */ + struct process_t { + bool entire_container; + std::set filter_keys; + std::set prefixes; + + process_t() : + entire_container(false) + {} + }; + + std::map bucket_process_map; + std::set visited_oids; + + void add_bucket_entire(const std::string& bucket_name) { + auto p = bucket_process_map.emplace(std::make_pair(bucket_name, + process_t())); + p.first->second.entire_container = true; + } + + void add_bucket_prefix(const std::string& bucket_name, + const std::string& prefix) { + auto p = bucket_process_map.emplace(std::make_pair(bucket_name, + process_t())); + p.first->second.prefixes.insert(prefix); + } + + void add_bucket_filter(const std::string& bucket_name, + const rgw_obj_key& obj_key) { + auto p = bucket_process_map.emplace(std::make_pair(bucket_name, + process_t())); + p.first->second.filter_keys.insert(obj_key); + } + + rgw::sal::RadosStore* store; + + uint16_t max_concurrent_ios; + uint64_t stale_secs; + std::string tenant_name; + + bool include_rgw_obj_name; + std::string field_separator; + + int handle_stat_result(const DoutPrefixProvider *dpp, + RGWRados::Object::Stat::Result& result, + std::string& bucket_name, + rgw_obj_key& obj_key, + std::set& obj_oids); + int pop_and_handle_stat_op(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + std::deque& ops); + +public: + + RGWRadosList(rgw::sal::RadosStore* _store, + int _max_ios, + uint64_t _stale_secs, + const std::string& _tenant_name) : + store(_store), + max_concurrent_ios(_max_ios), + stale_secs(_stale_secs), + tenant_name(_tenant_name), + include_rgw_obj_name(false) + {} + + int process_bucket(const DoutPrefixProvider *dpp, + const std::string& bucket_instance_id, + const std::string& prefix, + const std::set& entries_filter); + + int do_incomplete_multipart(const DoutPrefixProvider *dpp, + rgw::sal::Bucket* bucket); + + int build_linked_oids_index(); + + int run(const DoutPrefixProvider *dpp, + const std::string& bucket_id, + const bool silent_indexless = false); + int run(const DoutPrefixProvider *dpp, + const bool yes_i_really_mean_it = false); + + // if there's a non-empty field separator, that means we'll display + // bucket and object names + void set_field_separator(const std::string& fs) { + field_separator = fs; + include_rgw_obj_name = !field_separator.empty(); + } +}; // class RGWRadosList diff --git a/src/rgw/rgw_os_lib.cc b/src/rgw/rgw_os_lib.cc new file mode 100644 index 000000000..55eb2fb4b --- /dev/null +++ b/src/rgw/rgw_os_lib.cc @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_user.h" +#include "rgw_os_lib.h" +#include "rgw_file.h" +#include "rgw_lib_frontend.h" + +namespace rgw { + +/* static */ + int RGWHandler_Lib::init_from_header(rgw::sal::Driver* driver, + req_state *s) + { + string req; + string first; + + const char *req_name = s->relative_uri.c_str(); + const char *p; + + /* skip request_params parsing, rgw_file should not be + * seeing any */ + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(s); + + if (*req_name != '/') + return 0; + + req_name++; + + if (!*req_name) + return 0; + + req = req_name; + int pos = req.find('/'); + if (pos >= 0) { + first = req.substr(0, pos); + } else { + first = req; + } + + if (s->bucket_name.empty()) { + s->bucket_name = std::move(first); + if (pos >= 0) { + // XXX ugh, another copy + string encoded_obj_str = req.substr(pos+1); + s->object = driver->get_object(rgw_obj_key(encoded_obj_str, s->info.args.get("versionId"))); + } + } else { + s->object = driver->get_object(rgw_obj_key(req_name, s->info.args.get("versionId"))); + } + return 0; + } /* init_from_header */ + +} /* namespace rgw */ diff --git a/src/rgw/rgw_os_lib.h b/src/rgw/rgw_os_lib.h new file mode 100644 index 000000000..65df0a726 --- /dev/null +++ b/src/rgw/rgw_os_lib.h @@ -0,0 +1,9 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include "rgw_common.h" +#include "rgw_lib.h" + diff --git a/src/rgw/rgw_perf_counters.cc b/src/rgw/rgw_perf_counters.cc new file mode 100644 index 000000000..fd058ab00 --- /dev/null +++ b/src/rgw/rgw_perf_counters.cc @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_perf_counters.h" +#include "common/perf_counters.h" +#include "common/ceph_context.h" + +PerfCounters *perfcounter = NULL; + +int rgw_perf_start(CephContext *cct) +{ + PerfCountersBuilder plb(cct, "rgw", l_rgw_first, l_rgw_last); + + // RGW emits comparatively few metrics, so let's be generous + // and mark them all USEFUL to get transmission to ceph-mgr by default. + plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + plb.add_u64_counter(l_rgw_req, "req", "Requests"); + plb.add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests"); + + plb.add_u64_counter(l_rgw_get, "get", "Gets"); + plb.add_u64_counter(l_rgw_get_b, "get_b", "Size of gets"); + plb.add_time_avg(l_rgw_get_lat, "get_initial_lat", "Get latency"); + plb.add_u64_counter(l_rgw_put, "put", "Puts"); + plb.add_u64_counter(l_rgw_put_b, "put_b", "Size of puts"); + plb.add_time_avg(l_rgw_put_lat, "put_initial_lat", "Put latency"); + + plb.add_u64(l_rgw_qlen, "qlen", "Queue length"); + plb.add_u64(l_rgw_qactive, "qactive", "Active requests queue"); + + plb.add_u64_counter(l_rgw_cache_hit, "cache_hit", "Cache hits"); + plb.add_u64_counter(l_rgw_cache_miss, "cache_miss", "Cache miss"); + + plb.add_u64_counter(l_rgw_keystone_token_cache_hit, "keystone_token_cache_hit", "Keystone token cache hits"); + plb.add_u64_counter(l_rgw_keystone_token_cache_miss, "keystone_token_cache_miss", "Keystone token cache miss"); + + plb.add_u64_counter(l_rgw_gc_retire, "gc_retire_object", "GC object retires"); + + plb.add_u64_counter(l_rgw_lc_expire_current, "lc_expire_current", + "Lifecycle current expiration"); + plb.add_u64_counter(l_rgw_lc_expire_noncurrent, "lc_expire_noncurrent", + "Lifecycle non-current expiration"); + plb.add_u64_counter(l_rgw_lc_expire_dm, "lc_expire_dm", + "Lifecycle delete-marker expiration"); + plb.add_u64_counter(l_rgw_lc_transition_current, "lc_transition_current", + "Lifecycle current transition"); + plb.add_u64_counter(l_rgw_lc_transition_noncurrent, + "lc_transition_noncurrent", + "Lifecycle non-current transition"); + plb.add_u64_counter(l_rgw_lc_abort_mpu, "lc_abort_mpu", + "Lifecycle abort multipart upload"); + + plb.add_u64_counter(l_rgw_pubsub_event_triggered, "pubsub_event_triggered", "Pubsub events with at least one topic"); + plb.add_u64_counter(l_rgw_pubsub_event_lost, "pubsub_event_lost", "Pubsub events lost"); + plb.add_u64_counter(l_rgw_pubsub_store_ok, "pubsub_store_ok", "Pubsub events successfully stored"); + plb.add_u64_counter(l_rgw_pubsub_store_fail, "pubsub_store_fail", "Pubsub events failed to be stored"); + plb.add_u64(l_rgw_pubsub_events, "pubsub_events", "Pubsub events in store"); + plb.add_u64_counter(l_rgw_pubsub_push_ok, "pubsub_push_ok", "Pubsub events pushed to an endpoint"); + plb.add_u64_counter(l_rgw_pubsub_push_failed, "pubsub_push_failed", "Pubsub events failed to be pushed to an endpoint"); + plb.add_u64(l_rgw_pubsub_push_pending, "pubsub_push_pending", "Pubsub events pending reply from endpoint"); + plb.add_u64_counter(l_rgw_pubsub_missing_conf, "pubsub_missing_conf", "Pubsub events could not be handled because of missing configuration"); + + plb.add_u64_counter(l_rgw_lua_script_ok, "lua_script_ok", "Successfull executions of lua scripts"); + plb.add_u64_counter(l_rgw_lua_script_fail, "lua_script_fail", "Failed executions of lua scripts"); + plb.add_u64(l_rgw_lua_current_vms, "lua_current_vms", "Number of Lua VMs currently being executed"); + + perfcounter = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perfcounter); + return 0; +} + +void rgw_perf_stop(CephContext *cct) +{ + ceph_assert(perfcounter); + cct->get_perfcounters_collection()->remove(perfcounter); + delete perfcounter; +} + diff --git a/src/rgw/rgw_perf_counters.h b/src/rgw/rgw_perf_counters.h new file mode 100644 index 000000000..3c4e4e97f --- /dev/null +++ b/src/rgw/rgw_perf_counters.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "include/common_fwd.h" + +extern PerfCounters *perfcounter; + +extern int rgw_perf_start(CephContext *cct); +extern void rgw_perf_stop(CephContext *cct); + +enum { + l_rgw_first = 15000, + l_rgw_req, + l_rgw_failed_req, + + l_rgw_get, + l_rgw_get_b, + l_rgw_get_lat, + + l_rgw_put, + l_rgw_put_b, + l_rgw_put_lat, + + l_rgw_qlen, + l_rgw_qactive, + + l_rgw_cache_hit, + l_rgw_cache_miss, + + l_rgw_keystone_token_cache_hit, + l_rgw_keystone_token_cache_miss, + + l_rgw_gc_retire, + + l_rgw_lc_expire_current, + l_rgw_lc_expire_noncurrent, + l_rgw_lc_expire_dm, + l_rgw_lc_transition_current, + l_rgw_lc_transition_noncurrent, + l_rgw_lc_abort_mpu, + + l_rgw_pubsub_event_triggered, + l_rgw_pubsub_event_lost, + l_rgw_pubsub_store_ok, + l_rgw_pubsub_store_fail, + l_rgw_pubsub_events, + l_rgw_pubsub_push_ok, + l_rgw_pubsub_push_failed, + l_rgw_pubsub_push_pending, + l_rgw_pubsub_missing_conf, + + l_rgw_lua_current_vms, + l_rgw_lua_script_ok, + l_rgw_lua_script_fail, + + l_rgw_last, +}; + diff --git a/src/rgw/rgw_period.cc b/src/rgw/rgw_period.cc new file mode 100644 index 000000000..1e7de60ea --- /dev/null +++ b/src/rgw/rgw_period.cc @@ -0,0 +1,350 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_sync.h" + +using namespace std; +using namespace rgw_zone_defaults; + +std::string period_latest_epoch_info_oid = ".latest_epoch"; +std::string period_info_oid_prefix = "periods."; + +#define FIRST_EPOCH 1 + +int RGWPeriod::init(const DoutPrefixProvider *dpp, + CephContext *_cct, RGWSI_SysObj *_sysobj_svc, + optional_yield y, bool setup_obj) +{ + cct = _cct; + sysobj_svc = _sysobj_svc; + + if (!setup_obj) + return 0; + + if (id.empty()) { + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(dpp, cct, sysobj_svc, y); + if (ret < 0) { + ldpp_dout(dpp, 4) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " << + cpp_strerror(-ret) << dendl; + return ret; + } + id = realm.get_current_period(); + realm_id = realm.get_id(); + } + + if (!epoch) { + int ret = use_latest_epoch(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id + << " : " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + return read_info(dpp, y); +} + +int RGWPeriod::init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, + const string& period_realm_id, optional_yield y, + const string& period_realm_name, bool setup_obj) +{ + cct = _cct; + sysobj_svc = _sysobj_svc; + + realm_id = period_realm_id; + realm_name = period_realm_name; + + if (!setup_obj) + return 0; + + return init(dpp, _cct, _sysobj_svc, y, setup_obj); +} + +const string& RGWPeriod::get_latest_epoch_oid() const +{ + if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) { + return period_latest_epoch_info_oid; + } + return cct->_conf->rgw_period_latest_epoch_info_oid; +} + +const string& RGWPeriod::get_info_oid_prefix() const +{ + return period_info_oid_prefix; +} + +const string RGWPeriod::get_period_oid_prefix() const +{ + return get_info_oid_prefix() + id; +} + +const string RGWPeriod::get_period_oid() const +{ + std::ostringstream oss; + oss << get_period_oid_prefix(); + // skip the epoch for the staging period + if (id != get_staging_id(realm_id)) + oss << "." << epoch; + return oss.str(); +} + +bool RGWPeriod::find_zone(const DoutPrefixProvider *dpp, + const rgw_zone_id& zid, + RGWZoneGroup *pzonegroup, + optional_yield y) const +{ + RGWZoneGroup zg; + RGWZone zone; + + bool found = period_map.find_zone_by_id(zid, &zg, &zone); + if (found) { + *pzonegroup = zg; + } + + return found; +} + +rgw_pool RGWPeriod::get_pool(CephContext *cct) const +{ + if (cct->_conf->rgw_period_root_pool.empty()) { + return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL); + } + return rgw_pool(cct->_conf->rgw_period_root_pool); +} + +int RGWPeriod::set_latest_epoch(const DoutPrefixProvider *dpp, + optional_yield y, + epoch_t epoch, bool exclusive, + RGWObjVersionTracker *objv) +{ + string oid = get_period_oid_prefix() + get_latest_epoch_oid(); + + rgw_pool pool(get_pool(cct)); + bufferlist bl; + + RGWPeriodLatestEpochInfo info; + info.epoch = epoch; + + using ceph::encode; + encode(info, bl); + + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid)); + return sysobj.wop() + .set_exclusive(exclusive) + .write(dpp, bl, y); +} + +int RGWPeriod::read_info(const DoutPrefixProvider *dpp, optional_yield y) +{ + rgw_pool pool(get_pool(cct)); + + bufferlist bl; + + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, get_period_oid()}); + int ret = sysobj.rop().read(dpp, &bl, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + try { + using ceph::decode; + auto iter = bl.cbegin(); + decode(*this, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl; + return -EIO; + } + + return 0; +} + +int RGWPeriod::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + rgw_pool pool(get_pool(cct)); + + string oid = get_period_oid(); + bufferlist bl; + using ceph::encode; + encode(*this, bl); + + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid)); + return sysobj.wop() + .set_exclusive(exclusive) + .write(dpp, bl, y); +} + +int RGWPeriod::create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive) +{ + int ret; + + /* create unique id */ + uuid_d new_uuid; + char uuid_str[37]; + new_uuid.generate_random(); + new_uuid.print(uuid_str); + id = uuid_str; + + epoch = FIRST_EPOCH; + + period_map.id = id; + + ret = store_info(dpp, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = set_latest_epoch(dpp, y, epoch); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl; + } + + return ret; +} + +int RGWPeriod::reflect(const DoutPrefixProvider *dpp, optional_yield y) +{ + for (auto& iter : period_map.zonegroups) { + RGWZoneGroup& zg = iter.second; + zg.reinit_instance(cct, sysobj_svc); + int r = zg.write(dpp, false, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl; + return r; + } + if (zg.is_master_zonegroup()) { + // set master as default if no default exists + r = zg.set_as_default(dpp, y, true); + if (r == 0) { + ldpp_dout(dpp, 1) << "Set the period's master zonegroup " << zg.get_id() + << " as the default" << dendl; + } + } + } + + int r = period_config.write(dpp, sysobj_svc, realm_id, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to store period config: " + << cpp_strerror(-r) << dendl; + return r; + } + return 0; +} + +void RGWPeriod::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("epoch", epoch , f); + encode_json("predecessor_uuid", predecessor_uuid, f); + encode_json("sync_status", sync_status, f); + encode_json("period_map", period_map, f); + encode_json("master_zonegroup", master_zonegroup, f); + encode_json("master_zone", master_zone, f); + encode_json("period_config", period_config, f); + encode_json("realm_id", realm_id, f); + encode_json("realm_name", realm_name, f); + encode_json("realm_epoch", realm_epoch, f); +} + +void RGWPeriod::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("epoch", epoch, obj); + JSONDecoder::decode_json("predecessor_uuid", predecessor_uuid, obj); + JSONDecoder::decode_json("sync_status", sync_status, obj); + JSONDecoder::decode_json("period_map", period_map, obj); + JSONDecoder::decode_json("master_zonegroup", master_zonegroup, obj); + JSONDecoder::decode_json("master_zone", master_zone, obj); + JSONDecoder::decode_json("period_config", period_config, obj); + JSONDecoder::decode_json("realm_id", realm_id, obj); + JSONDecoder::decode_json("realm_name", realm_name, obj); + JSONDecoder::decode_json("realm_epoch", realm_epoch, obj); +} + +int RGWPeriod::update_latest_epoch(const DoutPrefixProvider *dpp, epoch_t epoch, optional_yield y) +{ + static constexpr int MAX_RETRIES = 20; + + for (int i = 0; i < MAX_RETRIES; i++) { + RGWPeriodLatestEpochInfo info; + RGWObjVersionTracker objv; + bool exclusive = false; + + // read existing epoch + int r = read_latest_epoch(dpp, info, y, &objv); + if (r == -ENOENT) { + // use an exclusive create to set the epoch atomically + exclusive = true; + ldpp_dout(dpp, 20) << "creating initial latest_epoch=" << epoch + << " for period=" << id << dendl; + } else if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read latest_epoch" << dendl; + return r; + } else if (epoch <= info.epoch) { + r = -EEXIST; // fail with EEXIST if epoch is not newer + ldpp_dout(dpp, 10) << "found existing latest_epoch " << info.epoch + << " >= given epoch " << epoch << ", returning r=" << r << dendl; + return r; + } else { + ldpp_dout(dpp, 20) << "updating latest_epoch from " << info.epoch + << " -> " << epoch << " on period=" << id << dendl; + } + + r = set_latest_epoch(dpp, y, epoch, exclusive, &objv); + if (r == -EEXIST) { + continue; // exclusive create raced with another update, retry + } else if (r == -ECANCELED) { + continue; // write raced with a conflicting version, retry + } + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to write latest_epoch" << dendl; + return r; + } + return 0; // return success + } + + return -ECANCELED; // fail after max retries +} + +int RGWPeriod::read_latest_epoch(const DoutPrefixProvider *dpp, + RGWPeriodLatestEpochInfo& info, + optional_yield y, + RGWObjVersionTracker *objv) +{ + string oid = get_period_oid_prefix() + get_latest_epoch_oid(); + + rgw_pool pool(get_pool(cct)); + bufferlist bl; + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid}); + int ret = sysobj.rop().read(dpp, &bl, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl; + return ret; + } + try { + auto iter = bl.cbegin(); + using ceph::decode; + decode(info, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "error decoding data from " << pool << ":" << oid << dendl; + return -EIO; + } + + return 0; +} + +int RGWPeriod::use_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y) +{ + RGWPeriodLatestEpochInfo info; + int ret = read_latest_epoch(dpp, info, y); + if (ret < 0) { + return ret; + } + + epoch = info.epoch; + + return 0; +} + diff --git a/src/rgw/rgw_period_history.cc b/src/rgw/rgw_period_history.cc new file mode 100644 index 000000000..abbd998cf --- /dev/null +++ b/src/rgw/rgw_period_history.cc @@ -0,0 +1,353 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_period_history.h" +#include "rgw_zone.h" + +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw period history: ") + +/// an ordered history of consecutive periods +class RGWPeriodHistory::History : public bi::avl_set_base_hook<> { + public: + std::deque periods; + + epoch_t get_oldest_epoch() const { + return periods.front().get_realm_epoch(); + } + epoch_t get_newest_epoch() const { + return periods.back().get_realm_epoch(); + } + bool contains(epoch_t epoch) const { + return get_oldest_epoch() <= epoch && epoch <= get_newest_epoch(); + } + RGWPeriod& get(epoch_t epoch) { + return periods[epoch - get_oldest_epoch()]; + } + const RGWPeriod& get(epoch_t epoch) const { + return periods[epoch - get_oldest_epoch()]; + } + const std::string& get_predecessor_id() const { + return periods.front().get_predecessor(); + } +}; + +/// value comparison for avl_set +bool operator<(const RGWPeriodHistory::History& lhs, + const RGWPeriodHistory::History& rhs) +{ + return lhs.get_newest_epoch() < rhs.get_newest_epoch(); +} + +/// key-value comparison for avl_set +struct NewestEpochLess { + bool operator()(const RGWPeriodHistory::History& value, epoch_t key) const { + return value.get_newest_epoch() < key; + } +}; + + +using Cursor = RGWPeriodHistory::Cursor; + +const RGWPeriod& Cursor::get_period() const +{ + std::lock_guard lock(*mutex); + return history->get(epoch); +} +bool Cursor::has_prev() const +{ + std::lock_guard lock(*mutex); + return epoch > history->get_oldest_epoch(); +} +bool Cursor::has_next() const +{ + std::lock_guard lock(*mutex); + return epoch < history->get_newest_epoch(); +} + +bool operator==(const Cursor& lhs, const Cursor& rhs) +{ + return lhs.history == rhs.history && lhs.epoch == rhs.epoch; +} + +bool operator!=(const Cursor& lhs, const Cursor& rhs) +{ + return !(lhs == rhs); +} + +class RGWPeriodHistory::Impl final { + public: + Impl(CephContext* cct, Puller* puller, const RGWPeriod& current_period); + ~Impl(); + + Cursor get_current() const { return current_cursor; } + Cursor attach(const DoutPrefixProvider *dpp, RGWPeriod&& period, optional_yield y); + Cursor insert(RGWPeriod&& period); + Cursor lookup(epoch_t realm_epoch); + + private: + /// an intrusive set of histories, ordered by their newest epoch. although + /// the newest epoch of each history is mutable, the ordering cannot change + /// because we prevent the histories from overlapping + using Set = bi::avl_set; + + /// insert the given period into the period history, creating new unconnected + /// histories or merging existing histories as necessary. expects the caller + /// to hold a lock on mutex. returns a valid cursor regardless of whether it + /// ends up in current_history, though cursors in other histories are only + /// valid within the context of the lock + Cursor insert_locked(RGWPeriod&& period); + + /// merge the periods from the src history onto the end of the dst history, + /// and return an iterator to the merged history + Set::iterator merge(Set::iterator dst, Set::iterator src); + + /// construct a Cursor object using Cursor's private constuctor + Cursor make_cursor(Set::const_iterator history, epoch_t epoch); + + CephContext *const cct; + Puller *const puller; //< interface for pulling missing periods + Cursor current_cursor; //< Cursor to realm's current period + + mutable std::mutex mutex; //< protects the histories + + /// set of disjoint histories that are missing intermediate periods needed to + /// connect them together + Set histories; + + /// iterator to the history that contains the realm's current period + Set::const_iterator current_history; +}; + +RGWPeriodHistory::Impl::Impl(CephContext* cct, Puller* puller, + const RGWPeriod& current_period) + : cct(cct), puller(puller) +{ + if (!current_period.get_id().empty()) { + // copy the current period into a new history + auto history = new History; + history->periods.push_back(current_period); + + // insert as our current history + current_history = histories.insert(*history).first; + + // get a cursor to the current period + current_cursor = make_cursor(current_history, current_period.get_realm_epoch()); + } else { + current_history = histories.end(); + } +} + +RGWPeriodHistory::Impl::~Impl() +{ + // clear the histories and delete each entry + histories.clear_and_dispose(std::default_delete{}); +} + +Cursor RGWPeriodHistory::Impl::attach(const DoutPrefixProvider *dpp, RGWPeriod&& period, optional_yield y) +{ + if (current_history == histories.end()) { + return Cursor{-EINVAL}; + } + + const auto epoch = period.get_realm_epoch(); + + std::string predecessor_id; + for (;;) { + { + // hold the lock over insert, and while accessing the unsafe cursor + std::lock_guard lock(mutex); + + auto cursor = insert_locked(std::move(period)); + if (!cursor) { + return cursor; + } + if (current_history->contains(epoch)) { + break; // the history is complete + } + + // take the predecessor id of the most recent history + if (cursor.get_epoch() > current_cursor.get_epoch()) { + predecessor_id = cursor.history->get_predecessor_id(); + } else { + predecessor_id = current_history->get_predecessor_id(); + } + } + + if (predecessor_id.empty()) { + ldpp_dout(dpp, -1) << "reached a period with an empty predecessor id" << dendl; + return Cursor{-EINVAL}; + } + + // pull the period outside of the lock + int r = puller->pull(dpp, predecessor_id, period, y); + if (r < 0) { + return Cursor{r}; + } + } + + // return a cursor to the requested period + return make_cursor(current_history, epoch); +} + +Cursor RGWPeriodHistory::Impl::insert(RGWPeriod&& period) +{ + if (current_history == histories.end()) { + return Cursor{-EINVAL}; + } + + std::lock_guard lock(mutex); + + auto cursor = insert_locked(std::move(period)); + + if (cursor.get_error()) { + return cursor; + } + // we can only provide cursors that are safe to use outside of the mutex if + // they're within the current_history, because other histories can disappear + // in a merge. see merge() for the special handling of current_history + if (cursor.history == &*current_history) { + return cursor; + } + return Cursor{}; +} + +Cursor RGWPeriodHistory::Impl::lookup(epoch_t realm_epoch) +{ + if (current_history != histories.end() && + current_history->contains(realm_epoch)) { + return make_cursor(current_history, realm_epoch); + } + return Cursor{}; +} + +Cursor RGWPeriodHistory::Impl::insert_locked(RGWPeriod&& period) +{ + auto epoch = period.get_realm_epoch(); + + // find the first history whose newest epoch comes at or after this period + auto i = histories.lower_bound(epoch, NewestEpochLess{}); + + if (i == histories.end()) { + // epoch is past the end of our newest history + auto last = --Set::iterator{i}; // last = i - 1 + + if (epoch == last->get_newest_epoch() + 1) { + // insert at the back of the last history + last->periods.emplace_back(std::move(period)); + return make_cursor(last, epoch); + } + + // create a new history for this period + auto history = new History; + history->periods.emplace_back(std::move(period)); + histories.insert(last, *history); + + i = Set::s_iterator_to(*history); + return make_cursor(i, epoch); + } + + if (i->contains(epoch)) { + // already resident in this history + auto& existing = i->get(epoch); + // verify that the period ids match; otherwise we've forked the history + if (period.get_id() != existing.get_id()) { + lderr(cct) << "Got two different periods, " << period.get_id() + << " and " << existing.get_id() << ", with the same realm epoch " + << epoch << "! This indicates a fork in the period history." << dendl; + return Cursor{-EEXIST}; + } + // update the existing period if we got a newer period epoch + if (period.get_epoch() > existing.get_epoch()) { + existing = std::move(period); + } + return make_cursor(i, epoch); + } + + if (epoch + 1 == i->get_oldest_epoch()) { + // insert at the front of this history + i->periods.emplace_front(std::move(period)); + + // try to merge with the previous history + if (i != histories.begin()) { + auto prev = --Set::iterator{i}; + if (epoch == prev->get_newest_epoch() + 1) { + i = merge(prev, i); + } + } + return make_cursor(i, epoch); + } + + if (i != histories.begin()) { + auto prev = --Set::iterator{i}; + if (epoch == prev->get_newest_epoch() + 1) { + // insert at the back of the previous history + prev->periods.emplace_back(std::move(period)); + return make_cursor(prev, epoch); + } + } + + // create a new history for this period + auto history = new History; + history->periods.emplace_back(std::move(period)); + histories.insert(i, *history); + + i = Set::s_iterator_to(*history); + return make_cursor(i, epoch); +} + +RGWPeriodHistory::Impl::Set::iterator +RGWPeriodHistory::Impl::merge(Set::iterator dst, Set::iterator src) +{ + ceph_assert(dst->get_newest_epoch() + 1 == src->get_oldest_epoch()); + + // always merge into current_history + if (src == current_history) { + // move the periods from dst onto the front of src + src->periods.insert(src->periods.begin(), + std::make_move_iterator(dst->periods.begin()), + std::make_move_iterator(dst->periods.end())); + histories.erase_and_dispose(dst, std::default_delete{}); + return src; + } + + // move the periods from src onto the end of dst + dst->periods.insert(dst->periods.end(), + std::make_move_iterator(src->periods.begin()), + std::make_move_iterator(src->periods.end())); + histories.erase_and_dispose(src, std::default_delete{}); + return dst; +} + +Cursor RGWPeriodHistory::Impl::make_cursor(Set::const_iterator history, + epoch_t epoch) { + return Cursor{&*history, &mutex, epoch}; +} + + +RGWPeriodHistory::RGWPeriodHistory(CephContext* cct, Puller* puller, + const RGWPeriod& current_period) + : impl(new Impl(cct, puller, current_period)) {} + +RGWPeriodHistory::~RGWPeriodHistory() = default; + +Cursor RGWPeriodHistory::get_current() const +{ + return impl->get_current(); +} +Cursor RGWPeriodHistory::attach(const DoutPrefixProvider *dpp, RGWPeriod&& period, optional_yield y) +{ + return impl->attach(dpp, std::move(period), y); +} +Cursor RGWPeriodHistory::insert(RGWPeriod&& period) +{ + return impl->insert(std::move(period)); +} +Cursor RGWPeriodHistory::lookup(epoch_t realm_epoch) +{ + return impl->lookup(realm_epoch); +} diff --git a/src/rgw/rgw_period_history.h b/src/rgw/rgw_period_history.h new file mode 100644 index 000000000..3d18fbf9e --- /dev/null +++ b/src/rgw/rgw_period_history.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include "include/ceph_assert.h" +#include "include/types.h" +#include "common/async/yield_context.h" +#include "common/dout.h" + +namespace bi = boost::intrusive; + +class RGWPeriod; + +/** + * RGWPeriodHistory tracks the relative history of all inserted periods, + * coordinates the pulling of missing intermediate periods, and provides a + * Cursor object for traversing through the connected history. + */ +class RGWPeriodHistory final { + private: + /// an ordered history of consecutive periods + class History; + + // comparisons for avl_set ordering + friend bool operator<(const History& lhs, const History& rhs); + friend struct NewestEpochLess; + + class Impl; + std::unique_ptr impl; + + public: + /** + * Puller is a synchronous interface for pulling periods from the master + * zone. The abstraction exists mainly to support unit testing. + */ + class Puller { + public: + virtual ~Puller() = default; + + virtual int pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, + optional_yield y) = 0; + }; + + RGWPeriodHistory(CephContext* cct, Puller* puller, + const RGWPeriod& current_period); + ~RGWPeriodHistory(); + + /** + * Cursor tracks a position in the period history and allows forward and + * backward traversal. Only periods that are fully connected to the + * current_period are reachable via a Cursor, because other histories are + * temporary and can be merged away. Cursors to periods in disjoint + * histories, as provided by insert() or lookup(), are therefore invalid and + * their operator bool() will return false. + */ + class Cursor final { + public: + Cursor() = default; + explicit Cursor(int error) : error(error) {} + + int get_error() const { return error; } + + /// return false for a default-constructed or error Cursor + operator bool() const { return history != nullptr; } + + epoch_t get_epoch() const { return epoch; } + const RGWPeriod& get_period() const; + + bool has_prev() const; + bool has_next() const; + + void prev() { epoch--; } + void next() { epoch++; } + + friend bool operator==(const Cursor& lhs, const Cursor& rhs); + friend bool operator!=(const Cursor& lhs, const Cursor& rhs); + + private: + // private constructors for RGWPeriodHistory + friend class RGWPeriodHistory::Impl; + + Cursor(const History* history, std::mutex* mutex, epoch_t epoch) + : history(history), mutex(mutex), epoch(epoch) {} + + int error{0}; + const History* history{nullptr}; + std::mutex* mutex{nullptr}; + epoch_t epoch{0}; //< realm epoch of cursor position + }; + + /// return a cursor to the current period + Cursor get_current() const; + + /// build up a connected period history that covers the span between + /// current_period and the given period, reading predecessor periods or + /// fetching them from the master as necessary. returns a cursor at the + /// given period that can be used to traverse the current_history + Cursor attach(const DoutPrefixProvider *dpp, RGWPeriod&& period, optional_yield y); + + /// insert the given period into an existing history, or create a new + /// unconnected history. similar to attach(), but it doesn't try to fetch + /// missing periods. returns a cursor to the inserted period iff it's in + /// the current_history + Cursor insert(RGWPeriod&& period); + + /// search for a period by realm epoch, returning a valid Cursor iff it's in + /// the current_history + Cursor lookup(epoch_t realm_epoch); +}; diff --git a/src/rgw/rgw_period_puller.cc b/src/rgw/rgw_period_puller.cc new file mode 100644 index 000000000..ea2f28e56 --- /dev/null +++ b/src/rgw/rgw_period_puller.cc @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_rest_conn.h" +#include "common/ceph_json.h" +#include "common/errno.h" + +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw period puller: ") + +RGWPeriodPuller::RGWPeriodPuller(RGWSI_Zone *zone_svc, RGWSI_SysObj *sysobj_svc) +{ + cct = zone_svc->ctx(); + svc.zone = zone_svc; + svc.sysobj = sysobj_svc; +} + +namespace { + +// pull the given period over the connection +int pull_period(const DoutPrefixProvider *dpp, RGWRESTConn* conn, const std::string& period_id, + const std::string& realm_id, RGWPeriod& period, + optional_yield y) +{ + rgw_user user; + RGWEnv env; + req_info info(conn->get_ctx(), &env); + info.method = "GET"; + info.request_uri = "/admin/realm/period"; + + auto& params = info.args.get_params(); + params["realm_id"] = realm_id; + params["period_id"] = period_id; + + bufferlist data; +#define MAX_REST_RESPONSE (128 * 1024) + int r = conn->forward(dpp, user, info, nullptr, MAX_REST_RESPONSE, nullptr, &data, y); + if (r < 0) { + return r; + } + + JSONParser parser; + r = parser.parse(data.c_str(), data.length()); + if (r < 0) { + ldpp_dout(dpp, -1) << "request failed: " << cpp_strerror(-r) << dendl; + return r; + } + + try { + decode_json_obj(period, &parser); + } catch (const JSONDecoder::err& e) { + ldpp_dout(dpp, -1) << "failed to decode JSON input: " + << e.what() << dendl; + return -EINVAL; + } + return 0; +} + +} // anonymous namespace + +int RGWPeriodPuller::pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, + optional_yield y) +{ + // try to read the period from rados + period.set_id(period_id); + period.set_epoch(0); + int r = period.init(dpp, cct, svc.sysobj, y); + if (r < 0) { + if (svc.zone->is_meta_master()) { + // can't pull if we're the master + ldpp_dout(dpp, 1) << "metadata master failed to read period " + << period_id << " from local storage: " << cpp_strerror(r) << dendl; + return r; + } + ldpp_dout(dpp, 14) << "pulling period " << period_id + << " from master" << dendl; + // request the period from the master zone + r = pull_period(dpp, svc.zone->get_master_conn(), period_id, + svc.zone->get_realm().get_id(), period, y); + if (r < 0) { + ldpp_dout(dpp, -1) << "failed to pull period " << period_id << dendl; + return r; + } + // write the period to rados + r = period.store_info(dpp, true, y); + if (r == -EEXIST) { + r = 0; + } else if (r < 0) { + ldpp_dout(dpp, -1) << "failed to store period " << period_id << dendl; + return r; + } + // update latest epoch + r = period.update_latest_epoch(dpp, period.get_epoch(), y); + if (r == -EEXIST) { + // already have this epoch (or a more recent one) + return 0; + } + if (r < 0) { + ldpp_dout(dpp, -1) << "failed to update latest_epoch for period " + << period_id << dendl; + return r; + } + // reflect period objects if this is the latest version + if (svc.zone->get_realm().get_current_period() == period_id) { + r = period.reflect(dpp, y); + if (r < 0) { + return r; + } + } + ldpp_dout(dpp, 14) << "period " << period_id + << " pulled and written to local storage" << dendl; + } else { + ldpp_dout(dpp, 14) << "found period " << period_id + << " in local storage" << dendl; + } + return 0; +} diff --git a/src/rgw/rgw_period_puller.h b/src/rgw/rgw_period_puller.h new file mode 100644 index 000000000..88138d36b --- /dev/null +++ b/src/rgw/rgw_period_puller.h @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_period_history.h" +#include "include/common_fwd.h" +#include "rgw/services/svc_sys_obj.h" + +class RGWPeriod; + +class RGWPeriodPuller : public RGWPeriodHistory::Puller { + CephContext *cct; + + struct { + RGWSI_Zone *zone; + RGWSI_SysObj *sysobj; + } svc; + + public: + explicit RGWPeriodPuller(RGWSI_Zone *zone_svc, RGWSI_SysObj *sysobj_svc); + + int pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, optional_yield y) override; +}; diff --git a/src/rgw/rgw_period_pusher.cc b/src/rgw/rgw_period_pusher.cc new file mode 100644 index 000000000..d9c899e5c --- /dev/null +++ b/src/rgw/rgw_period_pusher.cc @@ -0,0 +1,316 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "rgw_period_pusher.h" +#include "rgw_cr_rest.h" +#include "rgw_zone.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" + +#include "services/svc_zone.h" + +#include "common/errno.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw period pusher: ") + +/// A coroutine to post the period over the given connection. +using PushCR = RGWPostRESTResourceCR; + +/// A coroutine that calls PushCR, and retries with backoff until success. +class PushAndRetryCR : public RGWCoroutine { + const std::string& zone; + RGWRESTConn *const conn; + RGWHTTPManager *const http; + RGWPeriod& period; + const std::string epoch; //< epoch string for params + double timeout; //< current interval between retries + const double timeout_max; //< maximum interval between retries + uint32_t counter; //< number of failures since backoff increased + + public: + PushAndRetryCR(CephContext* cct, const std::string& zone, RGWRESTConn* conn, + RGWHTTPManager* http, RGWPeriod& period) + : RGWCoroutine(cct), zone(zone), conn(conn), http(http), period(period), + epoch(std::to_string(period.get_epoch())), + timeout(cct->_conf->rgw_period_push_interval), + timeout_max(cct->_conf->rgw_period_push_interval_max), + counter(0) + {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int PushAndRetryCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + for (;;) { + yield { + ldpp_dout(dpp, 10) << "pushing period " << period.get_id() + << " to " << zone << dendl; + // initialize the http params + rgw_http_param_pair params[] = { + { "period", period.get_id().c_str() }, + { "epoch", epoch.c_str() }, + { nullptr, nullptr } + }; + call(new PushCR(cct, conn, http, "/admin/realm/period", + params, period, nullptr)); + } + + // stop on success + if (get_ret_status() == 0) { + ldpp_dout(dpp, 10) << "push to " << zone << " succeeded" << dendl; + return set_cr_done(); + } + + // try each endpoint in the connection before waiting + if (++counter < conn->get_endpoint_count()) + continue; + counter = 0; + + // wait with exponential backoff up to timeout_max + yield { + utime_t dur; + dur.set_from_double(timeout); + + ldpp_dout(dpp, 10) << "waiting " << dur << "s for retry.." << dendl; + wait(dur); + + timeout *= 2; + if (timeout > timeout_max) + timeout = timeout_max; + } + } + } + return 0; +} + +/** + * PushAllCR is a coroutine that sends the period over all of the given + * connections, retrying until they are all marked as completed. + */ +class PushAllCR : public RGWCoroutine { + RGWHTTPManager *const http; + RGWPeriod period; //< period object to push + std::map conns; //< zones that need the period + + public: + PushAllCR(CephContext* cct, RGWHTTPManager* http, RGWPeriod&& period, + std::map&& conns) + : RGWCoroutine(cct), http(http), + period(std::move(period)), + conns(std::move(conns)) + {} + + int operate(const DoutPrefixProvider *dpp) override; +}; + +int PushAllCR::operate(const DoutPrefixProvider *dpp) +{ + reenter(this) { + // spawn a coroutine to push the period over each connection + yield { + ldpp_dout(dpp, 4) << "sending " << conns.size() << " periods" << dendl; + for (auto& c : conns) + spawn(new PushAndRetryCR(cct, c.first, &c.second, http, period), false); + } + // wait for all to complete + drain_all(); + return set_cr_done(); + } + return 0; +} + +/// A background thread to run the PushAllCR coroutine and exit. +class RGWPeriodPusher::CRThread : public DoutPrefixProvider { + CephContext* cct; + RGWCoroutinesManager coroutines; + RGWHTTPManager http; + boost::intrusive_ptr push_all; + std::thread thread; + + public: + CRThread(CephContext* cct, RGWPeriod&& period, + std::map&& conns) + : cct(cct), coroutines(cct, NULL), + http(cct, coroutines.get_completion_mgr()), + push_all(new PushAllCR(cct, &http, std::move(period), std::move(conns))) + { + http.start(); + // must spawn the CR thread after start + thread = std::thread([this]() noexcept { coroutines.run(this, push_all.get()); }); + } + ~CRThread() + { + push_all.reset(); + coroutines.stop(); + http.stop(); + if (thread.joinable()) + thread.join(); + } + + CephContext *get_cct() const override { return cct; } + unsigned get_subsys() const override { return dout_subsys; } + std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw period pusher CR thread: "; } +}; + + +RGWPeriodPusher::RGWPeriodPusher(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + optional_yield y) + : cct(driver->ctx()), driver(driver) +{ + rgw::sal::Zone* zone = driver->get_zone(); + auto& realm_id = zone->get_realm_id(); + if (realm_id.empty()) // no realm configuration + return; + + // always send out the current period on startup + RGWPeriod period; + // XXX dang + int r = period.init(dpp, cct, static_cast(driver)->svc()->sysobj, realm_id, y, zone->get_realm_name()); + if (r < 0) { + ldpp_dout(dpp, -1) << "failed to load period for realm " << realm_id << dendl; + return; + } + + std::lock_guard lock(mutex); + handle_notify(std::move(period)); +} + +// destructor is here because CRThread is incomplete in the header +RGWPeriodPusher::~RGWPeriodPusher() = default; + +void RGWPeriodPusher::handle_notify(RGWRealmNotify type, + bufferlist::const_iterator& p) +{ + // decode the period + RGWZonesNeedPeriod info; + try { + decode(info, p); + } catch (buffer::error& e) { + lderr(cct) << "Failed to decode the period: " << e.what() << dendl; + return; + } + + std::lock_guard lock(mutex); + + // we can't process this notification without access to our current realm + // configuration. queue it until resume() + if (driver == nullptr) { + pending_periods.emplace_back(std::move(info)); + return; + } + + handle_notify(std::move(info)); +} + +// expects the caller to hold a lock on mutex +void RGWPeriodPusher::handle_notify(RGWZonesNeedPeriod&& period) +{ + if (period.get_realm_epoch() < realm_epoch) { + ldout(cct, 10) << "period's realm epoch " << period.get_realm_epoch() + << " is not newer than current realm epoch " << realm_epoch + << ", discarding update" << dendl; + return; + } + if (period.get_realm_epoch() == realm_epoch && + period.get_epoch() <= period_epoch) { + ldout(cct, 10) << "period epoch " << period.get_epoch() << " is not newer " + "than current epoch " << period_epoch << ", discarding update" << dendl; + return; + } + + // find our zonegroup in the new period + auto& zonegroups = period.get_map().zonegroups; + auto i = zonegroups.find(driver->get_zone()->get_zonegroup().get_id()); + if (i == zonegroups.end()) { + lderr(cct) << "The new period does not contain my zonegroup!" << dendl; + return; + } + auto& my_zonegroup = i->second; + + // if we're not a master zone, we're not responsible for pushing any updates + if (my_zonegroup.master_zone != driver->get_zone()->get_id()) + return; + + // construct a map of the zones that need this period. the map uses the same + // keys/ordering as the zone[group] map, so we can use a hint for insertions + std::map conns; + auto hint = conns.end(); + + // are we the master zonegroup in this period? + if (period.get_map().master_zonegroup == driver->get_zone()->get_zonegroup().get_id()) { + // update other zonegroup endpoints + for (auto& zg : zonegroups) { + auto& zonegroup = zg.second; + if (zonegroup.get_id() == driver->get_zone()->get_zonegroup().get_id()) + continue; + if (zonegroup.endpoints.empty()) + continue; + + hint = conns.emplace_hint( + hint, std::piecewise_construct, + std::forward_as_tuple(zonegroup.get_id()), + std::forward_as_tuple(cct, driver, zonegroup.get_id(), zonegroup.endpoints, zonegroup.api_name)); + } + } + + // update other zone endpoints + for (auto& z : my_zonegroup.zones) { + auto& zone = z.second; + if (zone.id == driver->get_zone()->get_id()) + continue; + if (zone.endpoints.empty()) + continue; + + hint = conns.emplace_hint( + hint, std::piecewise_construct, + std::forward_as_tuple(zone.id), + std::forward_as_tuple(cct, driver, zone.id, zone.endpoints, my_zonegroup.api_name)); + } + + if (conns.empty()) { + ldout(cct, 4) << "No zones to update" << dendl; + return; + } + + realm_epoch = period.get_realm_epoch(); + period_epoch = period.get_epoch(); + + ldout(cct, 4) << "Zone master pushing period " << period.get_id() + << " epoch " << period_epoch << " to " + << conns.size() << " other zones" << dendl; + + // spawn a new coroutine thread, destroying the previous one + cr_thread.reset(new CRThread(cct, std::move(period), std::move(conns))); +} + +void RGWPeriodPusher::pause() +{ + ldout(cct, 4) << "paused for realm update" << dendl; + std::lock_guard lock(mutex); + driver = nullptr; +} + +void RGWPeriodPusher::resume(rgw::sal::Driver* driver) +{ + std::lock_guard lock(mutex); + this->driver = driver; + + ldout(cct, 4) << "resume with " << pending_periods.size() + << " periods pending" << dendl; + + // process notification queue + for (auto& info : pending_periods) { + handle_notify(std::move(info)); + } + pending_periods.clear(); +} diff --git a/src/rgw/rgw_period_pusher.h b/src/rgw/rgw_period_pusher.h new file mode 100644 index 000000000..3ea7bd7dd --- /dev/null +++ b/src/rgw/rgw_period_pusher.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +#include "common/async/yield_context.h" +#include "rgw_realm_reloader.h" +#include "rgw_sal_fwd.h" + +class RGWPeriod; + +// RGWRealmNotify payload for push coordination +using RGWZonesNeedPeriod = RGWPeriod; + +/** + * RGWPeriodPusher coordinates with other nodes via the realm watcher to manage + * the responsibility for pushing period updates to other zones or zonegroups. + */ +class RGWPeriodPusher final : public RGWRealmWatcher::Watcher, + public RGWRealmReloader::Pauser { + public: + explicit RGWPeriodPusher(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y); + ~RGWPeriodPusher() override; + + /// respond to realm notifications by pushing new periods to other zones + void handle_notify(RGWRealmNotify type, bufferlist::const_iterator& p) override; + + /// avoid accessing RGWRados while dynamic reconfiguration is in progress. + /// notifications will be enqueued until resume() + void pause() override; + + /// continue processing notifications with a new RGWRados instance + void resume(rgw::sal::Driver* driver) override; + + private: + void handle_notify(RGWZonesNeedPeriod&& period); + + CephContext *const cct; + rgw::sal::Driver* driver; + + std::mutex mutex; + epoch_t realm_epoch{0}; //< the current realm epoch being sent + epoch_t period_epoch{0}; //< the current period epoch being sent + + /// while paused for reconfiguration, we need to queue up notifications + std::vector pending_periods; + + class CRThread; //< contains thread, coroutine manager, http manager + std::unique_ptr cr_thread; //< thread to run the push coroutines +}; diff --git a/src/rgw/rgw_placement_types.h b/src/rgw/rgw_placement_types.h new file mode 100644 index 000000000..bcf7a4af7 --- /dev/null +++ b/src/rgw/rgw_placement_types.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "include/types.h" + +#include "common/Formatter.h" + + +static std::string RGW_STORAGE_CLASS_STANDARD = "STANDARD"; + +struct rgw_placement_rule { + std::string name; + std::string storage_class; + + rgw_placement_rule() {} + rgw_placement_rule(const std::string& _n, const std::string& _sc) : name(_n), storage_class(_sc) {} + rgw_placement_rule(const rgw_placement_rule& _r, const std::string& _sc) : name(_r.name) { + if (!_sc.empty()) { + storage_class = _sc; + } else { + storage_class = _r.storage_class; + } + } + + bool empty() const { + return name.empty() && storage_class.empty(); + } + + void inherit_from(const rgw_placement_rule& r) { + if (name.empty()) { + name = r.name; + } + if (storage_class.empty()) { + storage_class = r.storage_class; + } + } + + void clear() { + name.clear(); + storage_class.clear(); + } + + void init(const std::string& n, const std::string& c) { + name = n; + storage_class = c; + } + + static const std::string& get_canonical_storage_class(const std::string& storage_class) { + if (storage_class.empty()) { + return RGW_STORAGE_CLASS_STANDARD; + } + return storage_class; + } + + const std::string& get_storage_class() const { + return get_canonical_storage_class(storage_class); + } + + int compare(const rgw_placement_rule& r) const { + int c = name.compare(r.name); + if (c != 0) { + return c; + } + return get_storage_class().compare(r.get_storage_class()); + } + + bool operator==(const rgw_placement_rule& r) const { + return (name == r.name && + get_storage_class() == r.get_storage_class()); + } + + bool operator!=(const rgw_placement_rule& r) const { + return !(*this == r); + } + + void encode(bufferlist& bl) const { + /* no ENCODE_START/END due to backward compatibility */ + std::string s = to_str(); + ceph::encode(s, bl); + } + + void decode(bufferlist::const_iterator& bl) { + std::string s; + ceph::decode(s, bl); + from_str(s); + } + + std::string to_str() const { + if (standard_storage_class()) { + return name; + } + return to_str_explicit(); + } + + std::string to_str_explicit() const { + return name + "/" + storage_class; + } + + void from_str(const std::string& s) { + size_t pos = s.find("/"); + if (pos == std::string::npos) { + name = s; + storage_class.clear(); + return; + } + name = s.substr(0, pos); + storage_class = s.substr(pos + 1); + } + + bool standard_storage_class() const { + return storage_class.empty() || storage_class == RGW_STORAGE_CLASS_STANDARD; + } +}; +WRITE_CLASS_ENCODER(rgw_placement_rule) diff --git a/src/rgw/rgw_policy_s3.cc b/src/rgw/rgw_policy_s3.cc new file mode 100644 index 000000000..e017cc887 --- /dev/null +++ b/src/rgw/rgw_policy_s3.cc @@ -0,0 +1,305 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "common/ceph_json.h" +#include "rgw_policy_s3.h" +#include "rgw_common.h" +#include "rgw_crypt_sanitize.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +class RGWPolicyCondition { +protected: + string v1; + string v2; + + virtual bool check(const string& first, const string& second, string& err_msg) = 0; + +public: + virtual ~RGWPolicyCondition() {} + + void set_vals(const string& _v1, const string& _v2) { + v1 = _v1; + v2 = _v2; + } + + bool check(RGWPolicyEnv *env, map& checked_vars, string& err_msg) { + string first, second; + env->get_value(v1, first, checked_vars); + env->get_value(v2, second, checked_vars); + dout(1) << "policy condition check " << v1 << " [" + << rgw::crypt_sanitize::s3_policy{v1, first} + << "] " << v2 << " [" + << rgw::crypt_sanitize::s3_policy{v2, second} + << "]" << dendl; + bool ret = check(first, second, err_msg); + if (!ret) { + err_msg.append(": "); + err_msg.append(v1); + err_msg.append(", "); + err_msg.append(v2); + } + return ret; + } + +}; + + +class RGWPolicyCondition_StrEqual : public RGWPolicyCondition { +protected: + bool check(const string& first, const string& second, string& msg) override { + bool ret = first.compare(second) == 0; + if (!ret) { + msg = "Policy condition failed: eq"; + } + return ret; + } +}; + +class RGWPolicyCondition_StrStartsWith : public RGWPolicyCondition { +protected: + bool check(const string& first, const string& second, string& msg) override { + bool ret = first.compare(0, second.size(), second) == 0; + if (!ret) { + msg = "Policy condition failed: starts-with"; + } + return ret; + } +}; + +void RGWPolicyEnv::add_var(const string& name, const string& value) +{ + vars[name] = value; +} + +bool RGWPolicyEnv::get_var(const string& name, string& val) +{ + map::iterator iter = vars.find(name); + if (iter == vars.end()) + return false; + + val = iter->second; + + return true; +} + +bool RGWPolicyEnv::get_value(const string& s, string& val, map& checked_vars) +{ + if (s.empty() || s[0] != '$') { + val = s; + return true; + } + + const string& var = s.substr(1); + checked_vars[var] = true; + + return get_var(var, val); +} + + +bool RGWPolicyEnv::match_policy_vars(map& policy_vars, string& err_msg) +{ + map::iterator iter; + string ignore_prefix = "x-ignore-"; + for (iter = vars.begin(); iter != vars.end(); ++iter) { + const string& var = iter->first; + if (strncasecmp(ignore_prefix.c_str(), var.c_str(), ignore_prefix.size()) == 0) + continue; + if (policy_vars.count(var) == 0) { + err_msg = "Policy missing condition: "; + err_msg.append(iter->first); + dout(1) << "env var missing in policy: " << iter->first << dendl; + return false; + } + } + return true; +} + +RGWPolicy::~RGWPolicy() +{ + list::iterator citer; + for (citer = conditions.begin(); citer != conditions.end(); ++citer) { + RGWPolicyCondition *cond = *citer; + delete cond; + } +} + +int RGWPolicy::set_expires(const string& e) +{ + struct tm t; + if (!parse_iso8601(e.c_str(), &t)) + return -EINVAL; + + expires = internal_timegm(&t); + + return 0; +} + +int RGWPolicy::add_condition(const string& op, const string& first, const string& second, string& err_msg) +{ + RGWPolicyCondition *cond = NULL; + if (stringcasecmp(op, "eq") == 0) { + cond = new RGWPolicyCondition_StrEqual; + } else if (stringcasecmp(op, "starts-with") == 0) { + cond = new RGWPolicyCondition_StrStartsWith; + } else if (stringcasecmp(op, "content-length-range") == 0) { + off_t min, max; + int r = stringtoll(first, &min); + if (r < 0) { + err_msg = "Bad content-length-range param"; + dout(0) << "bad content-length-range param: " << first << dendl; + return r; + } + + r = stringtoll(second, &max); + if (r < 0) { + err_msg = "Bad content-length-range param"; + dout(0) << "bad content-length-range param: " << second << dendl; + return r; + } + + if (min > min_length) + min_length = min; + + if (max < max_length) + max_length = max; + + return 0; + } + + if (!cond) { + err_msg = "Invalid condition: "; + err_msg.append(op); + dout(0) << "invalid condition: " << op << dendl; + return -EINVAL; + } + + cond->set_vals(first, second); + + conditions.push_back(cond); + + return 0; +} + +int RGWPolicy::check(RGWPolicyEnv *env, string& err_msg) +{ + uint64_t now = ceph_clock_now().sec(); + if (expires <= now) { + dout(0) << "NOTICE: policy calculated as expired: " << expiration_str << dendl; + err_msg = "Policy expired"; + return -EACCES; // change to condition about expired policy following S3 + } + + list >::iterator viter; + for (viter = var_checks.begin(); viter != var_checks.end(); ++viter) { + pair& p = *viter; + const string& name = p.first; + const string& check_val = p.second; + string val; + if (!env->get_var(name, val)) { + dout(20) << " policy check failed, variable not found: '" << name << "'" << dendl; + err_msg = "Policy check failed, variable not found: "; + err_msg.append(name); + return -EACCES; + } + + set_var_checked(name); + + dout(20) << "comparing " << name << " [" << val << "], " << check_val << dendl; + if (val.compare(check_val) != 0) { + err_msg = "Policy check failed, variable not met condition: "; + err_msg.append(name); + dout(1) << "policy check failed, val=" << val << " != " << check_val << dendl; + return -EACCES; + } + } + + list::iterator citer; + for (citer = conditions.begin(); citer != conditions.end(); ++citer) { + RGWPolicyCondition *cond = *citer; + if (!cond->check(env, checked_vars, err_msg)) { + return -EACCES; + } + } + + if (!env->match_policy_vars(checked_vars, err_msg)) { + dout(1) << "missing policy condition" << dendl; + return -EACCES; + } + return 0; +} + + +int RGWPolicy::from_json(bufferlist& bl, string& err_msg) +{ + JSONParser parser; + + if (!parser.parse(bl.c_str(), bl.length())) { + err_msg = "Malformed JSON"; + dout(0) << "malformed json" << dendl; + return -EINVAL; + } + + // as no time was included in the request, we hope that the user has included a short timeout + JSONObjIter iter = parser.find_first("expiration"); + if (iter.end()) { + err_msg = "Policy missing expiration"; + dout(0) << "expiration not found" << dendl; + return -EINVAL; // change to a "no expiration" error following S3 + } + + JSONObj *obj = *iter; + expiration_str = obj->get_data(); + int r = set_expires(expiration_str); + if (r < 0) { + err_msg = "Failed to parse policy expiration"; + return r; + } + + iter = parser.find_first("conditions"); + if (iter.end()) { + err_msg = "Policy missing conditions"; + dout(0) << "conditions not found" << dendl; + return -EINVAL; // change to a "no conditions" error following S3 + } + + obj = *iter; + + iter = obj->find_first(); + for (; !iter.end(); ++iter) { + JSONObj *child = *iter; + dout(20) << "data=" << child->get_data() << dendl; + dout(20) << "is_object=" << child->is_object() << dendl; + dout(20) << "is_array=" << child->is_array() << dendl; + JSONObjIter citer = child->find_first(); + if (child->is_array()) { + vector v; + int i; + for (i = 0; !citer.end() && i < 3; ++citer, ++i) { + JSONObj *o = *citer; + v.push_back(o->get_data()); + } + if (i != 3 || !citer.end()) { /* we expect exactly 3 arguments here */ + err_msg = "Bad condition array, expecting 3 arguments"; + return -EINVAL; + } + + int r = add_condition(v[0], v[1], v[2], err_msg); + if (r < 0) + return r; + } else if (!citer.end()) { + JSONObj *c = *citer; + dout(20) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl; + + add_simple_check(c->get_name(), c->get_data()); + } else { + return -EINVAL; + } + } + return 0; +} diff --git a/src/rgw/rgw_policy_s3.h b/src/rgw/rgw_policy_s3.h new file mode 100644 index 000000000..2a8a7ab09 --- /dev/null +++ b/src/rgw/rgw_policy_s3.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include +#include +#include + +#include "include/utime.h" + +#include "rgw_string.h" + + +class RGWPolicyEnv { + std::map vars; + +public: + void add_var(const std::string& name, const std::string& value); + bool get_var(const std::string& name, std::string& val); + bool get_value(const std::string& s, std::string& val, std::map& checked_vars); + bool match_policy_vars(std::map& policy_vars, std::string& err_msg); +}; + +class RGWPolicyCondition; + + +class RGWPolicy { + uint64_t expires; + std::string expiration_str; + std::list conditions; + std::list > var_checks; + std::map checked_vars; + +public: + off_t min_length; + off_t max_length; + + RGWPolicy() : expires(0), min_length(0), max_length(LLONG_MAX) {} + ~RGWPolicy(); + + int set_expires(const std::string& e); + + void set_var_checked(const std::string& var) { + checked_vars[var] = true; + } + + int add_condition(const std::string& op, const std::string& first, const std::string& second, std::string& err_msg); + void add_simple_check(const std::string& var, const std::string& value) { + var_checks.emplace_back(var, value); + } + + int check(RGWPolicyEnv *env, std::string& err_msg); + int from_json(bufferlist& bl, std::string& err_msg); +}; diff --git a/src/rgw/rgw_polparser.cc b/src/rgw/rgw_polparser.cc new file mode 100644 index 000000000..eca5066b3 --- /dev/null +++ b/src/rgw/rgw_polparser.cc @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include +#include +#include + +#include "include/buffer.h" + +#include "common/ceph_argparse.h" +#include "common/common_init.h" + +#include "global/global_init.h" + +#include "rgw/rgw_iam_policy.h" + +// Returns true on success +bool parse(CephContext* cct, const std::string& tenant, + const std::string& fname, std::istream& in) noexcept +{ + bufferlist bl; + bl.append(in); + try { + auto p = rgw::IAM::Policy( + cct, tenant, bl, + cct->_conf.get_val("rgw_policy_reject_invalid_principals")); + } catch (const rgw::IAM::PolicyParseException& e) { + std::cerr << fname << ": " << e.what() << std::endl; + return false; + } catch (const std::exception& e) { + std::cerr << fname << ": caught exception: " << e.what() << std::endl;; + return false; + } + return true; +} + +void helpful_exit(std::string_view cmdname) +{ + std::cerr << cmdname << "-h for usage" << std::endl; + exit(1); +} + +void usage(std::string_view cmdname) +{ + std::cout << "usage: " << cmdname << " -t [filename]" + << std::endl; +} + +int main(int argc, const char** argv) +{ + std::string_view cmdname = argv[0]; + std::string tenant; + + auto args = argv_to_vec(argc, argv); + if (ceph_argparse_need_usage(args)) { + usage(cmdname); + exit(0); + } + + auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DAEMON_ACTIONS | + CINIT_FLAG_NO_MON_CONFIG); + common_init_finish(cct.get()); + std::string val; + for (std::vector::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_witharg(args, i, &val, "--tenant", "-t", + (char*)nullptr)) { + tenant = std::move(val); + } else { + ++i; + } + } + + if (tenant.empty()) { + std::cerr << cmdname << ": must specify tenant name" << std::endl; + helpful_exit(cmdname); + } + + bool success = true; + + if (args.empty()) { + success = parse(cct.get(), tenant, "(stdin)", std::cin); + } else { + for (const auto& file : args) { + std::ifstream in; + in.open(file, std::ifstream::in); + if (!in.is_open()) { + std::cerr << "Can't read " << file << std::endl; + success = false; + } + if (!parse(cct.get(), tenant, file, in)) { + success = false; + } + } + } + + return success ? 0 : 1; +} diff --git a/src/rgw/rgw_pool_types.h b/src/rgw/rgw_pool_types.h new file mode 100644 index 000000000..b23e7d005 --- /dev/null +++ b/src/rgw/rgw_pool_types.h @@ -0,0 +1,157 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * introduce changes or include files which can only be compiled in + * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h) + */ + +#pragma once + +#include +#include + +#include "include/types.h" +#include "common/Formatter.h" + +class JSONObj; + +struct rgw_pool { + std::string name; + std::string ns; + + rgw_pool() = default; + rgw_pool(const rgw_pool& _p) : name(_p.name), ns(_p.ns) {} + rgw_pool(rgw_pool&&) = default; + rgw_pool(const std::string& _s) { + from_str(_s); + } + rgw_pool(const std::string& _name, const std::string& _ns) : name(_name), ns(_ns) {} + + std::string to_str() const; + void from_str(const std::string& s); + + void init(const std::string& _s) { + from_str(_s); + } + + bool empty() const { + return name.empty(); + } + + int compare(const rgw_pool& p) const { + int r = name.compare(p.name); + if (r != 0) { + return r; + } + return ns.compare(p.ns); + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(10, 10, bl); + encode(name, bl); + encode(ns, bl); + ENCODE_FINISH(bl); + } + + void decode_from_bucket(ceph::buffer::list::const_iterator& bl); + + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(10, 3, 3, bl); + + decode(name, bl); + + if (struct_v < 10) { + + /* + * note that rgw_pool can be used where rgw_bucket was used before + * therefore we inherit rgw_bucket's old versions. However, we only + * need the first field from rgw_bucket. unless we add more fields + * in which case we'll need to look at struct_v, and check the actual + * version. Anything older than 10 needs to be treated as old rgw_bucket + */ + + } else { + decode(ns, bl); + } + + DECODE_FINISH(bl); + } + + rgw_pool& operator=(const rgw_pool&) = default; + + bool operator==(const rgw_pool& p) const { + return (compare(p) == 0); + } + bool operator!=(const rgw_pool& p) const { + return !(*this == p); + } + bool operator<(const rgw_pool& p) const { + int r = name.compare(p.name); + if (r == 0) { + return (ns.compare(p.ns) < 0); + } + return (r < 0); + } +}; +WRITE_CLASS_ENCODER(rgw_pool) + +inline std::ostream& operator<<(std::ostream& out, const rgw_pool& p) { + out << p.to_str(); + return out; +} + +struct rgw_data_placement_target { + rgw_pool data_pool; + rgw_pool data_extra_pool; + rgw_pool index_pool; + + rgw_data_placement_target() = default; + rgw_data_placement_target(const rgw_data_placement_target&) = default; + rgw_data_placement_target(rgw_data_placement_target&&) = default; + + rgw_data_placement_target(const rgw_pool& data_pool, + const rgw_pool& data_extra_pool, + const rgw_pool& index_pool) + : data_pool(data_pool), + data_extra_pool(data_extra_pool), + index_pool(index_pool) { + } + + rgw_data_placement_target& + operator=(const rgw_data_placement_target&) = default; + + const rgw_pool& get_data_extra_pool() const { + if (data_extra_pool.empty()) { + return data_pool; + } + return data_extra_pool; + } + + int compare(const rgw_data_placement_target& t) { + int c = data_pool.compare(t.data_pool); + if (c != 0) { + return c; + } + c = data_extra_pool.compare(t.data_extra_pool); + if (c != 0) { + return c; + } + return index_pool.compare(t.index_pool); + }; + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); +}; diff --git a/src/rgw/rgw_process.cc b/src/rgw/rgw_process.cc new file mode 100644 index 000000000..8d20251f8 --- /dev/null +++ b/src/rgw/rgw_process.cc @@ -0,0 +1,472 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" +#include "include/scope_guard.h" + +#include +#include "rgw_auth_registry.h" +#include "rgw_dmclock_scheduler.h" +#include "rgw_rest.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_loadgen.h" +#include "rgw_client_io.h" +#include "rgw_opa.h" +#include "rgw_perf_counters.h" +#include "rgw_lua.h" +#include "rgw_lua_request.h" +#include "rgw_tracer.h" +#include "rgw_ratelimit.h" + +#include "services/svc_zone_utils.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using rgw::dmclock::Scheduler; + +void RGWProcess::RGWWQ::_dump_queue() +{ + if (!g_conf()->subsys.should_gather()) { + return; + } + deque::iterator iter; + if (process->m_req_queue.empty()) { + dout(20) << "RGWWQ: empty" << dendl; + return; + } + dout(20) << "RGWWQ:" << dendl; + for (iter = process->m_req_queue.begin(); + iter != process->m_req_queue.end(); ++iter) { + dout(20) << "req: " << hex << *iter << dec << dendl; + } +} /* RGWProcess::RGWWQ::_dump_queue */ + +auto schedule_request(Scheduler *scheduler, req_state *s, RGWOp *op) +{ + using rgw::dmclock::SchedulerCompleter; + if (!scheduler) + return std::make_pair(0,SchedulerCompleter{}); + + const auto client = op->dmclock_client(); + const auto cost = op->dmclock_cost(); + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 10)) { + ldpp_dout(op,10) << "scheduling with " + << s->cct->_conf.get_val("rgw_scheduler_type") + << " client=" << static_cast(client) + << " cost=" << cost << dendl; + } + return scheduler->schedule_request(client, {}, + req_state::Clock::to_double(s->time), + cost, + s->yield); +} + +bool RGWProcess::RGWWQ::_enqueue(RGWRequest* req) { + process->m_req_queue.push_back(req); + perfcounter->inc(l_rgw_qlen); + dout(20) << "enqueued request req=" << hex << req << dec << dendl; + _dump_queue(); + return true; +} + +RGWRequest* RGWProcess::RGWWQ::_dequeue() { + if (process->m_req_queue.empty()) + return NULL; + RGWRequest *req = process->m_req_queue.front(); + process->m_req_queue.pop_front(); + dout(20) << "dequeued request req=" << hex << req << dec << dendl; + _dump_queue(); + perfcounter->inc(l_rgw_qlen, -1); + return req; +} + +void RGWProcess::RGWWQ::_process(RGWRequest *req, ThreadPool::TPHandle &) { + perfcounter->inc(l_rgw_qactive); + process->handle_request(this, req); + process->req_throttle.put(1); + perfcounter->inc(l_rgw_qactive, -1); +} +bool rate_limit(rgw::sal::Driver* driver, req_state* s) { + // we dont want to limit health check or system or admin requests + const auto& is_admin_or_system = s->user->get_info(); + if ((s->op_type == RGW_OP_GET_HEALTH_CHECK) || is_admin_or_system.admin || is_admin_or_system.system) + return false; + std::string userfind; + RGWRateLimitInfo global_user; + RGWRateLimitInfo global_bucket; + RGWRateLimitInfo global_anon; + RGWRateLimitInfo* bucket_ratelimit; + RGWRateLimitInfo* user_ratelimit; + driver->get_ratelimit(global_bucket, global_user, global_anon); + bucket_ratelimit = &global_bucket; + user_ratelimit = &global_user; + s->user->get_id().to_str(userfind); + userfind = "u" + userfind; + s->ratelimit_user_name = userfind; + std::string bucketfind = !rgw::sal::Bucket::empty(s->bucket.get()) ? "b" + s->bucket->get_marker() : ""; + s->ratelimit_bucket_marker = bucketfind; + const char *method = s->info.method; + + auto iter = s->user->get_attrs().find(RGW_ATTR_RATELIMIT); + if(iter != s->user->get_attrs().end()) { + try { + RGWRateLimitInfo user_ratelimit_temp; + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(user_ratelimit_temp, biter); + // override global rate limiting only if local rate limiting is enabled + if (user_ratelimit_temp.enabled) + *user_ratelimit = user_ratelimit_temp; + } catch (buffer::error& err) { + ldpp_dout(s, 0) << "ERROR: failed to decode rate limit" << dendl; + return -EIO; + } + } + if (s->user->get_id().id == RGW_USER_ANON_ID && global_anon.enabled) { + *user_ratelimit = global_anon; + } + bool limit_bucket = false; + bool limit_user = s->ratelimit_data->should_rate_limit(method, s->ratelimit_user_name, s->time, user_ratelimit); + + if(!rgw::sal::Bucket::empty(s->bucket.get())) + { + iter = s->bucket->get_attrs().find(RGW_ATTR_RATELIMIT); + if(iter != s->bucket->get_attrs().end()) { + try { + RGWRateLimitInfo bucket_ratelimit_temp; + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(bucket_ratelimit_temp, biter); + // override global rate limiting only if local rate limiting is enabled + if (bucket_ratelimit_temp.enabled) + *bucket_ratelimit = bucket_ratelimit_temp; + } catch (buffer::error& err) { + ldpp_dout(s, 0) << "ERROR: failed to decode rate limit" << dendl; + return -EIO; + } + } + if (!limit_user) { + limit_bucket = s->ratelimit_data->should_rate_limit(method, s->ratelimit_bucket_marker, s->time, bucket_ratelimit); + } + } + if(limit_bucket && !limit_user) { + s->ratelimit_data->giveback_tokens(method, s->ratelimit_user_name); + } + s->user_ratelimit = *user_ratelimit; + s->bucket_ratelimit = *bucket_ratelimit; + return (limit_user || limit_bucket); +} + +int rgw_process_authenticated(RGWHandler_REST * const handler, + RGWOp *& op, + RGWRequest * const req, + req_state * const s, + optional_yield y, + rgw::sal::Driver* driver, + const bool skip_retarget) +{ + ldpp_dout(op, 2) << "init permissions" << dendl; + int ret = handler->init_permissions(op, y); + if (ret < 0) { + return ret; + } + + /** + * Only some accesses support website mode, and website mode does NOT apply + * if you are using the REST endpoint either (ergo, no authenticated access) + */ + if (! skip_retarget) { + ldpp_dout(op, 2) << "recalculating target" << dendl; + ret = handler->retarget(op, &op, y); + if (ret < 0) { + return ret; + } + req->op = op; + } else { + ldpp_dout(op, 2) << "retargeting skipped because of SubOp mode" << dendl; + } + + /* If necessary extract object ACL and put them into req_state. */ + ldpp_dout(op, 2) << "reading permissions" << dendl; + ret = handler->read_permissions(op, y); + if (ret < 0) { + return ret; + } + + ldpp_dout(op, 2) << "init op" << dendl; + ret = op->init_processing(y); + if (ret < 0) { + return ret; + } + + ldpp_dout(op, 2) << "verifying op mask" << dendl; + ret = op->verify_op_mask(); + if (ret < 0) { + return ret; + } + + /* Check if OPA is used to authorize requests */ + if (s->cct->_conf->rgw_use_opa_authz) { + ret = rgw_opa_authorize(op, s); + if (ret < 0) { + return ret; + } + } + + ldpp_dout(op, 2) << "verifying op permissions" << dendl; + { + auto span = tracing::rgw::tracer.add_span("verify_permission", s->trace); + std::swap(span, s->trace); + ret = op->verify_permission(y); + std::swap(span, s->trace); + } + if (ret < 0) { + if (s->system_request) { + dout(2) << "overriding permissions due to system operation" << dendl; + } else if (s->auth.identity->is_admin_of(s->user->get_id())) { + dout(2) << "overriding permissions due to admin operation" << dendl; + } else { + return ret; + } + } + + ldpp_dout(op, 2) << "verifying op params" << dendl; + ret = op->verify_params(); + if (ret < 0) { + return ret; + } + + ldpp_dout(op, 2) << "pre-executing" << dendl; + op->pre_exec(); + + ldpp_dout(op, 2) << "check rate limiting" << dendl; + if (rate_limit(driver, s)) { + return -ERR_RATE_LIMITED; + } + ldpp_dout(op, 2) << "executing" << dendl; + { + auto span = tracing::rgw::tracer.add_span("execute", s->trace); + std::swap(span, s->trace); + op->execute(y); + std::swap(span, s->trace); + } + + ldpp_dout(op, 2) << "completing" << dendl; + op->complete(); + + return 0; +} + +int process_request(const RGWProcessEnv& penv, + RGWRequest* const req, + const std::string& frontend_prefix, + RGWRestfulIO* const client_io, + optional_yield yield, + rgw::dmclock::Scheduler *scheduler, + string* user, + ceph::coarse_real_clock::duration* latency, + int* http_ret) +{ + int ret = client_io->init(g_ceph_context); + dout(1) << "====== starting new request req=" << hex << req << dec + << " =====" << dendl; + perfcounter->inc(l_rgw_req); + + RGWEnv& rgw_env = client_io->get_env(); + + req_state rstate(g_ceph_context, penv, &rgw_env, req->id); + req_state *s = &rstate; + + s->ratelimit_data = penv.ratelimiting->get_active(); + + rgw::sal::Driver* driver = penv.driver; + std::unique_ptr u = driver->get_user(rgw_user()); + s->set_user(u); + + if (ret < 0) { + s->cio = client_io; + abort_early(s, nullptr, ret, nullptr, yield); + return ret; + } + + s->req_id = driver->zone_unique_id(req->id); + s->trans_id = driver->zone_unique_trans_id(req->id); + s->host_id = driver->get_host_id(); + s->yield = yield; + + ldpp_dout(s, 2) << "initializing for trans_id = " << s->trans_id << dendl; + + RGWOp* op = nullptr; + int init_error = 0; + bool should_log = false; + RGWREST* rest = penv.rest; + RGWRESTMgr *mgr; + RGWHandler_REST *handler = rest->get_handler(driver, s, + *penv.auth_registry, + frontend_prefix, + client_io, &mgr, &init_error); + rgw::dmclock::SchedulerCompleter c; + + if (init_error != 0) { + abort_early(s, nullptr, init_error, nullptr, yield); + goto done; + } + ldpp_dout(s, 10) << "handler=" << typeid(*handler).name() << dendl; + + should_log = mgr->get_logging(); + + ldpp_dout(s, 2) << "getting op " << s->op << dendl; + op = handler->get_op(); + if (!op) { + abort_early(s, NULL, -ERR_METHOD_NOT_ALLOWED, handler, yield); + goto done; + } + { + s->trace_enabled = tracing::rgw::tracer.is_enabled(); + std::string script; + auto rc = rgw::lua::read_script(s, penv.lua.manager.get(), s->bucket_tenant, s->yield, rgw::lua::context::preRequest, script); + if (rc == -ENOENT) { + // no script, nothing to do + } else if (rc < 0) { + ldpp_dout(op, 5) << "WARNING: failed to read pre request script. error: " << rc << dendl; + } else { + rc = rgw::lua::request::execute(driver, rest, penv.olog, s, op, script); + if (rc < 0) { + ldpp_dout(op, 5) << "WARNING: failed to execute pre request script. error: " << rc << dendl; + } + } + } + std::tie(ret,c) = schedule_request(scheduler, s, op); + if (ret < 0) { + if (ret == -EAGAIN) { + ret = -ERR_RATE_LIMITED; + } + ldpp_dout(op,0) << "Scheduling request failed with " << ret << dendl; + abort_early(s, op, ret, handler, yield); + goto done; + } + req->op = op; + ldpp_dout(op, 10) << "op=" << typeid(*op).name() << dendl; + s->op_type = op->get_type(); + + try { + ldpp_dout(op, 2) << "verifying requester" << dendl; + ret = op->verify_requester(*penv.auth_registry, yield); + if (ret < 0) { + dout(10) << "failed to authorize request" << dendl; + abort_early(s, op, ret, handler, yield); + goto done; + } + + /* FIXME: remove this after switching all handlers to the new authentication + * infrastructure. */ + if (nullptr == s->auth.identity) { + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } + + ldpp_dout(op, 2) << "normalizing buckets and tenants" << dendl; + ret = handler->postauth_init(yield); + if (ret < 0) { + dout(10) << "failed to run post-auth init" << dendl; + abort_early(s, op, ret, handler, yield); + goto done; + } + + if (s->user->get_info().suspended) { + dout(10) << "user is suspended, uid=" << s->user->get_id() << dendl; + abort_early(s, op, -ERR_USER_SUSPENDED, handler, yield); + goto done; + } + + + const auto trace_name = std::string(op->name()) + " " + s->trans_id; + s->trace = tracing::rgw::tracer.start_trace(trace_name, s->trace_enabled); + s->trace->SetAttribute(tracing::rgw::OP, op->name()); + s->trace->SetAttribute(tracing::rgw::TYPE, tracing::rgw::REQUEST); + + ret = rgw_process_authenticated(handler, op, req, s, yield, driver); + if (ret < 0) { + abort_early(s, op, ret, handler, yield); + goto done; + } + } catch (const ceph::crypto::DigestException& e) { + dout(0) << "authentication failed" << e.what() << dendl; + abort_early(s, op, -ERR_INVALID_SECRET_KEY, handler, yield); + } + +done: + if (op) { + if (s->trace) { + s->trace->SetAttribute(tracing::rgw::RETURN, op->get_ret()); + if (!rgw::sal::User::empty(s->user)) { + s->trace->SetAttribute(tracing::rgw::USER_ID, s->user->get_id().id); + } + if (!rgw::sal::Bucket::empty(s->bucket)) { + s->trace->SetAttribute(tracing::rgw::BUCKET_NAME, s->bucket->get_name()); + } + if (!rgw::sal::Object::empty(s->object)) { + s->trace->SetAttribute(tracing::rgw::OBJECT_NAME, s->object->get_name()); + } + } + std::string script; + auto rc = rgw::lua::read_script(s, penv.lua.manager.get(), s->bucket_tenant, s->yield, rgw::lua::context::postRequest, script); + if (rc == -ENOENT) { + // no script, nothing to do + } else if (rc < 0) { + ldpp_dout(op, 5) << "WARNING: failed to read post request script. error: " << rc << dendl; + } else { + rc = rgw::lua::request::execute(driver, rest, penv.olog, s, op, script); + if (rc < 0) { + ldpp_dout(op, 5) << "WARNING: failed to execute post request script. error: " << rc << dendl; + } + } + } + + try { + client_io->complete_request(); + } catch (rgw::io::Exception& e) { + dout(0) << "ERROR: client_io->complete_request() returned " + << e.what() << dendl; + } + if (should_log) { + rgw_log_op(rest, s, op, penv.olog); + } + + if (http_ret != nullptr) { + *http_ret = s->err.http_ret; + } + int op_ret = 0; + + if (user && !rgw::sal::User::empty(s->user.get())) { + *user = s->user->get_id().to_str(); + } + + if (op) { + op_ret = op->get_ret(); + ldpp_dout(op, 2) << "op status=" << op_ret << dendl; + ldpp_dout(op, 2) << "http status=" << s->err.http_ret << dendl; + } else { + ldpp_dout(s, 2) << "http status=" << s->err.http_ret << dendl; + } + if (handler) + handler->put_op(op); + rest->put_handler(handler); + + const auto lat = s->time_elapsed(); + if (latency) { + *latency = lat; + } + dout(1) << "====== req done req=" << hex << req << dec + << " op status=" << op_ret + << " http_status=" << s->err.http_ret + << " latency=" << lat + << " ======" + << dendl; + + return (ret < 0 ? ret : s->err.ret); +} /* process_request */ diff --git a/src/rgw/rgw_process.h b/src/rgw/rgw_process.h new file mode 100644 index 000000000..640f07842 --- /dev/null +++ b/src/rgw/rgw_process.h @@ -0,0 +1,159 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_common.h" +#include "rgw_acl.h" +#include "rgw_user.h" +#include "rgw_rest.h" +#include "include/ceph_assert.h" + +#include "common/WorkQueue.h" +#include "common/Throttle.h" + +#include + +#define dout_context g_ceph_context + + +namespace rgw::dmclock { + class Scheduler; +} + +struct RGWProcessEnv; +class RGWFrontendConfig; +class RGWRequest; + +class RGWProcess { + std::deque m_req_queue; +protected: + CephContext *cct; + RGWProcessEnv& env; + ThreadPool m_tp; + Throttle req_throttle; + RGWFrontendConfig* conf; + int sock_fd; + std::string uri_prefix; + + struct RGWWQ : public DoutPrefixProvider, public ThreadPool::WorkQueue { + RGWProcess* process; + RGWWQ(RGWProcess* p, ceph::timespan timeout, ceph::timespan suicide_timeout, + ThreadPool* tp) + : ThreadPool::WorkQueue("RGWWQ", timeout, suicide_timeout, + tp), process(p) {} + + bool _enqueue(RGWRequest* req) override; + + void _dequeue(RGWRequest* req) override { + ceph_abort(); + } + + bool _empty() override { + return process->m_req_queue.empty(); + } + + RGWRequest* _dequeue() override; + + using ThreadPool::WorkQueue::_process; + + void _process(RGWRequest *req, ThreadPool::TPHandle &) override; + + void _dump_queue(); + + void _clear() override { + ceph_assert(process->m_req_queue.empty()); + } + + CephContext *get_cct() const override { return process->cct; } + unsigned get_subsys() const { return ceph_subsys_rgw; } + std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw request work queue: ";} + + } req_wq; + +public: + RGWProcess(CephContext* const cct, + RGWProcessEnv& env, + const int num_threads, + std::string uri_prefix, + RGWFrontendConfig* const conf) + : cct(cct), env(env), + m_tp(cct, "RGWProcess::m_tp", "tp_rgw_process", num_threads), + req_throttle(cct, "rgw_ops", num_threads * 2), + conf(conf), + sock_fd(-1), + uri_prefix(std::move(uri_prefix)), + req_wq(this, + ceph::make_timespan(g_conf()->rgw_op_thread_timeout), + ceph::make_timespan(g_conf()->rgw_op_thread_suicide_timeout), + &m_tp) { + } + + virtual ~RGWProcess() = default; + + const RGWProcessEnv& get_env() const { return env; } + + virtual void run() = 0; + virtual void handle_request(const DoutPrefixProvider *dpp, RGWRequest *req) = 0; + + void pause() { + m_tp.pause(); + } + + void unpause_with_new_config() { + m_tp.unpause(); + } + + void close_fd() { + if (sock_fd >= 0) { + ::close(sock_fd); + sock_fd = -1; + } + } +}; /* RGWProcess */ + +class RGWProcessControlThread : public Thread { + RGWProcess *pprocess; +public: + explicit RGWProcessControlThread(RGWProcess *_pprocess) : pprocess(_pprocess) {} + + void *entry() override { + pprocess->run(); + return NULL; + } +}; + +class RGWLoadGenProcess : public RGWProcess { + RGWAccessKey access_key; +public: + RGWLoadGenProcess(CephContext* cct, RGWProcessEnv& env, int num_threads, + std::string uri_prefix, RGWFrontendConfig* _conf) + : RGWProcess(cct, env, num_threads, std::move(uri_prefix), _conf) {} + void run() override; + void checkpoint(); + void handle_request(const DoutPrefixProvider *dpp, RGWRequest* req) override; + void gen_request(const std::string& method, const std::string& resource, + int content_length, std::atomic* fail_flag); + + void set_access_key(RGWAccessKey& key) { access_key = key; } +}; +/* process stream request */ +extern int process_request(const RGWProcessEnv& penv, + RGWRequest* req, + const std::string& frontend_prefix, + RGWRestfulIO* client_io, + optional_yield y, + rgw::dmclock::Scheduler *scheduler, + std::string* user, + ceph::coarse_real_clock::duration* latency, + int* http_ret = nullptr); + +extern int rgw_process_authenticated(RGWHandler_REST* handler, + RGWOp*& op, + RGWRequest* req, + req_state* s, + optional_yield y, + rgw::sal::Driver* driver, + bool skip_retarget = false); + +#undef dout_context diff --git a/src/rgw/rgw_process_env.h b/src/rgw/rgw_process_env.h new file mode 100644 index 000000000..4becf21a1 --- /dev/null +++ b/src/rgw/rgw_process_env.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +class ActiveRateLimiter; +class OpsLogSink; +class RGWREST; + +namespace rgw::auth { + class StrategyRegistry; +} +namespace rgw::lua { + class Background; +} +namespace rgw::sal { + class Store; + class LuaManager; +} + +#ifdef WITH_ARROW_FLIGHT +namespace rgw::flight { + class FlightServer; + class FlightStore; +} +#endif + +struct RGWLuaProcessEnv { + std::string luarocks_path; + rgw::lua::Background* background = nullptr; + std::unique_ptr manager; +}; + +struct RGWProcessEnv { + RGWLuaProcessEnv lua; + rgw::sal::Driver* driver = nullptr; + RGWREST *rest = nullptr; + OpsLogSink *olog = nullptr; + std::unique_ptr auth_registry; + ActiveRateLimiter* ratelimiting = nullptr; + +#ifdef WITH_ARROW_FLIGHT + // managed by rgw:flight::FlightFrontend in rgw_flight_frontend.cc + rgw::flight::FlightServer* flight_server = nullptr; + rgw::flight::FlightStore* flight_store = nullptr; +#endif +}; + diff --git a/src/rgw/rgw_public_access.cc b/src/rgw/rgw_public_access.cc new file mode 100644 index 000000000..6298bb306 --- /dev/null +++ b/src/rgw/rgw_public_access.cc @@ -0,0 +1,33 @@ +#include "rgw_public_access.h" +#include "rgw_xml.h" + +void PublicAccessBlockConfiguration::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("BlockPublicAcls", BlockPublicAcls, obj); + RGWXMLDecoder::decode_xml("IgnorePublicAcls", IgnorePublicAcls, obj); + RGWXMLDecoder::decode_xml("BlockPublicPolicy", BlockPublicPolicy, obj); + RGWXMLDecoder::decode_xml("RestrictPublicBuckets", RestrictPublicBuckets, obj); +} + +void PublicAccessBlockConfiguration::dump_xml(Formatter *f) const { + Formatter::ObjectSection os(*f, "BlockPublicAccessBlockConfiguration"); + // Note: AWS spec mentions the values to be ALL CAPs, but clients seem to + // require all small letters, and S3 itself doesn't seem to follow the API + // spec here + f->dump_bool("BlockPublicAcls", BlockPublicAcls); + f->dump_bool("IgnorePublicAcls", IgnorePublicAcls); + f->dump_bool("BlockPublicPolicy", BlockPublicPolicy); + f->dump_bool("RestrictPublicBuckets", RestrictPublicBuckets); +} + + +std::ostream& operator<< (std::ostream& os, const PublicAccessBlockConfiguration& access_conf) +{ + os << std::boolalpha + << "BlockPublicAcls: " << access_conf.block_public_acls() << std::endl + << "IgnorePublicAcls: " << access_conf.ignore_public_acls() << std::endl + << "BlockPublicPolicy" << access_conf.block_public_policy() << std::endl + << "RestrictPublicBuckets" << access_conf.restrict_public_buckets() << std::endl; + + return os; +} + diff --git a/src/rgw/rgw_public_access.h b/src/rgw/rgw_public_access.h new file mode 100644 index 000000000..87d2a16a3 --- /dev/null +++ b/src/rgw/rgw_public_access.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once +#include + +class XMLObj; + +class PublicAccessBlockConfiguration { + bool BlockPublicAcls; + bool IgnorePublicAcls; + bool BlockPublicPolicy; + bool RestrictPublicBuckets; + public: + PublicAccessBlockConfiguration(): + BlockPublicAcls(false), IgnorePublicAcls(false), + BlockPublicPolicy(false), RestrictPublicBuckets(false) + {} + + auto block_public_acls() const { + return BlockPublicAcls; + } + auto ignore_public_acls() const { + return IgnorePublicAcls; + } + auto block_public_policy() const { + return BlockPublicPolicy; + } + auto restrict_public_buckets() const { + return RestrictPublicBuckets; + } + + void encode(ceph::bufferlist& bl) const { + ENCODE_START(1,1, bl); + encode(BlockPublicAcls, bl); + encode(IgnorePublicAcls, bl); + encode(BlockPublicPolicy, bl); + encode(RestrictPublicBuckets, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::bufferlist::const_iterator& bl) { + DECODE_START(1,bl); + decode(BlockPublicAcls, bl); + decode(IgnorePublicAcls, bl); + decode(BlockPublicPolicy, bl); + decode(RestrictPublicBuckets, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(PublicAccessBlockConfiguration) +std::ostream& operator<< (std::ostream& os, const PublicAccessBlockConfiguration& access_conf); diff --git a/src/rgw/rgw_pubsub.cc b/src/rgw/rgw_pubsub.cc new file mode 100644 index 000000000..2b0cffd47 --- /dev/null +++ b/src/rgw/rgw_pubsub.cc @@ -0,0 +1,736 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "services/svc_zone.h" +#include "rgw_b64.h" +#include "rgw_sal.h" +#include "rgw_pubsub.h" +#include "rgw_tools.h" +#include "rgw_xml.h" +#include "rgw_arn.h" +#include "rgw_pubsub_push.h" +#include +#include + +#define dout_subsys ceph_subsys_rgw + +void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) { + char buf[64]; + const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str()); + if (len > 0) { + id.assign(buf, len); + } +} + +void rgw_s3_key_filter::dump(Formatter *f) const { + if (!prefix_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_json("Name", "prefix", f); + ::encode_json("Value", prefix_rule, f); + f->close_section(); + } + if (!suffix_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_json("Name", "suffix", f); + ::encode_json("Value", suffix_rule, f); + f->close_section(); + } + if (!regex_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_json("Name", "regex", f); + ::encode_json("Value", regex_rule, f); + f->close_section(); + } +} + +bool rgw_s3_key_filter::decode_xml(XMLObj* obj) { + XMLObjIter iter = obj->find("FilterRule"); + XMLObj *o; + + const auto throw_if_missing = true; + auto prefix_not_set = true; + auto suffix_not_set = true; + auto regex_not_set = true; + std::string name; + + while ((o = iter.get_next())) { + RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing); + if (name == "prefix" && prefix_not_set) { + prefix_not_set = false; + RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing); + } else if (name == "suffix" && suffix_not_set) { + suffix_not_set = false; + RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing); + } else if (name == "regex" && regex_not_set) { + regex_not_set = false; + RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing); + } else { + throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'"); + } + } + return true; +} + +void rgw_s3_key_filter::dump_xml(Formatter *f) const { + if (!prefix_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "prefix", f); + ::encode_xml("Value", prefix_rule, f); + f->close_section(); + } + if (!suffix_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "suffix", f); + ::encode_xml("Value", suffix_rule, f); + f->close_section(); + } + if (!regex_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "regex", f); + ::encode_xml("Value", regex_rule, f); + f->close_section(); + } +} + +bool rgw_s3_key_filter::has_content() const { + return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty()); +} + +void rgw_s3_key_value_filter::dump(Formatter *f) const { + for (const auto& key_value : kv) { + f->open_object_section("FilterRule"); + ::encode_json("Name", key_value.first, f); + ::encode_json("Value", key_value.second, f); + f->close_section(); + } +} + +bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) { + kv.clear(); + XMLObjIter iter = obj->find("FilterRule"); + XMLObj *o; + + const auto throw_if_missing = true; + + std::string key; + std::string value; + + while ((o = iter.get_next())) { + RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing); + RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing); + kv.emplace(key, value); + } + return true; +} + +void rgw_s3_key_value_filter::dump_xml(Formatter *f) const { + for (const auto& key_value : kv) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", key_value.first, f); + ::encode_xml("Value", key_value.second, f); + f->close_section(); + } +} + +bool rgw_s3_key_value_filter::has_content() const { + return !kv.empty(); +} + +void rgw_s3_filter::dump(Formatter *f) const { + encode_json("S3Key", key_filter, f); + encode_json("S3Metadata", metadata_filter, f); + encode_json("S3Tags", tag_filter, f); +} + +bool rgw_s3_filter::decode_xml(XMLObj* obj) { + RGWXMLDecoder::decode_xml("S3Key", key_filter, obj); + RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj); + RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj); + return true; +} + +void rgw_s3_filter::dump_xml(Formatter *f) const { + if (key_filter.has_content()) { + ::encode_xml("S3Key", key_filter, f); + } + if (metadata_filter.has_content()) { + ::encode_xml("S3Metadata", metadata_filter, f); + } + if (tag_filter.has_content()) { + ::encode_xml("S3Tags", tag_filter, f); + } +} + +bool rgw_s3_filter::has_content() const { + return key_filter.has_content() || + metadata_filter.has_content() || + tag_filter.has_content(); +} + +bool match(const rgw_s3_key_filter& filter, const std::string& key) { + const auto key_size = key.size(); + const auto prefix_size = filter.prefix_rule.size(); + if (prefix_size != 0) { + // prefix rule exists + if (prefix_size > key_size) { + // if prefix is longer than key, we fail + return false; + } + if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) { + return false; + } + } + const auto suffix_size = filter.suffix_rule.size(); + if (suffix_size != 0) { + // suffix rule exists + if (suffix_size > key_size) { + // if suffix is longer than key, we fail + return false; + } + if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) { + return false; + } + } + if (!filter.regex_rule.empty()) { + // TODO add regex chaching in the filter + const std::regex base_regex(filter.regex_rule); + if (!std::regex_match(key, base_regex)) { + return false; + } + } + return true; +} + +bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv) { + // all filter pairs must exist with the same value in the object's metadata/tags + // object metadata/tags may include items not in the filter + return std::includes(kv.begin(), kv.end(), filter.kv.begin(), filter.kv.end()); +} + +bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv) { + // all filter pairs must exist with the same value in the object's metadata/tags + // object metadata/tags may include items not in the filter + for (auto& filter : filter.kv) { + auto result = kv.equal_range(filter.first); + if (std::any_of(result.first, result.second, [&filter](const std::pair& p) { return p.second == filter.second;})) + continue; + else + return false; + } + return true; +} + +bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) { + // if event list exists, and none of the events in the list matches the event type, filter the message + if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) { + return false; + } + return true; +} + +void do_decode_xml_obj(rgw::notify::EventTypeList& l, const std::string& name, XMLObj *obj) { + l.clear(); + + XMLObjIter iter = obj->find(name); + XMLObj *o; + + while ((o = iter.get_next())) { + std::string val; + decode_xml_obj(val, o); + l.push_back(rgw::notify::from_string(val)); + } +} + +bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) { + const auto throw_if_missing = true; + RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing); + + RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing); + + RGWXMLDecoder::decode_xml("Filter", filter, obj); + + do_decode_xml_obj(events, "Event", obj); + if (events.empty()) { + // if no events are provided, we assume all events + events.push_back(rgw::notify::ObjectCreated); + events.push_back(rgw::notify::ObjectRemoved); + } + return true; +} + +void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const { + ::encode_xml("Id", id, f); + ::encode_xml("Topic", topic_arn.c_str(), f); + if (filter.has_content()) { + ::encode_xml("Filter", filter, f); + } + for (const auto& event : events) { + ::encode_xml("Event", rgw::notify::to_string(event), f); + } +} + +bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) { + do_decode_xml_obj(list, "TopicConfiguration", obj); + return true; +} + +rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) : + id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {} + +void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const { + do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f); +} + +void rgw_pubsub_s3_event::dump(Formatter *f) const { + encode_json("eventVersion", eventVersion, f); + encode_json("eventSource", eventSource, f); + encode_json("awsRegion", awsRegion, f); + utime_t ut(eventTime); + encode_json("eventTime", ut, f); + encode_json("eventName", eventName, f); + { + Formatter::ObjectSection s(*f, "userIdentity"); + encode_json("principalId", userIdentity, f); + } + { + Formatter::ObjectSection s(*f, "requestParameters"); + encode_json("sourceIPAddress", sourceIPAddress, f); + } + { + Formatter::ObjectSection s(*f, "responseElements"); + encode_json("x-amz-request-id", x_amz_request_id, f); + encode_json("x-amz-id-2", x_amz_id_2, f); + } + { + Formatter::ObjectSection s(*f, "s3"); + encode_json("s3SchemaVersion", s3SchemaVersion, f); + encode_json("configurationId", configurationId, f); + { + Formatter::ObjectSection sub_s(*f, "bucket"); + encode_json("name", bucket_name, f); + { + Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity"); + encode_json("principalId", bucket_ownerIdentity, f); + } + encode_json("arn", bucket_arn, f); + encode_json("id", bucket_id, f); + } + { + Formatter::ObjectSection sub_s(*f, "object"); + encode_json("key", object_key, f); + encode_json("size", object_size, f); + encode_json("eTag", object_etag, f); + encode_json("versionId", object_versionId, f); + encode_json("sequencer", object_sequencer, f); + encode_json("metadata", x_meta_map, f); + encode_json("tags", tags, f); + } + } + encode_json("eventId", id, f); + encode_json("opaqueData", opaque_data, f); +} + +void rgw_pubsub_topic::dump(Formatter *f) const +{ + encode_json("user", user, f); + encode_json("name", name, f); + encode_json("dest", dest, f); + encode_json("arn", arn, f); + encode_json("opaqueData", opaque_data, f); +} + +void rgw_pubsub_topic::dump_xml(Formatter *f) const +{ + encode_xml("User", user, f); + encode_xml("Name", name, f); + encode_xml("EndPoint", dest, f); + encode_xml("TopicArn", arn, f); + encode_xml("OpaqueData", opaque_data, f); +} + +void encode_xml_key_value_entry(const std::string& key, const std::string& value, Formatter *f) { + f->open_object_section("entry"); + encode_xml("key", key, f); + encode_xml("value", value, f); + f->close_section(); // entry +} + +void rgw_pubsub_topic::dump_xml_as_attributes(Formatter *f) const +{ + f->open_array_section("Attributes"); + std::string str_user; + user.to_str(str_user); + encode_xml_key_value_entry("User", str_user, f); + encode_xml_key_value_entry("Name", name, f); + encode_xml_key_value_entry("EndPoint", dest.to_json_str(), f); + encode_xml_key_value_entry("TopicArn", arn, f); + encode_xml_key_value_entry("OpaqueData", opaque_data, f); + f->close_section(); // Attributes +} + +void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f) +{ + f->open_array_section(name); + for (auto iter = l.cbegin(); iter != l.cend(); ++iter) { + f->dump_string("obj", rgw::notify::to_string(*iter)); + } + f->close_section(); +} + +void rgw_pubsub_topic_filter::dump(Formatter *f) const +{ + encode_json("TopicArn", topic.arn, f); + encode_json("Id", s3_id, f); + encode_json("Events", events, f); + encode_json("Filter", s3_filter, f); +} + +void rgw_pubsub_bucket_topics::dump(Formatter *f) const +{ + Formatter::ArraySection s(*f, "notifications"); + for (auto& t : topics) { + encode_json(t.first.c_str(), t.second, f); + } +} + +void rgw_pubsub_topics::dump(Formatter *f) const +{ + Formatter::ArraySection s(*f, "topics"); + for (auto& t : topics) { + auto& topic = t.second; + if (topic.name == topic.dest.arn_topic) { + encode_json(t.first.c_str(), topic, f); + } + } +} + +void rgw_pubsub_topics::dump_xml(Formatter *f) const +{ + for (auto& t : topics) { + encode_xml("member", t.second, f); + } +} + +void rgw_pubsub_dest::dump(Formatter *f) const +{ + encode_json("push_endpoint", push_endpoint, f); + encode_json("push_endpoint_args", push_endpoint_args, f); + encode_json("push_endpoint_topic", arn_topic, f); + encode_json("stored_secret", stored_secret, f); + encode_json("persistent", persistent, f); +} + +void rgw_pubsub_dest::dump_xml(Formatter *f) const +{ + encode_xml("EndpointAddress", push_endpoint, f); + encode_xml("EndpointArgs", push_endpoint_args, f); + encode_xml("EndpointTopic", arn_topic, f); + encode_xml("HasStoredSecret", stored_secret, f); + encode_xml("Persistent", persistent, f); +} + +std::string rgw_pubsub_dest::to_json_str() const +{ + JSONFormatter f; + f.open_object_section(""); + encode_json("EndpointAddress", push_endpoint, &f); + encode_json("EndpointArgs", push_endpoint_args, &f); + encode_json("EndpointTopic", arn_topic, &f); + encode_json("HasStoredSecret", stored_secret, &f); + encode_json("Persistent", persistent, &f); + f.close_section(); + std::stringstream ss; + f.flush(ss); + return ss.str(); +} + +RGWPubSub::RGWPubSub(rgw::sal::Driver* _driver, const std::string& _tenant) + : driver(_driver), tenant(_tenant) +{} + +int RGWPubSub::read_topics(const DoutPrefixProvider *dpp, rgw_pubsub_topics& result, + RGWObjVersionTracker *objv_tracker, optional_yield y) const +{ + const int ret = driver->read_topics(tenant, result, objv_tracker, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 10) << "WARNING: failed to read topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWPubSub::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics, + RGWObjVersionTracker *objv_tracker, optional_yield y) const +{ + const int ret = driver->write_topics(tenant, topics, objv_tracker, y, dpp); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWPubSub::Bucket::read_topics(const DoutPrefixProvider *dpp, rgw_pubsub_bucket_topics& result, + RGWObjVersionTracker *objv_tracker, optional_yield y) const +{ + const int ret = bucket->read_topics(result, objv_tracker, y, dpp); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWPubSub::Bucket::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics, + RGWObjVersionTracker *objv_tracker, + optional_yield y) const +{ + const int ret = bucket->write_topics(topics, objv_tracker, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWPubSub::get_topic(const DoutPrefixProvider *dpp, const std::string& name, rgw_pubsub_topic& result, optional_yield y) const +{ + rgw_pubsub_topics topics; + const int ret = read_topics(dpp, topics, nullptr, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } + + auto iter = topics.topics.find(name); + if (iter == topics.topics.end()) { + ldpp_dout(dpp, 1) << "ERROR: topic not found" << dendl; + return -ENOENT; + } + + result = iter->second; + return 0; +} + +// from list of bucket topics, find the one that was auto-generated by a notification +auto find_unique_topic(const rgw_pubsub_bucket_topics &bucket_topics, const std::string ¬ification_id) { + auto it = std::find_if(bucket_topics.topics.begin(), bucket_topics.topics.end(), + [&](const auto& val) { return notification_id == val.second.s3_id; }); + return it != bucket_topics.topics.end() ? + std::optional>(it->second): + std::nullopt; +} + +int RGWPubSub::Bucket::get_notification_by_id(const DoutPrefixProvider *dpp, const std::string& notification_id, + rgw_pubsub_topic_filter& result, optional_yield y) const { + rgw_pubsub_bucket_topics bucket_topics; + const int ret = read_topics(dpp, bucket_topics, nullptr, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read bucket_topics info: ret=" << ret << dendl; + return ret; + } + + auto iter = find_unique_topic(bucket_topics, notification_id); + if (!iter) { + ldpp_dout(dpp, 1) << "ERROR: notification was not found" << dendl; + return -ENOENT; + } + + result = iter->get(); + return 0; +} + + +int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, + const rgw::notify::EventTypeList& events, optional_yield y) const { + return create_notification(dpp, topic_name, events, std::nullopt, "", y); +} + +int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, + const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) const { + rgw_pubsub_topic topic_info; + + int ret = ps.get_topic(dpp, topic_name, topic_info, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << "successfully read topic '" << topic_name << "' info" << dendl; + + RGWObjVersionTracker objv_tracker; + rgw_pubsub_bucket_topics bucket_topics; + + ret = read_topics(dpp, bucket_topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read topics from bucket '" << + bucket->get_name() << "': ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" << + bucket->get_name() << "'" << dendl; + + auto& topic_filter = bucket_topics.topics[topic_name]; + topic_filter.topic = topic_info; + topic_filter.events = events; + topic_filter.s3_id = notif_name; + if (s3_filter) { + topic_filter.s3_filter = *s3_filter; + } + + ret = write_topics(dpp, bucket_topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to write topics to bucket '" << bucket->get_name() << "': ret=" << ret << dendl; + return ret; + } + + ldpp_dout(dpp, 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket->get_name() << "'" << dendl; + + return 0; +} + +int RGWPubSub::Bucket::remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y) const +{ + return remove_notification_inner(dpp, topic_name, false, y); +} + +int RGWPubSub::Bucket::remove_notification_inner(const DoutPrefixProvider *dpp, const std::string& notification_id, + bool is_notification_id, optional_yield y) const +{ + RGWObjVersionTracker objv_tracker; + rgw_pubsub_bucket_topics bucket_topics; + + auto ret = read_topics(dpp, bucket_topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl; + return ret; + } + + + std::unique_ptr topic_name = std::make_unique(notification_id); + if(is_notification_id) { + auto iter = find_unique_topic(bucket_topics, notification_id); + if (!iter) { + ldpp_dout(dpp, 1) << "ERROR: notification was not found" << dendl; + return -ENOENT; + } + topic_name = std::make_unique(iter->get().topic.name); + } + + if (bucket_topics.topics.erase(*topic_name) == 0) { + ldpp_dout(dpp, 1) << "INFO: no need to remove, topic does not exist" << dendl; + return 0; + } + + if (bucket_topics.topics.empty()) { + // no more topics - delete the notification object of the bucket + ret = bucket->remove_topics(&objv_tracker, y, dpp); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl; + return ret; + } + return 0; + } + + // write back the notifications without the deleted one + ret = write_topics(dpp, bucket_topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWPubSub::Bucket::remove_notification_by_id(const DoutPrefixProvider *dpp, const std::string& notif_id, optional_yield y) const +{ + return remove_notification_inner(dpp, notif_id, true, y); +} + +int RGWPubSub::Bucket::remove_notifications(const DoutPrefixProvider *dpp, optional_yield y) const +{ + // get all topics on a bucket + rgw_pubsub_bucket_topics bucket_topics; + auto ret = get_topics(dpp, bucket_topics, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to get list of topics from bucket '" << bucket->get_name() << "', ret=" << ret << dendl; + return ret ; + } + + // remove all auto-genrated topics + for (const auto& topic : bucket_topics.topics) { + const auto& topic_name = topic.first; + ret = ps.remove_topic(dpp, topic_name, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 5) << "WARNING: failed to remove auto-generated topic '" << topic_name << "', ret=" << ret << dendl; + } + } + + // delete the notification object of the bucket + ret = bucket->remove_topics(nullptr, y, dpp); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y) const { + return create_topic(dpp, name, rgw_pubsub_dest{}, "", "", y); +} + +int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_dest& dest, + const std::string& arn, const std::string& opaque_data, optional_yield y) const { + RGWObjVersionTracker objv_tracker; + rgw_pubsub_topics topics; + + int ret = read_topics(dpp, topics, &objv_tracker, y); + if (ret < 0 && ret != -ENOENT) { + // its not an error if not topics exist, we create one + ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } + + rgw_pubsub_topic& new_topic = topics.topics[name]; + new_topic.user = rgw_user("", tenant); + new_topic.name = name; + new_topic.dest = dest; + new_topic.arn = arn; + new_topic.opaque_data = opaque_data; + + ret = write_topics(dpp, topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWPubSub::remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y) const +{ + RGWObjVersionTracker objv_tracker; + rgw_pubsub_topics topics; + + int ret = read_topics(dpp, topics, &objv_tracker, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } else if (ret == -ENOENT) { + // its not an error if no topics exist, just a no-op + ldpp_dout(dpp, 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl; + return 0; + } + + topics.topics.erase(name); + + ret = write_topics(dpp, topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + diff --git a/src/rgw/rgw_pubsub.h b/src/rgw/rgw_pubsub.h new file mode 100644 index 000000000..290c52c2b --- /dev/null +++ b/src/rgw/rgw_pubsub.h @@ -0,0 +1,629 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_sal.h" +#include "rgw_tools.h" +#include "rgw_zone.h" +#include "rgw_notify_event_type.h" +#include + +class XMLObj; + +struct rgw_s3_key_filter { + std::string prefix_rule; + std::string suffix_rule; + std::string regex_rule; + + bool has_content() const; + + void dump(Formatter *f) const; + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(prefix_rule, bl); + encode(suffix_rule, bl); + encode(regex_rule, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(prefix_rule, bl); + decode(suffix_rule, bl); + decode(regex_rule, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_key_filter) + +using KeyValueMap = boost::container::flat_map; +using KeyMultiValueMap = std::multimap; + +struct rgw_s3_key_value_filter { + KeyValueMap kv; + + bool has_content() const; + + void dump(Formatter *f) const; + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(kv, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(kv, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_key_value_filter) + +struct rgw_s3_filter { + rgw_s3_key_filter key_filter; + rgw_s3_key_value_filter metadata_filter; + rgw_s3_key_value_filter tag_filter; + + bool has_content() const; + + void dump(Formatter *f) const; + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(key_filter, bl); + encode(metadata_filter, bl); + encode(tag_filter, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(key_filter, bl); + decode(metadata_filter, bl); + if (struct_v >= 2) { + decode(tag_filter, bl); + } + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_filter) + +using OptionalFilter = std::optional; + +struct rgw_pubsub_topic_filter; +/* S3 notification configuration + * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html + + + + + + suffix + jpg + + + + + + + + + + + + + + + + notification1 + arn:aws:sns::: + s3:ObjectCreated:* + s3:ObjectRemoved:* + + +*/ +struct rgw_pubsub_s3_notification { + // notification id + std::string id; + // types of events + rgw::notify::EventTypeList events; + // topic ARN + std::string topic_arn; + // filter rules + rgw_s3_filter filter; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + rgw_pubsub_s3_notification() = default; + // construct from rgw_pubsub_topic_filter (used by get/list notifications) + explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter); +}; + +// return true if the key matches the prefix/suffix/regex rules of the key filter +bool match(const rgw_s3_key_filter& filter, const std::string& key); + +// return true if the key matches the metadata rules of the metadata filter +bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv); + +// return true if the key matches the tag rules of the tag filter +bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv); + +// return true if the event type matches (equal or contained in) one of the events in the list +bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event); + +struct rgw_pubsub_s3_notifications { + std::list list; + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +/* S3 event records structure + * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html +{ +"Records":[ + { + "eventVersion":"" + "eventSource":"", + "awsRegion":"", + "eventTime":"", + "eventName":"", + "userIdentity":{ + "principalId":"" + }, + "requestParameters":{ + "sourceIPAddress":"" + }, + "responseElements":{ + "x-amz-request-id":"", + "x-amz-id-2":"" + }, + "s3":{ + "s3SchemaVersion":"1.0", + "configurationId":"", + "bucket":{ + "name":"", + "ownerIdentity":{ + "principalId":"" + }, + "arn":"" + "id": "" + }, + "object":{ + "key":"", + "size": , + "eTag":"", + "versionId":"", + "sequencer": "", + "metadata": "" + "tags": "" + } + }, + "eventId":"", + } +] +}*/ + +struct rgw_pubsub_s3_event { + constexpr static const char* const json_type_plural = "Records"; + std::string eventVersion = "2.2"; + // aws:s3 + std::string eventSource = "ceph:s3"; + // zonegroup + std::string awsRegion; + // time of the request + ceph::real_time eventTime; + // type of the event + std::string eventName; + // user that sent the request + std::string userIdentity; + // IP address of source of the request (not implemented) + std::string sourceIPAddress; + // request ID (not implemented) + std::string x_amz_request_id; + // radosgw that received the request + std::string x_amz_id_2; + std::string s3SchemaVersion = "1.0"; + // ID received in the notification request + std::string configurationId; + // bucket name + std::string bucket_name; + // bucket owner + std::string bucket_ownerIdentity; + // bucket ARN + std::string bucket_arn; + // object key + std::string object_key; + // object size + uint64_t object_size = 0; + // object etag + std::string object_etag; + // object version id bucket is versioned + std::string object_versionId; + // hexadecimal value used to determine event order for specific key + std::string object_sequencer; + // this is an rgw extension (not S3 standard) + // used to store a globally unique identifier of the event + // that could be used for acking or any other identification of the event + std::string id; + // this is an rgw extension holding the internal bucket id + std::string bucket_id; + // meta data + KeyValueMap x_meta_map; + // tags + KeyMultiValueMap tags; + // opaque data received from the topic + // could be used to identify the gateway + std::string opaque_data; + + void encode(bufferlist& bl) const { + ENCODE_START(4, 1, bl); + encode(eventVersion, bl); + encode(eventSource, bl); + encode(awsRegion, bl); + encode(eventTime, bl); + encode(eventName, bl); + encode(userIdentity, bl); + encode(sourceIPAddress, bl); + encode(x_amz_request_id, bl); + encode(x_amz_id_2, bl); + encode(s3SchemaVersion, bl); + encode(configurationId, bl); + encode(bucket_name, bl); + encode(bucket_ownerIdentity, bl); + encode(bucket_arn, bl); + encode(object_key, bl); + encode(object_size, bl); + encode(object_etag, bl); + encode(object_versionId, bl); + encode(object_sequencer, bl); + encode(id, bl); + encode(bucket_id, bl); + encode(x_meta_map, bl); + encode(tags, bl); + encode(opaque_data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(4, bl); + decode(eventVersion, bl); + decode(eventSource, bl); + decode(awsRegion, bl); + decode(eventTime, bl); + decode(eventName, bl); + decode(userIdentity, bl); + decode(sourceIPAddress, bl); + decode(x_amz_request_id, bl); + decode(x_amz_id_2, bl); + decode(s3SchemaVersion, bl); + decode(configurationId, bl); + decode(bucket_name, bl); + decode(bucket_ownerIdentity, bl); + decode(bucket_arn, bl); + decode(object_key, bl); + decode(object_size, bl); + decode(object_etag, bl); + decode(object_versionId, bl); + decode(object_sequencer, bl); + decode(id, bl); + if (struct_v >= 2) { + decode(bucket_id, bl); + decode(x_meta_map, bl); + } + if (struct_v >= 3) { + decode(tags, bl); + } + if (struct_v >= 4) { + decode(opaque_data, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_s3_event) + +// setting a unique ID for an event based on object hash and timestamp +void set_event_id(std::string& id, const std::string& hash, const utime_t& ts); + +struct rgw_pubsub_dest { + std::string push_endpoint; + std::string push_endpoint_args; + std::string arn_topic; + bool stored_secret = false; + bool persistent = false; + + void encode(bufferlist& bl) const { + ENCODE_START(5, 1, bl); + encode("", bl); + encode("", bl); + encode(push_endpoint, bl); + encode(push_endpoint_args, bl); + encode(arn_topic, bl); + encode(stored_secret, bl); + encode(persistent, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(5, bl); + std::string dummy; + decode(dummy, bl); + decode(dummy, bl); + decode(push_endpoint, bl); + if (struct_v >= 2) { + decode(push_endpoint_args, bl); + } + if (struct_v >= 3) { + decode(arn_topic, bl); + } + if (struct_v >= 4) { + decode(stored_secret, bl); + } + if (struct_v >= 5) { + decode(persistent, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + std::string to_json_str() const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_dest) + +struct rgw_pubsub_topic { + rgw_user user; + std::string name; + rgw_pubsub_dest dest; + std::string arn; + std::string opaque_data; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(user, bl); + encode(name, bl); + encode(dest, bl); + encode(arn, bl); + encode(opaque_data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(user, bl); + decode(name, bl); + if (struct_v >= 2) { + decode(dest, bl); + decode(arn, bl); + } + if (struct_v >= 3) { + decode(opaque_data, bl); + } + DECODE_FINISH(bl); + } + + std::string to_str() const { + return user.tenant + "/" + name; + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void dump_xml_as_attributes(Formatter *f) const; + + bool operator<(const rgw_pubsub_topic& t) const { + return to_str().compare(t.to_str()); + } +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic) + +// this struct deprecated and remain only for backward compatibility +struct rgw_pubsub_topic_subs { + rgw_pubsub_topic topic; + std::set subs; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(topic, bl); + encode(subs, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(topic, bl); + decode(subs, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs) + +struct rgw_pubsub_topic_filter { + rgw_pubsub_topic topic; + rgw::notify::EventTypeList events; + std::string s3_id; + rgw_s3_filter s3_filter; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(topic, bl); + // events are stored as a vector of std::strings + std::vector tmp_events; + std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), rgw::notify::to_string); + encode(tmp_events, bl); + encode(s3_id, bl); + encode(s3_filter, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(topic, bl); + // events are stored as a vector of std::strings + events.clear(); + std::vector tmp_events; + decode(tmp_events, bl); + std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string); + if (struct_v >= 2) { + decode(s3_id, bl); + } + if (struct_v >= 3) { + decode(s3_filter, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter) + +struct rgw_pubsub_bucket_topics { + std::map topics; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(topics, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(topics, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics) + +struct rgw_pubsub_topics { + std::map topics; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(topics, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + if (struct_v >= 2) { + decode(topics, bl); + } else { + std::map v1topics; + decode(v1topics, bl); + std::transform(v1topics.begin(), v1topics.end(), std::inserter(topics, topics.end()), + [](const auto& entry) { + return std::pair(entry.first, entry.second.topic); + }); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topics) + +class RGWPubSub +{ + friend class Bucket; + + rgw::sal::Driver* const driver; + const std::string tenant; + + int read_topics(const DoutPrefixProvider *dpp, rgw_pubsub_topics& result, + RGWObjVersionTracker* objv_tracker, optional_yield y) const; + int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics, + RGWObjVersionTracker* objv_tracker, optional_yield y) const; + +public: + RGWPubSub(rgw::sal::Driver* _driver, const std::string& tenant); + + class Bucket { + friend class RGWPubSub; + const RGWPubSub& ps; + rgw::sal::Bucket* const bucket; + + // read the list of topics associated with a bucket and populate into result + // use version tacker to enforce atomicity between read/write + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int read_topics(const DoutPrefixProvider *dpp, rgw_pubsub_bucket_topics& result, + RGWObjVersionTracker* objv_tracker, optional_yield y) const; + // set the list of topics associated with a bucket + // use version tacker to enforce atomicity between read/write + // return 0 on success, error code otherwise + int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics, + RGWObjVersionTracker* objv_tracker, optional_yield y) const; + int remove_notification_inner(const DoutPrefixProvider *dpp, const std::string& notification_id, + bool notif_id_or_topic, optional_yield y) const; + public: + Bucket(const RGWPubSub& _ps, rgw::sal::Bucket* _bucket) : + ps(_ps), bucket(_bucket) + {} + + // get the list of topics associated with a bucket and populate into result + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int get_topics(const DoutPrefixProvider *dpp, rgw_pubsub_bucket_topics& result, optional_yield y) const { + return read_topics(dpp, result, nullptr, y); + } + // get a bucket_topic with by its name and populate it into "result" + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int get_notification_by_id(const DoutPrefixProvider *dpp, const std::string& notification_id, rgw_pubsub_topic_filter& result, optional_yield y) const; + // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket + // assigning a notification name is optional (needed for S3 compatible notifications) + // if the topic already exist on the bucket, the filter event list may be updated + // for S3 compliant notifications the version with: s3_filter and notif_name should be used + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, + const rgw::notify::EventTypeList& events, optional_yield y) const; + int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, + const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) const; + // remove a topic and filter from bucket + // if the topic does not exists on the bucket it is a no-op (considered success) + // return -ENOENT if the notification-id/topic does not exists + // return 0 on success, error code otherwise + int remove_notification_by_id(const DoutPrefixProvider *dpp, const std::string& notif_id, optional_yield y) const; + int remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y) const; + // remove all notifications (and autogenerated topics) associated with the bucket + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int remove_notifications(const DoutPrefixProvider *dpp, optional_yield y) const; + }; + + // get the list of topics + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int get_topics(const DoutPrefixProvider *dpp, rgw_pubsub_topics& result, optional_yield y) const { + return read_topics(dpp, result, nullptr, y); + } + // get a topic with by its name and populate it into "result" + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int get_topic(const DoutPrefixProvider *dpp, const std::string& name, rgw_pubsub_topic& result, optional_yield y) const; + // create a topic with a name only + // if the topic already exists it is a no-op (considered success) + // return 0 on success, error code otherwise + int create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y) const; + // create a topic with push destination information and ARN + // if the topic already exists the destination and ARN values may be updated (considered succsess) + // return 0 on success, error code otherwise + int create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_dest& dest, + const std::string& arn, const std::string& opaque_data, optional_yield y) const; + // remove a topic according to its name + // if the topic does not exists it is a no-op (considered success) + // return 0 on success, error code otherwise + int remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y) const; +}; + diff --git a/src/rgw/rgw_putobj.cc b/src/rgw/rgw_putobj.cc new file mode 100644 index 000000000..24a4b3275 --- /dev/null +++ b/src/rgw/rgw_putobj.cc @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_putobj.h" + +namespace rgw::putobj { + +int ChunkProcessor::process(bufferlist&& data, uint64_t offset) +{ + ceph_assert(offset >= chunk.length()); + uint64_t position = offset - chunk.length(); + + const bool flush = (data.length() == 0); + if (flush) { + if (chunk.length() > 0) { + int r = Pipe::process(std::move(chunk), position); + if (r < 0) { + return r; + } + } + return Pipe::process({}, offset); + } + chunk.claim_append(data); + + // write each full chunk + while (chunk.length() >= chunk_size) { + bufferlist bl; + chunk.splice(0, chunk_size, &bl); + + int r = Pipe::process(std::move(bl), position); + if (r < 0) { + return r; + } + position += chunk_size; + } + return 0; +} + + +int StripeProcessor::process(bufferlist&& data, uint64_t offset) +{ + ceph_assert(offset >= bounds.first); + + const bool flush = (data.length() == 0); + if (flush) { + return Pipe::process({}, offset - bounds.first); + } + + auto max = bounds.second - offset; + while (data.length() > max) { + if (max > 0) { + bufferlist bl; + data.splice(0, max, &bl); + + int r = Pipe::process(std::move(bl), offset - bounds.first); + if (r < 0) { + return r; + } + offset += max; + } + + // flush the current chunk + int r = Pipe::process({}, offset - bounds.first); + if (r < 0) { + return r; + } + // generate the next stripe + uint64_t stripe_size; + r = gen->next(offset, &stripe_size); + if (r < 0) { + return r; + } + ceph_assert(stripe_size > 0); + + bounds.first = offset; + bounds.second = offset + stripe_size; + + max = stripe_size; + } + + if (data.length() == 0) { // don't flush the chunk here + return 0; + } + return Pipe::process(std::move(data), offset - bounds.first); +} + +} // namespace rgw::putobj diff --git a/src/rgw/rgw_putobj.h b/src/rgw/rgw_putobj.h new file mode 100644 index 000000000..6740e88ce --- /dev/null +++ b/src/rgw/rgw_putobj.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "include/buffer.h" +#include "rgw_sal.h" + +namespace rgw::putobj { + +// for composing data processors into a pipeline +class Pipe : public rgw::sal::DataProcessor { + rgw::sal::DataProcessor *next; + public: + explicit Pipe(rgw::sal::DataProcessor *next) : next(next) {} + + virtual ~Pipe() override {} + + // passes the data on to the next processor + int process(bufferlist&& data, uint64_t offset) override { + return next->process(std::move(data), offset); + } +}; + +// pipe that writes to the next processor in discrete chunks +class ChunkProcessor : public Pipe { + uint64_t chunk_size; + bufferlist chunk; // leftover bytes from the last call to process() + public: + ChunkProcessor(rgw::sal::DataProcessor *next, uint64_t chunk_size) + : Pipe(next), chunk_size(chunk_size) + {} + virtual ~ChunkProcessor() override {} + + int process(bufferlist&& data, uint64_t offset) override; +}; + + +// interface to generate the next stripe description +class StripeGenerator { + public: + virtual ~StripeGenerator() {} + + virtual int next(uint64_t offset, uint64_t *stripe_size) = 0; +}; + +// pipe that respects stripe boundaries and restarts each stripe at offset 0 +class StripeProcessor : public Pipe { + StripeGenerator *gen; + std::pair bounds; // bounds of current stripe + public: + StripeProcessor(rgw::sal::DataProcessor *next, StripeGenerator *gen, + uint64_t first_stripe_size) + : Pipe(next), gen(gen), bounds(0, first_stripe_size) + {} + virtual ~StripeProcessor() override {} + + int process(bufferlist&& data, uint64_t data_offset) override; +}; + +} // namespace rgw::putobj diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc new file mode 100644 index 000000000..f1ae34f93 --- /dev/null +++ b/src/rgw/rgw_quota.cc @@ -0,0 +1,1049 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "include/utime.h" +#include "common/lru_map.h" +#include "common/RefCountedObj.h" +#include "common/Thread.h" +#include "common/ceph_mutex.h" + +#include "rgw_common.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "rgw_quota.h" +#include "rgw_bucket.h" +#include "rgw_user.h" + +#include "services/svc_sys_obj.h" +#include "services/svc_meta.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +struct RGWQuotaCacheStats { + RGWStorageStats stats; + utime_t expiration; + utime_t async_refresh_time; +}; + +template +class RGWQuotaCache { +protected: + rgw::sal::Driver* driver; + lru_map stats_map; + RefCountedWaitObject *async_refcount; + + class StatsAsyncTestSet : public lru_map::UpdateContext { + int objs_delta; + uint64_t added_bytes; + uint64_t removed_bytes; + public: + StatsAsyncTestSet() : objs_delta(0), added_bytes(0), removed_bytes(0) {} + bool update(RGWQuotaCacheStats *entry) override { + if (entry->async_refresh_time.sec() == 0) + return false; + + entry->async_refresh_time = utime_t(0, 0); + + return true; + } + }; + + virtual int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider *dpp) = 0; + + virtual bool map_find(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0; + + virtual bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, typename lru_map::UpdateContext *ctx) = 0; + virtual void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0; + + virtual void data_modified(const rgw_user& user, rgw_bucket& bucket) {} +public: + RGWQuotaCache(rgw::sal::Driver* _driver, int size) : driver(_driver), stats_map(size) { + async_refcount = new RefCountedWaitObject; + } + virtual ~RGWQuotaCache() { + async_refcount->put_wait(); /* wait for all pending async requests to complete */ + } + + int get_stats(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, + const DoutPrefixProvider* dpp); + void adjust_stats(const rgw_user& user, rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes); + + void set_stats(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats); + int async_refresh(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs); + void async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats); + void async_refresh_fail(const rgw_user& user, rgw_bucket& bucket); + + class AsyncRefreshHandler { + protected: + rgw::sal::Driver* driver; + RGWQuotaCache *cache; + public: + AsyncRefreshHandler(rgw::sal::Driver* _driver, RGWQuotaCache *_cache) : driver(_driver), cache(_cache) {} + virtual ~AsyncRefreshHandler() {} + + virtual int init_fetch() = 0; + virtual void drop_reference() = 0; + }; + + virtual AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) = 0; +}; + +template +int RGWQuotaCache::async_refresh(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) +{ + /* protect against multiple updates */ + StatsAsyncTestSet test_update; + if (!map_find_and_update(user, bucket, &test_update)) { + /* most likely we just raced with another update */ + return 0; + } + + async_refcount->get(); + + + AsyncRefreshHandler *handler = allocate_refresh_handler(user, bucket); + + int ret = handler->init_fetch(); + if (ret < 0) { + async_refcount->put(); + handler->drop_reference(); + return ret; + } + + return 0; +} + +template +void RGWQuotaCache::async_refresh_fail(const rgw_user& user, rgw_bucket& bucket) +{ + ldout(driver->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl; + + async_refcount->put(); +} + +template +void RGWQuotaCache::async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats) +{ + ldout(driver->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl; + + RGWQuotaCacheStats qs; + + map_find(user, bucket, qs); + + set_stats(user, bucket, qs, stats); + + async_refcount->put(); +} + +template +void RGWQuotaCache::set_stats(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats) +{ + qs.stats = stats; + qs.expiration = ceph_clock_now(); + qs.async_refresh_time = qs.expiration; + qs.expiration += driver->ctx()->_conf->rgw_bucket_quota_ttl; + qs.async_refresh_time += driver->ctx()->_conf->rgw_bucket_quota_ttl / 2; + + map_add(user, bucket, qs); +} + +template +int RGWQuotaCache::get_stats(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider* dpp) { + RGWQuotaCacheStats qs; + utime_t now = ceph_clock_now(); + if (map_find(user, bucket, qs)) { + if (qs.async_refresh_time.sec() > 0 && now >= qs.async_refresh_time) { + int r = async_refresh(user, bucket, qs); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: quota async refresh returned ret=" << r << dendl; + + /* continue processing, might be a transient error, async refresh is just optimization */ + } + } + + if (qs.expiration > ceph_clock_now()) { + stats = qs.stats; + return 0; + } + } + + int ret = fetch_stats_from_storage(user, bucket, stats, y, dpp); + if (ret < 0 && ret != -ENOENT) + return ret; + + set_stats(user, bucket, qs, stats); + + return 0; +} + + +template +class RGWQuotaStatsUpdate : public lru_map::UpdateContext { + const int objs_delta; + const uint64_t added_bytes; + const uint64_t removed_bytes; +public: + RGWQuotaStatsUpdate(const int objs_delta, + const uint64_t added_bytes, + const uint64_t removed_bytes) + : objs_delta(objs_delta), + added_bytes(added_bytes), + removed_bytes(removed_bytes) { + } + + bool update(RGWQuotaCacheStats * const entry) override { + const uint64_t rounded_added = rgw_rounded_objsize(added_bytes); + const uint64_t rounded_removed = rgw_rounded_objsize(removed_bytes); + + if (((int64_t)(entry->stats.size + added_bytes - removed_bytes)) >= 0) { + entry->stats.size += added_bytes - removed_bytes; + } else { + entry->stats.size = 0; + } + + if (((int64_t)(entry->stats.size_rounded + rounded_added - rounded_removed)) >= 0) { + entry->stats.size_rounded += rounded_added - rounded_removed; + } else { + entry->stats.size_rounded = 0; + } + + if (((int64_t)(entry->stats.num_objects + objs_delta)) >= 0) { + entry->stats.num_objects += objs_delta; + } else { + entry->stats.num_objects = 0; + } + + return true; + } +}; + + +template +void RGWQuotaCache::adjust_stats(const rgw_user& user, rgw_bucket& bucket, int objs_delta, + uint64_t added_bytes, uint64_t removed_bytes) +{ + RGWQuotaStatsUpdate update(objs_delta, added_bytes, removed_bytes); + map_find_and_update(user, bucket, &update); + + data_modified(user, bucket); +} + +class BucketAsyncRefreshHandler : public RGWQuotaCache::AsyncRefreshHandler, + public RGWGetBucketStats_CB { + rgw_user user; +public: + BucketAsyncRefreshHandler(rgw::sal::Driver* _driver, RGWQuotaCache *_cache, + const rgw_user& _user, const rgw_bucket& _bucket) : + RGWQuotaCache::AsyncRefreshHandler(_driver, _cache), + RGWGetBucketStats_CB(_bucket), user(_user) {} + + void drop_reference() override { put(); } + void handle_response(int r) override; + int init_fetch() override; +}; + +int BucketAsyncRefreshHandler::init_fetch() +{ + std::unique_ptr rbucket; + + const DoutPrefix dp(driver->ctx(), dout_subsys, "rgw bucket async refresh handler: "); + int r = driver->get_bucket(&dp, nullptr, bucket, &rbucket, null_yield); + if (r < 0) { + ldpp_dout(&dp, 0) << "could not get bucket info for bucket=" << bucket << " r=" << r << dendl; + return r; + } + + ldpp_dout(&dp, 20) << "initiating async quota refresh for bucket=" << bucket << dendl; + + const auto& index = rbucket->get_info().get_current_index(); + if (is_layout_indexless(index)) { + return 0; + } + + r = rbucket->read_stats_async(&dp, index, RGW_NO_SHARD, this); + if (r < 0) { + ldpp_dout(&dp, 0) << "could not get bucket info for bucket=" << bucket.name << dendl; + + /* read_stats_async() dropped our reference already */ + return r; + } + + return 0; +} + +void BucketAsyncRefreshHandler::handle_response(const int r) +{ + if (r < 0) { + ldout(driver->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl; + cache->async_refresh_fail(user, bucket); + return; + } + + RGWStorageStats bs; + + for (const auto& pair : *stats) { + const RGWStorageStats& s = pair.second; + + bs.size += s.size; + bs.size_rounded += s.size_rounded; + bs.num_objects += s.num_objects; + } + + cache->async_refresh_response(user, bucket, bs); +} + +class RGWBucketStatsCache : public RGWQuotaCache { +protected: + bool map_find(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override { + return stats_map.find(bucket, qs); + } + + bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, lru_map::UpdateContext *ctx) override { + return stats_map.find_and_update(bucket, NULL, ctx); + } + + void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override { + stats_map.add(bucket, qs); + } + + int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider *dpp) override; + +public: + explicit RGWBucketStatsCache(rgw::sal::Driver* _driver) : RGWQuotaCache(_driver, _driver->ctx()->_conf->rgw_bucket_quota_cache_size) { + } + + AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) override { + return new BucketAsyncRefreshHandler(driver, this, user, bucket); + } +}; + +int RGWBucketStatsCache::fetch_stats_from_storage(const rgw_user& _u, const rgw_bucket& _b, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider *dpp) +{ + std::unique_ptr user = driver->get_user(_u); + std::unique_ptr bucket; + + int r = driver->get_bucket(dpp, user.get(), _b, &bucket, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "could not get bucket info for bucket=" << _b << " r=" << r << dendl; + return r; + } + + stats = RGWStorageStats(); + + const auto& index = bucket->get_info().get_current_index(); + if (is_layout_indexless(index)) { + return 0; + } + + string bucket_ver; + string master_ver; + + map bucket_stats; + r = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, + &master_ver, bucket_stats, nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << "could not get bucket stats for bucket=" + << _b.name << dendl; + return r; + } + + for (const auto& pair : bucket_stats) { + const RGWStorageStats& s = pair.second; + + stats.size += s.size; + stats.size_rounded += s.size_rounded; + stats.num_objects += s.num_objects; + } + + return 0; +} + +class UserAsyncRefreshHandler : public RGWQuotaCache::AsyncRefreshHandler, + public RGWGetUserStats_CB { + const DoutPrefixProvider *dpp; + rgw_bucket bucket; +public: + UserAsyncRefreshHandler(const DoutPrefixProvider *_dpp, rgw::sal::Driver* _driver, RGWQuotaCache *_cache, + const rgw_user& _user, const rgw_bucket& _bucket) : + RGWQuotaCache::AsyncRefreshHandler(_driver, _cache), + RGWGetUserStats_CB(_user), + dpp(_dpp), + bucket(_bucket) {} + + void drop_reference() override { put(); } + int init_fetch() override; + void handle_response(int r) override; +}; + +int UserAsyncRefreshHandler::init_fetch() +{ + std::unique_ptr ruser = driver->get_user(user); + + ldpp_dout(dpp, 20) << "initiating async quota refresh for user=" << user << dendl; + int r = ruser->read_stats_async(dpp, this); + if (r < 0) { + ldpp_dout(dpp, 0) << "could not get bucket info for user=" << user << dendl; + + /* get_bucket_stats_async() dropped our reference already */ + return r; + } + + return 0; +} + +void UserAsyncRefreshHandler::handle_response(int r) +{ + if (r < 0) { + ldout(driver->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl; + cache->async_refresh_fail(user, bucket); + return; + } + + cache->async_refresh_response(user, bucket, stats); +} + +class RGWUserStatsCache : public RGWQuotaCache { + const DoutPrefixProvider *dpp; + std::atomic down_flag = { false }; + ceph::shared_mutex mutex = ceph::make_shared_mutex("RGWUserStatsCache"); + map modified_buckets; + + /* thread, sync recent modified buckets info */ + class BucketsSyncThread : public Thread { + CephContext *cct; + RGWUserStatsCache *stats; + + ceph::mutex lock = ceph::make_mutex("RGWUserStatsCache::BucketsSyncThread"); + ceph::condition_variable cond; + public: + + BucketsSyncThread(CephContext *_cct, RGWUserStatsCache *_s) : cct(_cct), stats(_s) {} + + void *entry() override { + ldout(cct, 20) << "BucketsSyncThread: start" << dendl; + do { + map buckets; + + stats->swap_modified_buckets(buckets); + + for (map::iterator iter = buckets.begin(); iter != buckets.end(); ++iter) { + rgw_bucket bucket = iter->first; + rgw_user& user = iter->second; + ldout(cct, 20) << "BucketsSyncThread: sync user=" << user << " bucket=" << bucket << dendl; + const DoutPrefix dp(cct, dout_subsys, "rgw bucket sync thread: "); + int r = stats->sync_bucket(user, bucket, null_yield, &dp); + if (r < 0) { + ldout(cct, 0) << "WARNING: sync_bucket() returned r=" << r << dendl; + } + } + + if (stats->going_down()) + break; + + std::unique_lock locker{lock}; + cond.wait_for( + locker, + std::chrono::seconds(cct->_conf->rgw_user_quota_bucket_sync_interval)); + } while (!stats->going_down()); + ldout(cct, 20) << "BucketsSyncThread: done" << dendl; + + return NULL; + } + + void stop() { + std::lock_guard l{lock}; + cond.notify_all(); + } + }; + + /* + * thread, full sync all users stats periodically + * + * only sync non idle users or ones that never got synced before, this is needed so that + * users that didn't have quota turned on before (or existed before the user objclass + * tracked stats) need to get their backend stats up to date. + */ + class UserSyncThread : public Thread { + CephContext *cct; + RGWUserStatsCache *stats; + + ceph::mutex lock = ceph::make_mutex("RGWUserStatsCache::UserSyncThread"); + ceph::condition_variable cond; + public: + + UserSyncThread(CephContext *_cct, RGWUserStatsCache *_s) : cct(_cct), stats(_s) {} + + void *entry() override { + ldout(cct, 20) << "UserSyncThread: start" << dendl; + do { + const DoutPrefix dp(cct, dout_subsys, "rgw user sync thread: "); + int ret = stats->sync_all_users(&dp, null_yield); + if (ret < 0) { + ldout(cct, 5) << "ERROR: sync_all_users() returned ret=" << ret << dendl; + } + + if (stats->going_down()) + break; + + std::unique_lock l{lock}; + cond.wait_for(l, std::chrono::seconds(cct->_conf->rgw_user_quota_sync_interval)); + } while (!stats->going_down()); + ldout(cct, 20) << "UserSyncThread: done" << dendl; + + return NULL; + } + + void stop() { + std::lock_guard l{lock}; + cond.notify_all(); + } + }; + + BucketsSyncThread *buckets_sync_thread; + UserSyncThread *user_sync_thread; +protected: + bool map_find(const rgw_user& user,const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override { + return stats_map.find(user, qs); + } + + bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, lru_map::UpdateContext *ctx) override { + return stats_map.find_and_update(user, NULL, ctx); + } + + void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override { + stats_map.add(user, qs); + } + + int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider *dpp) override; + int sync_bucket(const rgw_user& rgw_user, rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp); + int sync_user(const DoutPrefixProvider *dpp, const rgw_user& user, optional_yield y); + int sync_all_users(const DoutPrefixProvider *dpp, optional_yield y); + + void data_modified(const rgw_user& user, rgw_bucket& bucket) override; + + void swap_modified_buckets(map& out) { + std::unique_lock lock{mutex}; + modified_buckets.swap(out); + } + + template /* easier doing it as a template, Thread doesn't have ->stop() */ + void stop_thread(T **pthr) { + T *thread = *pthr; + if (!thread) + return; + + thread->stop(); + thread->join(); + delete thread; + *pthr = NULL; + } + +public: + RGWUserStatsCache(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver, bool quota_threads) + : RGWQuotaCache(_driver, _driver->ctx()->_conf->rgw_bucket_quota_cache_size), dpp(dpp) + { + if (quota_threads) { + buckets_sync_thread = new BucketsSyncThread(driver->ctx(), this); + buckets_sync_thread->create("rgw_buck_st_syn"); + user_sync_thread = new UserSyncThread(driver->ctx(), this); + user_sync_thread->create("rgw_user_st_syn"); + } else { + buckets_sync_thread = NULL; + user_sync_thread = NULL; + } + } + ~RGWUserStatsCache() override { + stop(); + } + + AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) override { + return new UserAsyncRefreshHandler(dpp, driver, this, user, bucket); + } + + bool going_down() { + return down_flag; + } + + void stop() { + down_flag = true; + { + std::unique_lock lock{mutex}; + stop_thread(&buckets_sync_thread); + } + stop_thread(&user_sync_thread); + } +}; + +int RGWUserStatsCache::fetch_stats_from_storage(const rgw_user& _u, + const rgw_bucket& _b, + RGWStorageStats& stats, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + std::unique_ptr user = driver->get_user(_u); + int r = user->read_stats(dpp, y, &stats); + if (r < 0) { + ldpp_dout(dpp, 0) << "could not get user stats for user=" << user << dendl; + return r; + } + + return 0; +} + +int RGWUserStatsCache::sync_bucket(const rgw_user& _u, rgw_bucket& _b, optional_yield y, const DoutPrefixProvider *dpp) +{ + std::unique_ptr user = driver->get_user(_u); + std::unique_ptr bucket; + + int r = driver->get_bucket(dpp, user.get(), _b, &bucket, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "could not get bucket info for bucket=" << _b << " r=" << r << dendl; + return r; + } + + r = bucket->sync_user_stats(dpp, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: sync_user_stats() for user=" << _u << ", bucket=" << bucket << " returned " << r << dendl; + return r; + } + + return bucket->check_bucket_shards(dpp); +} + +int RGWUserStatsCache::sync_user(const DoutPrefixProvider *dpp, const rgw_user& _u, optional_yield y) +{ + RGWStorageStats stats; + ceph::real_time last_stats_sync; + ceph::real_time last_stats_update; + std::unique_ptr user = driver->get_user(rgw_user(_u.to_str())); + + int ret = user->read_stats(dpp, y, &stats, &last_stats_sync, &last_stats_update); + if (ret < 0) { + ldpp_dout(dpp, 5) << "ERROR: can't read user header: ret=" << ret << dendl; + return ret; + } + + if (!driver->ctx()->_conf->rgw_user_quota_sync_idle_users && + last_stats_update < last_stats_sync) { + ldpp_dout(dpp, 20) << "user is idle, not doing a full sync (user=" << user << ")" << dendl; + return 0; + } + + real_time when_need_full_sync = last_stats_sync; + when_need_full_sync += make_timespan(driver->ctx()->_conf->rgw_user_quota_sync_wait_time); + + // check if enough time passed since last full sync + /* FIXME: missing check? */ + + ret = rgw_user_sync_all_stats(dpp, driver, user.get(), y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed user stats sync, ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWUserStatsCache::sync_all_users(const DoutPrefixProvider *dpp, optional_yield y) +{ + string key = "user"; + void *handle; + + int ret = driver->meta_list_keys_init(dpp, key, string(), &handle); + if (ret < 0) { + ldpp_dout(dpp, 10) << "ERROR: can't get key: ret=" << ret << dendl; + return ret; + } + + bool truncated; + int max = 1000; + + do { + list keys; + ret = driver->meta_list_keys_next(dpp, handle, max, keys, &truncated); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: lists_keys_next(): ret=" << ret << dendl; + goto done; + } + for (list::iterator iter = keys.begin(); + iter != keys.end() && !going_down(); + ++iter) { + rgw_user user(*iter); + ldpp_dout(dpp, 20) << "RGWUserStatsCache: sync user=" << user << dendl; + int ret = sync_user(dpp, user, y); + if (ret < 0) { + ldpp_dout(dpp, 5) << "ERROR: sync_user() failed, user=" << user << " ret=" << ret << dendl; + + /* continuing to next user */ + continue; + } + } + } while (truncated); + + ret = 0; +done: + driver->meta_list_keys_complete(handle); + return ret; +} + +void RGWUserStatsCache::data_modified(const rgw_user& user, rgw_bucket& bucket) +{ + /* racy, but it's ok */ + mutex.lock_shared(); + bool need_update = modified_buckets.find(bucket) == modified_buckets.end(); + mutex.unlock_shared(); + + if (need_update) { + std::unique_lock lock{mutex}; + modified_buckets[bucket] = user; + } +} + + +class RGWQuotaInfoApplier { + /* NOTE: no non-static field allowed as instances are supposed to live in + * the static memory only. */ +protected: + RGWQuotaInfoApplier() = default; + +public: + virtual ~RGWQuotaInfoApplier() {} + + virtual bool is_size_exceeded(const DoutPrefixProvider *dpp, + const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const = 0; + + virtual bool is_num_objs_exceeded(const DoutPrefixProvider *dpp, + const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const = 0; + + static const RGWQuotaInfoApplier& get_instance(const RGWQuotaInfo& qinfo); +}; + +class RGWQuotaInfoDefApplier : public RGWQuotaInfoApplier { +public: + bool is_size_exceeded(const DoutPrefixProvider *dpp, const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const override; + + bool is_num_objs_exceeded(const DoutPrefixProvider *dpp, const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const override; +}; + +class RGWQuotaInfoRawApplier : public RGWQuotaInfoApplier { +public: + bool is_size_exceeded(const DoutPrefixProvider *dpp, const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const override; + + bool is_num_objs_exceeded(const DoutPrefixProvider *dpp, const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const override; +}; + + +bool RGWQuotaInfoDefApplier::is_size_exceeded(const DoutPrefixProvider *dpp, + const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const +{ + if (qinfo.max_size < 0) { + /* The limit is not enabled. */ + return false; + } + + const uint64_t cur_size = stats.size_rounded; + const uint64_t new_size = rgw_rounded_objsize(size); + + if (std::cmp_greater(cur_size + new_size, qinfo.max_size)) { + ldpp_dout(dpp, 10) << "quota exceeded: stats.size_rounded=" << stats.size_rounded + << " size=" << new_size << " " + << entity << "_quota.max_size=" << qinfo.max_size << dendl; + return true; + } + + return false; +} + +bool RGWQuotaInfoDefApplier::is_num_objs_exceeded(const DoutPrefixProvider *dpp, + const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const +{ + if (qinfo.max_objects < 0) { + /* The limit is not enabled. */ + return false; + } + + if (std::cmp_greater(stats.num_objects + num_objs, qinfo.max_objects)) { + ldpp_dout(dpp, 10) << "quota exceeded: stats.num_objects=" << stats.num_objects + << " " << entity << "_quota.max_objects=" << qinfo.max_objects + << dendl; + return true; + } + + return false; +} + +bool RGWQuotaInfoRawApplier::is_size_exceeded(const DoutPrefixProvider *dpp, + const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const +{ + if (qinfo.max_size < 0) { + /* The limit is not enabled. */ + return false; + } + + const uint64_t cur_size = stats.size; + + if (std::cmp_greater(cur_size + size, qinfo.max_size)) { + ldpp_dout(dpp, 10) << "quota exceeded: stats.size=" << stats.size + << " size=" << size << " " + << entity << "_quota.max_size=" << qinfo.max_size << dendl; + return true; + } + + return false; +} + +bool RGWQuotaInfoRawApplier::is_num_objs_exceeded(const DoutPrefixProvider *dpp, + const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const +{ + if (qinfo.max_objects < 0) { + /* The limit is not enabled. */ + return false; + } + + if (std::cmp_greater(stats.num_objects + num_objs, qinfo.max_objects)) { + ldpp_dout(dpp, 10) << "quota exceeded: stats.num_objects=" << stats.num_objects + << " " << entity << "_quota.max_objects=" << qinfo.max_objects + << dendl; + return true; + } + + return false; +} + +const RGWQuotaInfoApplier& RGWQuotaInfoApplier::get_instance( + const RGWQuotaInfo& qinfo) +{ + static RGWQuotaInfoDefApplier default_qapplier; + static RGWQuotaInfoRawApplier raw_qapplier; + + if (qinfo.check_on_raw) { + return raw_qapplier; + } else { + return default_qapplier; + } +} + + +class RGWQuotaHandlerImpl : public RGWQuotaHandler { + rgw::sal::Driver* driver; + RGWBucketStatsCache bucket_stats_cache; + RGWUserStatsCache user_stats_cache; + + int check_quota(const DoutPrefixProvider *dpp, + const char * const entity, + const RGWQuotaInfo& quota, + const RGWStorageStats& stats, + const uint64_t num_objs, + const uint64_t size) { + if (!quota.enabled) { + return 0; + } + + const auto& quota_applier = RGWQuotaInfoApplier::get_instance(quota); + + ldpp_dout(dpp, 20) << entity + << " quota: max_objects=" << quota.max_objects + << " max_size=" << quota.max_size << dendl; + + + if (quota_applier.is_num_objs_exceeded(dpp, entity, quota, stats, num_objs)) { + return -ERR_QUOTA_EXCEEDED; + } + + if (quota_applier.is_size_exceeded(dpp, entity, quota, stats, size)) { + return -ERR_QUOTA_EXCEEDED; + } + + ldpp_dout(dpp, 20) << entity << " quota OK:" + << " stats.num_objects=" << stats.num_objects + << " stats.size=" << stats.size << dendl; + return 0; + } +public: + RGWQuotaHandlerImpl(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver, bool quota_threads) : driver(_driver), + bucket_stats_cache(_driver), + user_stats_cache(dpp, _driver, quota_threads) {} + + int check_quota(const DoutPrefixProvider *dpp, + const rgw_user& user, + rgw_bucket& bucket, + RGWQuota& quota, + uint64_t num_objs, + uint64_t size, optional_yield y) override { + + if (!quota.bucket_quota.enabled && !quota.user_quota.enabled) { + return 0; + } + + /* + * we need to fetch bucket stats if the user quota is enabled, because + * the whole system relies on us periodically updating the user's bucket + * stats in the user's header, this happens in get_stats() if we actually + * fetch that info and not rely on cached data + */ + + const DoutPrefix dp(driver->ctx(), dout_subsys, "rgw quota handler: "); + if (quota.bucket_quota.enabled) { + RGWStorageStats bucket_stats; + int ret = bucket_stats_cache.get_stats(user, bucket, bucket_stats, y, &dp); + if (ret < 0) { + return ret; + } + ret = check_quota(dpp, "bucket", quota.bucket_quota, bucket_stats, num_objs, size); + if (ret < 0) { + return ret; + } + } + + if (quota.user_quota.enabled) { + RGWStorageStats user_stats; + int ret = user_stats_cache.get_stats(user, bucket, user_stats, y, &dp); + if (ret < 0) { + return ret; + } + ret = check_quota(dpp, "user", quota.user_quota, user_stats, num_objs, size); + if (ret < 0) { + return ret; + } + } + return 0; + } + + void update_stats(const rgw_user& user, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) override { + bucket_stats_cache.adjust_stats(user, bucket, obj_delta, added_bytes, removed_bytes); + user_stats_cache.adjust_stats(user, bucket, obj_delta, added_bytes, removed_bytes); + } + + void check_bucket_shards(const DoutPrefixProvider *dpp, uint64_t max_objs_per_shard, + uint64_t num_shards, uint64_t num_objs, bool is_multisite, + bool& need_resharding, uint32_t *suggested_num_shards) override + { + if (num_objs > num_shards * max_objs_per_shard) { + ldpp_dout(dpp, 0) << __func__ << ": resharding needed: stats.num_objects=" << num_objs + << " shard max_objects=" << max_objs_per_shard * num_shards << dendl; + need_resharding = true; + if (suggested_num_shards) { + uint32_t obj_multiplier = 2; + if (is_multisite) { + // if we're maintaining bilogs for multisite, reshards are significantly + // more expensive. scale up the shard count much faster to minimize the + // number of reshard events during a write workload + obj_multiplier = 8; + } + *suggested_num_shards = num_objs * obj_multiplier / max_objs_per_shard; + } + } else { + need_resharding = false; + } + } +}; + + +RGWQuotaHandler *RGWQuotaHandler::generate_handler(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, bool quota_threads) +{ + return new RGWQuotaHandlerImpl(dpp, driver, quota_threads); +} + +void RGWQuotaHandler::free_handler(RGWQuotaHandler *handler) +{ + delete handler; +} + + +void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf) +{ + if (conf->rgw_bucket_default_quota_max_objects >= 0) { + quota.max_objects = conf->rgw_bucket_default_quota_max_objects; + quota.enabled = true; + } + if (conf->rgw_bucket_default_quota_max_size >= 0) { + quota.max_size = conf->rgw_bucket_default_quota_max_size; + quota.enabled = true; + } +} + +void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf) +{ + if (conf->rgw_user_default_quota_max_objects >= 0) { + quota.max_objects = conf->rgw_user_default_quota_max_objects; + quota.enabled = true; + } + if (conf->rgw_user_default_quota_max_size >= 0) { + quota.max_size = conf->rgw_user_default_quota_max_size; + quota.enabled = true; + } +} + +void RGWQuotaInfo::dump(Formatter *f) const +{ + f->dump_bool("enabled", enabled); + f->dump_bool("check_on_raw", check_on_raw); + + f->dump_int("max_size", max_size); + f->dump_int("max_size_kb", rgw_rounded_kb(max_size)); + f->dump_int("max_objects", max_objects); +} + +void RGWQuotaInfo::decode_json(JSONObj *obj) +{ + if (false == JSONDecoder::decode_json("max_size", max_size, obj)) { + /* We're parsing an older version of the struct. */ + int64_t max_size_kb = 0; + + JSONDecoder::decode_json("max_size_kb", max_size_kb, obj); + max_size = max_size_kb * 1024; + } + JSONDecoder::decode_json("max_objects", max_objects, obj); + + JSONDecoder::decode_json("check_on_raw", check_on_raw, obj); + JSONDecoder::decode_json("enabled", enabled, obj); +} + diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h new file mode 100644 index 000000000..632cb4817 --- /dev/null +++ b/src/rgw/rgw_quota.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "include/utime.h" +#include "common/config_fwd.h" +#include "common/lru_map.h" + +#include "rgw/rgw_quota_types.h" +#include "common/async/yield_context.h" +#include "rgw_sal_fwd.h" + +struct rgw_bucket; + +class RGWQuotaHandler { +public: + RGWQuotaHandler() {} + virtual ~RGWQuotaHandler() { + } + virtual int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket, + RGWQuota& quota, + uint64_t num_objs, uint64_t size, optional_yield y) = 0; + + virtual void check_bucket_shards(const DoutPrefixProvider *dpp, uint64_t max_objs_per_shard, + uint64_t num_shards, uint64_t num_objs, bool is_multisite, + bool& need_resharding, uint32_t *suggested_num_shards) = 0; + + virtual void update_stats(const rgw_user& bucket_owner, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) = 0; + + static RGWQuotaHandler *generate_handler(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, bool quota_threads); + static void free_handler(RGWQuotaHandler *handler); +}; + +// apply default quotas from configuration +void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf); +void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf); diff --git a/src/rgw/rgw_quota_types.h b/src/rgw/rgw_quota_types.h new file mode 100644 index 000000000..830696815 --- /dev/null +++ b/src/rgw/rgw_quota_types.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * introduce changes or include files which can only be compiled in + * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h) + */ + +#pragma once + +static inline int64_t rgw_rounded_kb(int64_t bytes) +{ + return (bytes + 1023) / 1024; +} + +class JSONObj; + +struct RGWQuotaInfo { + template friend class RGWQuotaCache; +public: + int64_t max_size; + int64_t max_objects; + bool enabled; + /* Do we want to compare with raw, not rounded RGWStorageStats::size (true) + * or maybe rounded-to-4KiB RGWStorageStats::size_rounded (false)? */ + bool check_on_raw; + + RGWQuotaInfo() + : max_size(-1), + max_objects(-1), + enabled(false), + check_on_raw(false) { + } + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + if (max_size < 0) { + encode(-rgw_rounded_kb(abs(max_size)), bl); + } else { + encode(rgw_rounded_kb(max_size), bl); + } + encode(max_objects, bl); + encode(enabled, bl); + encode(max_size, bl); + encode(check_on_raw, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 1, 1, bl); + int64_t max_size_kb; + decode(max_size_kb, bl); + decode(max_objects, bl); + decode(enabled, bl); + if (struct_v < 2) { + max_size = max_size_kb * 1024; + } else { + decode(max_size, bl); + } + if (struct_v >= 3) { + decode(check_on_raw, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + + void decode_json(JSONObj *obj); + +}; +WRITE_CLASS_ENCODER(RGWQuotaInfo) + +struct RGWQuota { + RGWQuotaInfo user_quota; + RGWQuotaInfo bucket_quota; +}; diff --git a/src/rgw/rgw_ratelimit.h b/src/rgw/rgw_ratelimit.h new file mode 100644 index 000000000..2639d4d42 --- /dev/null +++ b/src/rgw/rgw_ratelimit.h @@ -0,0 +1,292 @@ +#pragma once +#include +#include +#include +#include "rgw_common.h" + + +class RateLimiterEntry { + /* + fixed_point_rgw_ratelimit is important to preserve the precision of the token calculation + for example: a user have a limit of single op per minute, the user will consume its single token and then will send another request, 1s after it. + in that case, without this method, the user will get 0 tokens although it should get 0.016 tokens. + using this method it will add 16 tokens to the user, and the user will have 16 tokens, each time rgw will do comparison rgw will divide by fixed_point_rgw_ratelimit, so the user will be blocked anyway until it has enough tokens. + */ + static constexpr int64_t fixed_point_rgw_ratelimit = 1000; + // counters are tracked in multiples of fixed_point_rgw_ratelimit + struct counters { + int64_t ops = 0; + int64_t bytes = 0; + }; + counters read; + counters write; + ceph::timespan ts; + bool first_run = true; + std::mutex ts_lock; + // Those functions are returning the integer value of the tokens + int64_t read_ops () const + { + return read.ops / fixed_point_rgw_ratelimit; + } + int64_t write_ops() const + { + return write.ops / fixed_point_rgw_ratelimit; + } + int64_t read_bytes() const + { + return read.bytes / fixed_point_rgw_ratelimit; + } + int64_t write_bytes() const + { + return write.bytes / fixed_point_rgw_ratelimit; + } + bool should_rate_limit_read(int64_t ops_limit, int64_t bw_limit) { + //check if tenants did not reach their bw or ops limits and that the limits are not 0 (which is unlimited) + if(((read_ops() - 1 < 0) && (ops_limit > 0)) || + (read_bytes() < 0 && bw_limit > 0)) + { + return true; + } + // we don't want to reduce ops' tokens if we've rejected it. + read.ops -= fixed_point_rgw_ratelimit; + return false; + } + bool should_rate_limit_write(int64_t ops_limit, int64_t bw_limit) + { + //check if tenants did not reach their bw or ops limits and that the limits are not 0 (which is unlimited) + if(((write_ops() - 1 < 0) && (ops_limit > 0)) || + (write_bytes() < 0 && bw_limit > 0)) + { + return true; + } + + // we don't want to reduce ops' tokens if we've rejected it. + write.ops -= fixed_point_rgw_ratelimit; + return false; + } + /* The purpose of this function is to minimum time before overriding the stored timestamp + This function is necessary to force the increase tokens add at least 1 token when it updates the last stored timestamp. + That way the user/bucket will not lose tokens because of rounding + */ + bool minimum_time_reached(ceph::timespan curr_timestamp) const + { + using namespace std::chrono; + constexpr auto min_duration = duration_cast(seconds(60)) / fixed_point_rgw_ratelimit; + const auto delta = curr_timestamp - ts; + if (delta < min_duration) + { + return false; + } + return true; + } + + void increase_tokens(ceph::timespan curr_timestamp, + const RGWRateLimitInfo* info) + { + constexpr int fixed_point = fixed_point_rgw_ratelimit; + if (first_run) + { + write.ops = info->max_write_ops * fixed_point; + write.bytes = info->max_write_bytes * fixed_point; + read.ops = info->max_read_ops * fixed_point; + read.bytes = info->max_read_bytes * fixed_point; + ts = curr_timestamp; + first_run = false; + return; + } + else if(curr_timestamp > ts && minimum_time_reached(curr_timestamp)) + { + const int64_t time_in_ms = std::chrono::duration_cast(curr_timestamp - ts).count() / 60.0 / std::milli::den * fixed_point; // / 60 to make it work with 1 min token bucket + ts = curr_timestamp; + const int64_t write_ops = info->max_write_ops * time_in_ms; + const int64_t write_bw = info->max_write_bytes * time_in_ms; + const int64_t read_ops = info->max_read_ops * time_in_ms; + const int64_t read_bw = info->max_read_bytes * time_in_ms; + read.ops = std::min(info->max_read_ops * fixed_point, read_ops + read.ops); + read.bytes = std::min(info->max_read_bytes * fixed_point, read_bw + read.bytes); + write.ops = std::min(info->max_write_ops * fixed_point, write_ops + write.ops); + write.bytes = std::min(info->max_write_bytes * fixed_point, write_bw + write.bytes); + } + } + + public: + bool should_rate_limit(bool is_read, const RGWRateLimitInfo* ratelimit_info, ceph::timespan curr_timestamp) + { + std::unique_lock lock(ts_lock); + increase_tokens(curr_timestamp, ratelimit_info); + if (is_read) + { + return should_rate_limit_read(ratelimit_info->max_read_ops, ratelimit_info->max_read_bytes); + } + return should_rate_limit_write(ratelimit_info->max_write_ops, ratelimit_info->max_write_bytes); + } + void decrease_bytes(bool is_read, int64_t amount, const RGWRateLimitInfo* info) { + std::unique_lock lock(ts_lock); + // we don't want the tenant to be with higher debt than 120 seconds(2 min) of its limit + if (is_read) + { + read.bytes = std::max(read.bytes - amount * fixed_point_rgw_ratelimit,info->max_read_bytes * fixed_point_rgw_ratelimit * -2); + } else { + write.bytes = std::max(write.bytes - amount * fixed_point_rgw_ratelimit,info->max_write_bytes * fixed_point_rgw_ratelimit * -2); + } + } + void giveback_tokens(bool is_read) + { + std::unique_lock lock(ts_lock); + if (is_read) + { + read.ops += fixed_point_rgw_ratelimit; + } else { + write.ops += fixed_point_rgw_ratelimit; + } + } +}; + +class RateLimiter { + + static constexpr size_t map_size = 2000000; // will create it with the closest upper prime number + std::shared_mutex insert_lock; + std::atomic_bool& replacing; + std::condition_variable& cv; + typedef std::unordered_map hash_map; + hash_map ratelimit_entries{map_size}; + static bool is_read_op(const std::string_view method) { + if (method == "GET" || method == "HEAD") + { + return true; + } + return false; + } + + // find or create an entry, and return its iterator + auto& find_or_create(const std::string& key) { + std::shared_lock rlock(insert_lock); + if (ratelimit_entries.size() > 0.9 * map_size && replacing == false) + { + replacing = true; + cv.notify_all(); + } + auto ret = ratelimit_entries.find(key); + rlock.unlock(); + if (ret == ratelimit_entries.end()) + { + std::unique_lock wlock(insert_lock); + ret = ratelimit_entries.emplace(std::piecewise_construct, + std::forward_as_tuple(key), + std::forward_as_tuple()).first; + } + return ret->second; + } + + + + public: + RateLimiter(const RateLimiter&) = delete; + RateLimiter& operator =(const RateLimiter&) = delete; + RateLimiter(RateLimiter&&) = delete; + RateLimiter& operator =(RateLimiter&&) = delete; + RateLimiter() = delete; + RateLimiter(std::atomic_bool& replacing, std::condition_variable& cv) + : replacing(replacing), cv(cv) + { + // prevents rehash, so no iterators invalidation + ratelimit_entries.max_load_factor(1000); + }; + + bool should_rate_limit(const char *method, const std::string& key, ceph::coarse_real_time curr_timestamp, const RGWRateLimitInfo* ratelimit_info) { + if (key.empty() || key.length() == 1 || !ratelimit_info->enabled) + { + return false; + } + bool is_read = is_read_op(method); + auto& it = find_or_create(key); + auto curr_ts = curr_timestamp.time_since_epoch(); + return it.should_rate_limit(is_read ,ratelimit_info, curr_ts); + } + void giveback_tokens(const char *method, const std::string& key) + { + bool is_read = is_read_op(method); + auto& it = find_or_create(key); + it.giveback_tokens(is_read); + } + void decrease_bytes(const char *method, const std::string& key, const int64_t amount, const RGWRateLimitInfo* info) { + if (key.empty() || key.length() == 1 || !info->enabled) + { + return; + } + bool is_read = is_read_op(method); + if ((is_read && !info->max_read_bytes) || (!is_read && !info->max_write_bytes)) + { + return; + } + auto& it = find_or_create(key); + it.decrease_bytes(is_read, amount, info); + } + void clear() { + ratelimit_entries.clear(); + } +}; +// This class purpose is to hold 2 RateLimiter instances, one active and one passive. +// once the active has reached the watermark for clearing it will call the replace_active() thread using cv +// The replace_active will clear the previous RateLimiter after all requests to it has been done (use_count() > 1) +// In the meanwhile new requests will come into the newer active +class ActiveRateLimiter : public DoutPrefix { + std::atomic_uint8_t stopped = {false}; + std::condition_variable cv; + std::mutex cv_m; + std::thread runner; + std::atomic_bool replacing = false; + std::atomic_uint8_t current_active = 0; + std::shared_ptr ratelimit[2]; + void replace_active() { + using namespace std::chrono_literals; + std::unique_lock lk(cv_m); + while (!stopped) { + cv.wait(lk); + current_active = current_active ^ 1; + ldpp_dout(this, 20) << "replacing active ratelimit data structure" << dendl; + while (!stopped && ratelimit[(current_active ^ 1)].use_count() > 1 ) { + if (cv.wait_for(lk, 1min) != std::cv_status::timeout && stopped) + { + return; + } + } + if (stopped) + { + return; + } + ldpp_dout(this, 20) << "clearing passive ratelimit data structure" << dendl; + ratelimit[(current_active ^ 1)]->clear(); + replacing = false; + } + } + public: + ActiveRateLimiter(const ActiveRateLimiter&) = delete; + ActiveRateLimiter& operator =(const ActiveRateLimiter&) = delete; + ActiveRateLimiter(ActiveRateLimiter&&) = delete; + ActiveRateLimiter& operator =(ActiveRateLimiter&&) = delete; + ActiveRateLimiter() = delete; + ActiveRateLimiter(CephContext* cct) : + DoutPrefix(cct, ceph_subsys_rgw, "rate limiter: ") + { + ratelimit[0] = std::make_shared(replacing, cv); + ratelimit[1] = std::make_shared(replacing, cv); + } + ~ActiveRateLimiter() { + ldpp_dout(this, 20) << "stopping ratelimit_gc thread" << dendl; + cv_m.lock(); + stopped = true; + cv_m.unlock(); + cv.notify_all(); + runner.join(); + } + std::shared_ptr get_active() { + return ratelimit[current_active]; + } + void start() { + ldpp_dout(this, 20) << "starting ratelimit_gc thread" << dendl; + runner = std::thread(&ActiveRateLimiter::replace_active, this); + const auto rc = ceph_pthread_setname(runner.native_handle(), "ratelimit_gc"); + ceph_assert(rc==0); + } +}; diff --git a/src/rgw/rgw_realm.cc b/src/rgw/rgw_realm.cc new file mode 100644 index 000000000..8dd6d6f50 --- /dev/null +++ b/src/rgw/rgw_realm.cc @@ -0,0 +1,265 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "common/errno.h" + +#include "rgw_zone.h" +#include "rgw_realm_watcher.h" +#include "rgw_meta_sync_status.h" +#include "rgw_sal_config.h" +#include "rgw_string.h" +#include "rgw_sync.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" + +#include "common/ceph_json.h" +#include "common/Formatter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +namespace rgw_zone_defaults { + +std::string realm_info_oid_prefix = "realms."; +std::string realm_names_oid_prefix = "realms_names."; +std::string default_realm_info_oid = "default.realm"; +std::string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root"; + +} + +using namespace std; +using namespace rgw_zone_defaults; + +RGWRealm::~RGWRealm() {} + +RGWRemoteMetaLog::~RGWRemoteMetaLog() +{ + delete error_logger; +} + +string RGWRealm::get_predefined_id(CephContext *cct) const { + return cct->_conf.get_val("rgw_realm_id"); +} + +const string& RGWRealm::get_predefined_name(CephContext *cct) const { + return cct->_conf->rgw_realm; +} + +int RGWRealm::create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive) +{ + int ret = RGWSystemMetaObj::create(dpp, y, exclusive); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + // create the control object for watch/notify + ret = create_control(dpp, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + RGWPeriod period; + if (current_period.empty()) { + /* create new period for the realm */ + ret = period.init(dpp, cct, sysobj_svc, id, y, name, false); + if (ret < 0 ) { + return ret; + } + ret = period.create(dpp, y, true); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + } else { + period = RGWPeriod(current_period, 0); + int ret = period.init(dpp, cct, sysobj_svc, id, y, name); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to init period " << current_period << dendl; + return ret; + } + } + ret = set_current_period(dpp, period, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed set current period " << current_period << dendl; + return ret; + } + // try to set as default. may race with another create, so pass exclusive=true + // so we don't override an existing default + ret = set_as_default(dpp, y, true); + if (ret < 0 && ret != -EEXIST) { + ldpp_dout(dpp, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl; + } + + return 0; +} + +int RGWRealm::delete_obj(const DoutPrefixProvider *dpp, optional_yield y) +{ + int ret = RGWSystemMetaObj::delete_obj(dpp, y); + if (ret < 0) { + return ret; + } + return delete_control(dpp, y); +} + +int RGWRealm::create_control(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + auto pool = rgw_pool{get_pool(cct)}; + auto oid = get_control_oid(); + bufferlist bl; + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid}); + return sysobj.wop() + .set_exclusive(exclusive) + .write(dpp, bl, y); +} + +int RGWRealm::delete_control(const DoutPrefixProvider *dpp, optional_yield y) +{ + auto pool = rgw_pool{get_pool(cct)}; + auto obj = rgw_raw_obj{pool, get_control_oid()}; + auto sysobj = sysobj_svc->get_obj(obj); + return sysobj.wop().remove(dpp, y); +} + +rgw_pool RGWRealm::get_pool(CephContext *cct) const +{ + if (cct->_conf->rgw_realm_root_pool.empty()) { + return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL); + } + return rgw_pool(cct->_conf->rgw_realm_root_pool); +} + +const string RGWRealm::get_default_oid(bool old_format) const +{ + if (cct->_conf->rgw_default_realm_info_oid.empty()) { + return default_realm_info_oid; + } + return cct->_conf->rgw_default_realm_info_oid; +} + +const string& RGWRealm::get_names_oid_prefix() const +{ + return realm_names_oid_prefix; +} + +const string& RGWRealm::get_info_oid_prefix(bool old_format) const +{ + return realm_info_oid_prefix; +} + +int RGWRealm::set_current_period(const DoutPrefixProvider *dpp, RGWPeriod& period, optional_yield y) +{ + // update realm epoch to match the period's + if (epoch > period.get_realm_epoch()) { + ldpp_dout(dpp, 0) << "ERROR: set_current_period with old realm epoch " + << period.get_realm_epoch() << ", current epoch=" << epoch << dendl; + return -EINVAL; + } + if (epoch == period.get_realm_epoch() && current_period != period.get_id()) { + ldpp_dout(dpp, 0) << "ERROR: set_current_period with same realm epoch " + << period.get_realm_epoch() << ", but different period id " + << period.get_id() << " != " << current_period << dendl; + return -EINVAL; + } + + epoch = period.get_realm_epoch(); + current_period = period.get_id(); + + int ret = update(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = period.reflect(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +string RGWRealm::get_control_oid() const +{ + return get_info_oid_prefix() + id + ".control"; +} + +int RGWRealm::notify_zone(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y) +{ + rgw_pool pool{get_pool(cct)}; + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, get_control_oid()}); + int ret = sysobj.wn().notify(dpp, bl, 0, nullptr, y); + if (ret < 0) { + return ret; + } + return 0; +} + +int RGWRealm::notify_new_period(const DoutPrefixProvider *dpp, const RGWPeriod& period, optional_yield y) +{ + bufferlist bl; + using ceph::encode; + // push the period to dependent zonegroups/zones + encode(RGWRealmNotify::ZonesNeedPeriod, bl); + encode(period, bl); + // reload the gateway with the new period + encode(RGWRealmNotify::Reload, bl); + + return notify_zone(dpp, bl, y); +} + + +int RGWRealm::find_zone(const DoutPrefixProvider *dpp, + const rgw_zone_id& zid, + RGWPeriod *pperiod, + RGWZoneGroup *pzonegroup, + bool *pfound, + optional_yield y) const +{ + auto& found = *pfound; + + found = false; + + string period_id; + epoch_t epoch = 0; + + RGWPeriod period(period_id, epoch); + int r = period.init(dpp, cct, sysobj_svc, get_id(), y, get_name()); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: period init failed: " << cpp_strerror(-r) << " ... skipping" << dendl; + return r; + } + + found = period.find_zone(dpp, zid, pzonegroup, y); + if (found) { + *pperiod = period; + } + return 0; +} + +void RGWRealm::generate_test_instances(list &o) +{ + RGWRealm *z = new RGWRealm; + o.push_back(z); + o.push_back(new RGWRealm); +} + +void RGWRealm::dump(Formatter *f) const +{ + RGWSystemMetaObj::dump(f); + encode_json("current_period", current_period, f); + encode_json("epoch", epoch, f); +} + + +void RGWRealm::decode_json(JSONObj *obj) +{ + RGWSystemMetaObj::decode_json(obj); + JSONDecoder::decode_json("current_period", current_period, obj); + JSONDecoder::decode_json("epoch", epoch, obj); +} + diff --git a/src/rgw/rgw_realm_reloader.cc b/src/rgw/rgw_realm_reloader.cc new file mode 100644 index 000000000..182cf1639 --- /dev/null +++ b/src/rgw/rgw_realm_reloader.cc @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_realm_reloader.h" + +#include "rgw_auth_registry.h" +#include "rgw_bucket.h" +#include "rgw_log.h" +#include "rgw_rest.h" +#include "rgw_user.h" +#include "rgw_process_env.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" + +#include "services/svc_zone.h" + +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw realm reloader: ") + + +// safe callbacks from SafeTimer are unneccessary. reload() can take a long +// time, so we don't want to hold the mutex and block handle_notify() for the +// duration +static constexpr bool USE_SAFE_TIMER_CALLBACKS = false; + + +RGWRealmReloader::RGWRealmReloader(RGWProcessEnv& env, + const rgw::auth::ImplicitTenants& implicit_tenants, + std::map& service_map_meta, + Pauser* frontends) + : env(env), + implicit_tenants(implicit_tenants), + service_map_meta(service_map_meta), + frontends(frontends), + timer(env.driver->ctx(), mutex, USE_SAFE_TIMER_CALLBACKS), + mutex(ceph::make_mutex("RGWRealmReloader")), + reload_scheduled(nullptr) +{ + timer.init(); +} + +RGWRealmReloader::~RGWRealmReloader() +{ + std::lock_guard lock{mutex}; + timer.shutdown(); +} + +class RGWRealmReloader::C_Reload : public Context { + RGWRealmReloader* reloader; + public: + explicit C_Reload(RGWRealmReloader* reloader) : reloader(reloader) {} + void finish(int r) override { reloader->reload(); } +}; + +void RGWRealmReloader::handle_notify(RGWRealmNotify type, + bufferlist::const_iterator& p) +{ + if (!env.driver) { + /* we're in the middle of reload */ + return; + } + + CephContext *const cct = env.driver->ctx(); + + std::lock_guard lock{mutex}; + if (reload_scheduled) { + ldout(cct, 4) << "Notification on realm, reconfiguration " + "already scheduled" << dendl; + return; + } + + reload_scheduled = new C_Reload(this); + cond.notify_one(); // wake reload() if it blocked on a bad configuration + + // schedule reload() without delay + timer.add_event_after(0, reload_scheduled); + + ldout(cct, 4) << "Notification on realm, reconfiguration scheduled" << dendl; +} + +void RGWRealmReloader::reload() +{ + CephContext *const cct = env.driver->ctx(); + const DoutPrefix dp(cct, dout_subsys, "rgw realm reloader: "); + ldpp_dout(&dp, 1) << "Pausing frontends for realm update..." << dendl; + + frontends->pause(); + + ldpp_dout(&dp, 1) << "Frontends paused" << dendl; + + // TODO: make RGWRados responsible for rgw_log_usage lifetime + rgw_log_usage_finalize(); + + // destroy the existing driver + DriverManager::close_storage(env.driver); + env.driver = nullptr; + + ldpp_dout(&dp, 1) << "driver closed" << dendl; + { + // allow a new notify to reschedule us. it's important that we do this + // before we start loading the new realm, or we could miss some updates + std::lock_guard lock{mutex}; + reload_scheduled = nullptr; + } + + + while (!env.driver) { + // recreate and initialize a new driver + DriverManager::Config cfg; + cfg.store_name = "rados"; + cfg.filter_name = "none"; + env.driver = + DriverManager::get_storage(&dp, cct, + cfg, + cct->_conf->rgw_enable_gc_threads, + cct->_conf->rgw_enable_lc_threads, + cct->_conf->rgw_enable_quota_threads, + cct->_conf->rgw_run_sync_thread, + cct->_conf.get_val("rgw_dynamic_resharding"), + cct->_conf->rgw_cache_enabled); + + ldpp_dout(&dp, 1) << "Creating new driver" << dendl; + + rgw::sal::Driver* store_cleanup = nullptr; + { + std::unique_lock lock{mutex}; + + // failure to recreate RGWRados is not a recoverable error, but we + // don't want to assert or abort the entire cluster. instead, just + // sleep until we get another notification, and retry until we get + // a working configuration + if (env.driver == nullptr) { + ldpp_dout(&dp, -1) << "Failed to reinitialize RGWRados after a realm " + "configuration update. Waiting for a new update." << dendl; + + // sleep until another event is scheduled + cond.wait(lock, [this] { return reload_scheduled; }); + ldout(cct, 1) << "Woke up with a new configuration, retrying " + "RGWRados initialization." << dendl; + } + + if (reload_scheduled) { + // cancel the event; we'll handle it now + timer.cancel_event(reload_scheduled); + reload_scheduled = nullptr; + + // if we successfully created a driver, clean it up outside of the lock, + // then continue to loop and recreate another + std::swap(env.driver, store_cleanup); + } + } + + if (store_cleanup) { + ldpp_dout(&dp, 4) << "Got another notification, restarting RGWRados " + "initialization." << dendl; + + DriverManager::close_storage(store_cleanup); + } + } + + int r = env.driver->register_to_service_map(&dp, "rgw", service_map_meta); + if (r < 0) { + ldpp_dout(&dp, -1) << "ERROR: failed to register to service map: " << cpp_strerror(-r) << dendl; + + /* ignore error */ + } + + ldpp_dout(&dp, 1) << "Finishing initialization of new driver" << dendl; + // finish initializing the new driver + ldpp_dout(&dp, 1) << " - REST subsystem init" << dendl; + rgw_rest_init(cct, env.driver->get_zone()->get_zonegroup()); + ldpp_dout(&dp, 1) << " - usage subsystem init" << dendl; + rgw_log_usage_init(cct, env.driver); + + /* Initialize the registry of auth strategies which will coordinate + * the dynamic reconfiguration. */ + env.auth_registry = rgw::auth::StrategyRegistry::create( + cct, implicit_tenants, env.driver); + env.lua.manager = env.driver->get_lua_manager(); + + ldpp_dout(&dp, 1) << "Resuming frontends with new realm configuration." << dendl; + + frontends->resume(env.driver); +} diff --git a/src/rgw/rgw_realm_reloader.h b/src/rgw/rgw_realm_reloader.h new file mode 100644 index 000000000..25082a2e4 --- /dev/null +++ b/src/rgw/rgw_realm_reloader.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_realm_watcher.h" +#include "common/Cond.h" +#include "rgw_sal_fwd.h" + +struct RGWProcessEnv; +namespace rgw::auth { class ImplicitTenants; } + +/** + * RGWRealmReloader responds to new period notifications by recreating RGWRados + * with the updated realm configuration. + */ +class RGWRealmReloader : public RGWRealmWatcher::Watcher { + public: + /** + * Pauser is an interface to pause/resume frontends. Frontend cooperation + * is required to ensure that they stop issuing requests on the old + * RGWRados instance, and restart with the updated configuration. + * + * This abstraction avoids a dependency on class RGWFrontend. + */ + class Pauser { + public: + virtual ~Pauser() = default; + + /// pause all frontends while realm reconfiguration is in progress + virtual void pause() = 0; + /// resume all frontends with the given RGWRados instance + virtual void resume(rgw::sal::Driver* driver) = 0; + }; + + RGWRealmReloader(RGWProcessEnv& env, + const rgw::auth::ImplicitTenants& implicit_tenants, + std::map& service_map_meta, + Pauser* frontends); + ~RGWRealmReloader() override; + + /// respond to realm notifications by scheduling a reload() + void handle_notify(RGWRealmNotify type, bufferlist::const_iterator& p) override; + + private: + /// pause frontends and replace the RGWRados instance + void reload(); + + class C_Reload; //< Context that calls reload() + + RGWProcessEnv& env; + const rgw::auth::ImplicitTenants& implicit_tenants; + std::map& service_map_meta; + Pauser *const frontends; + + /// reload() takes a significant amount of time, so we don't want to run + /// it in the handle_notify() thread. we choose a timer thread instead of a + /// Finisher because it allows us to cancel events that were scheduled while + /// reload() is still running + SafeTimer timer; + ceph::mutex mutex; //< protects access to timer and reload_scheduled + ceph::condition_variable cond; //< to signal reload() after an invalid realm config + C_Reload* reload_scheduled; //< reload() context if scheduled +}; diff --git a/src/rgw/rgw_realm_watcher.cc b/src/rgw/rgw_realm_watcher.cc new file mode 100644 index 000000000..f6cd34759 --- /dev/null +++ b/src/rgw/rgw_realm_watcher.cc @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/errno.h" + +#include "rgw_realm_watcher.h" +#include "rgw_tools.h" +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw realm watcher: ") + + +RGWRealmWatcher::RGWRealmWatcher(const DoutPrefixProvider *dpp, CephContext* cct, const RGWRealm& realm) + : cct(cct) +{ + // no default realm, nothing to watch + if (realm.get_id().empty()) { + ldpp_dout(dpp, 4) << "No realm, disabling dynamic reconfiguration." << dendl; + return; + } + + // establish the watch on RGWRealm + int r = watch_start(dpp, realm); + if (r < 0) { + ldpp_dout(dpp, -1) << "Failed to establish a watch on RGWRealm, " + "disabling dynamic reconfiguration." << dendl; + return; + } +} + +RGWRealmWatcher::~RGWRealmWatcher() +{ + watch_stop(); +} + +void RGWRealmWatcher::add_watcher(RGWRealmNotify type, Watcher& watcher) +{ + watchers.emplace(type, watcher); +} + +void RGWRealmWatcher::handle_notify(uint64_t notify_id, uint64_t cookie, + uint64_t notifier_id, bufferlist& bl) +{ + if (cookie != watch_handle) + return; + + // send an empty notify ack + bufferlist reply; + pool_ctx.notify_ack(watch_oid, notify_id, cookie, reply); + + try { + auto p = bl.cbegin(); + while (!p.end()) { + RGWRealmNotify notify; + decode(notify, p); + auto watcher = watchers.find(notify); + if (watcher == watchers.end()) { + lderr(cct) << "Failed to find a watcher for notify type " + << static_cast(notify) << dendl; + break; + } + watcher->second.handle_notify(notify, p); + } + } catch (const buffer::error &e) { + lderr(cct) << "Failed to decode realm notifications." << dendl; + } +} + +void RGWRealmWatcher::handle_error(uint64_t cookie, int err) +{ + lderr(cct) << "RGWRealmWatcher::handle_error oid=" << watch_oid << " err=" << err << dendl; + if (cookie != watch_handle) + return; + + watch_restart(); +} + +int RGWRealmWatcher::watch_start(const DoutPrefixProvider *dpp, const RGWRealm& realm) +{ + // initialize a Rados client + int r = rados.init_with_context(cct); + if (r < 0) { + ldpp_dout(dpp, -1) << "Rados client initialization failed with " + << cpp_strerror(-r) << dendl; + return r; + } + r = rados.connect(); + if (r < 0) { + ldpp_dout(dpp, -1) << "Rados client connection failed with " + << cpp_strerror(-r) << dendl; + return r; + } + + // open an IoCtx for the realm's pool + rgw_pool pool(realm.get_pool(cct)); + r = rgw_init_ioctx(dpp, &rados, pool, pool_ctx); + if (r < 0) { + ldpp_dout(dpp, -1) << "Failed to open pool " << pool + << " with " << cpp_strerror(-r) << dendl; + rados.shutdown(); + return r; + } + + // register a watch on the realm's control object + auto oid = realm.get_control_oid(); + r = pool_ctx.watch2(oid, &watch_handle, this); + if (r < 0) { + ldpp_dout(dpp, -1) << "Failed to watch " << oid + << " with " << cpp_strerror(-r) << dendl; + pool_ctx.close(); + rados.shutdown(); + return r; + } + + ldpp_dout(dpp, 10) << "Watching " << oid << dendl; + std::swap(watch_oid, oid); + return 0; +} + +int RGWRealmWatcher::watch_restart() +{ + ceph_assert(!watch_oid.empty()); + int r = pool_ctx.unwatch2(watch_handle); + if (r < 0) { + lderr(cct) << "Failed to unwatch on " << watch_oid + << " with " << cpp_strerror(-r) << dendl; + } + r = pool_ctx.watch2(watch_oid, &watch_handle, this); + if (r < 0) { + lderr(cct) << "Failed to restart watch on " << watch_oid + << " with " << cpp_strerror(-r) << dendl; + pool_ctx.close(); + watch_oid.clear(); + } + return r; +} + +void RGWRealmWatcher::watch_stop() +{ + if (!watch_oid.empty()) { + pool_ctx.unwatch2(watch_handle); + pool_ctx.close(); + watch_oid.clear(); + } +} diff --git a/src/rgw/rgw_realm_watcher.h b/src/rgw/rgw_realm_watcher.h new file mode 100644 index 000000000..2a0c0d076 --- /dev/null +++ b/src/rgw/rgw_realm_watcher.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "include/rados/librados.hpp" +#include "include/ceph_assert.h" +#include "common/Timer.h" +#include "common/Cond.h" + +class RGWRados; +class RGWRealm; + +enum class RGWRealmNotify { + Reload, + ZonesNeedPeriod, +}; +WRITE_RAW_ENCODER(RGWRealmNotify); + +/** + * RGWRealmWatcher establishes a watch on the current RGWRealm's control object, + * and forwards notifications to registered observers. + */ +class RGWRealmWatcher : public librados::WatchCtx2 { + public: + /** + * Watcher is an interface that allows the RGWRealmWatcher to pass + * notifications on to other interested objects. + */ + class Watcher { + public: + virtual ~Watcher() = default; + + virtual void handle_notify(RGWRealmNotify type, + bufferlist::const_iterator& p) = 0; + }; + + RGWRealmWatcher(const DoutPrefixProvider *dpp, CephContext* cct, const RGWRealm& realm); + ~RGWRealmWatcher() override; + + /// register a watcher for the given notification type + void add_watcher(RGWRealmNotify type, Watcher& watcher); + + /// respond to realm notifications by calling the appropriate watcher + void handle_notify(uint64_t notify_id, uint64_t cookie, + uint64_t notifier_id, bufferlist& bl) override; + + /// reestablish the watch if it gets disconnected + void handle_error(uint64_t cookie, int err) override; + + private: + CephContext *const cct; + + /// keep a separate Rados client whose lifetime is independent of RGWRados + /// so that we don't miss notifications during realm reconfiguration + librados::Rados rados; + librados::IoCtx pool_ctx; + uint64_t watch_handle = 0; + std::string watch_oid; + + int watch_start(const DoutPrefixProvider *dpp, const RGWRealm& realm); + int watch_restart(); + void watch_stop(); + + std::map watchers; +}; diff --git a/src/rgw/rgw_request.h b/src/rgw/rgw_request.h new file mode 100644 index 000000000..cd05f51c9 --- /dev/null +++ b/src/rgw/rgw_request.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_common.h" +#include "rgw_acl.h" +#include "rgw_user.h" +#include "rgw_op.h" + +#include "common/QueueRing.h" + +#include + +struct RGWRequest +{ + uint64_t id; + req_state *s; + RGWOp *op; + + explicit RGWRequest(uint64_t id) : id(id), s(NULL), op(NULL) {} + + virtual ~RGWRequest() {} + + void init_state(req_state *_s) { + s = _s; + } +}; /* RGWRequest */ + +struct RGWLoadGenRequest : public RGWRequest { + std::string method; + std::string resource; + int content_length; + std::atomic* fail_flag = nullptr; + +RGWLoadGenRequest(uint64_t req_id, const std::string& _m, const std::string& _r, int _cl, + std::atomic *ff) + : RGWRequest(req_id), method(_m), resource(_r), content_length(_cl), + fail_flag(ff) {} +}; diff --git a/src/rgw/rgw_resolve.cc b/src/rgw/rgw_resolve.cc new file mode 100644 index 000000000..b6f258ee0 --- /dev/null +++ b/src/rgw/rgw_resolve.cc @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + +#include "acconfig.h" + +#ifdef HAVE_ARPA_NAMESER_COMPAT_H +#include +#endif + +#include "rgw_common.h" +#include "rgw_resolve.h" +#include "common/dns_resolve.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWResolver::~RGWResolver() { +} + +RGWResolver::RGWResolver() { + resolver = DNSResolver::get_instance(); +} + +int RGWResolver::resolve_cname(const string& hostname, string& cname, bool *found) { + return resolver->resolve_cname(g_ceph_context, hostname, &cname, found); +} + +RGWResolver *rgw_resolver; + + +void rgw_init_resolver() +{ + rgw_resolver = new RGWResolver(); +} + +void rgw_shutdown_resolver() +{ + delete rgw_resolver; +} diff --git a/src/rgw/rgw_resolve.h b/src/rgw/rgw_resolve.h new file mode 100644 index 000000000..0428e0a02 --- /dev/null +++ b/src/rgw/rgw_resolve.h @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_common.h" + +namespace ceph { + class DNSResolver; +} + +class RGWResolver { + DNSResolver *resolver; + +public: + ~RGWResolver(); + RGWResolver(); + int resolve_cname(const std::string& hostname, std::string& cname, bool *found); +}; + + +extern void rgw_init_resolver(void); +extern void rgw_shutdown_resolver(void); +extern RGWResolver *rgw_resolver; diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc new file mode 100644 index 000000000..a1741e6dc --- /dev/null +++ b/src/rgw/rgw_rest.cc @@ -0,0 +1,2335 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#include +#include + +#include +#include +#include "common/Formatter.h" +#include "common/HTMLFormatter.h" +#include "common/utf8.h" +#include "include/str_list.h" +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_auth_s3.h" +#include "rgw_formats.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_swift.h" +#include "rgw_rest_s3.h" +#include "rgw_swift_auth.h" +#include "rgw_cors_s3.h" +#include "rgw_perf_counters.h" + +#include "rgw_client_io.h" +#include "rgw_resolve.h" +#include "rgw_sal_rados.h" + +#include "rgw_ratelimit.h" +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +struct rgw_http_status_code { + int code; + const char *name; +}; + +const static struct rgw_http_status_code http_codes[] = { + { 100, "Continue" }, + { 200, "OK" }, + { 201, "Created" }, + { 202, "Accepted" }, + { 204, "No Content" }, + { 205, "Reset Content" }, + { 206, "Partial Content" }, + { 207, "Multi Status" }, + { 208, "Already Reported" }, + { 300, "Multiple Choices" }, + { 301, "Moved Permanently" }, + { 302, "Found" }, + { 303, "See Other" }, + { 304, "Not Modified" }, + { 305, "User Proxy" }, + { 306, "Switch Proxy" }, + { 307, "Temporary Redirect" }, + { 308, "Permanent Redirect" }, + { 400, "Bad Request" }, + { 401, "Unauthorized" }, + { 402, "Payment Required" }, + { 403, "Forbidden" }, + { 404, "Not Found" }, + { 405, "Method Not Allowed" }, + { 406, "Not Acceptable" }, + { 407, "Proxy Authentication Required" }, + { 408, "Request Timeout" }, + { 409, "Conflict" }, + { 410, "Gone" }, + { 411, "Length Required" }, + { 412, "Precondition Failed" }, + { 413, "Request Entity Too Large" }, + { 414, "Request-URI Too Long" }, + { 415, "Unsupported Media Type" }, + { 416, "Requested Range Not Satisfiable" }, + { 417, "Expectation Failed" }, + { 422, "Unprocessable Entity" }, + { 498, "Rate Limited"}, + { 500, "Internal Server Error" }, + { 501, "Not Implemented" }, + { 503, "Slow Down"}, + { 0, NULL }, +}; + +struct rgw_http_attr { + const char *rgw_attr; + const char *http_attr; +}; + +/* + * mapping between rgw object attrs and output http fields + */ +static const struct rgw_http_attr base_rgw_to_http_attrs[] = { + { RGW_ATTR_CONTENT_LANG, "Content-Language" }, + { RGW_ATTR_EXPIRES, "Expires" }, + { RGW_ATTR_CACHE_CONTROL, "Cache-Control" }, + { RGW_ATTR_CONTENT_DISP, "Content-Disposition" }, + { RGW_ATTR_CONTENT_ENC, "Content-Encoding" }, + { RGW_ATTR_USER_MANIFEST, "X-Object-Manifest" }, + { RGW_ATTR_X_ROBOTS_TAG , "X-Robots-Tag" }, + { RGW_ATTR_STORAGE_CLASS , "X-Amz-Storage-Class" }, + /* RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION header depends on access mode: + * S3 endpoint: x-amz-website-redirect-location + * S3Website endpoint: Location + */ + { RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "x-amz-website-redirect-location" }, +}; + + +struct generic_attr { + const char *http_header; + const char *rgw_attr; +}; + +/* + * mapping between http env fields and rgw object attrs + */ +static const struct generic_attr generic_attrs[] = { + { "CONTENT_TYPE", RGW_ATTR_CONTENT_TYPE }, + { "HTTP_CONTENT_LANGUAGE", RGW_ATTR_CONTENT_LANG }, + { "HTTP_EXPIRES", RGW_ATTR_EXPIRES }, + { "HTTP_CACHE_CONTROL", RGW_ATTR_CACHE_CONTROL }, + { "HTTP_CONTENT_DISPOSITION", RGW_ATTR_CONTENT_DISP }, + { "HTTP_CONTENT_ENCODING", RGW_ATTR_CONTENT_ENC }, + { "HTTP_X_ROBOTS_TAG", RGW_ATTR_X_ROBOTS_TAG }, +}; + +map rgw_to_http_attrs; +static map generic_attrs_map; +map http_status_names; + +/* + * make attrs look_like_this + * converts dashes to underscores + */ +string lowercase_underscore_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + switch (*s) { + case '-': + buf[i] = '_'; + break; + default: + buf[i] = tolower(*s); + } + } + return string(buf); +} + +/* + * make attrs LOOK_LIKE_THIS + * converts dashes to underscores + */ +string uppercase_underscore_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + switch (*s) { + case '-': + buf[i] = '_'; + break; + default: + buf[i] = toupper(*s); + } + } + return string(buf); +} + +/* avoid duplicate hostnames in hostnames lists */ +static set hostnames_set; +static set hostnames_s3website_set; + +void rgw_rest_init(CephContext *cct, const rgw::sal::ZoneGroup& zone_group) +{ + for (const auto& rgw2http : base_rgw_to_http_attrs) { + rgw_to_http_attrs[rgw2http.rgw_attr] = rgw2http.http_attr; + } + + for (const auto& http2rgw : generic_attrs) { + generic_attrs_map[http2rgw.http_header] = http2rgw.rgw_attr; + } + + list extended_http_attrs; + get_str_list(cct->_conf->rgw_extended_http_attrs, extended_http_attrs); + + list::iterator iter; + for (iter = extended_http_attrs.begin(); iter != extended_http_attrs.end(); ++iter) { + string rgw_attr = RGW_ATTR_PREFIX; + rgw_attr.append(lowercase_underscore_http_attr(*iter)); + + rgw_to_http_attrs[rgw_attr] = camelcase_dash_http_attr(*iter); + + string http_header = "HTTP_"; + http_header.append(uppercase_underscore_http_attr(*iter)); + + generic_attrs_map[http_header] = rgw_attr; + } + + for (const struct rgw_http_status_code *h = http_codes; h->code; h++) { + http_status_names[h->code] = h->name; + } + + std::list rgw_dns_names; + std::string rgw_dns_names_str = cct->_conf->rgw_dns_name; + get_str_list(rgw_dns_names_str, ", ", rgw_dns_names); + hostnames_set.insert(rgw_dns_names.begin(), rgw_dns_names.end()); + + std::list names; + zone_group.get_hostnames(names); + hostnames_set.insert(names.begin(), names.end()); + hostnames_set.erase(""); // filter out empty hostnames + ldout(cct, 20) << "RGW hostnames: " << hostnames_set << dendl; + /* TODO: We should have a sanity check that no hostname matches the end of + * any other hostname, otherwise we will get ambigious results from + * rgw_find_host_in_domains. + * Eg: + * Hostnames: [A, B.A] + * Inputs: [Z.A, X.B.A] + * Z.A clearly splits to subdomain=Z, domain=Z + * X.B.A ambigously splits to both {X, B.A} and {X.B, A} + */ + + zone_group.get_s3website_hostnames(names); + hostnames_s3website_set.insert(cct->_conf->rgw_dns_s3website_name); + hostnames_s3website_set.insert(names.begin(), names.end()); + hostnames_s3website_set.erase(""); // filter out empty hostnames + ldout(cct, 20) << "RGW S3website hostnames: " << hostnames_s3website_set << dendl; + /* TODO: we should repeat the hostnames_set sanity check here + * and ALSO decide about overlap, if any + */ +} + +static bool str_ends_with_nocase(const string& s, const string& suffix, size_t *pos) +{ + size_t len = suffix.size(); + if (len > (size_t)s.size()) { + return false; + } + + ssize_t p = s.size() - len; + if (pos) { + *pos = p; + } + + return boost::algorithm::iends_with(s, suffix); +} + +static bool rgw_find_host_in_domains(const string& host, string *domain, string *subdomain, + const set& valid_hostnames_set) +{ + set::iterator iter; + /** TODO, Future optimization + * store hostnames_set elements _reversed_, and look for a prefix match, + * which is much faster than a suffix match. + */ + for (iter = valid_hostnames_set.begin(); iter != valid_hostnames_set.end(); ++iter) { + size_t pos; + if (!str_ends_with_nocase(host, *iter, &pos)) + continue; + + if (pos == 0) { + *domain = host; + subdomain->clear(); + } else { + if (host[pos - 1] != '.') { + continue; + } + + *domain = host.substr(pos); + *subdomain = host.substr(0, pos - 1); + } + return true; + } + return false; +} + +static void dump_status(req_state *s, int status, + const char *status_name) +{ + s->formatter->set_status(status, status_name); + try { + RESTFUL_IO(s)->send_status(status, status_name); + } catch (rgw::io::Exception& e) { + ldpp_dout(s, 0) << "ERROR: s->cio->send_status() returned err=" + << e.what() << dendl; + } +} + +void rgw_flush_formatter_and_reset(req_state *s, Formatter *formatter) +{ + std::ostringstream oss; + formatter->output_footer(); + formatter->flush(oss); + std::string outs(oss.str()); + if (!outs.empty() && s->op != OP_HEAD) { + dump_body(s, outs); + } + + s->formatter->reset(); +} + +void rgw_flush_formatter(req_state *s, Formatter *formatter) +{ + std::ostringstream oss; + formatter->flush(oss); + std::string outs(oss.str()); + if (!outs.empty() && s->op != OP_HEAD) { + dump_body(s, outs); + } +} + +void dump_errno(int http_ret, string& out) { + stringstream ss; + + ss << http_ret << " " << http_status_names[http_ret]; + out = ss.str(); +} + +void dump_errno(const struct rgw_err &err, string& out) { + dump_errno(err.http_ret, out); +} + +void dump_errno(req_state *s) +{ + dump_status(s, s->err.http_ret, http_status_names[s->err.http_ret]); +} + +void dump_errno(req_state *s, int http_ret) +{ + dump_status(s, http_ret, http_status_names[http_ret]); +} + +void dump_header(req_state* const s, + const std::string_view& name, + const std::string_view& val) +{ + try { + RESTFUL_IO(s)->send_header(name, val); + } catch (rgw::io::Exception& e) { + ldpp_dout(s, 0) << "ERROR: s->cio->send_header() returned err=" + << e.what() << dendl; + } +} + +void dump_header(req_state* const s, + const std::string_view& name, + ceph::buffer::list& bl) +{ + return dump_header(s, name, rgw_sanitized_hdrval(bl)); +} + +void dump_header(req_state* const s, + const std::string_view& name, + const long long val) +{ + char buf[32]; + const auto len = snprintf(buf, sizeof(buf), "%lld", val); + + return dump_header(s, name, std::string_view(buf, len)); +} + +void dump_header(req_state* const s, + const std::string_view& name, + const utime_t& ut) +{ + char buf[32]; + const auto len = snprintf(buf, sizeof(buf), "%lld.%05d", + static_cast(ut.sec()), + static_cast(ut.usec() / 10)); + + return dump_header(s, name, std::string_view(buf, len)); +} + +void dump_content_length(req_state* const s, const uint64_t len) +{ + try { + RESTFUL_IO(s)->send_content_length(len); + } catch (rgw::io::Exception& e) { + ldpp_dout(s, 0) << "ERROR: s->cio->send_content_length() returned err=" + << e.what() << dendl; + } + dump_header(s, "Accept-Ranges", "bytes"); +} + +static void dump_chunked_encoding(req_state* const s) +{ + try { + RESTFUL_IO(s)->send_chunked_transfer_encoding(); + } catch (rgw::io::Exception& e) { + ldpp_dout(s, 0) << "ERROR: RESTFUL_IO(s)->send_chunked_transfer_encoding()" + << " returned err=" << e.what() << dendl; + } +} + +void dump_etag(req_state* const s, + const std::string_view& etag, + const bool quoted) +{ + if (etag.empty()) { + return; + } + + if (s->prot_flags & RGW_REST_SWIFT && ! quoted) { + return dump_header(s, "etag", etag); + } else { + return dump_header_quoted(s, "ETag", etag); + } +} + +void dump_bucket_from_state(req_state *s) +{ + if (g_conf()->rgw_expose_bucket && ! s->bucket_name.empty()) { + if (! s->bucket_tenant.empty()) { + dump_header(s, "Bucket", + url_encode(s->bucket_tenant + "/" + s->bucket_name)); + } else { + dump_header(s, "Bucket", url_encode(s->bucket_name)); + } + } +} + +void dump_redirect(req_state * const s, const std::string& redirect) +{ + return dump_header_if_nonempty(s, "Location", redirect); +} + +static size_t dump_time_header_impl(char (×tr)[TIME_BUF_SIZE], + const real_time t) +{ + const utime_t ut(t); + time_t secs = static_cast(ut.sec()); + + struct tm result; + const struct tm * const tmp = gmtime_r(&secs, &result); + if (tmp == nullptr) { + return 0; + } + + return strftime(timestr, sizeof(timestr), + "%a, %d %b %Y %H:%M:%S %Z", tmp); +} + +void dump_time_header(req_state *s, const char *name, real_time t) +{ + char timestr[TIME_BUF_SIZE]; + + const size_t len = dump_time_header_impl(timestr, t); + if (len == 0) { + return; + } + + return dump_header(s, name, std::string_view(timestr, len)); +} + +std::string dump_time_to_str(const real_time& t) +{ + char timestr[TIME_BUF_SIZE]; + dump_time_header_impl(timestr, t); + + return timestr; +} + + +void dump_last_modified(req_state *s, real_time t) +{ + dump_time_header(s, "Last-Modified", t); +} + +void dump_epoch_header(req_state *s, const char *name, real_time t) +{ + utime_t ut(t); + char buf[65]; + const auto len = snprintf(buf, sizeof(buf), "%lld.%09lld", + (long long)ut.sec(), + (long long)ut.nsec()); + + return dump_header(s, name, std::string_view(buf, len)); +} + +void dump_time(req_state *s, const char *name, real_time t) +{ + char buf[TIME_BUF_SIZE]; + rgw_to_iso8601(t, buf, sizeof(buf)); + + s->formatter->dump_string(name, buf); +} + +void dump_owner(req_state *s, const rgw_user& id, const string& name, + const char *section) +{ + if (!section) + section = "Owner"; + s->formatter->open_object_section(section); + s->formatter->dump_string("ID", id.to_str()); + s->formatter->dump_string("DisplayName", name); + s->formatter->close_section(); +} + +void dump_access_control(req_state *s, const char *origin, + const char *meth, + const char *hdr, const char *exp_hdr, + uint32_t max_age) { + if (origin && (origin[0] != '\0')) { + dump_header(s, "Access-Control-Allow-Origin", origin); + /* If the server specifies an origin host rather than "*", + * then it must also include Origin in the Vary response header + * to indicate to clients that server responses will differ + * based on the value of the Origin request header. + */ + if (strcmp(origin, "*") != 0) { + dump_header(s, "Vary", "Origin"); + } + + if (meth && (meth[0] != '\0')) { + dump_header(s, "Access-Control-Allow-Methods", meth); + } + if (hdr && (hdr[0] != '\0')) { + dump_header(s, "Access-Control-Allow-Headers", hdr); + } + if (exp_hdr && (exp_hdr[0] != '\0')) { + dump_header(s, "Access-Control-Expose-Headers", exp_hdr); + } + if (max_age != CORS_MAX_AGE_INVALID) { + dump_header(s, "Access-Control-Max-Age", max_age); + } + } +} + +void dump_access_control(req_state *s, RGWOp *op) +{ + string origin; + string method; + string header; + string exp_header; + unsigned max_age = CORS_MAX_AGE_INVALID; + + if (!op->generate_cors_headers(origin, method, header, exp_header, &max_age)) + return; + + dump_access_control(s, origin.c_str(), method.c_str(), header.c_str(), + exp_header.c_str(), max_age); +} + +void dump_start(req_state *s) +{ + if (!s->content_started) { + s->formatter->output_header(); + s->content_started = true; + } +} + +void dump_trans_id(req_state *s) +{ + if (s->prot_flags & RGW_REST_SWIFT) { + dump_header(s, "X-Trans-Id", s->trans_id); + dump_header(s, "X-Openstack-Request-Id", s->trans_id); + } else if (s->trans_id.length()) { + dump_header(s, "x-amz-request-id", s->trans_id); + } +} + +void end_header(req_state* s, RGWOp* op, const char *content_type, + const int64_t proposed_content_length, bool force_content_type, + bool force_no_error) +{ + string ctype; + + dump_trans_id(s); + + if ((!s->is_err()) && s->bucket && + (s->bucket->get_info().owner != s->user->get_id()) && + (s->bucket->get_info().requester_pays)) { + dump_header(s, "x-amz-request-charged", "requester"); + } + + if (op) { + dump_access_control(s, op); + } + + if (s->prot_flags & RGW_REST_SWIFT && !content_type) { + force_content_type = true; + } + + /* do not send content type if content length is zero + and the content type was not set by the user */ + if (force_content_type || + (!content_type && s->formatter->get_len() != 0) || s->is_err()){ + ctype = to_mime_type(s->format); + if (s->prot_flags & RGW_REST_SWIFT) + ctype.append("; charset=utf-8"); + content_type = ctype.c_str(); + } + if (!force_no_error && s->is_err()) { + dump_start(s); + dump(s); + dump_content_length(s, s->formatter->get_len()); + } else { + if (proposed_content_length == CHUNKED_TRANSFER_ENCODING) { + dump_chunked_encoding(s); + } else if (proposed_content_length != NO_CONTENT_LENGTH) { + dump_content_length(s, proposed_content_length); + } + } + + if (content_type) { + dump_header(s, "Content-Type", content_type); + } + dump_header_if_nonempty(s, "Server", g_conf()->rgw_service_provider_name); + + try { + RESTFUL_IO(s)->complete_header(); + } catch (rgw::io::Exception& e) { + ldpp_dout(s, 0) << "ERROR: RESTFUL_IO(s)->complete_header() returned err=" + << e.what() << dendl; + } + + ACCOUNTING_IO(s)->set_account(true); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void build_redirect_url(req_state *s, const string& redirect_base, string *redirect_url) +{ + string& dest_uri = *redirect_url; + + dest_uri = redirect_base; + /* + * reqest_uri is always start with slash, so we need to remove + * the unnecessary slash at the end of dest_uri. + */ + if (dest_uri[dest_uri.size() - 1] == '/') { + dest_uri = dest_uri.substr(0, dest_uri.size() - 1); + } + dest_uri += s->info.request_uri; + dest_uri += "?"; + dest_uri += s->info.request_params; +} + +void abort_early(req_state *s, RGWOp* op, int err_no, + RGWHandler* handler, optional_yield y) +{ + string error_content(""); + if (!s->formatter) { + s->formatter = new JSONFormatter; + s->format = RGWFormat::JSON; + } + + // op->error_handler is responsible for calling it's handler error_handler + if (op != NULL) { + int new_err_no; + new_err_no = op->error_handler(err_no, &error_content, y); + ldpp_dout(s, 1) << "op->ERRORHANDLER: err_no=" << err_no + << " new_err_no=" << new_err_no << dendl; + err_no = new_err_no; + } else if (handler != NULL) { + int new_err_no; + new_err_no = handler->error_handler(err_no, &error_content, y); + ldpp_dout(s, 1) << "handler->ERRORHANDLER: err_no=" << err_no + << " new_err_no=" << new_err_no << dendl; + err_no = new_err_no; + } + + // If the error handler(s) above dealt with it completely, they should have + // returned 0. If non-zero, we need to continue here. + if (err_no) { + // Watch out, we might have a custom error state already set! + if (!s->err.http_ret || s->err.http_ret == 200) { + set_req_state_err(s, err_no); + } + + if (s->err.http_ret == 404 && !s->redirect_zone_endpoint.empty()) { + s->err.http_ret = 301; + err_no = -ERR_PERMANENT_REDIRECT; + build_redirect_url(s, s->redirect_zone_endpoint, &s->redirect); + } + + dump_errno(s); + dump_bucket_from_state(s); + if (err_no == -ERR_PERMANENT_REDIRECT || err_no == -ERR_WEBSITE_REDIRECT) { + string dest_uri; + if (!s->redirect.empty()) { + dest_uri = s->redirect; + } else if (!s->zonegroup_endpoint.empty()) { + build_redirect_url(s, s->zonegroup_endpoint, &dest_uri); + } + + if (!dest_uri.empty()) { + dump_redirect(s, dest_uri); + } + } + + if (!error_content.empty()) { + /* + * TODO we must add all error entries as headers here: + * when having a working errordoc, then the s3 error fields are + * rendered as HTTP headers, e.g.: + * x-amz-error-code: NoSuchKey + * x-amz-error-message: The specified key does not exist. + * x-amz-error-detail-Key: foo + */ + end_header(s, op, NULL, error_content.size(), false, true); + RESTFUL_IO(s)->send_body(error_content.c_str(), error_content.size()); + } else { + end_header(s, op); + } + } + perfcounter->inc(l_rgw_failed_req); +} + +void dump_continue(req_state * const s) +{ + try { + RESTFUL_IO(s)->send_100_continue(); + } catch (rgw::io::Exception& e) { + ldpp_dout(s, 0) << "ERROR: RESTFUL_IO(s)->send_100_continue() returned err=" + << e.what() << dendl; + } +} + +void dump_range(req_state* const s, + const uint64_t ofs, + const uint64_t end, + const uint64_t total) +{ + /* dumping range into temp buffer first, as libfcgi will fail to digest + * %lld */ + char range_buf[128]; + size_t len; + + if (! total) { + len = snprintf(range_buf, sizeof(range_buf), "bytes */%lld", + static_cast(total)); + } else { + len = snprintf(range_buf, sizeof(range_buf), "bytes %lld-%lld/%lld", + static_cast(ofs), + static_cast(end), + static_cast(total)); + } + + return dump_header(s, "Content-Range", std::string_view(range_buf, len)); +} + + +int dump_body(req_state* const s, + const char* const buf, + const size_t len) +{ + bool healthchk = false; + // we dont want to limit health checks + if(s->op_type == RGW_OP_GET_HEALTH_CHECK) + healthchk = true; + if(len > 0 && !healthchk) { + const char *method = s->info.method; + s->ratelimit_data->decrease_bytes(method, s->ratelimit_user_name, len, &s->user_ratelimit); + if(!rgw::sal::Bucket::empty(s->bucket.get())) + s->ratelimit_data->decrease_bytes(method, s->ratelimit_bucket_marker, len, &s->bucket_ratelimit); + } + try { + return RESTFUL_IO(s)->send_body(buf, len); + } catch (rgw::io::Exception& e) { + return -e.code().value(); + } +} + +int dump_body(req_state* const s, /* const */ ceph::buffer::list& bl) +{ + return dump_body(s, bl.c_str(), bl.length()); +} + +int dump_body(req_state* const s, const std::string& str) +{ + return dump_body(s, str.c_str(), str.length()); +} + +int recv_body(req_state* const s, + char* const buf, + const size_t max) +{ + int len; + try { + len = RESTFUL_IO(s)->recv_body(buf, max); + } catch (rgw::io::Exception& e) { + return -e.code().value(); + } + bool healthchk = false; + // we dont want to limit health checks + if(s->op_type == RGW_OP_GET_HEALTH_CHECK) + healthchk = true; + if(len > 0 && !healthchk) { + const char *method = s->info.method; + s->ratelimit_data->decrease_bytes(method, s->ratelimit_user_name, len, &s->user_ratelimit); + if(!rgw::sal::Bucket::empty(s->bucket.get())) + s->ratelimit_data->decrease_bytes(method, s->ratelimit_bucket_marker, len, &s->bucket_ratelimit); + } + return len; + +} + +int RGWGetObj_ObjStore::get_params(optional_yield y) +{ + range_str = s->info.env->get("HTTP_RANGE"); + if_mod = s->info.env->get("HTTP_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_IF_NONE_MATCH"); + + if (s->system_request) { + mod_zone_id = s->info.env->get_int("HTTP_DEST_ZONE_SHORT_ID", 0); + mod_pg_ver = s->info.env->get_int("HTTP_DEST_PG_VER", 0); + rgwx_stat = s->info.args.exists(RGW_SYS_PARAM_PREFIX "stat"); + get_data &= (!rgwx_stat); + } + + if (s->info.args.exists(GET_TORRENT)) { + return torrent.get_params(); + } + return 0; +} + +int RESTArgs::get_string(req_state *s, const string& name, + const string& def_val, string *val, bool *existed) +{ + bool exists; + *val = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + return 0; +} + +int RESTArgs::get_uint64(req_state *s, const string& name, + uint64_t def_val, uint64_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + int r = stringtoull(sval, val); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_int64(req_state *s, const string& name, + int64_t def_val, int64_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + int r = stringtoll(sval, val); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_uint32(req_state *s, const string& name, + uint32_t def_val, uint32_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + int r = stringtoul(sval, val); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_int32(req_state *s, const string& name, + int32_t def_val, int32_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + int r = stringtol(sval, val); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_time(req_state *s, const string& name, + const utime_t& def_val, utime_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + uint64_t epoch, nsec; + + int r = utime_t::parse_date(sval, &epoch, &nsec); + if (r < 0) + return r; + + *val = utime_t(epoch, nsec); + + return 0; +} + +int RESTArgs::get_epoch(req_state *s, const string& name, uint64_t def_val, uint64_t *epoch, bool *existed) +{ + bool exists; + string date = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *epoch = def_val; + return 0; + } + + int r = utime_t::parse_date(date, epoch, NULL); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_bool(req_state *s, const string& name, bool def_val, bool *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + const char *str = sval.c_str(); + + if (sval.empty() || + strcasecmp(str, "true") == 0 || + sval.compare("1") == 0) { + *val = true; + return 0; + } + + if (strcasecmp(str, "false") != 0 && + sval.compare("0") != 0) { + *val = def_val; + return -EINVAL; + } + + *val = false; + return 0; +} + + +void RGWRESTFlusher::do_start(int ret) +{ + set_req_state_err(s, ret); /* no going back from here */ + dump_errno(s); + dump_start(s); + end_header(s, op); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWRESTFlusher::do_flush() +{ + rgw_flush_formatter(s, s->formatter); +} + +int RGWPutObj_ObjStore::verify_params() +{ + if (s->length) { + off_t len = atoll(s->length); + if (len > (off_t)(s->cct->_conf->rgw_max_put_size)) { + return -ERR_TOO_LARGE; + } + } + + return 0; +} + +int RGWPutObj_ObjStore::get_params(optional_yield y) +{ + /* start gettorrent */ + if (s->cct->_conf->rgw_torrent_flag) + { + int ret = 0; + ret = torrent.get_params(); + ldpp_dout(s, 5) << "NOTICE: open produce torrent file " << dendl; + if (ret < 0) + { + return ret; + } + torrent.set_info_name(s->object->get_name()); + } + /* end gettorrent */ + supplied_md5_b64 = s->info.env->get("HTTP_CONTENT_MD5"); + + return 0; +} + +int RGWPutObj_ObjStore::get_data(bufferlist& bl) +{ + size_t cl; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + if (s->length) { + cl = atoll(s->length) - ofs; + if (cl > chunk_size) + cl = chunk_size; + } else { + cl = chunk_size; + } + + int len = 0; + { + ACCOUNTING_IO(s)->set_account(true); + bufferptr bp(cl); + + const auto read_len = recv_body(s, bp.c_str(), cl); + if (read_len < 0) { + return read_len; + } + + len = read_len; + bl.append(bp, 0, len); + + ACCOUNTING_IO(s)->set_account(false); + } + + if ((uint64_t)ofs + len > s->cct->_conf->rgw_max_put_size) { + return -ERR_TOO_LARGE; + } + + return len; +} + + +/* + * parses params in the format: 'first; param1=foo; param2=bar' + */ +void RGWPostObj_ObjStore::parse_boundary_params(const std::string& params_str, + std::string& first, + std::map& params) +{ + size_t pos = params_str.find(';'); + if (std::string::npos == pos) { + first = rgw_trim_whitespace(params_str); + return; + } + + first = rgw_trim_whitespace(params_str.substr(0, pos)); + pos++; + + while (pos < params_str.size()) { + size_t end = params_str.find(';', pos); + if (std::string::npos == end) { + end = params_str.size(); + } + + std::string param = params_str.substr(pos, end - pos); + size_t eqpos = param.find('='); + + if (std::string::npos != eqpos) { + std::string param_name = rgw_trim_whitespace(param.substr(0, eqpos)); + std::string val = rgw_trim_quotes(param.substr(eqpos + 1)); + params[std::move(param_name)] = std::move(val); + } else { + params[rgw_trim_whitespace(param)] = ""; + } + + pos = end + 1; + } +} + +int RGWPostObj_ObjStore::parse_part_field(const std::string& line, + std::string& field_name, /* out */ + post_part_field& field) /* out */ +{ + size_t pos = line.find(':'); + if (pos == string::npos) + return -EINVAL; + + field_name = line.substr(0, pos); + if (pos >= line.size() - 1) + return 0; + + parse_boundary_params(line.substr(pos + 1), field.val, field.params); + + return 0; +} + +static bool is_crlf(const char *s) +{ + return (*s == '\r' && *(s + 1) == '\n'); +} + +/* + * find the index of the boundary, if exists, or optionally the next end of line + * also returns how many bytes to skip + */ +static int index_of(ceph::bufferlist& bl, + uint64_t max_len, + const std::string& str, + const bool check_crlf, + bool& reached_boundary, + int& skip) +{ + reached_boundary = false; + skip = 0; + + if (str.size() < 2) // we assume boundary is at least 2 chars (makes it easier with crlf checks) + return -EINVAL; + + if (bl.length() < str.size()) + return -1; + + const char *buf = bl.c_str(); + const char *s = str.c_str(); + + if (max_len > bl.length()) + max_len = bl.length(); + + for (uint64_t i = 0; i < max_len; i++, buf++) { + if (check_crlf && + i >= 1 && + is_crlf(buf - 1)) { + return i + 1; // skip the crlf + } + if ((i < max_len - str.size() + 1) && + (buf[0] == s[0] && buf[1] == s[1]) && + (strncmp(buf, s, str.size()) == 0)) { + reached_boundary = true; + skip = str.size(); + + /* oh, great, now we need to swallow the preceding crlf + * if exists + */ + if ((i >= 2) && + is_crlf(buf - 2)) { + i -= 2; + skip += 2; + } + return i; + } + } + + return -1; +} + +int RGWPostObj_ObjStore::read_with_boundary(ceph::bufferlist& bl, + uint64_t max, + const bool check_crlf, + bool& reached_boundary, + bool& done) +{ + uint64_t cl = max + 2 + boundary.size(); + + if (max > in_data.length()) { + uint64_t need_to_read = cl - in_data.length(); + + bufferptr bp(need_to_read); + + const auto read_len = recv_body(s, bp.c_str(), need_to_read); + if (read_len < 0) { + return read_len; + } + in_data.append(bp, 0, read_len); + } + + done = false; + int skip; + const int index = index_of(in_data, cl, boundary, check_crlf, + reached_boundary, skip); + if (index >= 0) { + max = index; + } + + if (max > in_data.length()) { + max = in_data.length(); + } + + bl.substr_of(in_data, 0, max); + + ceph::bufferlist new_read_data; + + /* + * now we need to skip boundary for next time, also skip any crlf, or + * check to see if it's the last final boundary (marked with "--" at the end + */ + if (reached_boundary) { + int left = in_data.length() - max; + if (left < skip + 2) { + int need = skip + 2 - left; + bufferptr boundary_bp(need); + const int r = recv_body(s, boundary_bp.c_str(), need); + if (r < 0) { + return r; + } + in_data.append(boundary_bp); + } + max += skip; // skip boundary for next time + if (in_data.length() >= max + 2) { + const char *data = in_data.c_str(); + if (is_crlf(data + max)) { + max += 2; + } else { + if (*(data + max) == '-' && + *(data + max + 1) == '-') { + done = true; + max += 2; + } + } + } + } + + new_read_data.substr_of(in_data, max, in_data.length() - max); + in_data = new_read_data; + + return 0; +} + +int RGWPostObj_ObjStore::read_line(ceph::bufferlist& bl, + const uint64_t max, + bool& reached_boundary, + bool& done) +{ + return read_with_boundary(bl, max, true, reached_boundary, done); +} + +int RGWPostObj_ObjStore::read_data(ceph::bufferlist& bl, + const uint64_t max, + bool& reached_boundary, + bool& done) +{ + return read_with_boundary(bl, max, false, reached_boundary, done); +} + + +int RGWPostObj_ObjStore::read_form_part_header(struct post_form_part* const part, + bool& done) +{ + bufferlist bl; + bool reached_boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + int r = read_line(bl, chunk_size, reached_boundary, done); + if (r < 0) { + return r; + } + + if (done) { + return 0; + } + + if (reached_boundary) { // skip the first boundary + r = read_line(bl, chunk_size, reached_boundary, done); + if (r < 0) { + return r; + } else if (done) { + return 0; + } + } + + while (true) { + /* + * iterate through fields + */ + std::string line = rgw_trim_whitespace(string(bl.c_str(), bl.length())); + + if (line.empty()) { + break; + } + + struct post_part_field field; + + string field_name; + r = parse_part_field(line, field_name, field); + if (r < 0) { + return r; + } + + part->fields[field_name] = field; + + if (stringcasecmp(field_name, "Content-Disposition") == 0) { + part->name = field.params["name"]; + } + + if (reached_boundary) { + break; + } + + r = read_line(bl, chunk_size, reached_boundary, done); + if (r < 0) { + return r; + } + } + + return 0; +} + +bool RGWPostObj_ObjStore::part_str(parts_collection_t& parts, + const std::string& name, + std::string* val) +{ + const auto iter = parts.find(name); + if (std::end(parts) == iter) { + return false; + } + + ceph::bufferlist& data = iter->second.data; + std::string str = string(data.c_str(), data.length()); + *val = rgw_trim_whitespace(str); + return true; +} + +std::string RGWPostObj_ObjStore::get_part_str(parts_collection_t& parts, + const std::string& name, + const std::string& def_val) +{ + std::string val; + + if (part_str(parts, name, &val)) { + return val; + } else { + return rgw_trim_whitespace(def_val); + } +} + +bool RGWPostObj_ObjStore::part_bl(parts_collection_t& parts, + const std::string& name, + ceph::bufferlist* pbl) +{ + const auto iter = parts.find(name); + if (std::end(parts) == iter) { + return false; + } + + *pbl = iter->second.data; + return true; +} + +int RGWPostObj_ObjStore::verify_params() +{ + /* check that we have enough memory to store the object + note that this test isn't exact and may fail unintentionally + for large requests is */ + if (!s->length) { + return -ERR_LENGTH_REQUIRED; + } + off_t len = atoll(s->length); + if (len > (off_t)(s->cct->_conf->rgw_max_put_size)) { + return -ERR_TOO_LARGE; + } + + supplied_md5_b64 = s->info.env->get("HTTP_CONTENT_MD5"); + + return 0; +} + +int RGWPostObj_ObjStore::get_params(optional_yield y) +{ + if (s->expect_cont) { + /* OK, here it really gets ugly. With POST, the params are embedded in the + * request body, so we need to continue before being able to actually look + * at them. This diverts from the usual request flow. */ + dump_continue(s); + s->expect_cont = false; + } + + std::string req_content_type_str = s->info.env->get("CONTENT_TYPE", ""); + std::string req_content_type; + std::map params; + parse_boundary_params(req_content_type_str, req_content_type, params); + + if (req_content_type.compare("multipart/form-data") != 0) { + err_msg = "Request Content-Type is not multipart/form-data"; + return -EINVAL; + } + + if (s->cct->_conf->subsys.should_gather()) { + ldpp_dout(s, 20) << "request content_type_str=" + << req_content_type_str << dendl; + ldpp_dout(s, 20) << "request content_type params:" << dendl; + + for (const auto& pair : params) { + ldpp_dout(s, 20) << " " << pair.first << " -> " << pair.second + << dendl; + } + } + + const auto iter = params.find("boundary"); + if (std::end(params) == iter) { + err_msg = "Missing multipart boundary specification"; + return -EINVAL; + } + + /* Create the boundary. */ + boundary = "--"; + boundary.append(iter->second); + + return 0; +} + + +int RGWPutACLs_ObjStore::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + ldpp_dout(s, 0) << "RGWPutACLs_ObjStore::get_params read data is: " << data.c_str() << dendl; + return op_ret; +} + +int RGWPutLC_ObjStore::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + return op_ret; +} + +int RGWPutBucketObjectLock_ObjStore::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + return op_ret; +} + +int RGWPutObjLegalHold_ObjStore::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + return op_ret; +} + + +static std::tuple read_all_chunked_input(req_state *s, const uint64_t max_read) +{ +#define READ_CHUNK 4096 +#define MAX_READ_CHUNK (128 * 1024) + int need_to_read = READ_CHUNK; + int total = need_to_read; + bufferlist bl; + + int read_len = 0; + do { + bufferptr bp(need_to_read + 1); + read_len = recv_body(s, bp.c_str(), need_to_read); + if (read_len < 0) { + return std::make_tuple(read_len, std::move(bl)); + } + + bp.c_str()[read_len] = '\0'; + bp.set_length(read_len); + bl.append(bp); + + if (read_len == need_to_read) { + if (need_to_read < MAX_READ_CHUNK) + need_to_read *= 2; + + if ((unsigned)total > max_read) { + return std::make_tuple(-ERANGE, std::move(bl)); + } + total += need_to_read; + } else { + break; + } + } while (true); + + return std::make_tuple(0, std::move(bl)); +} + +std::tuple rgw_rest_read_all_input(req_state *s, + const uint64_t max_len, + const bool allow_chunked) +{ + size_t cl = 0; + int len = 0; + bufferlist bl; + + if (s->length) + cl = atoll(s->length); + else if (!allow_chunked) + return std::make_tuple(-ERR_LENGTH_REQUIRED, std::move(bl)); + + if (cl) { + if (cl > (size_t)max_len) { + return std::make_tuple(-ERANGE, std::move(bl)); + } + + bufferptr bp(cl + 1); + + len = recv_body(s, bp.c_str(), cl); + if (len < 0) { + return std::make_tuple(len, std::move(bl)); + } + + bp.c_str()[len] = '\0'; + bp.set_length(len); + bl.append(bp); + + } else if (allow_chunked && !s->length) { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + if (!encoding || strcmp(encoding, "chunked") != 0) + return std::make_tuple(-ERR_LENGTH_REQUIRED, std::move(bl)); + + int ret = 0; + std::tie(ret, bl) = read_all_chunked_input(s, max_len); + if (ret < 0) + return std::make_tuple(ret, std::move(bl)); + } + + return std::make_tuple(0, std::move(bl)); +} + +int RGWCompleteMultipart_ObjStore::get_params(optional_yield y) +{ + upload_id = s->info.args.get("uploadId"); + + if (upload_id.empty()) { + op_ret = -ENOTSUP; + return op_ret; + } + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size); + if (op_ret < 0) + return op_ret; + + return 0; +} + +int RGWListMultipart_ObjStore::get_params(optional_yield y) +{ + upload_id = s->info.args.get("uploadId"); + + if (upload_id.empty()) { + op_ret = -ENOTSUP; + } + string marker_str = s->info.args.get("part-number-marker"); + + if (!marker_str.empty()) { + string err; + marker = strict_strtol(marker_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 20) << "bad marker: " << marker << dendl; + op_ret = -EINVAL; + return op_ret; + } + } + + string str = s->info.args.get("max-parts"); + op_ret = parse_value_and_bound(str, max_parts, 0, + g_conf().get_val("rgw_max_listing_results"), + max_parts); + + return op_ret; +} + +int RGWListBucketMultiparts_ObjStore::get_params(optional_yield y) +{ + delimiter = s->info.args.get("delimiter"); + prefix = s->info.args.get("prefix"); + string str = s->info.args.get("max-uploads"); + op_ret = parse_value_and_bound(str, max_uploads, 0, + g_conf().get_val("rgw_max_listing_results"), + default_max); + if (op_ret < 0) { + return op_ret; + } + + if (auto encoding_type = s->info.args.get_optional("encoding-type"); + encoding_type != boost::none) { + if (strcasecmp(encoding_type->c_str(), "url") != 0) { + op_ret = -EINVAL; + s->err.message="Invalid Encoding Method specified in Request"; + return op_ret; + } + encode_url = true; + } + + string key_marker = s->info.args.get("key-marker"); + string upload_id_marker = s->info.args.get("upload-id-marker"); + if (!key_marker.empty()) { + std::unique_ptr upload; + upload = s->bucket->get_multipart_upload(key_marker, + upload_id_marker); + marker_meta = upload->get_meta(); + marker_key = upload->get_key(); + marker_upload_id = upload->get_upload_id(); + } + + return 0; +} + +int RGWDeleteMultiObj_ObjStore::get_params(optional_yield y) +{ + + if (s->bucket_name.empty()) { + op_ret = -EINVAL; + return op_ret; + } + + // everything is probably fine, set the bucket + bucket = s->bucket.get(); + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + return op_ret; +} + + +void RGWRESTOp::send_response() +{ + if (!flusher.did_start()) { + set_req_state_err(s, get_ret()); + dump_errno(s); + end_header(s, this); + } + flusher.flush(); +} + +int RGWRESTOp::verify_permission(optional_yield) +{ + return check_caps(s->user->get_info().caps); +} + +RGWOp* RGWHandler_REST::get_op(void) +{ + RGWOp *op; + switch (s->op) { + case OP_GET: + op = op_get(); + break; + case OP_PUT: + op = op_put(); + break; + case OP_DELETE: + op = op_delete(); + break; + case OP_HEAD: + op = op_head(); + break; + case OP_POST: + op = op_post(); + break; + case OP_COPY: + op = op_copy(); + break; + case OP_OPTIONS: + op = op_options(); + break; + default: + return NULL; + } + + if (op) { + op->init(driver, s, this); + } + return op; +} /* get_op */ + +void RGWHandler_REST::put_op(RGWOp* op) +{ + delete op; +} /* put_op */ + +int RGWHandler_REST::allocate_formatter(req_state *s, + RGWFormat default_type, + bool configurable) +{ + s->format = RGWFormat::BAD_FORMAT; // set to invalid value to allocation happens anyway + auto type = default_type; + if (configurable) { + string format_str = s->info.args.get("format"); + if (format_str.compare("xml") == 0) { + type = RGWFormat::XML; + } else if (format_str.compare("json") == 0) { + type = RGWFormat::JSON; + } else if (format_str.compare("html") == 0) { + type = RGWFormat::HTML; + } else { + const char *accept = s->info.env->get("HTTP_ACCEPT"); + if (accept) { + // trim at first ; + std::string_view format = accept; + format = format.substr(0, format.find(';')); + + if (format == "text/xml" || format == "application/xml") { + type = RGWFormat::XML; + } else if (format == "application/json") { + type = RGWFormat::JSON; + } else if (format == "text/html") { + type = RGWFormat::HTML; + } + } + } + } + return RGWHandler_REST::reallocate_formatter(s, type); +} + +int RGWHandler_REST::reallocate_formatter(req_state *s, const RGWFormat type) +{ + if (s->format == type) { + // do nothing, just reset + ceph_assert(s->formatter); + s->formatter->reset(); + return 0; + } + + delete s->formatter; + s->formatter = nullptr; + s->format = type; + + const string& mm = s->info.args.get("multipart-manifest"); + const bool multipart_delete = (mm.compare("delete") == 0); + const bool swift_bulkupload = s->prot_flags & RGW_REST_SWIFT && + s->info.args.exists("extract-archive"); + switch (s->format) { + case RGWFormat::PLAIN: + { + const bool use_kv_syntax = s->info.args.exists("bulk-delete") || + multipart_delete || swift_bulkupload; + s->formatter = new RGWFormatter_Plain(use_kv_syntax); + break; + } + case RGWFormat::XML: + { + const bool lowercase_underscore = s->info.args.exists("bulk-delete") || + multipart_delete || swift_bulkupload; + + s->formatter = new XMLFormatter(false, lowercase_underscore); + break; + } + case RGWFormat::JSON: + s->formatter = new JSONFormatter(false); + break; + case RGWFormat::HTML: + s->formatter = new HTMLFormatter(s->prot_flags & RGW_REST_WEBSITE); + break; + default: + return -EINVAL; + + }; + //s->formatter->reset(); // All formatters should reset on create already + + return 0; +} +// This function enforces Amazon's spec for bucket names. +// (The requirements, not the recommendations.) +int RGWHandler_REST::validate_bucket_name(const string& bucket) +{ + int len = bucket.size(); + if (len < 3) { + if (len == 0) { + // This request doesn't specify a bucket at all + return 0; + } + // Name too short + return -ERR_INVALID_BUCKET_NAME; + } + else if (len > MAX_BUCKET_NAME_LEN) { + // Name too long + return -ERR_INVALID_BUCKET_NAME; + } + + const char *s = bucket.c_str(); + for (int i = 0; i < len; ++i, ++s) { + if (*(unsigned char *)s == 0xff) + return -ERR_INVALID_BUCKET_NAME; + if (*(unsigned char *)s == '/') + return -ERR_INVALID_BUCKET_NAME; + } + + return 0; +} + +// "The name for a key is a sequence of Unicode characters whose UTF-8 encoding +// is at most 1024 bytes long." +// However, we can still have control characters and other nasties in there. +// Just as long as they're utf-8 nasties. +int RGWHandler_REST::validate_object_name(const string& object) +{ + int len = object.size(); + if (len > MAX_OBJ_NAME_LEN) { + // Name too long + return -ERR_INVALID_OBJECT_NAME; + } + + if (check_utf8(object.c_str(), len)) { + // Object names must be valid UTF-8. + return -ERR_INVALID_OBJECT_NAME; + } + return 0; +} + +static http_op op_from_method(const char *method) +{ + if (!method) + return OP_UNKNOWN; + if (strcmp(method, "GET") == 0) + return OP_GET; + if (strcmp(method, "PUT") == 0) + return OP_PUT; + if (strcmp(method, "DELETE") == 0) + return OP_DELETE; + if (strcmp(method, "HEAD") == 0) + return OP_HEAD; + if (strcmp(method, "POST") == 0) + return OP_POST; + if (strcmp(method, "COPY") == 0) + return OP_COPY; + if (strcmp(method, "OPTIONS") == 0) + return OP_OPTIONS; + + return OP_UNKNOWN; +} + +int RGWHandler_REST::init_permissions(RGWOp* op, optional_yield y) +{ + if (op->get_type() == RGW_OP_CREATE_BUCKET) { + // We don't need user policies in case of STS token returned by AssumeRole, hence the check for user type + if (! s->user->get_id().empty() && s->auth.identity->get_identity_type() != TYPE_ROLE) { + try { + if (auto ret = s->user->read_attrs(s, y); ! ret) { + auto user_policies = get_iam_user_policy_from_attr(s->cct, s->user->get_attrs(), s->user->get_tenant()); + s->iam_user_policies.insert(s->iam_user_policies.end(), + std::make_move_iterator(user_policies.begin()), + std::make_move_iterator(user_policies.end())); + + } + } catch (const std::exception& e) { + ldpp_dout(op, -1) << "Error reading IAM User Policy: " << e.what() << dendl; + } + } + rgw_build_iam_environment(driver, s); + return 0; + } + + return do_init_permissions(op, y); +} + +int RGWHandler_REST::read_permissions(RGWOp* op_obj, optional_yield y) +{ + bool only_bucket = false; + + switch (s->op) { + case OP_HEAD: + case OP_GET: + only_bucket = false; + break; + case OP_PUT: + case OP_POST: + case OP_COPY: + /* is it a 'multi-object delete' request? */ + if (s->info.args.exists("delete")) { + only_bucket = true; + break; + } + if (is_obj_update_op()) { + only_bucket = false; + break; + } + /* is it a 'create bucket' request? */ + if (op_obj->get_type() == RGW_OP_CREATE_BUCKET) + return 0; + + only_bucket = true; + break; + case OP_DELETE: + if (!s->info.args.exists("tagging")){ + only_bucket = true; + } + break; + case OP_OPTIONS: + only_bucket = true; + break; + default: + return -EINVAL; + } + + return do_read_permissions(op_obj, only_bucket, y); +} + +void RGWRESTMgr::register_resource(string resource, RGWRESTMgr *mgr) +{ + string r = "/"; + r.append(resource); + + /* do we have a resource manager registered for this entry point? */ + map::iterator iter = resource_mgrs.find(r); + if (iter != resource_mgrs.end()) { + delete iter->second; + } + resource_mgrs[r] = mgr; + resources_by_size.insert(pair(r.size(), r)); + + /* now build default resource managers for the path (instead of nested entry points) + * e.g., if the entry point is /auth/v1.0/ then we'd want to create a default + * manager for /auth/ + */ + + size_t pos = r.find('/', 1); + + while (pos != r.size() - 1 && pos != string::npos) { + string s = r.substr(0, pos); + + iter = resource_mgrs.find(s); + if (iter == resource_mgrs.end()) { /* only register it if one does not exist */ + resource_mgrs[s] = new RGWRESTMgr; /* a default do-nothing manager */ + resources_by_size.insert(pair(s.size(), s)); + } + + pos = r.find('/', pos + 1); + } +} + +void RGWRESTMgr::register_default_mgr(RGWRESTMgr *mgr) +{ + delete default_mgr; + default_mgr = mgr; +} + +RGWRESTMgr* RGWRESTMgr::get_resource_mgr(req_state* const s, + const std::string& uri, + std::string* const out_uri) +{ + *out_uri = uri; + + multimap::reverse_iterator iter; + + for (iter = resources_by_size.rbegin(); iter != resources_by_size.rend(); ++iter) { + string& resource = iter->second; + if (uri.compare(0, iter->first, resource) == 0 && + (uri.size() == iter->first || + uri[iter->first] == '/')) { + std::string suffix = uri.substr(iter->first); + return resource_mgrs[resource]->get_resource_mgr(s, suffix, out_uri); + } + } + + if (default_mgr) { + return default_mgr->get_resource_mgr_as_default(s, uri, out_uri); + } + + return this; +} + +void RGWREST::register_x_headers(const string& s_headers) +{ + std::vector hdrs = get_str_vec(s_headers); + for (auto& hdr : hdrs) { + boost::algorithm::to_upper(hdr); // XXX + (void) x_headers.insert(hdr); + } +} + +RGWRESTMgr::~RGWRESTMgr() +{ + map::iterator iter; + for (iter = resource_mgrs.begin(); iter != resource_mgrs.end(); ++iter) { + delete iter->second; + } + delete default_mgr; +} + +int64_t parse_content_length(const char *content_length) +{ + int64_t len = -1; + + if (*content_length == '\0') { + len = 0; + } else { + string err; + len = strict_strtoll(content_length, 10, &err); + if (!err.empty()) { + len = -1; + } + } + + return len; +} + +int RGWREST::preprocess(req_state *s, rgw::io::BasicClient* cio) +{ + req_info& info = s->info; + + /* save the request uri used to hash on the client side. request_uri may suffer + modifications as part of the bucket encoding in the subdomain calling format. + request_uri_aws4 will be used under aws4 auth */ + s->info.request_uri_aws4 = s->info.request_uri; + + s->cio = cio; + + // We need to know if this RGW instance is running the s3website API with a + // higher priority than regular S3 API, or possibly in place of the regular + // S3 API. + // Map the listing of rgw_enable_apis in REVERSE order, so that items near + // the front of the list have a higher number assigned (and -1 for items not in the list). + list apis; + get_str_list(g_conf()->rgw_enable_apis, apis); + int api_priority_s3 = -1; + int api_priority_s3website = -1; + auto api_s3website_priority_rawpos = std::find(apis.begin(), apis.end(), "s3website"); + auto api_s3_priority_rawpos = std::find(apis.begin(), apis.end(), "s3"); + if (api_s3_priority_rawpos != apis.end()) { + api_priority_s3 = apis.size() - std::distance(apis.begin(), api_s3_priority_rawpos); + } + if (api_s3website_priority_rawpos != apis.end()) { + api_priority_s3website = apis.size() - std::distance(apis.begin(), api_s3website_priority_rawpos); + } + ldpp_dout(s, 10) << "rgw api priority: s3=" << api_priority_s3 << " s3website=" << api_priority_s3website << dendl; + bool s3website_enabled = api_priority_s3website >= 0; + + if (info.host.size()) { + ssize_t pos; + if (info.host.find('[') == 0) { + pos = info.host.find(']'); + if (pos >=1) { + info.host = info.host.substr(1, pos-1); + } + } else { + pos = info.host.find(':'); + if (pos >= 0) { + info.host = info.host.substr(0, pos); + } + } + ldpp_dout(s, 10) << "host=" << info.host << dendl; + string domain; + string subdomain; + bool in_hosted_domain_s3website = false; + bool in_hosted_domain = rgw_find_host_in_domains(info.host, &domain, &subdomain, hostnames_set); + + string s3website_domain; + string s3website_subdomain; + + if (s3website_enabled) { + in_hosted_domain_s3website = rgw_find_host_in_domains(info.host, &s3website_domain, &s3website_subdomain, hostnames_s3website_set); + if (in_hosted_domain_s3website) { + in_hosted_domain = true; // TODO: should hostnames be a strict superset of hostnames_s3website? + domain = s3website_domain; + subdomain = s3website_subdomain; + } + } + + ldpp_dout(s, 20) + << "subdomain=" << subdomain + << " domain=" << domain + << " in_hosted_domain=" << in_hosted_domain + << " in_hosted_domain_s3website=" << in_hosted_domain_s3website + << dendl; + + if (g_conf()->rgw_resolve_cname + && !in_hosted_domain + && !in_hosted_domain_s3website) { + string cname; + bool found; + int r = rgw_resolver->resolve_cname(info.host, cname, &found); + if (r < 0) { + ldpp_dout(s, 0) + << "WARNING: rgw_resolver->resolve_cname() returned r=" << r + << dendl; + } + + if (found) { + ldpp_dout(s, 5) << "resolved host cname " << info.host << " -> " + << cname << dendl; + in_hosted_domain = + rgw_find_host_in_domains(cname, &domain, &subdomain, hostnames_set); + + if (s3website_enabled + && !in_hosted_domain_s3website) { + in_hosted_domain_s3website = + rgw_find_host_in_domains(cname, &s3website_domain, + &s3website_subdomain, + hostnames_s3website_set); + if (in_hosted_domain_s3website) { + in_hosted_domain = true; // TODO: should hostnames be a + // strict superset of hostnames_s3website? + domain = s3website_domain; + subdomain = s3website_subdomain; + } + } + + ldpp_dout(s, 20) + << "subdomain=" << subdomain + << " domain=" << domain + << " in_hosted_domain=" << in_hosted_domain + << " in_hosted_domain_s3website=" << in_hosted_domain_s3website + << dendl; + } + } + + // Handle A/CNAME records that point to the RGW storage, but do match the + // CNAME test above, per issue http://tracker.ceph.com/issues/15975 + // If BOTH domain & subdomain variables are empty, then none of the above + // cases matched anything, and we should fall back to using the Host header + // directly as the bucket name. + // As additional checks: + // - if the Host header is an IP, we're using path-style access without DNS + // - Also check that the Host header is a valid bucket name before using it. + // - Don't enable virtual hosting if no hostnames are configured + if (subdomain.empty() + && (domain.empty() || domain != info.host) + && !looks_like_ip_address(info.host.c_str()) + && RGWHandler_REST::validate_bucket_name(info.host) == 0 + && !(hostnames_set.empty() && hostnames_s3website_set.empty())) { + subdomain.append(info.host); + in_hosted_domain = 1; + } + + if (s3website_enabled && api_priority_s3website > api_priority_s3) { + in_hosted_domain_s3website = 1; + } + + if (in_hosted_domain_s3website) { + s->prot_flags |= RGW_REST_WEBSITE; + } + + + if (in_hosted_domain && !subdomain.empty()) { + string encoded_bucket = "/"; + encoded_bucket.append(subdomain); + if (s->info.request_uri[0] != '/') + encoded_bucket.append("/"); + encoded_bucket.append(s->info.request_uri); + s->info.request_uri = encoded_bucket; + } + + if (!domain.empty()) { + s->info.domain = domain; + } + + ldpp_dout(s, 20) + << "final domain/bucket" + << " subdomain=" << subdomain + << " domain=" << domain + << " in_hosted_domain=" << in_hosted_domain + << " in_hosted_domain_s3website=" << in_hosted_domain_s3website + << " s->info.domain=" << s->info.domain + << " s->info.request_uri=" << s->info.request_uri + << dendl; + } + + if (s->info.domain.empty()) { + s->info.domain = s->cct->_conf->rgw_dns_name; + } + + s->decoded_uri = url_decode(s->info.request_uri); + /* Validate for being free of the '\0' buried in the middle of the string. */ + if (std::strlen(s->decoded_uri.c_str()) != s->decoded_uri.length()) { + return -ERR_ZERO_IN_URL; + } + + /* FastCGI specification, section 6.3 + * http://www.fastcgi.com/devkit/doc/fcgi-spec.html#S6.3 + * === + * The Authorizer application receives HTTP request information from the Web + * server on the FCGI_PARAMS stream, in the same format as a Responder. The + * Web server does not send CONTENT_LENGTH, PATH_INFO, PATH_TRANSLATED, and + * SCRIPT_NAME headers. + * === + * Ergo if we are in Authorizer role, we MUST look at HTTP_CONTENT_LENGTH + * instead of CONTENT_LENGTH for the Content-Length. + * + * There is one slight wrinkle in this, and that's older versions of + * nginx/lighttpd/apache setting BOTH headers. As a result, we have to check + * both headers and can't always simply pick A or B. + */ + const char* content_length = info.env->get("CONTENT_LENGTH"); + const char* http_content_length = info.env->get("HTTP_CONTENT_LENGTH"); + if (!http_content_length != !content_length) { + /* Easy case: one or the other is missing */ + s->length = (content_length ? content_length : http_content_length); + } else if (s->cct->_conf->rgw_content_length_compat && + content_length && http_content_length) { + /* Hard case: Both are set, we have to disambiguate */ + int64_t content_length_i, http_content_length_i; + + content_length_i = parse_content_length(content_length); + http_content_length_i = parse_content_length(http_content_length); + + // Now check them: + if (http_content_length_i < 0) { + // HTTP_CONTENT_LENGTH is invalid, ignore it + } else if (content_length_i < 0) { + // CONTENT_LENGTH is invalid, and HTTP_CONTENT_LENGTH is valid + // Swap entries + content_length = http_content_length; + } else { + // both CONTENT_LENGTH and HTTP_CONTENT_LENGTH are valid + // Let's pick the larger size + if (content_length_i < http_content_length_i) { + // prefer the larger value + content_length = http_content_length; + } + } + s->length = content_length; + // End of: else if (s->cct->_conf->rgw_content_length_compat && + // content_length && + // http_content_length) + } else { + /* no content length was defined */ + s->length = NULL; + } + + if (s->length) { + if (*s->length == '\0') { + s->content_length = 0; + } else { + string err; + s->content_length = strict_strtoll(s->length, 10, &err); + if (!err.empty()) { + ldpp_dout(s, 10) << "bad content length, aborting" << dendl; + return -EINVAL; + } + } + } + + if (s->content_length < 0) { + ldpp_dout(s, 10) << "negative content length, aborting" << dendl; + return -EINVAL; + } + + map::iterator giter; + for (giter = generic_attrs_map.begin(); giter != generic_attrs_map.end(); + ++giter) { + const char *env = info.env->get(giter->first.c_str()); + if (env) { + s->generic_attrs[giter->second] = env; + } + } + + if (g_conf()->rgw_print_continue) { + const char *expect = info.env->get("HTTP_EXPECT"); + s->expect_cont = (expect && !strcasecmp(expect, "100-continue")); + } + s->op = op_from_method(info.method); + + info.init_meta_info(s, &s->has_bad_meta); + + return 0; +} + +RGWHandler_REST* RGWREST::get_handler( + rgw::sal::Driver* const driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix, + RGWRestfulIO* const rio, + RGWRESTMgr** const pmgr, + int* const init_error +) { + *init_error = preprocess(s, rio); + if (*init_error < 0) { + return nullptr; + } + + RGWRESTMgr *m = mgr.get_manager(s, frontend_prefix, s->decoded_uri, + &s->relative_uri); + if (! m) { + *init_error = -ERR_METHOD_NOT_ALLOWED; + return nullptr; + } + + if (pmgr) { + *pmgr = m; + } + + RGWHandler_REST* handler = m->get_handler(driver, s, auth_registry, frontend_prefix); + if (! handler) { + *init_error = -ERR_METHOD_NOT_ALLOWED; + return NULL; + } + + ldpp_dout(s, 20) << __func__ << " handler=" << typeid(*handler).name() << dendl; + + *init_error = handler->init(driver, s, rio); + if (*init_error < 0) { + m->put_handler(handler); + return nullptr; + } + + return handler; +} /* get stream handler */ diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h new file mode 100644 index 000000000..434de99e9 --- /dev/null +++ b/src/rgw/rgw_rest.h @@ -0,0 +1,819 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#define TIME_BUF_SIZE 128 + +#include +#include +#include "common/sstring.hh" +#include "common/ceph_json.h" +#include "include/ceph_assert.h" /* needed because of common/ceph_json.h */ +#include "rgw_op.h" +#include "rgw_formats.h" +#include "rgw_client_io.h" +#include "rgw_lua_background.h" + +extern std::map rgw_to_http_attrs; + +extern void rgw_rest_init(CephContext *cct, const rgw::sal::ZoneGroup& zone_group); + +extern void rgw_flush_formatter_and_reset(req_state *s, + ceph::Formatter *formatter); + +extern void rgw_flush_formatter(req_state *s, + ceph::Formatter *formatter); + +inline std::string_view rgw_sanitized_hdrval(ceph::buffer::list& raw) +{ + /* std::string and thus std::string_view ARE OBLIGED to carry multiple + * 0x00 and count them to the length of a string. We need to take that + * into consideration and sanitize the size of a ceph::buffer::list used + * to store metadata values (x-amz-meta-*, X-Container-Meta-*, etags). + * Otherwise we might send 0x00 to clients. */ + const char* const data = raw.c_str(); + size_t len = raw.length(); + + if (len && data[len - 1] == '\0') { + /* That's the case - the null byte has been included at the last position + * of the bufferlist. We need to restore the proper string length we'll + * pass to string_ref. */ + len--; + } + + return std::string_view(data, len); +} + +template +std::tuple rgw_rest_get_json_input_keep_data(CephContext *cct, req_state *s, T& out, uint64_t max_len) +{ + int rv = 0; + bufferlist data; + std::tie(rv, data) = rgw_rest_read_all_input(s, max_len); + if (rv < 0) { + return std::make_tuple(rv, std::move(data)); + } + + if (!data.length()) { + return std::make_tuple(-EINVAL, std::move(data)); + } + + JSONParser parser; + + if (!parser.parse(data.c_str(), data.length())) { + return std::make_tuple(-EINVAL, std::move(data)); + } + + try { + decode_json_obj(out, &parser); + } catch (JSONDecoder::err& e) { + return std::make_tuple(-EINVAL, std::move(data)); + } + + return std::make_tuple(0, std::move(data)); +} + +class RESTArgs { +public: + static int get_string(req_state *s, const std::string& name, + const std::string& def_val, std::string *val, + bool *existed = NULL); + static int get_uint64(req_state *s, const std::string& name, + uint64_t def_val, uint64_t *val, bool *existed = NULL); + static int get_int64(req_state *s, const std::string& name, + int64_t def_val, int64_t *val, bool *existed = NULL); + static int get_uint32(req_state *s, const std::string& name, + uint32_t def_val, uint32_t *val, bool *existed = NULL); + static int get_int32(req_state *s, const std::string& name, + int32_t def_val, int32_t *val, bool *existed = NULL); + static int get_time(req_state *s, const std::string& name, + const utime_t& def_val, utime_t *val, + bool *existed = NULL); + static int get_epoch(req_state *s, const std::string& name, + uint64_t def_val, uint64_t *epoch, + bool *existed = NULL); + static int get_bool(req_state *s, const std::string& name, bool def_val, + bool *val, bool *existed = NULL); +}; + +class RGWRESTFlusher : public RGWFormatterFlusher { + req_state *s; + RGWOp *op; +protected: + void do_flush() override; + void do_start(int ret) override; +public: + RGWRESTFlusher(req_state *_s, RGWOp *_op) : + RGWFormatterFlusher(_s->formatter), s(_s), op(_op) {} + RGWRESTFlusher() : RGWFormatterFlusher(NULL), s(NULL), op(NULL) {} + + void init(req_state *_s, RGWOp *_op) { + s = _s; + op = _op; + set_formatter(s->formatter); + } +}; + +class RGWGetObj_ObjStore : public RGWGetObj +{ +protected: + bool sent_header; +public: + RGWGetObj_ObjStore() : sent_header(false) {} + + void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override { + RGWGetObj::init(driver, s, h); + sent_header = false; + } + + int get_params(optional_yield y) override; +}; + +class RGWGetObjTags_ObjStore : public RGWGetObjTags { +public: + RGWGetObjTags_ObjStore() {}; + ~RGWGetObjTags_ObjStore() {}; +}; + +class RGWPutObjTags_ObjStore: public RGWPutObjTags { +public: + RGWPutObjTags_ObjStore() {}; + ~RGWPutObjTags_ObjStore() {}; +}; + +class RGWGetBucketTags_ObjStore : public RGWGetBucketTags { +public: + RGWGetBucketTags_ObjStore() = default; + virtual ~RGWGetBucketTags_ObjStore() = default; +}; + +class RGWPutBucketTags_ObjStore: public RGWPutBucketTags { +public: + RGWPutBucketTags_ObjStore() = default; + virtual ~RGWPutBucketTags_ObjStore() = default; +}; + +class RGWGetBucketReplication_ObjStore : public RGWGetBucketReplication { +public: + RGWGetBucketReplication_ObjStore() {}; + ~RGWGetBucketReplication_ObjStore() {}; +}; + +class RGWPutBucketReplication_ObjStore: public RGWPutBucketReplication { +public: + RGWPutBucketReplication_ObjStore() = default; + virtual ~RGWPutBucketReplication_ObjStore() = default; +}; + +class RGWDeleteBucketReplication_ObjStore: public RGWDeleteBucketReplication { +public: + RGWDeleteBucketReplication_ObjStore() = default; + virtual ~RGWDeleteBucketReplication_ObjStore() = default; +}; + +class RGWListBuckets_ObjStore : public RGWListBuckets { +public: + RGWListBuckets_ObjStore() {} + ~RGWListBuckets_ObjStore() override {} +}; + +class RGWGetUsage_ObjStore : public RGWGetUsage { +public: + RGWGetUsage_ObjStore() {} + ~RGWGetUsage_ObjStore() override {} +}; + +class RGWListBucket_ObjStore : public RGWListBucket { +public: + RGWListBucket_ObjStore() {} + ~RGWListBucket_ObjStore() override {} +}; + +class RGWStatAccount_ObjStore : public RGWStatAccount { +public: + RGWStatAccount_ObjStore() {} + ~RGWStatAccount_ObjStore() override {} +}; + +class RGWStatBucket_ObjStore : public RGWStatBucket { +public: + RGWStatBucket_ObjStore() {} + ~RGWStatBucket_ObjStore() override {} +}; + +class RGWCreateBucket_ObjStore : public RGWCreateBucket { +public: + RGWCreateBucket_ObjStore() {} + ~RGWCreateBucket_ObjStore() override {} +}; + +class RGWDeleteBucket_ObjStore : public RGWDeleteBucket { +public: + RGWDeleteBucket_ObjStore() {} + ~RGWDeleteBucket_ObjStore() override {} +}; + +class RGWPutObj_ObjStore : public RGWPutObj +{ +public: + RGWPutObj_ObjStore() {} + ~RGWPutObj_ObjStore() override {} + + int verify_params() override; + int get_params(optional_yield y) override; + int get_data(bufferlist& bl) override; +}; + +class RGWPostObj_ObjStore : public RGWPostObj +{ + std::string boundary; + +public: + struct post_part_field { + std::string val; + std::map params; + }; + + struct post_form_part { + std::string name; + std::map fields; + ceph::bufferlist data; + }; + +protected: + using parts_collection_t = \ + std::map; + + std::string err_msg; + ceph::bufferlist in_data; + + int read_with_boundary(ceph::bufferlist& bl, + uint64_t max, + bool check_eol, + bool& reached_boundary, + bool& done); + + int read_line(ceph::bufferlist& bl, + uint64_t max, + bool& reached_boundary, + bool& done); + + int read_data(ceph::bufferlist& bl, + uint64_t max, + bool& reached_boundary, + bool& done); + + int read_form_part_header(struct post_form_part *part, bool& done); + + int get_params(optional_yield y) override; + + static int parse_part_field(const std::string& line, + std::string& field_name, /* out */ + post_part_field& field); /* out */ + + static void parse_boundary_params(const std::string& params_str, + std::string& first, + std::map& params); + + static bool part_str(parts_collection_t& parts, + const std::string& name, + std::string *val); + + static std::string get_part_str(parts_collection_t& parts, + const std::string& name, + const std::string& def_val = std::string()); + + static bool part_bl(parts_collection_t& parts, + const std::string& name, + ceph::bufferlist *pbl); + +public: + RGWPostObj_ObjStore() {} + ~RGWPostObj_ObjStore() override {} + + int verify_params() override; +}; + + +class RGWPutMetadataAccount_ObjStore : public RGWPutMetadataAccount +{ +public: + RGWPutMetadataAccount_ObjStore() {} + ~RGWPutMetadataAccount_ObjStore() override {} +}; + +class RGWPutMetadataBucket_ObjStore : public RGWPutMetadataBucket +{ +public: + RGWPutMetadataBucket_ObjStore() {} + ~RGWPutMetadataBucket_ObjStore() override {} +}; + +class RGWPutMetadataObject_ObjStore : public RGWPutMetadataObject +{ +public: + RGWPutMetadataObject_ObjStore() {} + ~RGWPutMetadataObject_ObjStore() override {} +}; + +class RGWDeleteObj_ObjStore : public RGWDeleteObj { +public: + RGWDeleteObj_ObjStore() {} + ~RGWDeleteObj_ObjStore() override {} +}; + +class RGWGetCrossDomainPolicy_ObjStore : public RGWGetCrossDomainPolicy { +public: + RGWGetCrossDomainPolicy_ObjStore() = default; + ~RGWGetCrossDomainPolicy_ObjStore() override = default; +}; + +class RGWGetHealthCheck_ObjStore : public RGWGetHealthCheck { +public: + RGWGetHealthCheck_ObjStore() = default; + ~RGWGetHealthCheck_ObjStore() override = default; +}; + +class RGWCopyObj_ObjStore : public RGWCopyObj { +public: + RGWCopyObj_ObjStore() {} + ~RGWCopyObj_ObjStore() override {} +}; + +class RGWGetACLs_ObjStore : public RGWGetACLs { +public: + RGWGetACLs_ObjStore() {} + ~RGWGetACLs_ObjStore() override {} +}; + +class RGWPutACLs_ObjStore : public RGWPutACLs { +public: + RGWPutACLs_ObjStore() {} + ~RGWPutACLs_ObjStore() override {} + + int get_params(optional_yield y) override; +}; + +class RGWGetLC_ObjStore : public RGWGetLC { +public: + RGWGetLC_ObjStore() {} + ~RGWGetLC_ObjStore() override {} +}; + +class RGWPutLC_ObjStore : public RGWPutLC { +public: + RGWPutLC_ObjStore() {} + ~RGWPutLC_ObjStore() override {} + + int get_params(optional_yield y) override; +}; + +class RGWDeleteLC_ObjStore : public RGWDeleteLC { +public: + RGWDeleteLC_ObjStore() {} + ~RGWDeleteLC_ObjStore() override {} + +}; + +class RGWGetCORS_ObjStore : public RGWGetCORS { +public: + RGWGetCORS_ObjStore() {} + ~RGWGetCORS_ObjStore() override {} +}; + +class RGWPutCORS_ObjStore : public RGWPutCORS { +public: + RGWPutCORS_ObjStore() {} + ~RGWPutCORS_ObjStore() override {} +}; + +class RGWDeleteCORS_ObjStore : public RGWDeleteCORS { +public: + RGWDeleteCORS_ObjStore() {} + ~RGWDeleteCORS_ObjStore() override {} +}; + +class RGWOptionsCORS_ObjStore : public RGWOptionsCORS { +public: + RGWOptionsCORS_ObjStore() {} + ~RGWOptionsCORS_ObjStore() override {} +}; + +class RGWGetBucketEncryption_ObjStore : public RGWGetBucketEncryption { +public: + RGWGetBucketEncryption_ObjStore() {} + ~RGWGetBucketEncryption_ObjStore() override {} +}; + +class RGWPutBucketEncryption_ObjStore : public RGWPutBucketEncryption { +public: + RGWPutBucketEncryption_ObjStore() {} + ~RGWPutBucketEncryption_ObjStore() override {} +}; + +class RGWDeleteBucketEncryption_ObjStore : public RGWDeleteBucketEncryption { +public: + RGWDeleteBucketEncryption_ObjStore() {} + ~RGWDeleteBucketEncryption_ObjStore() override {} +}; + +class RGWInitMultipart_ObjStore : public RGWInitMultipart { +public: + RGWInitMultipart_ObjStore() {} + ~RGWInitMultipart_ObjStore() override {} +}; + +class RGWCompleteMultipart_ObjStore : public RGWCompleteMultipart { +public: + RGWCompleteMultipart_ObjStore() {} + ~RGWCompleteMultipart_ObjStore() override {} + + int get_params(optional_yield y) override; +}; + +class RGWAbortMultipart_ObjStore : public RGWAbortMultipart { +public: + RGWAbortMultipart_ObjStore() {} + ~RGWAbortMultipart_ObjStore() override {} +}; + +class RGWListMultipart_ObjStore : public RGWListMultipart { +public: + RGWListMultipart_ObjStore() {} + ~RGWListMultipart_ObjStore() override {} + + int get_params(optional_yield y) override; +}; + +class RGWListBucketMultiparts_ObjStore : public RGWListBucketMultiparts { +public: + RGWListBucketMultiparts_ObjStore() {} + ~RGWListBucketMultiparts_ObjStore() override {} + + int get_params(optional_yield y) override; +}; + +class RGWBulkDelete_ObjStore : public RGWBulkDelete { +public: + RGWBulkDelete_ObjStore() {} + ~RGWBulkDelete_ObjStore() override {} +}; + +class RGWBulkUploadOp_ObjStore : public RGWBulkUploadOp { +public: + RGWBulkUploadOp_ObjStore() = default; + ~RGWBulkUploadOp_ObjStore() = default; +}; + +class RGWDeleteMultiObj_ObjStore : public RGWDeleteMultiObj { +public: + RGWDeleteMultiObj_ObjStore() {} + ~RGWDeleteMultiObj_ObjStore() override {} + + int get_params(optional_yield y) override; +}; + +class RGWInfo_ObjStore : public RGWInfo { +public: + RGWInfo_ObjStore() = default; + ~RGWInfo_ObjStore() override = default; +}; + +class RGWPutBucketObjectLock_ObjStore : public RGWPutBucketObjectLock { +public: + RGWPutBucketObjectLock_ObjStore() = default; + ~RGWPutBucketObjectLock_ObjStore() = default; + int get_params(optional_yield y) override; +}; + +class RGWGetBucketObjectLock_ObjStore : public RGWGetBucketObjectLock { +public: + RGWGetBucketObjectLock_ObjStore() = default; + ~RGWGetBucketObjectLock_ObjStore() override = default; +}; + +class RGWPutObjRetention_ObjStore : public RGWPutObjRetention { +public: + RGWPutObjRetention_ObjStore() = default; + ~RGWPutObjRetention_ObjStore() override = default; +}; + +class RGWGetObjRetention_ObjStore : public RGWGetObjRetention { +public: + RGWGetObjRetention_ObjStore() = default; + ~RGWGetObjRetention_ObjStore() = default; +}; + +class RGWPutObjLegalHold_ObjStore : public RGWPutObjLegalHold { +public: + RGWPutObjLegalHold_ObjStore() = default; + ~RGWPutObjLegalHold_ObjStore() override = default; + int get_params(optional_yield y) override; +}; + +class RGWGetObjLegalHold_ObjStore : public RGWGetObjLegalHold { +public: + RGWGetObjLegalHold_ObjStore() = default; + ~RGWGetObjLegalHold_ObjStore() = default; +}; + +class RGWRESTOp : public RGWOp { +protected: + RGWRESTFlusher flusher; + +public: + void init(rgw::sal::Driver* driver, req_state *s, + RGWHandler *dialect_handler) override { + RGWOp::init(driver, s, dialect_handler); + flusher.init(s, this); + } + void send_response() override; + virtual int check_caps(const RGWUserCaps& caps) + { return -EPERM; } /* should to be implemented! */ + int verify_permission(optional_yield y) override; + dmc::client_id dmclock_client() override { return dmc::client_id::admin; } +}; + +class RGWHandler_REST : public RGWHandler { +protected: + + virtual bool is_obj_update_op() const { return false; } + virtual RGWOp *op_get() { return NULL; } + virtual RGWOp *op_put() { return NULL; } + virtual RGWOp *op_delete() { return NULL; } + virtual RGWOp *op_head() { return NULL; } + virtual RGWOp *op_post() { return NULL; } + virtual RGWOp *op_copy() { return NULL; } + virtual RGWOp *op_options() { return NULL; } + +public: + static int allocate_formatter(req_state *s, RGWFormat default_formatter, + bool configurable); + + static constexpr int MAX_BUCKET_NAME_LEN = 255; + static constexpr int MAX_OBJ_NAME_LEN = 1024; + + RGWHandler_REST() {} + ~RGWHandler_REST() override {} + + static int validate_bucket_name(const std::string& bucket); + static int validate_object_name(const std::string& object); + static int reallocate_formatter(req_state *s, RGWFormat type); + + int init_permissions(RGWOp* op, optional_yield y) override; + int read_permissions(RGWOp* op, optional_yield y) override; + + virtual RGWOp* get_op(void); + virtual void put_op(RGWOp* op); +}; + +class RGWHandler_REST_SWIFT; +class RGWHandler_SWIFT_Auth; +class RGWHandler_REST_S3; + +namespace rgw::auth { + +class StrategyRegistry; + +} + +class RGWRESTMgr { + bool should_log; + +protected: + std::map resource_mgrs; + std::multimap resources_by_size; + RGWRESTMgr* default_mgr; + + virtual RGWRESTMgr* get_resource_mgr(req_state* s, + const std::string& uri, + std::string* out_uri); + + virtual RGWRESTMgr* get_resource_mgr_as_default(req_state* const s, + const std::string& uri, + std::string* our_uri) { + return this; + } + +public: + RGWRESTMgr() + : should_log(false), + default_mgr(nullptr) { + } + virtual ~RGWRESTMgr(); + + void register_resource(std::string resource, RGWRESTMgr* mgr); + void register_default_mgr(RGWRESTMgr* mgr); + + virtual RGWRESTMgr* get_manager(req_state* const s, + /* Prefix to be concatenated with @uri + * during the lookup. */ + const std::string& frontend_prefix, + const std::string& uri, + std::string* out_uri) final { + return get_resource_mgr(s, frontend_prefix + uri, out_uri); + } + + virtual RGWHandler_REST* get_handler( + rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix + ) { + return nullptr; + } + + virtual void put_handler(RGWHandler_REST* const handler) { + delete handler; + } + + void set_logging(bool _should_log) { + should_log = _should_log; + } + + bool get_logging() const { + return should_log; + } +}; + +class RGWLibIO; +class RGWRestfulIO; + +class RGWREST { + using x_header = basic_sstring; + boost::container::flat_set x_headers; + RGWRESTMgr mgr; + + static int preprocess(req_state *s, rgw::io::BasicClient* rio); +public: + RGWREST() {} + RGWHandler_REST *get_handler(rgw::sal::Driver* driver, + req_state *s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix, + RGWRestfulIO *rio, + RGWRESTMgr **pmgr, + int *init_error); +#if 0 + RGWHandler *get_handler(RGWRados *driver, req_state *s, + RGWLibIO *io, RGWRESTMgr **pmgr, + int *init_error); +#endif + + void put_handler(RGWHandler_REST *handler) { + mgr.put_handler(handler); + } + + void register_resource(std::string resource, RGWRESTMgr *m, + bool register_empty = false) { + if (!register_empty && resource.empty()) + return; + + mgr.register_resource(resource, m); + } + + void register_default_mgr(RGWRESTMgr *m) { + mgr.register_default_mgr(m); + } + + void register_x_headers(const std::string& headers); + + bool log_x_headers(void) { + return (x_headers.size() > 0); + } + + bool log_x_header(const std::string& header) { + return (x_headers.find(header) != x_headers.end()); + } +}; + +static constexpr int64_t NO_CONTENT_LENGTH = -1; +static constexpr int64_t CHUNKED_TRANSFER_ENCODING = -2; + +extern void dump_errno(int http_ret, std::string& out); +extern void dump_errno(const struct rgw_err &err, std::string& out); +extern void dump_errno(req_state *s); +extern void dump_errno(req_state *s, int http_ret); +extern void end_header(req_state *s, + RGWOp* op = nullptr, + const char *content_type = nullptr, + const int64_t proposed_content_length = + NO_CONTENT_LENGTH, + bool force_content_type = false, + bool force_no_error = false); +extern void dump_start(req_state *s); +extern void list_all_buckets_start(req_state *s); +extern void dump_owner(req_state *s, const rgw_user& id, + const std::string& name, const char *section = NULL); +extern void dump_header(req_state* s, + const std::string_view& name, + const std::string_view& val); +extern void dump_header(req_state* s, + const std::string_view& name, + ceph::buffer::list& bl); +extern void dump_header(req_state* s, + const std::string_view& name, + long long val); +extern void dump_header(req_state* s, + const std::string_view& name, + const utime_t& val); + +template +inline void dump_header_prefixed(req_state* s, + const std::string_view& name_prefix, + const std::string_view& name, + Args&&... args) { + char full_name_buf[name_prefix.size() + name.size() + 1]; + const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s", + static_cast(name_prefix.length()), + name_prefix.data(), + static_cast(name.length()), + name.data()); + std::string_view full_name(full_name_buf, len); + return dump_header(s, std::move(full_name), std::forward(args)...); +} + +template +inline void dump_header_infixed(req_state* s, + const std::string_view& prefix, + const std::string_view& infix, + const std::string_view& sufix, + Args&&... args) { + char full_name_buf[prefix.size() + infix.size() + sufix.size() + 1]; + const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s%.*s", + static_cast(prefix.length()), + prefix.data(), + static_cast(infix.length()), + infix.data(), + static_cast(sufix.length()), + sufix.data()); + std::string_view full_name(full_name_buf, len); + return dump_header(s, std::move(full_name), std::forward(args)...); +} + +template +inline void dump_header_quoted(req_state* s, + const std::string_view& name, + const std::string_view& val) { + /* We need two extra bytes for quotes. */ + char qvalbuf[val.size() + 2 + 1]; + const auto len = snprintf(qvalbuf, sizeof(qvalbuf), "\"%.*s\"", + static_cast(val.length()), val.data()); + return dump_header(s, name, std::string_view(qvalbuf, len)); +} + +template +inline void dump_header_if_nonempty(req_state* s, + const std::string_view& name, + const ValueT& value) { + if (name.length() > 0 && value.length() > 0) { + return dump_header(s, name, value); + } +} + +inline std::string compute_domain_uri(const req_state *s) { + std::string uri = (!s->info.domain.empty()) ? s->info.domain : + [&s]() -> std::string { + RGWEnv const &env(*(s->info.env)); + std::string uri = + env.get("SERVER_PORT_SECURE") ? "https://" : "http://"; + if (env.exists("SERVER_NAME")) { + uri.append(env.get("SERVER_NAME", "")); + } else { + uri.append(env.get("HTTP_HOST", "")); + } + return uri; + }(); + return uri; +} + +extern void dump_content_length(req_state *s, uint64_t len); +extern int64_t parse_content_length(const char *content_length); +extern void dump_etag(req_state *s, + const std::string_view& etag, + bool quoted = false); +extern void dump_epoch_header(req_state *s, const char *name, real_time t); +extern void dump_time_header(req_state *s, const char *name, real_time t); +extern void dump_last_modified(req_state *s, real_time t); +extern void abort_early(req_state* s, RGWOp* op, int err, + RGWHandler* handler, optional_yield y); +extern void dump_range(req_state* s, uint64_t ofs, uint64_t end, + uint64_t total_size); +extern void dump_continue(req_state *s); +extern void list_all_buckets_end(req_state *s); +extern void dump_time(req_state *s, const char *name, real_time t); +extern std::string dump_time_to_str(const real_time& t); +extern void dump_bucket_from_state(req_state *s); +extern void dump_redirect(req_state *s, const std::string& redirect); +extern bool is_valid_url(const char *url); +extern void dump_access_control(req_state *s, const char *origin, + const char *meth, + const char *hdr, const char *exp_hdr, + uint32_t max_age); +extern void dump_access_control(req_state *s, RGWOp *op); + +extern int dump_body(req_state* s, const char* buf, size_t len); +extern int dump_body(req_state* s, /* const */ ceph::buffer::list& bl); +extern int dump_body(req_state* s, const std::string& str); +extern int recv_body(req_state* s, char* buf, size_t max); diff --git a/src/rgw/rgw_rest_admin.h b/src/rgw/rgw_rest_admin.h new file mode 100644 index 000000000..91230af6c --- /dev/null +++ b/src/rgw/rgw_rest_admin.h @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw/rgw_rest.h" + +class RGWRESTMgr_Admin : public RGWRESTMgr { +public: + RGWRESTMgr_Admin() {} + ~RGWRESTMgr_Admin() override {} +}; diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc new file mode 100644 index 000000000..b0b8fcc84 --- /dev/null +++ b/src/rgw/rgw_rest_client.cc @@ -0,0 +1,1124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_common.h" +#include "rgw_rest_client.h" +#include "rgw_auth_s3.h" +#include "rgw_http_errors.h" + +#include "common/armor.h" +#include "common/strtol.h" +#include "include/str_list.h" +#include "rgw_crypt_sanitize.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int RGWHTTPSimpleRequest::get_status() +{ + int retcode = get_req_retcode(); + if (retcode < 0) { + return retcode; + } + return status; +} + +int RGWHTTPSimpleRequest::handle_header(const string& name, const string& val) +{ + if (name == "CONTENT_LENGTH") { + string err; + long len = strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 0) << "ERROR: failed converting content length (" << val << ") to int " << dendl; + return -EINVAL; + } + + max_response = len; + } + + return 0; +} + +int RGWHTTPSimpleRequest::receive_header(void *ptr, size_t len) +{ + unique_lock guard(out_headers_lock); + + char line[len + 1]; + + char *s = (char *)ptr, *end = (char *)ptr + len; + char *p = line; + ldpp_dout(this, 30) << "receive_http_header" << dendl; + + while (s != end) { + if (*s == '\r') { + s++; + continue; + } + if (*s == '\n') { + *p = '\0'; + ldpp_dout(this, 30) << "received header:" << line << dendl; + // TODO: fill whatever data required here + char *l = line; + char *tok = strsep(&l, " \t:"); + if (tok && l) { + while (*l == ' ') + l++; + + if (strcmp(tok, "HTTP") == 0 || strncmp(tok, "HTTP/", 5) == 0) { + http_status = atoi(l); + if (http_status == 100) /* 100-continue response */ + continue; + status = rgw_http_error_to_errno(http_status); + } else { + /* convert header field name to upper case */ + char *src = tok; + char buf[len + 1]; + size_t i; + for (i = 0; i < len && *src; ++i, ++src) { + switch (*src) { + case '-': + buf[i] = '_'; + break; + default: + buf[i] = toupper(*src); + } + } + buf[i] = '\0'; + out_headers[buf] = l; + int r = handle_header(buf, l); + if (r < 0) + return r; + } + } + } + if (s != end) + *p++ = *s++; + } + return 0; +} + +static void get_new_date_str(string& date_str) +{ + date_str = rgw_to_asctime(ceph_clock_now()); +} + +static void get_gmt_date_str(string& date_str) +{ + auto now_time = ceph::real_clock::now(); + time_t rawtime = ceph::real_clock::to_time_t(now_time); + + char buffer[80]; + + struct tm timeInfo; + gmtime_r(&rawtime, &timeInfo); + strftime(buffer, sizeof(buffer), "%a, %d %b %Y %H:%M:%S %z", &timeInfo); + + date_str = buffer; +} + +int RGWHTTPSimpleRequest::send_data(void *ptr, size_t len, bool* pause) +{ + if (!send_iter) + return 0; + + if (len > send_iter->get_remaining()) + len = send_iter->get_remaining(); + + send_iter->copy(len, (char *)ptr); + + return len; +} + +int RGWHTTPSimpleRequest::receive_data(void *ptr, size_t len, bool *pause) +{ + size_t cp_len, left_len; + + left_len = max_response > response.length() ? (max_response - response.length()) : 0; + if (left_len == 0) + return 0; /* don't read extra data */ + + cp_len = (len > left_len) ? left_len : len; + bufferptr p((char *)ptr, cp_len); + + response.append(p); + + return 0; +} + +static void append_param(string& dest, const string& name, const string& val) +{ + if (dest.empty()) { + dest.append("?"); + } else { + dest.append("&"); + } + string url_name; + url_encode(name, url_name); + dest.append(url_name); + + if (!val.empty()) { + string url_val; + url_encode(val, url_val); + dest.append("="); + dest.append(url_val); + } +} + +static void do_get_params_str(const param_vec_t& params, map& extra_args, string& dest) +{ + map::iterator miter; + for (miter = extra_args.begin(); miter != extra_args.end(); ++miter) { + append_param(dest, miter->first, miter->second); + } + for (auto iter = params.begin(); iter != params.end(); ++iter) { + append_param(dest, iter->first, iter->second); + } +} + +void RGWHTTPSimpleRequest::get_params_str(map& extra_args, string& dest) +{ + do_get_params_str(params, extra_args, dest); +} + +void RGWHTTPSimpleRequest::get_out_headers(map *pheaders) +{ + unique_lock guard(out_headers_lock); + pheaders->swap(out_headers); + out_headers.clear(); +} + +static int sign_request_v2(const DoutPrefixProvider *dpp, const RGWAccessKey& key, + const string& region, const string& service, + RGWEnv& env, req_info& info, + const bufferlist *opt_content) +{ + /* don't sign if no key is provided */ + if (key.key.empty()) { + return 0; + } + + auto cct = dpp->get_cct(); + + if (cct->_conf->subsys.should_gather()) { + for (const auto& i: env.get_map()) { + ldpp_dout(dpp, 20) << __func__ << "():> " << i.first << " -> " << rgw::crypt_sanitize::x_meta_map{i.first, i.second} << dendl; + } + } + + string canonical_header; + if (!rgw_create_s3_canonical_header(dpp, info, NULL, canonical_header, false)) { + ldpp_dout(dpp, 0) << "failed to create canonical s3 header" << dendl; + return -EINVAL; + } + + ldpp_dout(dpp, 10) << "generated canonical header: " << canonical_header << dendl; + + string digest; + try { + digest = rgw::auth::s3::get_v2_signature(cct, key.key, canonical_header); + } catch (int ret) { + return ret; + } + + string auth_hdr = "AWS " + key.id + ":" + digest; + ldpp_dout(dpp, 15) << "generated auth header: " << auth_hdr << dendl; + + env.set("AUTHORIZATION", auth_hdr); + + return 0; +} + +static int sign_request_v4(const DoutPrefixProvider *dpp, const RGWAccessKey& key, + const string& region, const string& service, + RGWEnv& env, req_info& info, + const bufferlist *opt_content) +{ + /* don't sign if no key is provided */ + if (key.key.empty()) { + return 0; + } + + auto cct = dpp->get_cct(); + + if (cct->_conf->subsys.should_gather()) { + for (const auto& i: env.get_map()) { + ldpp_dout(dpp, 20) << __func__ << "():> " << i.first << " -> " << rgw::crypt_sanitize::x_meta_map{i.first, i.second} << dendl; + } + } + + rgw::auth::s3::AWSSignerV4::prepare_result_t sigv4_data; + if (service == "s3") { + sigv4_data = rgw::auth::s3::AWSSignerV4::prepare(dpp, key.id, region, service, info, opt_content, true); + } else { + sigv4_data = rgw::auth::s3::AWSSignerV4::prepare(dpp, key.id, region, service, info, opt_content, false); + } + auto sigv4_headers = sigv4_data.signature_factory(dpp, key.key, sigv4_data); + + for (auto& entry : sigv4_headers) { + ldpp_dout(dpp, 20) << __func__ << "(): sigv4 header: " << entry.first << ": " << entry.second << dendl; + env.set(entry.first, entry.second); + } + + return 0; +} + +static int sign_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, + const string& region, const string& service, + RGWEnv& env, req_info& info, + const bufferlist *opt_content) +{ + auto authv = dpp->get_cct()->_conf.get_val("rgw_s3_client_max_sig_ver"); + if (authv > 0 && + authv <= 3) { + return sign_request_v2(dpp, key, region, service, env, info, opt_content); + } + + return sign_request_v4(dpp, key, region, service, env, info, opt_content); +} + +static string extract_region_name(string&& s) +{ + if (s == "s3") { + return "us-east-1"; + } + if (boost::algorithm::starts_with(s, "s3-")) { + return s.substr(3); + } + return std::move(s); +} + + +static bool identify_scope(const DoutPrefixProvider *dpp, + CephContext *cct, + const string& host, + string *region, + string& service) +{ + if (!boost::algorithm::ends_with(host, "amazonaws.com")) { + ldpp_dout(dpp, 20) << "NOTICE: cannot identify region for connection to: " << host << dendl; + return false; + } + + vector vec; + + get_str_vec(host, ".", vec); + + string ser = service; + if (service.empty()) { + service = "s3"; /* default */ + } + + for (auto iter = vec.begin(); iter != vec.end(); ++iter) { + auto& s = *iter; + if (s == "s3" || + s == "execute-api" || + s == "iam") { + if (s == "execute-api") { + service = s; + } + ++iter; + if (iter == vec.end()) { + ldpp_dout(dpp, 0) << "WARNING: cannot identify region name from host name: " << host << dendl; + return false; + } + auto& next = *iter; + if (next == "amazonaws") { + *region = "us-east-1"; + return true; + } + *region = next; + return true; + } else if (boost::algorithm::starts_with(s, "s3-")) { + *region = extract_region_name(std::move(s)); + return true; + } + } + + return false; +} + +static void scope_from_api_name(const DoutPrefixProvider *dpp, + CephContext *cct, + const string& host, + std::optional api_name, + string *region, + string& service) +{ + if (api_name && service.empty()) { + *region = *api_name; + service = "s3"; + return; + } + + if (!identify_scope(dpp, cct, host, region, service)) { + if (service == "iam") { + *region = cct->_conf->rgw_zonegroup; + } else { + *region = cct->_conf->rgw_zonegroup; + service = "s3"; + } + return; + } +} + +int RGWRESTSimpleRequest::forward_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, req_info& info, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y, std::string service) +{ + + string date_str; + get_new_date_str(date_str); + + RGWEnv new_env; + req_info new_info(cct, &new_env); + new_info.rebuild_from(info); + string bucket_encode; + string request_uri_encode; + size_t pos = new_info.request_uri.substr(1, new_info.request_uri.size() - 1).find("/"); + string bucket = new_info.request_uri.substr(1, pos); + url_encode(bucket, bucket_encode); + if (std::string::npos != pos) + request_uri_encode = string("/") + bucket_encode + new_info.request_uri.substr(pos + 1); + else + request_uri_encode = string("/") + bucket_encode; + new_info.request_uri = request_uri_encode; + + for (auto& param : params) { + new_info.args.append(param.first, param.second); + } + + new_env.set("HTTP_DATE", date_str.c_str()); + const char* const content_md5 = info.env->get("HTTP_CONTENT_MD5"); + if (content_md5) { + new_env.set("HTTP_CONTENT_MD5", content_md5); + } + + string region; + string s; + if (!service.empty()) { + s = service; + } + + scope_from_api_name(dpp, cct, host, api_name, ®ion, s); + + const char *maybe_payload_hash = info.env->get("HTTP_X_AMZ_CONTENT_SHA256"); + if (maybe_payload_hash && s != "iam") { + new_env.set("HTTP_X_AMZ_CONTENT_SHA256", maybe_payload_hash); + } + + int ret = sign_request(dpp, key, region, s, new_env, new_info, nullptr); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to sign request" << dendl; + return ret; + } + + if (s == "iam") { + info.args.remove("PayloadHash"); + } + + for (const auto& kv: new_env.get_map()) { + headers.emplace_back(kv); + } + + meta_map_t& meta_map = new_info.x_meta_map; + for (const auto& kv: meta_map) { + headers.emplace_back(kv); + } + + string params_str; + get_params_str(info.args.get_params(), params_str); + + string new_url = url; + string& resource = new_info.request_uri; + string new_resource = resource; + if (new_url[new_url.size() - 1] == '/' && resource[0] == '/') { + new_url = new_url.substr(0, new_url.size() - 1); + } else if (resource[0] != '/') { + new_resource = "/"; + new_resource.append(resource); + } + new_url.append(new_resource + params_str); + + bufferlist::iterator bliter; + + if (inbl) { + bliter = inbl->begin(); + send_iter = &bliter; + + set_send_length(inbl->length()); + } + + method = new_info.method; + url = new_url; + + int r = process(y); + if (r < 0){ + if (r == -EINVAL){ + // curl_easy has errored, generally means the service is not available + r = -ERR_SERVICE_UNAVAILABLE; + } + return r; + } + + response.append((char)0); /* NULL terminate response */ + + if (outbl) { + *outbl = std::move(response); + } + + return status; +} + +class RGWRESTStreamOutCB : public RGWGetDataCB { + RGWRESTStreamS3PutObj *req; +public: + explicit RGWRESTStreamOutCB(RGWRESTStreamS3PutObj *_req) : req(_req) {} + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override; /* callback for object iteration when sending data */ +}; + +int RGWRESTStreamOutCB::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + dout(20) << "RGWRESTStreamOutCB::handle_data bl.length()=" << bl.length() << " bl_ofs=" << bl_ofs << " bl_len=" << bl_len << dendl; + if (!bl_ofs && bl_len == bl.length()) { + req->add_send_data(bl); + return 0; + } + + bufferptr bp(bl.c_str() + bl_ofs, bl_len); + bufferlist new_bl; + new_bl.push_back(bp); + + req->add_send_data(new_bl); + return 0; +} + +RGWRESTStreamS3PutObj::~RGWRESTStreamS3PutObj() +{ + delete out_cb; +} + +static void grants_by_type_add_one_grant(map& grants_by_type, int perm, ACLGrant& grant) +{ + string& s = grants_by_type[perm]; + + if (!s.empty()) + s.append(", "); + + string id_type_str; + ACLGranteeType& type = grant.get_type(); + switch (type.get_type()) { + case ACL_TYPE_GROUP: + id_type_str = "uri"; + break; + case ACL_TYPE_EMAIL_USER: + id_type_str = "emailAddress"; + break; + default: + id_type_str = "id"; + } + rgw_user id; + grant.get_id(id); + s.append(id_type_str + "=\"" + id.to_str() + "\""); +} + +struct grant_type_to_header { + int type; + const char *header; +}; + +struct grant_type_to_header grants_headers_def[] = { + { RGW_PERM_FULL_CONTROL, "x-amz-grant-full-control"}, + { RGW_PERM_READ, "x-amz-grant-read"}, + { RGW_PERM_WRITE, "x-amz-grant-write"}, + { RGW_PERM_READ_ACP, "x-amz-grant-read-acp"}, + { RGW_PERM_WRITE_ACP, "x-amz-grant-write-acp"}, + { 0, NULL} +}; + +static bool grants_by_type_check_perm(map& grants_by_type, int perm, ACLGrant& grant, int check_perm) +{ + if ((perm & check_perm) == check_perm) { + grants_by_type_add_one_grant(grants_by_type, check_perm, grant); + return true; + } + return false; +} + +static void grants_by_type_add_perm(map& grants_by_type, int perm, ACLGrant& grant) +{ + struct grant_type_to_header *t; + + for (t = grants_headers_def; t->header; t++) { + if (grants_by_type_check_perm(grants_by_type, perm, grant, t->type)) + return; + } +} + +static void add_grants_headers(map& grants, RGWEnv& env, meta_map_t& meta_map) +{ + struct grant_type_to_header *t; + + for (t = grants_headers_def; t->header; t++) { + map::iterator iter = grants.find(t->type); + if (iter != grants.end()) { + env.set(t->header,iter->second); + meta_map[t->header] = iter->second; + } + } +} + +RGWRESTGenerateHTTPHeaders::RGWRESTGenerateHTTPHeaders(CephContext *_cct, RGWEnv *_env, req_info *_info) : + DoutPrefix(_cct, dout_subsys, "rest gen http headers: "), + cct(_cct), + new_env(_env), + new_info(_info) { +} + +void RGWRESTGenerateHTTPHeaders::init(const string& _method, const string& host, + const string& resource_prefix, const string& _url, + const string& resource, const param_vec_t& params, + std::optional api_name) +{ + scope_from_api_name(this, cct, host, api_name, ®ion, service); + + string params_str; + map& args = new_info->args.get_params(); + do_get_params_str(params, args, params_str); + + /* merge params with extra args so that we can sign correctly */ + for (auto iter = params.begin(); iter != params.end(); ++iter) { + new_info->args.append(iter->first, iter->second); + } + + url = _url + resource + params_str; + + string date_str; + get_gmt_date_str(date_str); + + new_env->set("HTTP_DATE", date_str.c_str()); + new_env->set("HTTP_HOST", host); + + method = _method; + new_info->method = method.c_str(); + new_info->host = host; + + new_info->script_uri = "/"; + new_info->script_uri.append(resource_prefix); + new_info->script_uri.append(resource); + new_info->request_uri = new_info->script_uri; +} + +static bool is_x_amz(const string& s) { + return boost::algorithm::starts_with(s, "x-amz-"); +} + +void RGWRESTGenerateHTTPHeaders::set_extra_headers(const map& extra_headers) +{ + for (auto iter : extra_headers) { + const string& name = lowercase_dash_http_attr(iter.first); + new_env->set(name, iter.second.c_str()); + if (is_x_amz(name)) { + new_info->x_meta_map[name] = iter.second; + } + } +} + +int RGWRESTGenerateHTTPHeaders::set_obj_attrs(const DoutPrefixProvider *dpp, map& rgw_attrs) +{ + map new_attrs; + + /* merge send headers */ + for (auto& attr: rgw_attrs) { + bufferlist& bl = attr.second; + const string& name = attr.first; + string val = bl.c_str(); + if (name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) { + string header_name = RGW_AMZ_META_PREFIX; + header_name.append(name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1)); + new_attrs[header_name] = val; + } + } + + RGWAccessControlPolicy policy; + int ret = rgw_policy_from_attrset(dpp, cct, rgw_attrs, &policy); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: couldn't get policy ret=" << ret << dendl; + return ret; + } + + set_http_attrs(new_attrs); + set_policy(policy); + + return 0; +} + +void RGWRESTGenerateHTTPHeaders::set_http_attrs(const map& http_attrs) +{ + /* merge send headers */ + for (auto& attr: http_attrs) { + const string& val = attr.second; + const string& name = lowercase_dash_http_attr(attr.first); + if (is_x_amz(name)) { + new_env->set(name, val); + new_info->x_meta_map[name] = val; + } else { + new_env->set(attr.first, val); /* Ugh, using the uppercase representation, + as the signing function calls info.env.get("CONTENT_TYPE"). + This needs to be cleaned up! */ + } + } +} + +void RGWRESTGenerateHTTPHeaders::set_policy(RGWAccessControlPolicy& policy) +{ + /* update acl headers */ + RGWAccessControlList& acl = policy.get_acl(); + multimap& grant_map = acl.get_grant_map(); + multimap::iterator giter; + map grants_by_type; + for (giter = grant_map.begin(); giter != grant_map.end(); ++giter) { + ACLGrant& grant = giter->second; + ACLPermission& perm = grant.get_permission(); + grants_by_type_add_perm(grants_by_type, perm.get_permissions(), grant); + } + add_grants_headers(grants_by_type, *new_env, new_info->x_meta_map); +} + +int RGWRESTGenerateHTTPHeaders::sign(const DoutPrefixProvider *dpp, RGWAccessKey& key, const bufferlist *opt_content) +{ + int ret = sign_request(dpp, key, region, service, *new_env, *new_info, opt_content); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to sign request" << dendl; + return ret; + } + + return 0; +} + +void RGWRESTStreamS3PutObj::send_init(const rgw_obj& obj) +{ + string resource_str; + string resource; + string new_url = url; + string new_host = host; + + const auto& bucket_name = obj.bucket.name; + + if (host_style == VirtualStyle) { + resource_str = obj.get_oid(); + + new_url = bucket_name + "." + new_url; + new_host = bucket_name + "." + new_host; + } else { + resource_str = bucket_name + "/" + obj.get_oid(); + } + + //do not encode slash in object key name + url_encode(resource_str, resource, false); + + if (new_url[new_url.size() - 1] != '/') + new_url.append("/"); + + method = "PUT"; + headers_gen.init(method, new_host, resource_prefix, new_url, resource, params, api_name); + + url = headers_gen.get_url(); +} + +void RGWRESTStreamS3PutObj::send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key, map& rgw_attrs) +{ + headers_gen.set_obj_attrs(dpp, rgw_attrs); + + send_ready(dpp, key); +} + +void RGWRESTStreamS3PutObj::send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key, const map& http_attrs, + RGWAccessControlPolicy& policy) +{ + headers_gen.set_http_attrs(http_attrs); + headers_gen.set_policy(policy); + + send_ready(dpp, key); +} + +void RGWRESTStreamS3PutObj::send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key) +{ + headers_gen.sign(dpp, key, nullptr); + + for (const auto& kv: new_env.get_map()) { + headers.emplace_back(kv); + } + + out_cb = new RGWRESTStreamOutCB(this); +} + +void RGWRESTStreamS3PutObj::put_obj_init(const DoutPrefixProvider *dpp, RGWAccessKey& key, const rgw_obj& obj, map& attrs) +{ + send_init(obj); + send_ready(dpp, key, attrs); +} + +void set_str_from_headers(map& out_headers, const string& header_name, string& str) +{ + map::iterator iter = out_headers.find(header_name); + if (iter != out_headers.end()) { + str = iter->second; + } else { + str.clear(); + } +} + +static int parse_rgwx_mtime(const DoutPrefixProvider *dpp, CephContext *cct, const string& s, ceph::real_time *rt) +{ + string err; + vector vec; + + get_str_vec(s, ".", vec); + + if (vec.empty()) { + return -EINVAL; + } + + long secs = strict_strtol(vec[0].c_str(), 10, &err); + long nsecs = 0; + if (!err.empty()) { + ldpp_dout(dpp, 0) << "ERROR: failed converting mtime (" << s << ") to real_time " << dendl; + return -EINVAL; + } + + if (vec.size() > 1) { + nsecs = strict_strtol(vec[1].c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(dpp, 0) << "ERROR: failed converting mtime (" << s << ") to real_time " << dendl; + return -EINVAL; + } + } + + *rt = utime_t(secs, nsecs).to_real_time(); + + return 0; +} + +static void send_prepare_convert(const rgw_obj& obj, string *resource) +{ + string urlsafe_bucket, urlsafe_object; + url_encode(obj.bucket.get_key(':', 0), urlsafe_bucket); + url_encode(obj.key.name, urlsafe_object); + *resource = urlsafe_bucket + "/" + urlsafe_object; +} + +int RGWRESTStreamRWRequest::send_request(const DoutPrefixProvider *dpp, RGWAccessKey& key, map& extra_headers, const rgw_obj& obj, RGWHTTPManager *mgr) +{ + string resource; + send_prepare_convert(obj, &resource); + + return send_request(dpp, &key, extra_headers, resource, mgr); +} + +int RGWRESTStreamRWRequest::send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey& key, map& extra_headers, const rgw_obj& obj) +{ + string resource; + send_prepare_convert(obj, &resource); + + return do_send_prepare(dpp, &key, extra_headers, resource); +} + +int RGWRESTStreamRWRequest::send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey *key, map& extra_headers, const string& resource, + bufferlist *send_data) +{ + string new_resource; + //do not encode slash + url_encode(resource, new_resource, false); + + return do_send_prepare(dpp, key, extra_headers, new_resource, send_data); +} + +int RGWRESTStreamRWRequest::do_send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey *key, map& extra_headers, const string& resource, + bufferlist *send_data) +{ + string new_url = url; + if (!new_url.empty() && new_url.back() != '/') + new_url.append("/"); + + string new_resource; + string bucket_name; + string old_resource = resource; + + if (resource[0] == '/') { + new_resource = resource.substr(1); + } else { + new_resource = resource; + } + + size_t pos = new_resource.find("/"); + bucket_name = new_resource.substr(0, pos); + + //when dest is a bucket with out other params, uri should end up with '/' + if(pos == string::npos && params.size() == 0 && host_style == VirtualStyle) { + new_resource.append("/"); + } + + if (host_style == VirtualStyle) { + new_url = protocol + "://" + bucket_name + "." + host; + if(pos == string::npos) { + new_resource = ""; + } else { + new_resource = new_resource.substr(pos+1); + } + } + + headers_gen.emplace(cct, &new_env, &new_info); + + headers_gen->init(method, host, resource_prefix, new_url, new_resource, params, api_name); + + headers_gen->set_http_attrs(extra_headers); + + if (key) { + sign_key = *key; + } + + if (send_data) { + set_send_length(send_data->length()); + set_outbl(*send_data); + set_send_data_hint(true); + } + + method = new_info.method; + url = headers_gen->get_url(); + + return 0; +} + +int RGWRESTStreamRWRequest::send_request(const DoutPrefixProvider *dpp, RGWAccessKey *key, map& extra_headers, const string& resource, + RGWHTTPManager *mgr, bufferlist *send_data) +{ + int ret = send_prepare(dpp, key, extra_headers, resource, send_data); + if (ret < 0) { + return ret; + } + + return send(mgr); +} + + +int RGWRESTStreamRWRequest::send(RGWHTTPManager *mgr) +{ + if (!headers_gen) { + ldpp_dout(this, 0) << "ERROR: " << __func__ << "(): send_prepare() was not called: likey a bug!" << dendl; + return -EINVAL; + } + + const bufferlist *outblp{nullptr}; + + if (send_len == outbl.length()) { + outblp = &outbl; + } + + if (sign_key) { + int r = headers_gen->sign(this, *sign_key, outblp); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to sign request" << dendl; + return r; + } + } + + for (const auto& kv: new_env.get_map()) { + headers.emplace_back(kv); + } + + return RGWHTTPStreamRWRequest::send(mgr); +} + +int RGWHTTPStreamRWRequest::complete_request(optional_yield y, + string *etag, + real_time *mtime, + uint64_t *psize, + map *pattrs, + map *pheaders) +{ + int ret = wait(y); + if (ret < 0) { + return ret; + } + + unique_lock guard(out_headers_lock); + + if (etag) { + set_str_from_headers(out_headers, "ETAG", *etag); + } + if (status >= 0) { + if (mtime) { + string mtime_str; + set_str_from_headers(out_headers, "RGWX_MTIME", mtime_str); + if (!mtime_str.empty()) { + int ret = parse_rgwx_mtime(this, cct, mtime_str, mtime); + if (ret < 0) { + return ret; + } + } else { + *mtime = real_time(); + } + } + if (psize) { + string size_str; + set_str_from_headers(out_headers, "RGWX_OBJECT_SIZE", size_str); + string err; + *psize = strict_strtoll(size_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 0) << "ERROR: failed parsing embedded metadata object size (" << size_str << ") to int " << dendl; + return -EIO; + } + } + } + + for (auto iter = out_headers.begin(); pattrs && iter != out_headers.end(); ++iter) { + const string& attr_name = iter->first; + if (attr_name.compare(0, sizeof(RGW_HTTP_RGWX_ATTR_PREFIX) - 1, RGW_HTTP_RGWX_ATTR_PREFIX) == 0) { + string name = attr_name.substr(sizeof(RGW_HTTP_RGWX_ATTR_PREFIX) - 1); + const char *src = name.c_str(); + char buf[name.size() + 1]; + char *dest = buf; + for (; *src; ++src, ++dest) { + switch(*src) { + case '_': + *dest = '-'; + break; + default: + *dest = tolower(*src); + } + } + *dest = '\0'; + (*pattrs)[buf] = iter->second; + } + } + + if (pheaders) { + *pheaders = std::move(out_headers); + } + return status; +} + +int RGWHTTPStreamRWRequest::handle_header(const string& name, const string& val) +{ + if (name == "RGWX_EMBEDDED_METADATA_LEN") { + string err; + long len = strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 0) << "ERROR: failed converting embedded metadata len (" << val << ") to int " << dendl; + return -EINVAL; + } + + cb->set_extra_data_len(len); + } + return 0; +} + +int RGWHTTPStreamRWRequest::receive_data(void *ptr, size_t len, bool *pause) +{ + size_t orig_len = len; + + if (cb) { + in_data.append((const char *)ptr, len); + + size_t orig_in_data_len = in_data.length(); + + int ret = cb->handle_data(in_data, pause); + if (ret < 0) + return ret; + if (ret == 0) { + in_data.clear(); + } else { + /* partial read */ + ceph_assert(in_data.length() <= orig_in_data_len); + len = ret; + bufferlist bl; + size_t left_to_read = orig_in_data_len - len; + if (in_data.length() > left_to_read) { + in_data.splice(0, in_data.length() - left_to_read, &bl); + } + } + } + ofs += len; + return orig_len; +} + +void RGWHTTPStreamRWRequest::set_stream_write(bool s) { + std::lock_guard wl{write_lock}; + stream_writes = s; +} + +void RGWHTTPStreamRWRequest::unpause_receive() +{ + std::lock_guard req_locker{get_req_lock()}; + if (!read_paused) { + _set_read_paused(false); + } +} + +void RGWHTTPStreamRWRequest::add_send_data(bufferlist& bl) +{ + std::scoped_lock locker{get_req_lock(), write_lock}; + outbl.claim_append(bl); + _set_write_paused(false); +} + +uint64_t RGWHTTPStreamRWRequest::get_pending_send_size() +{ + std::lock_guard wl{write_lock}; + return outbl.length(); +} + +void RGWHTTPStreamRWRequest::finish_write() +{ + std::scoped_lock locker{get_req_lock(), write_lock}; + write_stream_complete = true; + _set_write_paused(false); +} + +int RGWHTTPStreamRWRequest::send_data(void *ptr, size_t len, bool *pause) +{ + uint64_t out_len; + uint64_t send_size; + { + std::lock_guard wl{write_lock}; + + if (outbl.length() == 0) { + if ((stream_writes && !write_stream_complete) || + (write_ofs < send_len)) { + *pause = true; + } + return 0; + } + + len = std::min(len, (size_t)outbl.length()); + + bufferlist bl; + outbl.splice(0, len, &bl); + send_size = bl.length(); + if (send_size > 0) { + memcpy(ptr, bl.c_str(), send_size); + write_ofs += send_size; + } + + out_len = outbl.length(); + } + /* don't need to be under write_lock here, avoid deadlocks in case notify callback + * needs to lock */ + if (write_drain_cb) { + write_drain_cb->notify(out_len); + } + return send_size; +} + +int RGWHTTPStreamRWRequest::send(RGWHTTPManager *mgr) +{ + if (!mgr) { + return RGWHTTP::send(this); + } + + int r = mgr->add_request(this); + if (r < 0) + return r; + + return 0; +} diff --git a/src/rgw/rgw_rest_client.h b/src/rgw/rgw_rest_client.h new file mode 100644 index 000000000..97cf899fd --- /dev/null +++ b/src/rgw/rgw_rest_client.h @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_http_client.h" + +class RGWGetDataCB; + +class RGWHTTPSimpleRequest : public RGWHTTPClient { +protected: + int http_status; + int status; + + using unique_lock = std::unique_lock; + + std::mutex out_headers_lock; + std::map out_headers; + param_vec_t params; + + bufferlist::iterator *send_iter; + + size_t max_response; /* we need this as we don't stream out response */ + bufferlist response; + + virtual int handle_header(const std::string& name, const std::string& val); + void get_params_str(std::map& extra_args, std::string& dest); + +public: + RGWHTTPSimpleRequest(CephContext *_cct, const std::string& _method, const std::string& _url, + param_vec_t *_headers, param_vec_t *_params) : RGWHTTPClient(_cct, _method, _url), + http_status(0), status(0), + send_iter(NULL), + max_response(0) { + set_headers(_headers); + set_params(_params); + } + + void set_headers(param_vec_t *_headers) { + if (_headers) + headers = *_headers; + } + + void set_params(param_vec_t *_params) { + if (_params) + params = *_params; + } + + int receive_header(void *ptr, size_t len) override; + int receive_data(void *ptr, size_t len, bool *pause) override; + int send_data(void *ptr, size_t len, bool* pause=nullptr) override; + + bufferlist& get_response() { return response; } + + void get_out_headers(std::map *pheaders); /* modifies out_headers */ + + int get_http_status() { return http_status; } + int get_status(); +}; + +class RGWRESTSimpleRequest : public RGWHTTPSimpleRequest { + std::optional api_name; +public: + RGWRESTSimpleRequest(CephContext *_cct, const std::string& _method, const std::string& _url, + param_vec_t *_headers, param_vec_t *_params, + std::optional _api_name) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params), api_name(_api_name) {} + + int forward_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, req_info& info, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y, std::string service=""); +}; + +class RGWWriteDrainCB { +public: + RGWWriteDrainCB() = default; + virtual ~RGWWriteDrainCB() = default; + virtual void notify(uint64_t pending_size) = 0; +}; + +class RGWRESTGenerateHTTPHeaders : public DoutPrefix { + CephContext *cct; + RGWEnv *new_env; + req_info *new_info; + std::string region; + std::string service; + std::string method; + std::string url; + std::string resource; + +public: + RGWRESTGenerateHTTPHeaders(CephContext *_cct, RGWEnv *_env, req_info *_info); + void init(const std::string& method, const std::string& host, + const std::string& resource_prefix, const std::string& url, + const std::string& resource, const param_vec_t& params, + std::optional api_name); + void set_extra_headers(const std::map& extra_headers); + int set_obj_attrs(const DoutPrefixProvider *dpp, std::map& rgw_attrs); + void set_http_attrs(const std::map& http_attrs); + void set_policy(RGWAccessControlPolicy& policy); + int sign(const DoutPrefixProvider *dpp, RGWAccessKey& key, const bufferlist *opt_content); + + const std::string& get_url() { return url; } +}; + +class RGWHTTPStreamRWRequest : public RGWHTTPSimpleRequest { +public: + class ReceiveCB; + +private: + ceph::mutex lock = + ceph::make_mutex("RGWHTTPStreamRWRequest"); + ceph::mutex write_lock = + ceph::make_mutex("RGWHTTPStreamRWRequest::write_lock"); + ReceiveCB *cb{nullptr}; + RGWWriteDrainCB *write_drain_cb{nullptr}; + bufferlist in_data; + size_t chunk_ofs{0}; + size_t ofs{0}; + uint64_t write_ofs{0}; + bool read_paused{false}; + bool send_paused{false}; + bool stream_writes{false}; + bool write_stream_complete{false}; +protected: + bufferlist outbl; + + int handle_header(const std::string& name, const std::string& val) override; +public: + int send_data(void *ptr, size_t len, bool *pause) override; + int receive_data(void *ptr, size_t len, bool *pause) override; + + class ReceiveCB { + protected: + uint64_t extra_data_len{0}; + public: + ReceiveCB() = default; + virtual ~ReceiveCB() = default; + virtual int handle_data(bufferlist& bl, bool *pause = nullptr) = 0; + virtual void set_extra_data_len(uint64_t len) { + extra_data_len = len; + } + }; + + RGWHTTPStreamRWRequest(CephContext *_cct, const std::string& _method, const std::string& _url, + param_vec_t *_headers, param_vec_t *_params) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params) { + } + RGWHTTPStreamRWRequest(CephContext *_cct, const std::string& _method, const std::string& _url, ReceiveCB *_cb, + param_vec_t *_headers, param_vec_t *_params) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params), + cb(_cb) { + } + virtual ~RGWHTTPStreamRWRequest() override {} + + void set_outbl(bufferlist& _outbl) { + outbl.swap(_outbl); + } + + void set_in_cb(ReceiveCB *_cb) { cb = _cb; } + void set_write_drain_cb(RGWWriteDrainCB *_cb) { write_drain_cb = _cb; } + + void unpause_receive(); + + void add_send_data(bufferlist& bl); + + void set_stream_write(bool s); + + uint64_t get_pending_send_size(); + + /* finish streaming writes */ + void finish_write(); + + virtual int send(RGWHTTPManager *mgr); + + int complete_request(optional_yield y, + std::string *etag = nullptr, + real_time *mtime = nullptr, + uint64_t *psize = nullptr, + std::map *pattrs = nullptr, + std::map *pheaders = nullptr); +}; + +class RGWRESTStreamRWRequest : public RGWHTTPStreamRWRequest { + std::optional sign_key; + std::optional headers_gen; + RGWEnv new_env; + req_info new_info; + +protected: + std::optional api_name; + HostStyle host_style; +public: + RGWRESTStreamRWRequest(CephContext *_cct, const std::string& _method, const std::string& _url, RGWHTTPStreamRWRequest::ReceiveCB *_cb, + param_vec_t *_headers, param_vec_t *_params, + std::optional _api_name, HostStyle _host_style = PathStyle) : + RGWHTTPStreamRWRequest(_cct, _method, _url, _cb, _headers, _params), + new_info(_cct, &new_env), + api_name(_api_name), host_style(_host_style) { + } + virtual ~RGWRESTStreamRWRequest() override {} + + int send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey *key, std::map& extra_headers, const std::string& resource, bufferlist *send_data = nullptr /* optional input data */); + int send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey& key, std::map& extra_headers, const rgw_obj& obj); + int send(RGWHTTPManager *mgr) override; + + int send_request(const DoutPrefixProvider *dpp, RGWAccessKey& key, std::map& extra_headers, const rgw_obj& obj, RGWHTTPManager *mgr); + int send_request(const DoutPrefixProvider *dpp, RGWAccessKey *key, std::map& extra_headers, const std::string& resource, RGWHTTPManager *mgr, bufferlist *send_data = nullptr /* optional input data */); + + void add_params(param_vec_t *params); + +private: + int do_send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey *key, std::map& extra_headers, const std::string& resource, bufferlist *send_data = nullptr /* optional input data */); +}; + +class RGWRESTStreamReadRequest : public RGWRESTStreamRWRequest { +public: + RGWRESTStreamReadRequest(CephContext *_cct, const std::string& _url, ReceiveCB *_cb, param_vec_t *_headers, + param_vec_t *_params, std::optional _api_name, + HostStyle _host_style = PathStyle) : RGWRESTStreamRWRequest(_cct, "GET", _url, _cb, _headers, _params, _api_name, _host_style) {} +}; + +class RGWRESTStreamHeadRequest : public RGWRESTStreamRWRequest { +public: + RGWRESTStreamHeadRequest(CephContext *_cct, const std::string& _url, ReceiveCB *_cb, param_vec_t *_headers, + param_vec_t *_params, std::optional _api_name) : RGWRESTStreamRWRequest(_cct, "HEAD", _url, _cb, _headers, _params, _api_name) {} +}; + +class RGWRESTStreamSendRequest : public RGWRESTStreamRWRequest { +public: + RGWRESTStreamSendRequest(CephContext *_cct, const std::string& method, + const std::string& _url, + ReceiveCB *_cb, param_vec_t *_headers, param_vec_t *_params, + std::optional _api_name, + HostStyle _host_style = PathStyle) : RGWRESTStreamRWRequest(_cct, method, _url, _cb, _headers, _params, _api_name, _host_style) {} +}; + +class RGWRESTStreamS3PutObj : public RGWHTTPStreamRWRequest { + std::optional api_name; + HostStyle host_style; + RGWGetDataCB *out_cb; + RGWEnv new_env; + req_info new_info; + RGWRESTGenerateHTTPHeaders headers_gen; +public: + RGWRESTStreamS3PutObj(CephContext *_cct, const std::string& _method, const std::string& _url, param_vec_t *_headers, + param_vec_t *_params, std::optional _api_name, + HostStyle _host_style) : RGWHTTPStreamRWRequest(_cct, _method, _url, nullptr, _headers, _params), + api_name(_api_name), host_style(_host_style), + out_cb(NULL), new_info(cct, &new_env), headers_gen(_cct, &new_env, &new_info) {} + ~RGWRESTStreamS3PutObj() override; + + void send_init(const rgw_obj& obj); + void send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key, std::map& rgw_attrs); + void send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key, const std::map& http_attrs, + RGWAccessControlPolicy& policy); + void send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key); + + void put_obj_init(const DoutPrefixProvider *dpp, RGWAccessKey& key, const rgw_obj& obj, std::map& attrs); + + RGWGetDataCB *get_out_cb() { return out_cb; } +}; diff --git a/src/rgw/rgw_rest_config.cc b/src/rgw/rgw_rest_config.cc new file mode 100644 index 000000000..a3b93ea3a --- /dev/null +++ b/src/rgw/rgw_rest_config.cc @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/ceph_json.h" +#include "common/strtol.h" +#include "rgw_rest.h" +#include "rgw_op.h" +#include "rgw_rados.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_config.h" +#include "rgw_client_io.h" +#include "rgw_sal_rados.h" +#include "common/errno.h" +#include "include/ceph_assert.h" + +#include "services/svc_zone.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void RGWOp_ZoneConfig_Get::send_response() { + const RGWZoneParams& zone_params = static_cast(driver)->svc()->zone->get_zone_params(); + + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + encode_json("zone_params", zone_params, s->formatter); + flusher.flush(); +} + +RGWOp* RGWHandler_Config::op_get() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (type.compare("zone") == 0) { + return new RGWOp_ZoneConfig_Get(); + } + return nullptr; +} diff --git a/src/rgw/rgw_rest_config.h b/src/rgw/rgw_rest_config.h new file mode 100644 index 000000000..1910cbe0b --- /dev/null +++ b/src/rgw/rgw_rest_config.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_auth_s3.h" +#include "rgw_rest.h" +#include "rgw_zone.h" + +class RGWOp_ZoneConfig_Get : public RGWRESTOp { + RGWZoneParams zone_params; +public: + RGWOp_ZoneConfig_Get() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission(optional_yield) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield) override {} /* driver already has the info we need, just need to send response */ + void send_response() override ; + const char* name() const override { + return "get_zone_config"; + } +}; + +class RGWHandler_Config : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + + int read_permissions(RGWOp*, optional_yield) override { + return 0; + } +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Config() override = default; +}; + + +class RGWRESTMgr_Config : public RGWRESTMgr { +public: + RGWRESTMgr_Config() = default; + ~RGWRESTMgr_Config() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* , + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Config(auth_registry); + } +}; diff --git a/src/rgw/rgw_rest_conn.cc b/src/rgw/rgw_rest_conn.cc new file mode 100644 index 000000000..ffb536ed9 --- /dev/null +++ b/src/rgw/rgw_rest_conn.cc @@ -0,0 +1,526 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_zone.h" +#include "rgw_rest_conn.h" +#include "rgw_sal.h" +#include "rgw_rados.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWRESTConn::RGWRESTConn(CephContext *_cct, rgw::sal::Driver* driver, + const string& _remote_id, + const list& remote_endpoints, + std::optional _api_name, + HostStyle _host_style) + : cct(_cct), + endpoints(remote_endpoints.begin(), remote_endpoints.end()), + remote_id(_remote_id), + api_name(_api_name), + host_style(_host_style) +{ + if (driver) { + key = driver->get_zone()->get_system_key(); + self_zone_group = driver->get_zone()->get_zonegroup().get_id(); + } +} + +RGWRESTConn::RGWRESTConn(CephContext *_cct, + const string& _remote_id, + const list& remote_endpoints, + RGWAccessKey _cred, + std::string _zone_group, + std::optional _api_name, + HostStyle _host_style) + : cct(_cct), + endpoints(remote_endpoints.begin(), remote_endpoints.end()), + key(_cred), + self_zone_group(_zone_group), + remote_id(_remote_id), + api_name(_api_name), + host_style(_host_style) +{ +} + +RGWRESTConn::RGWRESTConn(RGWRESTConn&& other) + : cct(other.cct), + endpoints(std::move(other.endpoints)), + key(std::move(other.key)), + self_zone_group(std::move(other.self_zone_group)), + remote_id(std::move(other.remote_id)), + counter(other.counter.load()) +{ +} + +RGWRESTConn& RGWRESTConn::operator=(RGWRESTConn&& other) +{ + cct = other.cct; + endpoints = std::move(other.endpoints); + key = std::move(other.key); + self_zone_group = std::move(other.self_zone_group); + remote_id = std::move(other.remote_id); + counter = other.counter.load(); + return *this; +} + +int RGWRESTConn::get_url(string& endpoint) +{ + if (endpoints.empty()) { + ldout(cct, 0) << "ERROR: endpoints not configured for upstream zone" << dendl; + return -EIO; + } + + int i = ++counter; + endpoint = endpoints[i % endpoints.size()]; + + return 0; +} + +string RGWRESTConn::get_url() +{ + string endpoint; + get_url(endpoint); + return endpoint; +} + +void RGWRESTConn::populate_params(param_vec_t& params, const rgw_user *uid, const string& zonegroup) +{ + populate_uid(params, uid); + populate_zonegroup(params, zonegroup); +} + +int RGWRESTConn::forward(const DoutPrefixProvider *dpp, const rgw_user& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + param_vec_t params; + populate_params(params, &uid, self_zone_group); + if (objv) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "tag", objv->tag)); + char buf[16]; + snprintf(buf, sizeof(buf), "%lld", (long long)objv->ver); + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "ver", buf)); + } + RGWRESTSimpleRequest req(cct, info.method, url, NULL, ¶ms, api_name); + return req.forward_request(dpp, key, info, max_response, inbl, outbl, y); +} + +int RGWRESTConn::forward_iam_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + param_vec_t params; + if (objv) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "tag", objv->tag)); + char buf[16]; + snprintf(buf, sizeof(buf), "%lld", (long long)objv->ver); + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "ver", buf)); + } + std::string service = "iam"; + RGWRESTSimpleRequest req(cct, info.method, url, NULL, ¶ms, api_name); + return req.forward_request(dpp, key, info, max_response, inbl, outbl, y, service); +} + +int RGWRESTConn::put_obj_send_init(const rgw_obj& obj, const rgw_http_param_pair *extra_params, RGWRESTStreamS3PutObj **req) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + rgw_user uid; + param_vec_t params; + populate_params(params, &uid, self_zone_group); + + if (extra_params) { + append_param_list(params, extra_params); + } + + RGWRESTStreamS3PutObj *wr = new RGWRESTStreamS3PutObj(cct, "PUT", url, NULL, ¶ms, api_name, host_style); + wr->send_init(obj); + *req = wr; + return 0; +} + +int RGWRESTConn::put_obj_async_init(const DoutPrefixProvider *dpp, const rgw_user& uid, const rgw_obj& obj, + map& attrs, + RGWRESTStreamS3PutObj **req) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + param_vec_t params; + populate_params(params, &uid, self_zone_group); + RGWRESTStreamS3PutObj *wr = new RGWRESTStreamS3PutObj(cct, "PUT", url, NULL, ¶ms, api_name, host_style); + wr->put_obj_init(dpp, key, obj, attrs); + *req = wr; + return 0; +} + +int RGWRESTConn::complete_request(RGWRESTStreamS3PutObj *req, string& etag, + real_time *mtime, optional_yield y) +{ + int ret = req->complete_request(y, &etag, mtime); + delete req; + + return ret; +} + +static void set_date_header(const real_time *t, map& headers, bool high_precision_time, const string& header_name) +{ + if (!t) { + return; + } + stringstream s; + utime_t tm = utime_t(*t); + if (high_precision_time) { + tm.gmtime_nsec(s); + } else { + tm.gmtime(s); + } + headers[header_name] = s.str(); +} + +template +static void set_header(T val, map& headers, const string& header_name) +{ + stringstream s; + s << val; + headers[header_name] = s.str(); +} + + +int RGWRESTConn::get_obj(const DoutPrefixProvider *dpp, const rgw_user& uid, req_info *info /* optional */, const rgw_obj& obj, + const real_time *mod_ptr, const real_time *unmod_ptr, + uint32_t mod_zone_id, uint64_t mod_pg_ver, + bool prepend_metadata, bool get_op, bool rgwx_stat, + bool sync_manifest, bool skip_decrypt, + rgw_zone_set_entry *dst_zone_trace, bool sync_cloudtiered, + bool send, RGWHTTPStreamRWRequest::ReceiveCB *cb, RGWRESTStreamRWRequest **req) +{ + get_obj_params params; + params.uid = uid; + params.info = info; + params.mod_ptr = mod_ptr; + params.mod_pg_ver = mod_pg_ver; + params.prepend_metadata = prepend_metadata; + params.get_op = get_op; + params.rgwx_stat = rgwx_stat; + params.sync_manifest = sync_manifest; + params.skip_decrypt = skip_decrypt; + params.sync_cloudtiered = sync_cloudtiered; + params.dst_zone_trace = dst_zone_trace; + params.cb = cb; + return get_obj(dpp, obj, params, send, req); +} + +int RGWRESTConn::get_obj(const DoutPrefixProvider *dpp, const rgw_obj& obj, const get_obj_params& in_params, bool send, RGWRESTStreamRWRequest **req) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + param_vec_t params; + populate_params(params, &in_params.uid, self_zone_group); + if (in_params.prepend_metadata) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "prepend-metadata", "true")); + } + if (in_params.rgwx_stat) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "stat", "true")); + } + if (in_params.sync_manifest) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "sync-manifest", "")); + } + if (in_params.sync_cloudtiered) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "sync-cloudtiered", "")); + } + if (in_params.skip_decrypt) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "skip-decrypt", "")); + } + if (in_params.dst_zone_trace) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "if-not-replicated-to", in_params.dst_zone_trace->to_str())); + } + if (!obj.key.instance.empty()) { + params.push_back(param_pair_t("versionId", obj.key.instance)); + } + if (in_params.get_op) { + *req = new RGWRESTStreamReadRequest(cct, url, in_params.cb, NULL, ¶ms, api_name, host_style); + } else { + *req = new RGWRESTStreamHeadRequest(cct, url, in_params.cb, NULL, ¶ms, api_name); + } + map extra_headers; + if (in_params.info) { + const auto& orig_map = in_params.info->env->get_map(); + + /* add original headers that start with HTTP_X_AMZ_ */ + static constexpr char SEARCH_AMZ_PREFIX[] = "HTTP_X_AMZ_"; + for (auto iter= orig_map.lower_bound(SEARCH_AMZ_PREFIX); iter != orig_map.end(); ++iter) { + const string& name = iter->first; + if (name == "HTTP_X_AMZ_DATE") /* don't forward date from original request */ + continue; + if (name.compare(0, strlen(SEARCH_AMZ_PREFIX), SEARCH_AMZ_PREFIX) != 0) + break; + extra_headers[iter->first] = iter->second; + } + } + + set_date_header(in_params.mod_ptr, extra_headers, in_params.high_precision_time, "HTTP_IF_MODIFIED_SINCE"); + set_date_header(in_params.unmod_ptr, extra_headers, in_params.high_precision_time, "HTTP_IF_UNMODIFIED_SINCE"); + if (!in_params.etag.empty()) { + set_header(in_params.etag, extra_headers, "HTTP_IF_MATCH"); + } + if (in_params.mod_zone_id != 0) { + set_header(in_params.mod_zone_id, extra_headers, "HTTP_DEST_ZONE_SHORT_ID"); + } + if (in_params.mod_pg_ver != 0) { + set_header(in_params.mod_pg_ver, extra_headers, "HTTP_DEST_PG_VER"); + } + if (in_params.range_is_set) { + char buf[64]; + snprintf(buf, sizeof(buf), "bytes=%lld-%lld", (long long)in_params.range_start, (long long)in_params.range_end); + set_header(buf, extra_headers, "RANGE"); + } + + int r = (*req)->send_prepare(dpp, key, extra_headers, obj); + if (r < 0) { + goto done_err; + } + + if (!send) { + return 0; + } + + r = (*req)->send(nullptr); + if (r < 0) { + goto done_err; + } + return 0; +done_err: + delete *req; + *req = nullptr; + return r; +} + +int RGWRESTConn::complete_request(RGWRESTStreamRWRequest *req, + string *etag, + real_time *mtime, + uint64_t *psize, + map *pattrs, + map *pheaders, + optional_yield y) +{ + int ret = req->complete_request(y, etag, mtime, psize, pattrs, pheaders); + delete req; + + return ret; +} + +int RGWRESTConn::get_resource(const DoutPrefixProvider *dpp, + const string& resource, + param_vec_t *extra_params, + map *extra_headers, + bufferlist& bl, + bufferlist *send_data, + RGWHTTPManager *mgr, + optional_yield y) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + param_vec_t params; + + if (extra_params) { + params.insert(params.end(), extra_params->begin(), extra_params->end()); + } + + populate_params(params, nullptr, self_zone_group); + + RGWStreamIntoBufferlist cb(bl); + + RGWRESTStreamReadRequest req(cct, url, &cb, NULL, ¶ms, api_name, host_style); + + map headers; + if (extra_headers) { + headers.insert(extra_headers->begin(), extra_headers->end()); + } + + ret = req.send_request(dpp, &key, headers, resource, mgr, send_data); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return req.complete_request(y); +} + +int RGWRESTConn::send_resource(const DoutPrefixProvider *dpp, const std::string& method, + const std::string& resource, rgw_http_param_pair *extra_params, + std::map *extra_headers, bufferlist& bl, + bufferlist *send_data, RGWHTTPManager *mgr, optional_yield y) +{ + std::string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + param_vec_t params; + + if (extra_params) { + params = make_param_list(extra_params); + } + + populate_params(params, nullptr, self_zone_group); + + RGWStreamIntoBufferlist cb(bl); + + RGWRESTStreamSendRequest req(cct, method, url, &cb, NULL, ¶ms, api_name, host_style); + + std::map headers; + if (extra_headers) { + headers.insert(extra_headers->begin(), extra_headers->end()); + } + + ret = req.send_request(dpp, &key, headers, resource, mgr, send_data); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + ret = req.complete_request(y); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << ": complete_request() resource=" << resource << " returned ret=" << ret << dendl; + } + + return ret; +} + +RGWRESTReadResource::RGWRESTReadResource(RGWRESTConn *_conn, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) + : cct(_conn->get_ctx()), conn(_conn), resource(_resource), + params(make_param_list(pp)), cb(bl), mgr(_mgr), + req(cct, conn->get_url(), &cb, NULL, NULL, _conn->get_api_name()) +{ + init_common(extra_headers); +} + +RGWRESTReadResource::RGWRESTReadResource(RGWRESTConn *_conn, + const string& _resource, + param_vec_t& _params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) + : cct(_conn->get_ctx()), conn(_conn), resource(_resource), params(_params), + cb(bl), mgr(_mgr), req(cct, conn->get_url(), &cb, NULL, NULL, _conn->get_api_name()) +{ + init_common(extra_headers); +} + +void RGWRESTReadResource::init_common(param_vec_t *extra_headers) +{ + conn->populate_params(params, nullptr, conn->get_self_zonegroup()); + + if (extra_headers) { + headers.insert(extra_headers->begin(), extra_headers->end()); + } + + req.set_params(¶ms); +} + +int RGWRESTReadResource::read(const DoutPrefixProvider *dpp, optional_yield y) +{ + int ret = req.send_request(dpp, &conn->get_key(), headers, resource, mgr); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return req.complete_request(y); +} + +int RGWRESTReadResource::aio_read(const DoutPrefixProvider *dpp) +{ + int ret = req.send_request(dpp, &conn->get_key(), headers, resource, mgr); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +RGWRESTSendResource::RGWRESTSendResource(RGWRESTConn *_conn, + const string& _method, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) + : cct(_conn->get_ctx()), conn(_conn), method(_method), resource(_resource), + params(make_param_list(pp)), cb(bl), mgr(_mgr), + req(cct, method.c_str(), conn->get_url(), &cb, NULL, NULL, _conn->get_api_name(), _conn->get_host_style()) +{ + init_common(extra_headers); +} + +RGWRESTSendResource::RGWRESTSendResource(RGWRESTConn *_conn, + const string& _method, + const string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) + : cct(_conn->get_ctx()), conn(_conn), method(_method), resource(_resource), params(params), + cb(bl), mgr(_mgr), req(cct, method.c_str(), conn->get_url(), &cb, NULL, NULL, _conn->get_api_name(), _conn->get_host_style()) +{ + init_common(extra_headers); +} + +void RGWRESTSendResource::init_common(param_vec_t *extra_headers) +{ + conn->populate_params(params, nullptr, conn->get_self_zonegroup()); + + if (extra_headers) { + headers.insert(extra_headers->begin(), extra_headers->end()); + } + + req.set_params(¶ms); +} + +int RGWRESTSendResource::send(const DoutPrefixProvider *dpp, bufferlist& outbl, optional_yield y) +{ + req.set_send_length(outbl.length()); + req.set_outbl(outbl); + + int ret = req.send_request(dpp, &conn->get_key(), headers, resource, mgr); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return req.complete_request(y); +} + +int RGWRESTSendResource::aio_send(const DoutPrefixProvider *dpp, bufferlist& outbl) +{ + req.set_send_length(outbl.length()); + req.set_outbl(outbl); + + int ret = req.send_request(dpp, &conn->get_key(), headers, resource, mgr); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return 0; +} diff --git a/src/rgw/rgw_rest_conn.h b/src/rgw/rgw_rest_conn.h new file mode 100644 index 000000000..81f839f49 --- /dev/null +++ b/src/rgw/rgw_rest_conn.h @@ -0,0 +1,557 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest_client.h" +#include "common/ceph_json.h" +#include "common/RefCountedObj.h" +#include "include/common_fwd.h" +#include "rgw_sal_fwd.h" + +#include + +class RGWSI_Zone; + +template +inline int parse_decode_json(T& t, bufferlist& bl) +{ + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + return -EINVAL; + } + + try { + decode_json_obj(t, &p); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + return 0; +} + +struct rgw_http_param_pair { + const char *key; + const char *val; +}; + +// append a null-terminated rgw_http_param_pair list into a list of string pairs +inline void append_param_list(param_vec_t& params, const rgw_http_param_pair* pp) +{ + while (pp && pp->key) { + std::string k = pp->key; + std::string v = (pp->val ? pp->val : ""); + params.emplace_back(make_pair(std::move(k), std::move(v))); + ++pp; + } +} + +// copy a null-terminated rgw_http_param_pair list into a list of std::string pairs +inline param_vec_t make_param_list(const rgw_http_param_pair* pp) +{ + param_vec_t params; + append_param_list(params, pp); + return params; +} + +inline param_vec_t make_param_list(const std::map *pp) +{ + param_vec_t params; + if (!pp) { + return params; + } + for (auto iter : *pp) { + params.emplace_back(make_pair(iter.first, iter.second)); + } + return params; +} + +class RGWRESTConn +{ + CephContext *cct; + std::vector endpoints; + RGWAccessKey key; + std::string self_zone_group; + std::string remote_id; + std::optional api_name; + HostStyle host_style; + std::atomic counter = { 0 }; + +public: + + RGWRESTConn(CephContext *_cct, + rgw::sal::Driver* driver, + const std::string& _remote_id, + const std::list& endpoints, + std::optional _api_name, + HostStyle _host_style = PathStyle); + RGWRESTConn(CephContext *_cct, + const std::string& _remote_id, + const std::list& endpoints, + RGWAccessKey _cred, + std::string _zone_group, + std::optional _api_name, + HostStyle _host_style = PathStyle); + + // custom move needed for atomic + RGWRESTConn(RGWRESTConn&& other); + RGWRESTConn& operator=(RGWRESTConn&& other); + virtual ~RGWRESTConn() = default; + + int get_url(std::string& endpoint); + std::string get_url(); + const std::string& get_self_zonegroup() { + return self_zone_group; + } + const std::string& get_remote_id() { + return remote_id; + } + RGWAccessKey& get_key() { + return key; + } + + std::optional get_api_name() const { + return api_name; + } + + HostStyle get_host_style() { + return host_style; + } + + CephContext *get_ctx() { + return cct; + } + size_t get_endpoint_count() const { return endpoints.size(); } + + virtual void populate_params(param_vec_t& params, const rgw_user *uid, const std::string& zonegroup); + + /* sync request */ + int forward(const DoutPrefixProvider *dpp, const rgw_user& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y); + + /* sync request */ + int forward_iam_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y); + + + /* async requests */ + int put_obj_send_init(const rgw_obj& obj, const rgw_http_param_pair *extra_params, RGWRESTStreamS3PutObj **req); + int put_obj_async_init(const DoutPrefixProvider *dpp, const rgw_user& uid, const rgw_obj& obj, + std::map& attrs, RGWRESTStreamS3PutObj **req); + int complete_request(RGWRESTStreamS3PutObj *req, std::string& etag, + ceph::real_time *mtime, optional_yield y); + + struct get_obj_params { + rgw_user uid; + req_info *info{nullptr}; + const ceph::real_time *mod_ptr{nullptr}; + const ceph::real_time *unmod_ptr{nullptr}; + bool high_precision_time{true}; + + std::string etag; + + uint32_t mod_zone_id{0}; + uint64_t mod_pg_ver{0}; + + bool prepend_metadata{false}; + bool get_op{false}; + bool rgwx_stat{false}; + bool sync_manifest{false}; + bool sync_cloudtiered{false}; + + bool skip_decrypt{true}; + RGWHTTPStreamRWRequest::ReceiveCB *cb{nullptr}; + + bool range_is_set{false}; + uint64_t range_start{0}; + uint64_t range_end{0}; + rgw_zone_set_entry *dst_zone_trace{nullptr}; + }; + + int get_obj(const DoutPrefixProvider *dpp, const rgw_obj& obj, const get_obj_params& params, bool send, RGWRESTStreamRWRequest **req); + + int get_obj(const DoutPrefixProvider *dpp, const rgw_user& uid, req_info *info /* optional */, const rgw_obj& obj, + const ceph::real_time *mod_ptr, const ceph::real_time *unmod_ptr, + uint32_t mod_zone_id, uint64_t mod_pg_ver, + bool prepend_metadata, bool get_op, bool rgwx_stat, bool sync_manifest, + bool skip_decrypt, rgw_zone_set_entry *dst_zone_trace, bool sync_cloudtiered, + bool send, RGWHTTPStreamRWRequest::ReceiveCB *cb, RGWRESTStreamRWRequest **req); + int complete_request(RGWRESTStreamRWRequest *req, + std::string *etag, + ceph::real_time *mtime, + uint64_t *psize, + std::map *pattrs, + std::map *pheaders, + optional_yield y); + + int get_resource(const DoutPrefixProvider *dpp, + const std::string& resource, + param_vec_t *extra_params, + std::map* extra_headers, + bufferlist& bl, + bufferlist *send_data, + RGWHTTPManager *mgr, + optional_yield y); + + int send_resource(const DoutPrefixProvider *dpp, + const std::string& method, + const std::string& resource, + rgw_http_param_pair *extra_params, + std::map* extra_headers, + bufferlist& bl, + bufferlist *send_data, + RGWHTTPManager *mgr, + optional_yield y); + + template + int get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, param_vec_t *params, + bufferlist *in_data, optional_yield y, T& t); + template + int get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, param_vec_t *params, + optional_yield y, T& t); + template + int get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, const rgw_http_param_pair *pp, + optional_yield y, T& t); + +private: + void populate_zonegroup(param_vec_t& params, const std::string& zonegroup) { + if (!zonegroup.empty()) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "zonegroup", zonegroup)); + } + } + void populate_uid(param_vec_t& params, const rgw_user *uid) { + if (uid) { + std::string uid_str = uid->to_str(); + if (!uid->empty()){ + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "uid", uid_str)); + } + } + } +}; + +class S3RESTConn : public RGWRESTConn { + +public: + + S3RESTConn(CephContext *_cct, rgw::sal::Driver* driver, const std::string& _remote_id, const std::list& endpoints, std::optional _api_name, HostStyle _host_style = PathStyle) : + RGWRESTConn(_cct, driver, _remote_id, endpoints, _api_name, _host_style) {} + S3RESTConn(CephContext *_cct, const std::string& _remote_id, const std::list& endpoints, RGWAccessKey _cred, std::string _zone_group, std::optional _api_name, HostStyle _host_style = PathStyle): + RGWRESTConn(_cct, _remote_id, endpoints, _cred, _zone_group, _api_name, _host_style) {} + ~S3RESTConn() override = default; + + void populate_params(param_vec_t& params, const rgw_user *uid, const std::string& zonegroup) override { + // do not populate any params in S3 REST Connection. + return; + } +}; + + +template +int RGWRESTConn::get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, param_vec_t *params, + bufferlist *in_data, optional_yield y, T& t) +{ + bufferlist bl; + int ret = get_resource(dpp, resource, params, nullptr, bl, in_data, nullptr, y); + if (ret < 0) { + return ret; + } + + ret = parse_decode_json(t, bl); + if (ret < 0) { + return ret; + } + + return 0; +} + +template +int RGWRESTConn::get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, param_vec_t *params, + optional_yield y, T& t) +{ + return get_json_resource(dpp, resource, params, nullptr, y, t); +} + +template +int RGWRESTConn::get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, const rgw_http_param_pair *pp, + optional_yield y, T& t) +{ + param_vec_t params = make_param_list(pp); + return get_json_resource(dpp, resource, ¶ms, y, t); +} + +class RGWStreamIntoBufferlist : public RGWHTTPStreamRWRequest::ReceiveCB { + bufferlist& bl; +public: + explicit RGWStreamIntoBufferlist(bufferlist& _bl) : bl(_bl) {} + int handle_data(bufferlist& inbl, bool *pause) override { + bl.claim_append(inbl); + return inbl.length(); + } +}; + +class RGWRESTReadResource : public RefCountedObject, public RGWIOProvider { + CephContext *cct; + RGWRESTConn *conn; + std::string resource; + param_vec_t params; + std::map headers; + bufferlist bl; + RGWStreamIntoBufferlist cb; + + RGWHTTPManager *mgr; + RGWRESTStreamReadRequest req; + + void init_common(param_vec_t *extra_headers); + +public: + RGWRESTReadResource(RGWRESTConn *_conn, + const std::string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr); + + RGWRESTReadResource(RGWRESTConn *_conn, + const std::string& _resource, + param_vec_t& _params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr); + ~RGWRESTReadResource() = default; + + rgw_io_id get_io_id(int io_type) { + return req.get_io_id(io_type); + } + + void set_io_user_info(void *user_info) override { + req.set_io_user_info(user_info); + } + + void *get_io_user_info() override { + return req.get_io_user_info(); + } + + template + int decode_resource(T *dest); + + int read(const DoutPrefixProvider *dpp, optional_yield y); + + int aio_read(const DoutPrefixProvider *dpp); + + std::string to_str() { + return req.to_str(); + } + + int get_http_status() { + return req.get_http_status(); + } + + int wait(bufferlist *pbl, optional_yield y) { + int ret = req.wait(y); + if (ret < 0) { + return ret; + } + + if (req.get_status() < 0) { + return req.get_status(); + } + *pbl = bl; + return 0; + } + + template + int wait(T *dest, optional_yield y); + + template + int fetch(const DoutPrefixProvider *dpp, T *dest, optional_yield y); +}; + + +template +int RGWRESTReadResource::decode_resource(T *dest) +{ + int ret = req.get_status(); + if (ret < 0) { + return ret; + } + ret = parse_decode_json(*dest, bl); + if (ret < 0) { + return ret; + } + return 0; +} + +template +int RGWRESTReadResource::fetch(const DoutPrefixProvider *dpp, T *dest, optional_yield y) +{ + int ret = read(dpp, y); + if (ret < 0) { + return ret; + } + + ret = decode_resource(dest); + if (ret < 0) { + return ret; + } + return 0; +} + +template +int RGWRESTReadResource::wait(T *dest, optional_yield y) +{ + int ret = req.wait(y); + if (ret < 0) { + return ret; + } + + ret = decode_resource(dest); + if (ret < 0) { + return ret; + } + return 0; +} + +class RGWRESTSendResource : public RefCountedObject, public RGWIOProvider { + CephContext *cct; + RGWRESTConn *conn; + std::string method; + std::string resource; + param_vec_t params; + std::map headers; + bufferlist bl; + RGWStreamIntoBufferlist cb; + + RGWHTTPManager *mgr; + RGWRESTStreamRWRequest req; + + void init_common(param_vec_t *extra_headers); + +public: + RGWRESTSendResource(RGWRESTConn *_conn, + const std::string& _method, + const std::string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr); + + RGWRESTSendResource(RGWRESTConn *_conn, + const std::string& _method, + const std::string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr); + + ~RGWRESTSendResource() = default; + + rgw_io_id get_io_id(int io_type) { + return req.get_io_id(io_type); + } + + void set_io_user_info(void *user_info) override { + req.set_io_user_info(user_info); + } + + void *get_io_user_info() override { + return req.get_io_user_info(); + } + + int send(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y); + + int aio_send(const DoutPrefixProvider *dpp, bufferlist& bl); + + std::string to_str() { + return req.to_str(); + } + + int get_http_status() { + return req.get_http_status(); + } + + template + int wait(bufferlist *pbl, optional_yield y, E *err_result = nullptr) { + int ret = req.wait(y); + *pbl = bl; + + if (ret < 0 && err_result ) { + ret = parse_decode_json(*err_result, bl); + } + + return req.get_status(); + } + + template + int wait(T *dest, optional_yield y, E *err_result = nullptr); +}; + +template +int RGWRESTSendResource::wait(T *dest, optional_yield y, E *err_result) +{ + int ret = req.wait(y); + if (ret >= 0) { + ret = req.get_status(); + } + + if (ret < 0 && err_result) { + ret = parse_decode_json(*err_result, bl); + } + + if (ret < 0) { + return ret; + } + + ret = parse_decode_json(*dest, bl); + if (ret < 0) { + return ret; + } + return 0; + +} + +class RGWRESTPostResource : public RGWRESTSendResource { +public: + RGWRESTPostResource(RGWRESTConn *_conn, + const std::string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "POST", _resource, + pp, extra_headers, _mgr) {} + + RGWRESTPostResource(RGWRESTConn *_conn, + const std::string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "POST", _resource, + params, extra_headers, _mgr) {} + +}; + +class RGWRESTPutResource : public RGWRESTSendResource { +public: + RGWRESTPutResource(RGWRESTConn *_conn, + const std::string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "PUT", _resource, + pp, extra_headers, _mgr) {} + + RGWRESTPutResource(RGWRESTConn *_conn, + const std::string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "PUT", _resource, + params, extra_headers, _mgr) {} + +}; + +class RGWRESTDeleteResource : public RGWRESTSendResource { +public: + RGWRESTDeleteResource(RGWRESTConn *_conn, + const std::string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "DELETE", _resource, + pp, extra_headers, _mgr) {} + + RGWRESTDeleteResource(RGWRESTConn *_conn, + const std::string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "DELETE", _resource, + params, extra_headers, _mgr) {} + +}; diff --git a/src/rgw/rgw_rest_iam.cc b/src/rgw/rgw_rest_iam.cc new file mode 100644 index 000000000..b9e8779c1 --- /dev/null +++ b/src/rgw/rgw_rest_iam.cc @@ -0,0 +1,90 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "rgw_auth_s3.h" +#include "rgw_rest_iam.h" + +#include "rgw_rest_role.h" +#include "rgw_rest_user_policy.h" +#include "rgw_rest_oidc_provider.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +using op_generator = RGWOp*(*)(const bufferlist&); +static const std::unordered_map op_generators = { + {"CreateRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWCreateRole(bl_post_body);}}, + {"DeleteRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWDeleteRole(bl_post_body);}}, + {"GetRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWGetRole;}}, + {"UpdateAssumeRolePolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWModifyRoleTrustPolicy(bl_post_body);}}, + {"ListRoles", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListRoles;}}, + {"PutRolePolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWPutRolePolicy(bl_post_body);}}, + {"GetRolePolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWGetRolePolicy;}}, + {"ListRolePolicies", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListRolePolicies;}}, + {"DeleteRolePolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWDeleteRolePolicy(bl_post_body);}}, + {"PutUserPolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWPutUserPolicy;}}, + {"GetUserPolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWGetUserPolicy;}}, + {"ListUserPolicies", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListUserPolicies;}}, + {"DeleteUserPolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWDeleteUserPolicy;}}, + {"CreateOpenIDConnectProvider", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWCreateOIDCProvider;}}, + {"ListOpenIDConnectProviders", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListOIDCProviders;}}, + {"GetOpenIDConnectProvider", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWGetOIDCProvider;}}, + {"DeleteOpenIDConnectProvider", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWDeleteOIDCProvider;}}, + {"TagRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWTagRole(bl_post_body);}}, + {"ListRoleTags", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListRoleTags;}}, + {"UntagRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWUntagRole(bl_post_body);}}, + {"UpdateRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWUpdateRole(bl_post_body);}} +}; + +bool RGWHandler_REST_IAM::action_exists(const req_state* s) +{ + if (s->info.args.exists("Action")) { + const std::string action_name = s->info.args.get("Action"); + return op_generators.contains(action_name); + } + return false; +} + +RGWOp *RGWHandler_REST_IAM::op_post() +{ + if (s->info.args.exists("Action")) { + const std::string action_name = s->info.args.get("Action"); + const auto action_it = op_generators.find(action_name); + if (action_it != op_generators.end()) { + return action_it->second(bl_post_body); + } + ldpp_dout(s, 10) << "unknown action '" << action_name << "' for IAM handler" << dendl; + } else { + ldpp_dout(s, 10) << "missing action argument in IAM handler" << dendl; + } + return nullptr; +} + +int RGWHandler_REST_IAM::init(rgw::sal::Driver* driver, + req_state *s, + rgw::io::BasicClient *cio) +{ + s->dialect = "iam"; + s->prot_flags = RGW_REST_IAM; + + return RGWHandler_REST::init(driver, s, cio); +} + +int RGWHandler_REST_IAM::authorize(const DoutPrefixProvider* dpp, optional_yield y) +{ + return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y); +} + +RGWHandler_REST* +RGWRESTMgr_IAM::get_handler(rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + bufferlist bl; + return new RGWHandler_REST_IAM(auth_registry, bl); +} diff --git a/src/rgw/rgw_rest_iam.h b/src/rgw/rgw_rest_iam.h new file mode 100644 index 000000000..3e579ab35 --- /dev/null +++ b/src/rgw/rgw_rest_iam.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_auth.h" +#include "rgw_auth_filters.h" +#include "rgw_rest.h" + +class RGWHandler_REST_IAM : public RGWHandler_REST { + const rgw::auth::StrategyRegistry& auth_registry; + bufferlist bl_post_body; + RGWOp *op_post() override; + +public: + + static bool action_exists(const req_state* s); + + RGWHandler_REST_IAM(const rgw::auth::StrategyRegistry& auth_registry, + bufferlist& bl_post_body) + : RGWHandler_REST(), + auth_registry(auth_registry), + bl_post_body(bl_post_body) {} + ~RGWHandler_REST_IAM() override = default; + + int init(rgw::sal::Driver* driver, + req_state *s, + rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider* dpp, optional_yield y) override; + int postauth_init(optional_yield y) override { return 0; } +}; + +class RGWRESTMgr_IAM : public RGWRESTMgr { +public: + RGWRESTMgr_IAM() = default; + ~RGWRESTMgr_IAM() override = default; + + RGWRESTMgr *get_resource_mgr(req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry&, + const std::string&) override; +}; diff --git a/src/rgw/rgw_rest_info.cc b/src/rgw/rgw_rest_info.cc new file mode 100644 index 000000000..65323dd00 --- /dev/null +++ b/src/rgw/rgw_rest_info.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_op.h" +#include "rgw_rest_info.h" +#include "rgw_sal.h" + +#define dout_subsys ceph_subsys_rgw + +class RGWOp_Info_Get : public RGWRESTOp { + +public: + RGWOp_Info_Get() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("info", RGW_CAP_READ); + } + void execute(optional_yield y) override; + + const char* name() const override { return "get_info"; } +}; + +void RGWOp_Info_Get::execute(optional_yield y) { + Formatter *formatter = flusher.get_formatter(); + flusher.start(0); + + /* extensible array of general info sections, currently only + * storage backend is defined: + * {"info":{"storage_backends":[{"name":"rados","cluster_id":"75d1938b-2949-4933-8386-fb2d1449ff03"}]}} + */ + formatter->open_object_section("dummy"); + formatter->open_object_section("info"); + formatter->open_array_section("storage_backends"); + // for now, just return the backend that is accessible + formatter->open_object_section("dummy"); + formatter->dump_string("name", driver->get_name()); + formatter->dump_string("cluster_id", driver->get_cluster_id(this, y)); + formatter->close_section(); + formatter->close_section(); + formatter->close_section(); + formatter->close_section(); + + flusher.flush(); +} /* RGWOp_Info_Get::execute */ + +RGWOp *RGWHandler_Info::op_get() +{ + return new RGWOp_Info_Get; +} diff --git a/src/rgw/rgw_rest_info.h b/src/rgw/rgw_rest_info.h new file mode 100644 index 000000000..0c4467073 --- /dev/null +++ b/src/rgw/rgw_rest_info.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + + +class RGWHandler_Info : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Info() override = default; + + int read_permissions(RGWOp*, optional_yield) override { + return 0; + } +}; + +class RGWRESTMgr_Info : public RGWRESTMgr { +public: + RGWRESTMgr_Info() = default; + ~RGWRESTMgr_Info() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Info(auth_registry); + } +}; diff --git a/src/rgw/rgw_rest_metadata.cc b/src/rgw/rgw_rest_metadata.cc new file mode 100644 index 000000000..23f78819c --- /dev/null +++ b/src/rgw/rgw_rest_metadata.cc @@ -0,0 +1,321 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/page.h" + +#include "rgw_rest.h" +#include "rgw_op.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_metadata.h" +#include "rgw_client_io.h" +#include "rgw_mdlog_types.h" +#include "rgw_sal_rados.h" +#include "common/errno.h" +#include "common/strtol.h" +#include "rgw/rgw_b64.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static inline void frame_metadata_key(req_state *s, string& out) { + bool exists; + string key = s->info.args.get("key", &exists); + + string section; + if (!s->init_state.url_bucket.empty()) { + section = s->init_state.url_bucket; + } else { + section = key; + key.clear(); + } + + out = section; + + if (!key.empty()) { + out += string(":") + key; + } +} + +void RGWOp_Metadata_Get::execute(optional_yield y) { + string metadata_key; + + frame_metadata_key(s, metadata_key); + + auto meta_mgr = static_cast(driver)->ctl()->meta.mgr; + + /* Get keys */ + op_ret = meta_mgr->get(metadata_key, s->formatter, s->yield, s); + if (op_ret < 0) { + ldpp_dout(s, 5) << "ERROR: can't get key: " << cpp_strerror(op_ret) << dendl; + return; + } + + op_ret = 0; +} + +void RGWOp_Metadata_Get_Myself::execute(optional_yield y) { + string owner_id; + + owner_id = s->owner.get_id().to_str(); + s->info.args.append("key", owner_id); + + return RGWOp_Metadata_Get::execute(y); +} + +void RGWOp_Metadata_List::execute(optional_yield y) { + string marker; + ldpp_dout(this, 16) << __func__ + << " raw marker " << s->info.args.get("marker") + << dendl; + + try { + marker = s->info.args.get("marker"); + if (!marker.empty()) { + marker = rgw::from_base64(marker); + } + ldpp_dout(this, 16) << __func__ + << " marker " << marker << dendl; + } catch (...) { + marker = std::string(""); + } + + bool max_entries_specified; + string max_entries_str = + s->info.args.get("max-entries", &max_entries_specified); + + bool extended_response = (max_entries_specified); /* for backward compatibility, if max-entries is not specified + we will send the old response format */ + uint64_t max_entries = 0; + + if (max_entries_specified) { + string err; + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl; + op_ret = -EINVAL; + return; + } + } + + string metadata_key; + + frame_metadata_key(s, metadata_key); + /* List keys */ + void *handle; + int max = 1000; + + /* example markers: + marker = "3:b55a9110:root::bu_9:head"; + marker = "3:b9a8b2a6:root::sorry_janefonda_890:head"; + marker = "3:bf885d8f:root::sorry_janefonda_665:head"; + */ + + op_ret = driver->meta_list_keys_init(this, metadata_key, marker, &handle); + if (op_ret < 0) { + ldpp_dout(this, 5) << "ERROR: can't get key: " << cpp_strerror(op_ret) << dendl; + return; + } + + bool truncated; + uint64_t count = 0; + + if (extended_response) { + s->formatter->open_object_section("result"); + } + + s->formatter->open_array_section("keys"); + + uint64_t left; + do { + list keys; + left = (max_entries_specified ? max_entries - count : max); + op_ret = driver->meta_list_keys_next(this, handle, left, keys, &truncated); + if (op_ret < 0) { + ldpp_dout(this, 5) << "ERROR: lists_keys_next(): " << cpp_strerror(op_ret) + << dendl; + return; + } + + for (list::iterator iter = keys.begin(); iter != keys.end(); + ++iter) { + s->formatter->dump_string("key", *iter); + ++count; + } + + } while (truncated && left > 0); + + s->formatter->close_section(); + + if (extended_response) { + encode_json("truncated", truncated, s->formatter); + encode_json("count", count, s->formatter); + if (truncated) { + string esc_marker = + rgw::to_base64(driver->meta_get_marker(handle)); + encode_json("marker", esc_marker, s->formatter); + } + s->formatter->close_section(); + } + driver->meta_list_keys_complete(handle); + + op_ret = 0; +} + +int RGWOp_Metadata_Put::get_data(bufferlist& bl) { + size_t cl = 0; + char *data; + int read_len; + + if (s->length) + cl = atoll(s->length); + if (cl) { + data = (char *)malloc(cl + 1); + if (!data) { + return -ENOMEM; + } + read_len = recv_body(s, data, cl); + if (cl != (size_t)read_len) { + ldpp_dout(this, 10) << "recv_body incomplete" << dendl; + } + if (read_len < 0) { + free(data); + return read_len; + } + bl.append(data, read_len); + } else { + int chunk_size = CEPH_PAGE_SIZE; + const char *enc = s->info.env->get("HTTP_TRANSFER_ENCODING"); + if (!enc || strcmp(enc, "chunked")) { + return -ERR_LENGTH_REQUIRED; + } + data = (char *)malloc(chunk_size); + if (!data) { + return -ENOMEM; + } + do { + read_len = recv_body(s, data, chunk_size); + if (read_len < 0) { + free(data); + return read_len; + } + bl.append(data, read_len); + } while (read_len == chunk_size); + } + + free(data); + return 0; +} + +static bool string_to_sync_type(const string& sync_string, + RGWMDLogSyncType& type) { + if (sync_string.compare("update-by-version") == 0) + type = APPLY_UPDATES; + else if (sync_string.compare("update-by-timestamp") == 0) + type = APPLY_NEWER; + else if (sync_string.compare("always") == 0) + type = APPLY_ALWAYS; + else + return false; + return true; +} + +void RGWOp_Metadata_Put::execute(optional_yield y) { + bufferlist bl; + string metadata_key; + + op_ret = get_data(bl); + if (op_ret < 0) { + return; + } + + op_ret = do_aws4_auth_completion(); + if (op_ret < 0) { + return; + } + + frame_metadata_key(s, metadata_key); + + RGWMDLogSyncType sync_type = RGWMDLogSyncType::APPLY_ALWAYS; + + bool mode_exists = false; + string mode_string = s->info.args.get("update-type", &mode_exists); + if (mode_exists) { + bool parsed = string_to_sync_type(mode_string, + sync_type); + if (!parsed) { + op_ret = -EINVAL; + return; + } + } + + op_ret = static_cast(driver)->ctl()->meta.mgr->put(metadata_key, bl, s->yield, s, sync_type, + false, &ondisk_version); + if (op_ret < 0) { + ldpp_dout(s, 5) << "ERROR: can't put key: " << cpp_strerror(op_ret) << dendl; + return; + } + // translate internal codes into return header + if (op_ret == STATUS_NO_APPLY) + update_status = "skipped"; + else if (op_ret == STATUS_APPLIED) + update_status = "applied"; +} + +void RGWOp_Metadata_Put::send_response() { + int op_return_code = op_ret; + if ((op_ret == STATUS_NO_APPLY) || (op_ret == STATUS_APPLIED)) + op_return_code = STATUS_NO_CONTENT; + set_req_state_err(s, op_return_code); + dump_errno(s); + stringstream ver_stream; + ver_stream << "ver:" << ondisk_version.ver + <<",tag:" << ondisk_version.tag; + dump_header_if_nonempty(s, "RGWX_UPDATE_STATUS", update_status); + dump_header_if_nonempty(s, "RGWX_UPDATE_VERSION", ver_stream.str()); + end_header(s); +} + +void RGWOp_Metadata_Delete::execute(optional_yield y) { + string metadata_key; + + frame_metadata_key(s, metadata_key); + op_ret = static_cast(driver)->ctl()->meta.mgr->remove(metadata_key, s->yield, s); + if (op_ret < 0) { + ldpp_dout(s, 5) << "ERROR: can't remove key: " << cpp_strerror(op_ret) << dendl; + return; + } + op_ret = 0; +} + +RGWOp *RGWHandler_Metadata::op_get() { + if (s->info.args.exists("myself")) + return new RGWOp_Metadata_Get_Myself; + if (s->info.args.exists("key")) + return new RGWOp_Metadata_Get; + else + return new RGWOp_Metadata_List; +} + +RGWOp *RGWHandler_Metadata::op_put() { + return new RGWOp_Metadata_Put; +} + +RGWOp *RGWHandler_Metadata::op_delete() { + return new RGWOp_Metadata_Delete; +} + diff --git a/src/rgw/rgw_rest_metadata.h b/src/rgw/rgw_rest_metadata.h new file mode 100644 index 000000000..ea7376a1b --- /dev/null +++ b/src/rgw/rgw_rest_metadata.h @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw/rgw_rest.h" +#include "rgw/rgw_auth_s3.h" + +class RGWOp_Metadata_List : public RGWRESTOp { +public: + RGWOp_Metadata_List() {} + ~RGWOp_Metadata_List() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_READ); + } + void execute(optional_yield y) override; + const char* name() const override { return "list_metadata"; } +}; + +class RGWOp_Metadata_Get : public RGWRESTOp { +public: + RGWOp_Metadata_Get() {} + ~RGWOp_Metadata_Get() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_READ); + } + void execute(optional_yield y) override; + const char* name() const override { return "get_metadata"; } +}; + +class RGWOp_Metadata_Get_Myself : public RGWOp_Metadata_Get { +public: + RGWOp_Metadata_Get_Myself() {} + ~RGWOp_Metadata_Get_Myself() override {} + + void execute(optional_yield y) override; +}; + +class RGWOp_Metadata_Put : public RGWRESTOp { + int get_data(bufferlist& bl); + std::string update_status; + obj_version ondisk_version; +public: + RGWOp_Metadata_Put() {} + ~RGWOp_Metadata_Put() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { return "set_metadata"; } + RGWOpType get_type() override { return RGW_OP_ADMIN_SET_METADATA; } +}; + +class RGWOp_Metadata_Delete : public RGWRESTOp { +public: + RGWOp_Metadata_Delete() {} + ~RGWOp_Metadata_Delete() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { return "remove_metadata"; } +}; + +class RGWHandler_Metadata : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + + int read_permissions(RGWOp*, optional_yield y) override { + return 0; + } +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Metadata() override = default; +}; + +class RGWRESTMgr_Metadata : public RGWRESTMgr { +public: + RGWRESTMgr_Metadata() = default; + ~RGWRESTMgr_Metadata() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override { + return new RGWHandler_Metadata(auth_registry); + } +}; diff --git a/src/rgw/rgw_rest_oidc_provider.cc b/src/rgw/rgw_rest_oidc_provider.cc new file mode 100644 index 000000000..db4bc12fc --- /dev/null +++ b/src/rgw/rgw_rest_oidc_provider.cc @@ -0,0 +1,233 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_common.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_role.h" +#include "rgw_rest_oidc_provider.h" +#include "rgw_oidc_provider.h" +#include "rgw_sal.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int RGWRestOIDCProvider::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + provider_arn = s->info.args.get("OpenIDConnectProviderArn"); + if (provider_arn.empty()) { + ldpp_dout(this, 20) << "ERROR: Provider ARN is empty"<< dendl; + return -EINVAL; + } + + auto ret = check_caps(s->user->get_caps()); + if (ret == 0) { + return ret; + } + + uint64_t op = get_op(); + auto rgw_arn = rgw::ARN::parse(provider_arn, true); + if (rgw_arn) { + if (!verify_user_permission(this, s, *rgw_arn, op)) { + return -EACCES; + } + } else { + return -EACCES; + } + + return 0; +} + +void RGWRestOIDCProvider::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this); +} + +int RGWRestOIDCProviderRead::check_caps(const RGWUserCaps& caps) +{ + return caps.check_cap("oidc-provider", RGW_CAP_READ); +} + +int RGWRestOIDCProviderWrite::check_caps(const RGWUserCaps& caps) +{ + return caps.check_cap("oidc-provider", RGW_CAP_WRITE); +} + +int RGWCreateOIDCProvider::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + auto ret = check_caps(s->user->get_caps()); + if (ret == 0) { + return ret; + } + + string idp_url = url_remove_prefix(provider_url); + if (!verify_user_permission(this, + s, + rgw::ARN(idp_url, + "oidc-provider", + s->user->get_tenant(), true), + get_op())) { + return -EACCES; + } + return 0; +} + +int RGWCreateOIDCProvider::get_params() +{ + provider_url = s->info.args.get("Url"); + + auto val_map = s->info.args.get_params(); + for (auto& it : val_map) { + if (it.first.find("ClientIDList.member.") != string::npos) { + client_ids.emplace_back(it.second); + } + if (it.first.find("ThumbprintList.member.") != string::npos) { + thumbprints.emplace_back(it.second); + } + } + + if (provider_url.empty() || thumbprints.empty()) { + ldpp_dout(this, 20) << "ERROR: one of url or thumbprints is empty" << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWCreateOIDCProvider::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + std::unique_ptr provider = driver->get_oidc_provider(); + provider->set_url(provider_url); + provider->set_tenant(s->user->get_tenant()); + provider->set_client_ids(client_ids); + provider->set_thumbprints(thumbprints); + op_ret = provider->create(s, true, y); + + if (op_ret == 0) { + s->formatter->open_object_section("CreateOpenIDConnectProviderResponse"); + s->formatter->open_object_section("CreateOpenIDConnectProviderResult"); + provider->dump(s->formatter); + s->formatter->close_section(); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } + +} + +void RGWDeleteOIDCProvider::execute(optional_yield y) +{ + std::unique_ptr provider = driver->get_oidc_provider(); + provider->set_arn(provider_arn); + provider->set_tenant(s->user->get_tenant()); + op_ret = provider->delete_obj(s, y); + + if (op_ret < 0 && op_ret != -ENOENT && op_ret != -EINVAL) { + op_ret = ERR_INTERNAL_ERROR; + } + + if (op_ret == 0) { + s->formatter->open_object_section("DeleteOpenIDConnectProviderResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +void RGWGetOIDCProvider::execute(optional_yield y) +{ + std::unique_ptr provider = driver->get_oidc_provider(); + provider->set_arn(provider_arn); + provider->set_tenant(s->user->get_tenant()); + op_ret = provider->get(s); + + if (op_ret < 0 && op_ret != -ENOENT && op_ret != -EINVAL) { + op_ret = ERR_INTERNAL_ERROR; + } + + if (op_ret == 0) { + s->formatter->open_object_section("GetOpenIDConnectProviderResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("GetOpenIDConnectProviderResult"); + provider->dump_all(s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWListOIDCProviders::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (int ret = check_caps(s->user->get_caps()); ret == 0) { + return ret; + } + + if (!verify_user_permission(this, + s, + rgw::ARN(), + get_op())) { + return -EACCES; + } + + return 0; +} + +void RGWListOIDCProviders::execute(optional_yield y) +{ + vector> result; + op_ret = driver->get_oidc_providers(s, s->user->get_tenant(), result); + + if (op_ret == 0) { + s->formatter->open_array_section("ListOpenIDConnectProvidersResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("ListOpenIDConnectProvidersResult"); + s->formatter->open_array_section("OpenIDConnectProviderList"); + for (const auto& it : result) { + s->formatter->open_object_section("member"); + auto& arn = it->get_arn(); + ldpp_dout(s, 0) << "ARN: " << arn << dendl; + s->formatter->dump_string("Arn", arn); + s->formatter->close_section(); + } + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + diff --git a/src/rgw/rgw_rest_oidc_provider.h b/src/rgw/rgw_rest_oidc_provider.h new file mode 100644 index 000000000..33535c6b5 --- /dev/null +++ b/src/rgw/rgw_rest_oidc_provider.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" +#include "rgw_oidc_provider.h" + +class RGWRestOIDCProvider : public RGWRESTOp { +protected: + std::vector client_ids; + std::vector thumbprints; + std::string provider_url; //'iss' field in JWT + std::string provider_arn; +public: + int verify_permission(optional_yield y) override; + void send_response() override; + virtual uint64_t get_op() = 0; +}; + +class RGWRestOIDCProviderRead : public RGWRestOIDCProvider { +public: + RGWRestOIDCProviderRead() = default; + int check_caps(const RGWUserCaps& caps) override; +}; + +class RGWRestOIDCProviderWrite : public RGWRestOIDCProvider { +public: + RGWRestOIDCProviderWrite() = default; + int check_caps(const RGWUserCaps& caps) override; +}; + +class RGWCreateOIDCProvider : public RGWRestOIDCProviderWrite { +public: + RGWCreateOIDCProvider() = default; + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "create_oidc_provider"; } + RGWOpType get_type() override { return RGW_OP_CREATE_OIDC_PROVIDER; } + uint64_t get_op() override { return rgw::IAM::iamCreateOIDCProvider; } +}; + +class RGWDeleteOIDCProvider : public RGWRestOIDCProviderWrite { +public: + RGWDeleteOIDCProvider() = default; + void execute(optional_yield y) override; + const char* name() const override { return "delete_oidc_provider"; } + RGWOpType get_type() override { return RGW_OP_DELETE_OIDC_PROVIDER; } + uint64_t get_op() override { return rgw::IAM::iamDeleteOIDCProvider; } +}; + +class RGWGetOIDCProvider : public RGWRestOIDCProviderRead { +public: + RGWGetOIDCProvider() = default; + void execute(optional_yield y) override; + const char* name() const override { return "get_oidc_provider"; } + RGWOpType get_type() override { return RGW_OP_GET_OIDC_PROVIDER; } + uint64_t get_op() override { return rgw::IAM::iamGetOIDCProvider; } +}; + +class RGWListOIDCProviders : public RGWRestOIDCProviderRead { +public: + RGWListOIDCProviders() = default; + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "list_oidc_providers"; } + RGWOpType get_type() override { return RGW_OP_LIST_OIDC_PROVIDERS; } + uint64_t get_op() override { return rgw::IAM::iamListOIDCProviders; } +}; diff --git a/src/rgw/rgw_rest_pubsub.cc b/src/rgw/rgw_rest_pubsub.cc new file mode 100644 index 000000000..793232866 --- /dev/null +++ b/src/rgw/rgw_rest_pubsub.cc @@ -0,0 +1,954 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include "rgw_rest_pubsub.h" +#include "rgw_pubsub_push.h" +#include "rgw_pubsub.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_arn.h" +#include "rgw_auth_s3.h" +#include "rgw_notify.h" +#include "services/svc_zone.h" +#include "common/dout.h" +#include "rgw_url.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +static const char* AWS_SNS_NS("https://sns.amazonaws.com/doc/2010-03-31/"); + +bool verify_transport_security(CephContext *cct, const RGWEnv& env) { + const auto is_secure = rgw_transport_is_secure(cct, env); + if (!is_secure && g_conf().get_val("rgw_allow_notification_secrets_in_cleartext")) { + ldout(cct, 0) << "WARNING: bypassing endpoint validation, allows sending secrets over insecure transport" << dendl; + return true; + } + return is_secure; +} + +// make sure that endpoint is a valid URL +// make sure that if user/password are passed inside URL, it is over secure connection +// update rgw_pubsub_dest to indicate that a password is stored in the URL +bool validate_and_update_endpoint_secret(rgw_pubsub_dest& dest, CephContext *cct, const RGWEnv& env) { + if (dest.push_endpoint.empty()) { + return true; + } + std::string user; + std::string password; + if (!rgw::parse_url_userinfo(dest.push_endpoint, user, password)) { + ldout(cct, 1) << "endpoint validation error: malformed endpoint URL:" << dest.push_endpoint << dendl; + return false; + } + // this should be verified inside parse_url() + ceph_assert(user.empty() == password.empty()); + if (!user.empty()) { + dest.stored_secret = true; + if (!verify_transport_security(cct, env)) { + ldout(cct, 1) << "endpoint validation error: sending secrets over insecure transport" << dendl; + return false; + } + } + return true; +} + +bool topic_has_endpoint_secret(const rgw_pubsub_topic& topic) { + return topic.dest.stored_secret; +} + +bool topics_has_endpoint_secret(const rgw_pubsub_topics& topics) { + for (const auto& topic : topics.topics) { + if (topic_has_endpoint_secret(topic.second)) return true; + } + return false; +} + +// command (AWS compliant): +// POST +// Action=CreateTopic&Name=[&OpaqueData=data][&push-endpoint=[&persistent][&=]] +class RGWPSCreateTopicOp : public RGWOp { + private: + std::string topic_name; + rgw_pubsub_dest dest; + std::string topic_arn; + std::string opaque_data; + + int get_params() { + topic_name = s->info.args.get("Name"); + if (topic_name.empty()) { + ldpp_dout(this, 1) << "CreateTopic Action 'Name' argument is missing" << dendl; + return -EINVAL; + } + + opaque_data = s->info.args.get("OpaqueData"); + + dest.push_endpoint = s->info.args.get("push-endpoint"); + s->info.args.get_bool("persistent", &dest.persistent, false); + + if (!validate_and_update_endpoint_secret(dest, s->cct, *(s->info.env))) { + return -EINVAL; + } + for (const auto& param : s->info.args.get_params()) { + if (param.first == "Action" || param.first == "Name" || param.first == "PayloadHash") { + continue; + } + dest.push_endpoint_args.append(param.first+"="+param.second+"&"); + } + + if (!dest.push_endpoint_args.empty()) { + // remove last separator + dest.push_endpoint_args.pop_back(); + } + if (!dest.push_endpoint.empty() && dest.persistent) { + const auto ret = rgw::notify::add_persistent_topic(topic_name, s->yield); + if (ret < 0) { + ldpp_dout(this, 1) << "CreateTopic Action failed to create queue for persistent topics. error:" << ret << dendl; + return ret; + } + } + + // dest object only stores endpoint info + dest.arn_topic = topic_name; + // the topic ARN will be sent in the reply + const rgw::ARN arn(rgw::Partition::aws, rgw::Service::sns, + driver->get_zone()->get_zonegroup().get_name(), + s->user->get_tenant(), topic_name); + topic_arn = arn.to_string(); + return 0; + } + + public: + int verify_permission(optional_yield) override { + return 0; + } + + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute(optional_yield) override; + + const char* name() const override { return "pubsub_topic_create"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_CREATE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section_in_ns("CreateTopicResponse", AWS_SNS_NS); + f->open_object_section("CreateTopicResult"); + encode_xml("TopicArn", topic_arn, f); + f->close_section(); // CreateTopicResult + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); // ResponseMetadata + f->close_section(); // CreateTopicResponse + rgw_flush_formatter_and_reset(s, f); + } +}; + +void RGWPSCreateTopicOp::execute(optional_yield y) { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + const RGWPubSub ps(driver, s->owner.get_id().tenant); + op_ret = ps.create_topic(this, topic_name, dest, topic_arn, opaque_data, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to create topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + ldpp_dout(this, 20) << "successfully created topic '" << topic_name << "'" << dendl; +} + +// command (AWS compliant): +// POST +// Action=ListTopics +class RGWPSListTopicsOp : public RGWOp { +private: + rgw_pubsub_topics result; + +public: + int verify_permission(optional_yield) override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute(optional_yield) override; + + const char* name() const override { return "pubsub_topics_list"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPICS_LIST; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section_in_ns("ListTopicsResponse", AWS_SNS_NS); + f->open_object_section("ListTopicsResult"); + encode_xml("Topics", result, f); + f->close_section(); // ListTopicsResult + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); // ResponseMetadat + f->close_section(); // ListTopicsResponse + rgw_flush_formatter_and_reset(s, f); + } +}; + +void RGWPSListTopicsOp::execute(optional_yield y) { + const RGWPubSub ps(driver, s->owner.get_id().tenant); + op_ret = ps.get_topics(this, result, y); + // if there are no topics it is not considered an error + op_ret = op_ret == -ENOENT ? 0 : op_ret; + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get topics, ret=" << op_ret << dendl; + return; + } + if (topics_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) { + ldpp_dout(this, 1) << "topics contain secrets and cannot be sent over insecure transport" << dendl; + op_ret = -EPERM; + return; + } + ldpp_dout(this, 20) << "successfully got topics" << dendl; +} + +// command (extension to AWS): +// POST +// Action=GetTopic&TopicArn= +class RGWPSGetTopicOp : public RGWOp { + private: + std::string topic_name; + rgw_pubsub_topic result; + + int get_params() { + const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn"))); + + if (!topic_arn || topic_arn->resource.empty()) { + ldpp_dout(this, 1) << "GetTopic Action 'TopicArn' argument is missing or invalid" << dendl; + return -EINVAL; + } + + topic_name = topic_arn->resource; + return 0; + } + + public: + int verify_permission(optional_yield y) override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute(optional_yield y) override; + + const char* name() const override { return "pubsub_topic_get"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section("GetTopicResponse"); + f->open_object_section("GetTopicResult"); + encode_xml("Topic", result, f); + f->close_section(); + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); + f->close_section(); + rgw_flush_formatter_and_reset(s, f); + } +}; + +void RGWPSGetTopicOp::execute(optional_yield y) { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + const RGWPubSub ps(driver, s->owner.get_id().tenant); + op_ret = ps.get_topic(this, topic_name, result, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + if (topic_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) { + ldpp_dout(this, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl; + op_ret = -EPERM; + return; + } + ldpp_dout(this, 1) << "successfully got topic '" << topic_name << "'" << dendl; +} + +// command (AWS compliant): +// POST +// Action=GetTopicAttributes&TopicArn= +class RGWPSGetTopicAttributesOp : public RGWOp { + private: + std::string topic_name; + rgw_pubsub_topic result; + + int get_params() { + const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn"))); + + if (!topic_arn || topic_arn->resource.empty()) { + ldpp_dout(this, 1) << "GetTopicAttribute Action 'TopicArn' argument is missing or invalid" << dendl; + return -EINVAL; + } + + topic_name = topic_arn->resource; + return 0; + } + + public: + int verify_permission(optional_yield y) override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute(optional_yield y) override; + + const char* name() const override { return "pubsub_topic_get"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section_in_ns("GetTopicAttributesResponse", AWS_SNS_NS); + f->open_object_section("GetTopicAttributesResult"); + result.dump_xml_as_attributes(f); + f->close_section(); // GetTopicAttributesResult + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); // ResponseMetadata + f->close_section(); // GetTopicAttributesResponse + rgw_flush_formatter_and_reset(s, f); + } +}; + +void RGWPSGetTopicAttributesOp::execute(optional_yield y) { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + const RGWPubSub ps(driver, s->owner.get_id().tenant); + op_ret = ps.get_topic(this, topic_name, result, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + if (topic_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) { + ldpp_dout(this, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl; + op_ret = -EPERM; + return; + } + ldpp_dout(this, 1) << "successfully got topic '" << topic_name << "'" << dendl; +} + +// command (AWS compliant): +// POST +// Action=DeleteTopic&TopicArn= +class RGWPSDeleteTopicOp : public RGWOp { + private: + std::string topic_name; + + int get_params() { + const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn"))); + + if (!topic_arn || topic_arn->resource.empty()) { + ldpp_dout(this, 1) << "DeleteTopic Action 'TopicArn' argument is missing or invalid" << dendl; + return -EINVAL; + } + + topic_name = topic_arn->resource; + + // upon deletion it is not known if topic is persistent or not + // will try to delete the persistent topic anyway + const auto ret = rgw::notify::remove_persistent_topic(topic_name, s->yield); + if (ret == -ENOENT) { + // topic was not persistent, or already deleted + return 0; + } + if (ret < 0) { + ldpp_dout(this, 1) << "DeleteTopic Action failed to remove queue for persistent topics. error:" << ret << dendl; + return ret; + } + + return 0; + } + + public: + int verify_permission(optional_yield) override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute(optional_yield y) override; + + const char* name() const override { return "pubsub_topic_delete"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_DELETE; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section_in_ns("DeleteTopicResponse", AWS_SNS_NS); + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); // ResponseMetadata + f->close_section(); // DeleteTopicResponse + rgw_flush_formatter_and_reset(s, f); + } +}; + +void RGWPSDeleteTopicOp::execute(optional_yield y) { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + const RGWPubSub ps(driver, s->owner.get_id().tenant); + op_ret = ps.remove_topic(this, topic_name, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to remove topic '" << topic_name << ", ret=" << op_ret << dendl; + return; + } + ldpp_dout(this, 1) << "successfully removed topic '" << topic_name << "'" << dendl; +} + +using op_generator = RGWOp*(*)(); +static const std::unordered_map op_generators = { + {"CreateTopic", []() -> RGWOp* {return new RGWPSCreateTopicOp;}}, + {"DeleteTopic", []() -> RGWOp* {return new RGWPSDeleteTopicOp;}}, + {"ListTopics", []() -> RGWOp* {return new RGWPSListTopicsOp;}}, + {"GetTopic", []() -> RGWOp* {return new RGWPSGetTopicOp;}}, + {"GetTopicAttributes", []() -> RGWOp* {return new RGWPSGetTopicAttributesOp;}} +}; + +bool RGWHandler_REST_PSTopic_AWS::action_exists(const req_state* s) +{ + if (s->info.args.exists("Action")) { + const std::string action_name = s->info.args.get("Action"); + return op_generators.contains(action_name); + } + return false; +} + +RGWOp *RGWHandler_REST_PSTopic_AWS::op_post() +{ + s->dialect = "sns"; + s->prot_flags = RGW_REST_STS; + + if (s->info.args.exists("Action")) { + const std::string action_name = s->info.args.get("Action"); + const auto action_it = op_generators.find(action_name); + if (action_it != op_generators.end()) { + return action_it->second(); + } + ldpp_dout(s, 10) << "unknown action '" << action_name << "' for Topic handler" << dendl; + } else { + ldpp_dout(s, 10) << "missing action argument in Topic handler" << dendl; + } + return nullptr; +} + +int RGWHandler_REST_PSTopic_AWS::authorize(const DoutPrefixProvider* dpp, optional_yield y) { + const auto rc = RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y); + if (rc < 0) { + return rc; + } + if (s->auth.identity->is_anonymous()) { + ldpp_dout(dpp, 1) << "anonymous user not allowed in topic operations" << dendl; + return -ERR_INVALID_REQUEST; + } + return 0; +} + +namespace { +// return a unique topic by prefexing with the notification name: _ +std::string topic_to_unique(const std::string& topic, const std::string& notification) { + return notification + "_" + topic; +} + +// extract the topic from a unique topic of the form: _ +[[maybe_unused]] std::string unique_to_topic(const std::string& unique_topic, const std::string& notification) { + if (unique_topic.find(notification + "_") == std::string::npos) { + return ""; + } + return unique_topic.substr(notification.length() + 1); +} + +// from list of bucket topics, find the one that was auto-generated by a notification +auto find_unique_topic(const rgw_pubsub_bucket_topics& bucket_topics, const std::string& notif_name) { + auto it = std::find_if(bucket_topics.topics.begin(), bucket_topics.topics.end(), [&](const auto& val) { return notif_name == val.second.s3_id; }); + return it != bucket_topics.topics.end() ? + std::optional>(it->second): + std::nullopt; +} +} + +int remove_notification_by_topic(const DoutPrefixProvider *dpp, const std::string& topic_name, const RGWPubSub::Bucket& b, optional_yield y, const RGWPubSub& ps) { + int op_ret = b.remove_notification(dpp, topic_name, y); + if (op_ret < 0) { + ldpp_dout(dpp, 1) << "failed to remove notification of topic '" << topic_name << "', ret=" << op_ret << dendl; + } + op_ret = ps.remove_topic(dpp, topic_name, y); + if (op_ret < 0) { + ldpp_dout(dpp, 1) << "failed to remove auto-generated topic '" << topic_name << "', ret=" << op_ret << dendl; + } + return op_ret; +} + +int delete_all_notifications(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& bucket_topics, const RGWPubSub::Bucket& b, optional_yield y, const RGWPubSub& ps) { + // delete all notifications of on a bucket + for (const auto& topic : bucket_topics.topics) { + const auto op_ret = remove_notification_by_topic(dpp, topic.first, b, y, ps); + if (op_ret < 0) { + return op_ret; + } + } + return 0; +} + +// command (S3 compliant): PUT /?notification +// a "notification" and a subscription will be auto-generated +// actual configuration is XML encoded in the body of the message +class RGWPSCreateNotifOp : public RGWDefaultResponseOp { + int verify_params() override { + bool exists; + const auto no_value = s->info.args.get("notification", &exists); + if (!exists) { + ldpp_dout(this, 1) << "missing required param 'notification'" << dendl; + return -EINVAL; + } + if (no_value.length() > 0) { + ldpp_dout(this, 1) << "param 'notification' should not have any value" << dendl; + return -EINVAL; + } + if (s->bucket_name.empty()) { + ldpp_dout(this, 1) << "request must be on a bucket" << dendl; + return -EINVAL; + } + return 0; + } + + int get_params_from_body(rgw_pubsub_s3_notifications& configurations) { + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + int r; + bufferlist data; + std::tie(r, data) = read_all_input(s, max_size, false); + + if (r < 0) { + ldpp_dout(this, 1) << "failed to read XML payload" << dendl; + return r; + } + if (data.length() == 0) { + ldpp_dout(this, 1) << "XML payload missing" << dendl; + return -EINVAL; + } + + RGWXMLDecoder::XMLParser parser; + + if (!parser.init()){ + ldpp_dout(this, 1) << "failed to initialize XML parser" << dendl; + return -EINVAL; + } + if (!parser.parse(data.c_str(), data.length(), 1)) { + ldpp_dout(this, 1) << "failed to parse XML payload" << dendl; + return -ERR_MALFORMED_XML; + } + try { + // NotificationConfigurations is mandatory + // It can be empty which means we delete all the notifications + RGWXMLDecoder::decode_xml("NotificationConfiguration", configurations, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 1) << "failed to parse XML payload. error: " << err << dendl; + return -ERR_MALFORMED_XML; + } + return 0; + } +public: + int verify_permission(optional_yield y) override; + + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + + const char* name() const override { return "pubsub_notification_create_s3"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_CREATE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + + + void execute(optional_yield) override; +}; + +void RGWPSCreateNotifOp::execute(optional_yield y) { + op_ret = verify_params(); + if (op_ret < 0) { + return; + } + + rgw_pubsub_s3_notifications configurations; + op_ret = get_params_from_body(configurations); + if (op_ret < 0) { + return; + } + + std::unique_ptr user = driver->get_user(s->owner.get_id()); + std::unique_ptr bucket; + op_ret = driver->get_bucket(this, user.get(), s->bucket_tenant, s->bucket_name, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get bucket '" << + (s->bucket_tenant.empty() ? s->bucket_name : s->bucket_tenant + ":" + s->bucket_name) << + "' info, ret = " << op_ret << dendl; + return; + } + + const RGWPubSub ps(driver, s->owner.get_id().tenant); + const RGWPubSub::Bucket b(ps, bucket.get()); + + if(configurations.list.empty()) { + // get all topics on a bucket + rgw_pubsub_bucket_topics bucket_topics; + op_ret = b.get_topics(this, bucket_topics, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << s->bucket_name << "', ret=" << op_ret << dendl; + return; + } + + op_ret = delete_all_notifications(this, bucket_topics, b, y, ps); + return; + } + + for (const auto& c : configurations.list) { + const auto& notif_name = c.id; + if (notif_name.empty()) { + ldpp_dout(this, 1) << "missing notification id" << dendl; + op_ret = -EINVAL; + return; + } + if (c.topic_arn.empty()) { + ldpp_dout(this, 1) << "missing topic ARN in notification: '" << notif_name << "'" << dendl; + op_ret = -EINVAL; + return; + } + + const auto arn = rgw::ARN::parse(c.topic_arn); + if (!arn || arn->resource.empty()) { + ldpp_dout(this, 1) << "topic ARN has invalid format: '" << c.topic_arn << "' in notification: '" << notif_name << "'" << dendl; + op_ret = -EINVAL; + return; + } + + if (std::find(c.events.begin(), c.events.end(), rgw::notify::UnknownEvent) != c.events.end()) { + ldpp_dout(this, 1) << "unknown event type in notification: '" << notif_name << "'" << dendl; + op_ret = -EINVAL; + return; + } + + const auto topic_name = arn->resource; + + // get topic information. destination information is stored in the topic + rgw_pubsub_topic topic_info; + op_ret = ps.get_topic(this, topic_name, topic_info, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + // make sure that full topic configuration match + // TODO: use ARN match function + + // create unique topic name. this has 2 reasons: + // (1) topics cannot be shared between different S3 notifications because they hold the filter information + // (2) make topic clneaup easier, when notification is removed + const auto unique_topic_name = topic_to_unique(topic_name, notif_name); + // generate the internal topic. destination is stored here for the "push-only" case + // when no subscription exists + // ARN is cached to make the "GET" method faster + op_ret = ps.create_topic(this, unique_topic_name, topic_info.dest, topic_info.arn, topic_info.opaque_data, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to auto-generate unique topic '" << unique_topic_name << + "', ret=" << op_ret << dendl; + return; + } + ldpp_dout(this, 20) << "successfully auto-generated unique topic '" << unique_topic_name << "'" << dendl; + // generate the notification + rgw::notify::EventTypeList events; + op_ret = b.create_notification(this, unique_topic_name, c.events, std::make_optional(c.filter), notif_name, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to auto-generate notification for unique topic '" << unique_topic_name << + "', ret=" << op_ret << dendl; + // rollback generated topic (ignore return value) + ps.remove_topic(this, unique_topic_name, y); + return; + } + ldpp_dout(this, 20) << "successfully auto-generated notification for unique topic '" << unique_topic_name << "'" << dendl; + } +} + +int RGWPSCreateNotifOp::verify_permission(optional_yield y) { + if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketNotification)) { + return -EACCES; + } + + return 0; +} + +// command (extension to S3): DELETE /bucket?notification[=] +class RGWPSDeleteNotifOp : public RGWDefaultResponseOp { + int get_params(std::string& notif_name) const { + bool exists; + notif_name = s->info.args.get("notification", &exists); + if (!exists) { + ldpp_dout(this, 1) << "missing required param 'notification'" << dendl; + return -EINVAL; + } + if (s->bucket_name.empty()) { + ldpp_dout(this, 1) << "request must be on a bucket" << dendl; + return -EINVAL; + } + return 0; + } + +public: + int verify_permission(optional_yield y) override; + + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + + const char* name() const override { return "pubsub_notification_delete_s3"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_DELETE; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + + void execute(optional_yield y) override; +}; + +void RGWPSDeleteNotifOp::execute(optional_yield y) { + std::string notif_name; + op_ret = get_params(notif_name); + if (op_ret < 0) { + return; + } + + std::unique_ptr user = driver->get_user(s->owner.get_id()); + std::unique_ptr bucket; + op_ret = driver->get_bucket(this, user.get(), s->bucket_tenant, s->bucket_name, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get bucket '" << + (s->bucket_tenant.empty() ? s->bucket_name : s->bucket_tenant + ":" + s->bucket_name) << + "' info, ret = " << op_ret << dendl; + return; + } + + const RGWPubSub ps(driver, s->owner.get_id().tenant); + const RGWPubSub::Bucket b(ps, bucket.get()); + + // get all topics on a bucket + rgw_pubsub_bucket_topics bucket_topics; + op_ret = b.get_topics(this, bucket_topics, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << s->bucket_name << "', ret=" << op_ret << dendl; + return; + } + + if (!notif_name.empty()) { + // delete a specific notification + const auto unique_topic = find_unique_topic(bucket_topics, notif_name); + if (unique_topic) { + const auto unique_topic_name = unique_topic->get().topic.name; + op_ret = remove_notification_by_topic(this, unique_topic_name, b, y, ps); + return; + } + // notification to be removed is not found - considered success + ldpp_dout(this, 20) << "notification '" << notif_name << "' already removed" << dendl; + return; + } + + op_ret = delete_all_notifications(this, bucket_topics, b, y, ps); +} + +int RGWPSDeleteNotifOp::verify_permission(optional_yield y) { + if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketNotification)) { + return -EACCES; + } + + return 0; +} + +// command (S3 compliant): GET /bucket?notification[=] +class RGWPSListNotifsOp : public RGWOp { + rgw_pubsub_s3_notifications notifications; + + int get_params(std::string& notif_name) const { + bool exists; + notif_name = s->info.args.get("notification", &exists); + if (!exists) { + ldpp_dout(this, 1) << "missing required param 'notification'" << dendl; + return -EINVAL; + } + if (s->bucket_name.empty()) { + ldpp_dout(this, 1) << "request must be on a bucket" << dendl; + return -EINVAL; + } + return 0; + } + +public: + int verify_permission(optional_yield y) override; + + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + + const char* name() const override { return "pubsub_notifications_get_s3"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_LIST; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + + void execute(optional_yield y) override; + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + notifications.dump_xml(s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); + } +}; + +void RGWPSListNotifsOp::execute(optional_yield y) { + std::string notif_name; + op_ret = get_params(notif_name); + if (op_ret < 0) { + return; + } + + std::unique_ptr user = driver->get_user(s->owner.get_id()); + std::unique_ptr bucket; + op_ret = driver->get_bucket(this, user.get(), s->bucket_tenant, s->bucket_name, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get bucket '" << + (s->bucket_tenant.empty() ? s->bucket_name : s->bucket_tenant + ":" + s->bucket_name) << + "' info, ret = " << op_ret << dendl; + return; + } + + const RGWPubSub ps(driver, s->owner.get_id().tenant); + const RGWPubSub::Bucket b(ps, bucket.get()); + + // get all topics on a bucket + rgw_pubsub_bucket_topics bucket_topics; + op_ret = b.get_topics(this, bucket_topics, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << s->bucket_name << "', ret=" << op_ret << dendl; + return; + } + if (!notif_name.empty()) { + // get info of a specific notification + const auto unique_topic = find_unique_topic(bucket_topics, notif_name); + if (unique_topic) { + notifications.list.emplace_back(unique_topic->get()); + return; + } + op_ret = -ENOENT; + ldpp_dout(this, 1) << "failed to get notification info for '" << notif_name << "', ret=" << op_ret << dendl; + return; + } + // loop through all topics of the bucket + for (const auto& topic : bucket_topics.topics) { + if (topic.second.s3_id.empty()) { + // not an s3 notification + continue; + } + notifications.list.emplace_back(topic.second); + } +} + +int RGWPSListNotifsOp::verify_permission(optional_yield y) { + if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketNotification)) { + return -EACCES; + } + + return 0; +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::op_get() { + return new RGWPSListNotifsOp(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::op_put() { + return new RGWPSCreateNotifOp(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::op_delete() { + return new RGWPSDeleteNotifOp(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::create_get_op() { + return new RGWPSListNotifsOp(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::create_put_op() { + return new RGWPSCreateNotifOp(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::create_delete_op() { + return new RGWPSDeleteNotifOp(); +} + diff --git a/src/rgw/rgw_rest_ratelimit.cc b/src/rgw/rgw_rest_ratelimit.cc new file mode 100644 index 000000000..b482b4f82 --- /dev/null +++ b/src/rgw/rgw_rest_ratelimit.cc @@ -0,0 +1,349 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +#include "rgw_rest_ratelimit.h" +class RGWOp_Ratelimit_Info : public RGWRESTOp { +int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("ratelimit", RGW_CAP_READ); +} + + void execute(optional_yield y) override; + + const char* name() const override { return "get_ratelimit_info"; } +}; +void RGWOp_Ratelimit_Info::execute(optional_yield y) +{ + ldpp_dout(this, 20) << "" << dendl; + std::string uid_str; + std::string ratelimit_scope; + std::string bucket_name; + std::string tenant_name; + bool global = false; + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "ratelimit-scope", ratelimit_scope, &ratelimit_scope); + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); + RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name); + // RESTArgs::get_bool default value to true even if global is empty + bool exists; + std::string sval = s->info.args.get("global", &exists); + if (exists) { + if (!boost::iequals(sval,"true") && !boost::iequals(sval,"false")) { + op_ret = -EINVAL; + ldpp_dout(this, 20) << "global is not equal to true or false" << dendl; + return; + } + } + RESTArgs::get_bool(s, "global", false, &global); + + if (ratelimit_scope == "bucket" && !bucket_name.empty() && !global) { + std::unique_ptr bucket; + int r = driver->get_bucket(s, nullptr, tenant_name, bucket_name, &bucket, y); + if (r != 0) { + op_ret = r; + ldpp_dout(this, 0) << "Error on getting bucket info" << dendl; + return; + } + RGWRateLimitInfo ratelimit_info; + auto iter = bucket->get_attrs().find(RGW_ATTR_RATELIMIT); + if (iter != bucket->get_attrs().end()) { + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(ratelimit_info, biter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "Error on decoding ratelimit info from bucket" << dendl; + op_ret = -EIO; + return; + } + } + flusher.start(0); + s->formatter->open_object_section("bucket_ratelimit"); + encode_json("bucket_ratelimit", ratelimit_info, s->formatter); + s->formatter->close_section(); + flusher.flush(); + return; + } + if (ratelimit_scope == "user" && !uid_str.empty() && !global) { + RGWRateLimitInfo ratelimit_info; + rgw_user user(uid_str); + std::unique_ptr user_sal; + user_sal = driver->get_user(user); + if (!rgw::sal::User::empty(user_sal)) { + op_ret = user_sal->load_user(this, y); + if (op_ret) { + ldpp_dout(this, 0) << "Cannot load user info" << dendl; + return; + } + } else { + ldpp_dout(this, 0) << "User does not exist" << dendl; + op_ret = -ENOENT; + return; + } + + auto iter = user_sal->get_attrs().find(RGW_ATTR_RATELIMIT); + if(iter != user_sal->get_attrs().end()) { + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(ratelimit_info, biter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "Error on decoding ratelimit info from user" << dendl; + op_ret = -EIO; + return; + } + } + flusher.start(0); + s->formatter->open_object_section("user_ratelimit"); + encode_json("user_ratelimit", ratelimit_info, s->formatter); + s->formatter->close_section(); + flusher.flush(); + } + if (global) { + std::string realm_id = driver->get_zone()->get_realm_id(); + RGWPeriodConfig period_config; + op_ret = period_config.read(this, static_cast(driver)->svc()->sysobj, realm_id, y); + if (op_ret && op_ret != -ENOENT) { + ldpp_dout(this, 0) << "Error on period config read" << dendl; + return; + } + flusher.start(0); + s->formatter->open_object_section("period_config"); + encode_json("bucket_ratelimit", period_config.bucket_ratelimit, s->formatter); + encode_json("user_ratelimit", period_config.user_ratelimit, s->formatter); + encode_json("anonymous_ratelimit", period_config.anon_ratelimit, s->formatter); + s->formatter->close_section(); + flusher.flush(); + return; + } + op_ret = -EINVAL; + return; +} + +class RGWOp_Ratelimit_Set : public RGWRESTOp { + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("ratelimit", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "put_ratelimit_info"; } + + void set_ratelimit_info(bool have_max_read_ops, int64_t max_read_ops, bool have_max_write_ops, int64_t max_write_ops, + bool have_max_read_bytes, int64_t max_read_bytes, bool have_max_write_bytes, int64_t max_write_bytes, + bool have_enabled, bool enabled, bool& ratelimit_configured, RGWRateLimitInfo& ratelimit_info); +}; + + + void RGWOp_Ratelimit_Set::set_ratelimit_info(bool have_max_read_ops, int64_t max_read_ops, bool have_max_write_ops, int64_t max_write_ops, + bool have_max_read_bytes, int64_t max_read_bytes, bool have_max_write_bytes, int64_t max_write_bytes, + bool have_enabled, bool enabled, bool& ratelimit_configured, RGWRateLimitInfo& ratelimit_info) + { + if (have_max_read_ops) { + if (max_read_ops >= 0) { + ratelimit_info.max_read_ops = max_read_ops; + ratelimit_configured = true; + } + } + if (have_max_write_ops) { + if (max_write_ops >= 0) { + ratelimit_info.max_write_ops = max_write_ops; + ratelimit_configured = true; + } + } + if (have_max_read_bytes) { + if (max_read_bytes >= 0) { + ratelimit_info.max_read_bytes = max_read_bytes; + ratelimit_configured = true; + } + } + if (have_max_write_bytes) { + if (max_write_bytes >= 0) { + ratelimit_info.max_write_bytes = max_write_bytes; + ratelimit_configured = true; + } + } + if (have_enabled) { + ratelimit_info.enabled = enabled; + ratelimit_configured = true; + } + if (!ratelimit_configured) { + ldpp_dout(this, 0) << "No rate limit configuration arguments have been sent" << dendl; + op_ret = -EINVAL; + return; + } + + } + + +void RGWOp_Ratelimit_Set::execute(optional_yield y) +{ + std::string uid_str; + std::string ratelimit_scope; + std::string bucket_name; + std::string tenant_name; + RGWRateLimitInfo ratelimit_info; + bool ratelimit_configured = false; + bool enabled = false; + bool have_enabled = false; + bool global = false; + int64_t max_read_ops = 0; + bool have_max_read_ops = false; + int64_t max_write_ops = 0; + bool have_max_write_ops = false; + int64_t max_read_bytes = 0; + bool have_max_read_bytes = false; + int64_t max_write_bytes = 0; + bool have_max_write_bytes = false; + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "ratelimit-scope", ratelimit_scope, &ratelimit_scope); + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); + RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name); + // check there was no -EINVAL coming from get_int64 + op_ret = RESTArgs::get_int64(s, "max-read-ops", 0, &max_read_ops, &have_max_read_ops); + op_ret |= RESTArgs::get_int64(s, "max-write-ops", 0, &max_write_ops, &have_max_write_ops); + op_ret |= RESTArgs::get_int64(s, "max-read-bytes", 0, &max_read_bytes, &have_max_read_bytes); + op_ret |= RESTArgs::get_int64(s, "max-write-bytes", 0, &max_write_bytes, &have_max_write_bytes); + if (op_ret) { + ldpp_dout(this, 0) << "one of the maximum arguments could not be parsed" << dendl; + return; + } + // RESTArgs::get_bool default value to true even if enabled or global are empty + std::string sval = s->info.args.get("enabled", &have_enabled); + if (have_enabled) { + if (!boost::iequals(sval,"true") && !boost::iequals(sval,"false")) { + ldpp_dout(this, 20) << "enabled is not equal to true or false" << dendl; + op_ret = -EINVAL; + return; + } + } + RESTArgs::get_bool(s, "enabled", false, &enabled, &have_enabled); + bool exists; + sval = s->info.args.get("global", &exists); + if (exists) { + if (!boost::iequals(sval,"true") && !boost::iequals(sval,"false")) { + ldpp_dout(this, 20) << "global is not equal to true or faslse" << dendl; + op_ret = -EINVAL; + return; + } + } + RESTArgs::get_bool(s, "global", false, &global, nullptr); + set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops, + have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes, + have_enabled, enabled, ratelimit_configured, ratelimit_info); + if (op_ret) { + return; + } + if (ratelimit_scope == "user" && !uid_str.empty() && !global) { + rgw_user user(uid_str); + std::unique_ptr user_sal; + user_sal = driver->get_user(user); + if (!rgw::sal::User::empty(user_sal)) { + op_ret = user_sal->load_user(this, y); + if (op_ret) { + ldpp_dout(this, 0) << "Cannot load user info" << dendl; + return; + } + } else { + ldpp_dout(this, 0) << "User does not exist" << dendl; + op_ret = -ENOENT; + return; + } + auto iter = user_sal->get_attrs().find(RGW_ATTR_RATELIMIT); + if (iter != user_sal->get_attrs().end()) { + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(ratelimit_info, biter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "Error on decoding ratelimit info from user" << dendl; + op_ret = -EIO; + return; + } + } + set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops, + have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes, + have_enabled, enabled, ratelimit_configured, ratelimit_info); + bufferlist bl; + ratelimit_info.encode(bl); + rgw::sal::Attrs attr; + attr[RGW_ATTR_RATELIMIT] = bl; + op_ret = user_sal->merge_and_store_attrs(this, attr, y); + return; + } + + if (ratelimit_scope == "bucket" && !bucket_name.empty() && !global) { + ldpp_dout(this, 0) << "getting bucket info" << dendl; + std::unique_ptr bucket; + op_ret = driver->get_bucket(this, nullptr, tenant_name, bucket_name, &bucket, y); + if (op_ret) { + ldpp_dout(this, 0) << "Error on getting bucket info" << dendl; + return; + } + auto iter = bucket->get_attrs().find(RGW_ATTR_RATELIMIT); + if (iter != bucket->get_attrs().end()) { + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(ratelimit_info, biter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "Error on decoding ratelimit info from bucket" << dendl; + op_ret = -EIO; + return; + } + } + bufferlist bl; + set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops, + have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes, + have_enabled, enabled, ratelimit_configured, ratelimit_info); + ratelimit_info.encode(bl); + rgw::sal::Attrs attr; + attr[RGW_ATTR_RATELIMIT] = bl; + op_ret = bucket->merge_and_store_attrs(this, attr, y); + return; + } + if (global) { + std::string realm_id = driver->get_zone()->get_realm_id(); + RGWPeriodConfig period_config; + op_ret = period_config.read(s, static_cast(driver)->svc()->sysobj, realm_id, y); + if (op_ret && op_ret != -ENOENT) { + ldpp_dout(this, 0) << "Error on period config read" << dendl; + return; + } + if (ratelimit_scope == "bucket") { + ratelimit_info = period_config.bucket_ratelimit; + set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops, + have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes, + have_enabled, enabled, ratelimit_configured, ratelimit_info); + period_config.bucket_ratelimit = ratelimit_info; + op_ret = period_config.write(s, static_cast(driver)->svc()->sysobj, realm_id, y); + return; + } + if (ratelimit_scope == "anon") { + ratelimit_info = period_config.anon_ratelimit; + set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops, + have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes, + have_enabled, enabled, ratelimit_configured, ratelimit_info); + period_config.anon_ratelimit = ratelimit_info; + op_ret = period_config.write(s, static_cast(driver)->svc()->sysobj, realm_id, y); + return; + } + if (ratelimit_scope == "user") { + ratelimit_info = period_config.user_ratelimit; + set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops, + have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes, + have_enabled, enabled, ratelimit_configured, ratelimit_info); + period_config.user_ratelimit = ratelimit_info; + op_ret = period_config.write(s, static_cast(driver)->svc()->sysobj, realm_id, y); + return; + } + } + op_ret = -EINVAL; + return; +} +RGWOp* RGWHandler_Ratelimit::op_get() +{ + return new RGWOp_Ratelimit_Info; +} +RGWOp* RGWHandler_Ratelimit::op_post() +{ + return new RGWOp_Ratelimit_Set; +} diff --git a/src/rgw/rgw_rest_ratelimit.h b/src/rgw/rgw_rest_ratelimit.h new file mode 100644 index 000000000..c3a942b19 --- /dev/null +++ b/src/rgw/rgw_rest_ratelimit.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_sal_rados.h" + +class RGWHandler_Ratelimit : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_post() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Ratelimit() override = default; + + int read_permissions(RGWOp*, optional_yield) override { + return 0; + } +}; + +class RGWRESTMgr_Ratelimit : public RGWRESTMgr { +public: + RGWRESTMgr_Ratelimit() = default; + ~RGWRESTMgr_Ratelimit() override = default; + + RGWHandler_REST *get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Ratelimit(auth_registry); + } +}; diff --git a/src/rgw/rgw_rest_role.cc b/src/rgw/rgw_rest_role.cc new file mode 100644 index 000000000..e71dff570 --- /dev/null +++ b/src/rgw/rgw_rest_role.cc @@ -0,0 +1,1022 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_common.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_role.h" +#include "rgw_rest_role.h" +#include "rgw_sal.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int RGWRestRole::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + string role_name = s->info.args.get("RoleName"); + std::unique_ptr role = driver->get_role(role_name, + s->user->get_tenant()); + if (op_ret = role->get(s, y); op_ret < 0) { + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_ROLE_FOUND; + } + return op_ret; + } + + if (int ret = check_caps(s->user->get_caps()); ret == 0) { + _role = std::move(role); + return ret; + } + + string resource_name = role->get_path() + role_name; + uint64_t op = get_op(); + if (!verify_user_permission(this, + s, + rgw::ARN(resource_name, + "role", + s->user->get_tenant(), true), + op)) { + return -EACCES; + } + + _role = std::move(role); + + return 0; +} + +int RGWRestRole::parse_tags() +{ + vector keys, vals; + auto val_map = s->info.args.get_params(); + const regex pattern_key("Tags.member.([0-9]+).Key"); + const regex pattern_value("Tags.member.([0-9]+).Value"); + for (auto& v : val_map) { + string key_index="", value_index=""; + for(sregex_iterator it = sregex_iterator( + v.first.begin(), v.first.end(), pattern_key); + it != sregex_iterator(); it++) { + smatch match; + match = *it; + key_index = match.str(1); + ldout(s->cct, 20) << "Key index: " << match.str(1) << dendl; + if (!key_index.empty()) { + int index = stoi(key_index); + auto pos = keys.begin() + (index-1); + keys.insert(pos, v.second); + } + } + for(sregex_iterator it = sregex_iterator( + v.first.begin(), v.first.end(), pattern_value); + it != sregex_iterator(); it++) { + smatch match; + match = *it; + value_index = match.str(1); + ldout(s->cct, 20) << "Value index: " << match.str(1) << dendl; + if (!value_index.empty()) { + int index = stoi(value_index); + auto pos = vals.begin() + (index-1); + vals.insert(pos, v.second); + } + } + } + if (keys.size() != vals.size()) { + ldout(s->cct, 0) << "No. of keys doesn't match with no. of values in tags" << dendl; + return -EINVAL; + } + for (size_t i = 0; i < keys.size(); i++) { + tags.emplace(keys[i], vals[i]); + ldout(s->cct, 0) << "Tag Key: " << keys[i] << " Tag Value is: " << vals[i] << dendl; + } + return 0; +} + +void RGWRestRole::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this); +} + +int RGWRoleRead::check_caps(const RGWUserCaps& caps) +{ + return caps.check_cap("roles", RGW_CAP_READ); +} + +int RGWRoleWrite::check_caps(const RGWUserCaps& caps) +{ + return caps.check_cap("roles", RGW_CAP_WRITE); +} + +int RGWCreateRole::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (int ret = check_caps(s->user->get_caps()); ret == 0) { + return ret; + } + + string role_name = s->info.args.get("RoleName"); + string role_path = s->info.args.get("Path"); + + string resource_name = role_path + role_name; + if (!verify_user_permission(this, + s, + rgw::ARN(resource_name, + "role", + s->user->get_tenant(), true), + get_op())) { + return -EACCES; + } + return 0; +} + +int RGWCreateRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + role_path = s->info.args.get("Path"); + trust_policy = s->info.args.get("AssumeRolePolicyDocument"); + max_session_duration = s->info.args.get("MaxSessionDuration"); + + if (role_name.empty() || trust_policy.empty()) { + ldpp_dout(this, 20) << "ERROR: one of role name or assume role policy document is empty" + << dendl; + return -EINVAL; + } + + bufferlist bl = bufferlist::static_from_string(trust_policy); + try { + const rgw::IAM::Policy p( + s->cct, s->user->get_tenant(), bl, + s->cct->_conf.get_val("rgw_policy_reject_invalid_principals")); + } + catch (rgw::IAM::PolicyParseException& e) { + ldpp_dout(this, 5) << "failed to parse policy: " << e.what() << dendl; + s->err.message = e.what(); + return -ERR_MALFORMED_DOC; + } + + int ret = parse_tags(); + if (ret < 0) { + return ret; + } + + if (tags.size() > 50) { + ldout(s->cct, 0) << "No. tags is greater than 50" << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWCreateRole::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + std::string user_tenant = s->user->get_tenant(); + std::unique_ptr role = driver->get_role(role_name, + user_tenant, + role_path, + trust_policy, + max_session_duration, + tags); + if (!user_tenant.empty() && role->get_tenant() != user_tenant) { + ldpp_dout(this, 20) << "ERROR: the tenant provided in the role name does not match with the tenant of the user creating the role" + << dendl; + op_ret = -EINVAL; + return; + } + + std::string role_id; + + if (!driver->is_meta_master()) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl; + op_ret = -EINVAL; + return; + } + + bufferlist data; + s->info.args.remove("RoleName"); + s->info.args.remove("Path"); + s->info.args.remove("AssumeRolePolicyDocument"); + s->info.args.remove("MaxSessionDuration"); + s->info.args.remove("Action"); + s->info.args.remove("Version"); + auto& val_map = s->info.args.get_params(); + for (auto it = val_map.begin(); it!= val_map.end(); it++) { + if (it->first.find("Tags.member.") == 0) { + val_map.erase(it); + } + } + + RGWUserInfo info = s->user->get_info(); + const auto& it = info.access_keys.begin(); + RGWAccessKey key; + if (it != info.access_keys.end()) { + key.id = it->first; + RGWAccessKey cred = it->second; + key.key = cred.key; + } + op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl; + return; + } + + XMLObj* create_role_resp_obj = parser.find_first("CreateRoleResponse");; + if (!create_role_resp_obj) { + ldpp_dout(this, 5) << "ERROR: unexpected xml: CreateRoleResponse" << dendl; + op_ret = -EINVAL; + return; + } + + XMLObj* create_role_res_obj = create_role_resp_obj->find_first("CreateRoleResult"); + if (!create_role_res_obj) { + ldpp_dout(this, 5) << "ERROR: unexpected xml: CreateRoleResult" << dendl; + op_ret = -EINVAL; + return; + } + + XMLObj* role_obj = nullptr; + if (create_role_res_obj) { + role_obj = create_role_res_obj->find_first("Role"); + } + if (!role_obj) { + ldpp_dout(this, 5) << "ERROR: unexpected xml: Role" << dendl; + op_ret = -EINVAL; + return; + } + + try { + if (role_obj) { + RGWXMLDecoder::decode_xml("RoleId", role_id, role_obj, true); + } + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "ERROR: unexpected xml: RoleId" << dendl; + op_ret = -EINVAL; + return; + } + ldpp_dout(this, 0) << "role_id decoded from master zonegroup response is" << role_id << dendl; + } + + op_ret = role->create(s, true, role_id, y); + if (op_ret == -EEXIST) { + op_ret = -ERR_ROLE_EXISTS; + return; + } + + if (op_ret == 0) { + s->formatter->open_object_section("CreateRoleResponse"); + s->formatter->open_object_section("CreateRoleResult"); + s->formatter->open_object_section("Role"); + role->dump(s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWDeleteRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldpp_dout(this, 20) << "ERROR: Role name is empty"<< dendl; + return -EINVAL; + } + + return 0; +} + +void RGWDeleteRole::execute(optional_yield y) +{ + bool is_master = true; + int master_op_ret = 0; + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + if (!driver->is_meta_master()) { + is_master = false; + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl; + op_ret = -EINVAL; + } + + bufferlist data; + s->info.args.remove("RoleName"); + s->info.args.remove("Action"); + s->info.args.remove("Version"); + + RGWUserInfo info = s->user->get_info(); + const auto& it = info.access_keys.begin(); + RGWAccessKey key; + if (it != info.access_keys.end()) { + key.id = it->first; + RGWAccessKey cred = it->second; + key.key = cred.key; + } + master_op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y); + if (master_op_ret < 0) { + op_ret = master_op_ret; + ldpp_dout(this, 0) << "forward_iam_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + op_ret = _role->delete_obj(s, y); + + if (op_ret == -ENOENT) { + //Role has been deleted since metadata from master has synced up + if (!is_master && master_op_ret == 0) { + op_ret = 0; + } else { + op_ret = -ERR_NO_ROLE_FOUND; + } + return; + } + if (!op_ret) { + s->formatter->open_object_section("DeleteRoleResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWGetRole::verify_permission(optional_yield y) +{ + return 0; +} + +int RGWGetRole::_verify_permission(const rgw::sal::RGWRole* role) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (int ret = check_caps(s->user->get_caps()); ret == 0) { + return ret; + } + + string resource_name = role->get_path() + role->get_name(); + if (!verify_user_permission(this, + s, + rgw::ARN(resource_name, + "role", + s->user->get_tenant(), true), + get_op())) { + return -EACCES; + } + return 0; +} + +int RGWGetRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldpp_dout(this, 20) << "ERROR: Role name is empty"<< dendl; + return -EINVAL; + } + + return 0; +} + +void RGWGetRole::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + std::unique_ptr role = driver->get_role(role_name, + s->user->get_tenant()); + op_ret = role->get(s, y); + + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_ROLE_FOUND; + return; + } + + op_ret = _verify_permission(role.get()); + + if (op_ret == 0) { + s->formatter->open_object_section("GetRoleResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("GetRoleResult"); + s->formatter->open_object_section("Role"); + role->dump(s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWModifyRoleTrustPolicy::get_params() +{ + role_name = s->info.args.get("RoleName"); + trust_policy = s->info.args.get("PolicyDocument"); + + if (role_name.empty() || trust_policy.empty()) { + ldpp_dout(this, 20) << "ERROR: One of role name or trust policy is empty"<< dendl; + return -EINVAL; + } + JSONParser p; + if (!p.parse(trust_policy.c_str(), trust_policy.length())) { + ldpp_dout(this, 20) << "ERROR: failed to parse assume role policy doc" << dendl; + return -ERR_MALFORMED_DOC; + } + + return 0; +} + +void RGWModifyRoleTrustPolicy::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + if (!driver->is_meta_master()) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl; + op_ret = -EINVAL; + return; + } + + bufferlist data; + s->info.args.remove("RoleName"); + s->info.args.remove("PolicyDocument"); + s->info.args.remove("Action"); + s->info.args.remove("Version"); + + RGWUserInfo info = s->user->get_info(); + const auto& it = info.access_keys.begin(); + RGWAccessKey key; + if (it != info.access_keys.end()) { + key.id = it->first; + RGWAccessKey cred = it->second; + key.key = cred.key; + } + op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl; + return; + } + } + + _role->update_trust_policy(trust_policy); + op_ret = _role->update(this, y); + + s->formatter->open_object_section("UpdateAssumeRolePolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); +} + +int RGWListRoles::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (int ret = check_caps(s->user->get_caps()); ret == 0) { + return ret; + } + + if (!verify_user_permission(this, + s, + rgw::ARN(), + get_op())) { + return -EACCES; + } + + return 0; +} + +int RGWListRoles::get_params() +{ + path_prefix = s->info.args.get("PathPrefix"); + + return 0; +} + +void RGWListRoles::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + vector> result; + op_ret = driver->get_roles(s, y, path_prefix, s->user->get_tenant(), result); + + if (op_ret == 0) { + s->formatter->open_array_section("ListRolesResponse"); + s->formatter->open_array_section("ListRolesResult"); + s->formatter->open_object_section("Roles"); + for (const auto& it : result) { + s->formatter->open_object_section("member"); + it->dump(s->formatter); + s->formatter->close_section(); + } + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWPutRolePolicy::get_params() +{ + role_name = s->info.args.get("RoleName"); + policy_name = s->info.args.get("PolicyName"); + perm_policy = s->info.args.get("PolicyDocument"); + + if (role_name.empty() || policy_name.empty() || perm_policy.empty()) { + ldpp_dout(this, 20) << "ERROR: One of role name, policy name or perm policy is empty"<< dendl; + return -EINVAL; + } + bufferlist bl = bufferlist::static_from_string(perm_policy); + try { + const rgw::IAM::Policy p( + s->cct, s->user->get_tenant(), bl, + s->cct->_conf.get_val("rgw_policy_reject_invalid_principals")); + } + catch (rgw::IAM::PolicyParseException& e) { + ldpp_dout(this, 20) << "failed to parse policy: " << e.what() << dendl; + s->err.message = e.what(); + return -ERR_MALFORMED_DOC; + } + return 0; +} + +void RGWPutRolePolicy::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + if (!driver->is_meta_master()) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl; + op_ret = -EINVAL; + return; + } + + bufferlist data; + s->info.args.remove("RoleName"); + s->info.args.remove("PolicyName"); + s->info.args.remove("PolicyDocument"); + s->info.args.remove("Action"); + s->info.args.remove("Version"); + + RGWUserInfo info = s->user->get_info(); + const auto& it = info.access_keys.begin(); + RGWAccessKey key; + if (it != info.access_keys.end()) { + key.id = it->first; + RGWAccessKey cred = it->second; + key.key = cred.key; + } + op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl; + return; + } + } + + _role->set_perm_policy(policy_name, perm_policy); + op_ret = _role->update(this, y); + + if (op_ret == 0) { + s->formatter->open_object_section("PutRolePolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWGetRolePolicy::get_params() +{ + role_name = s->info.args.get("RoleName"); + policy_name = s->info.args.get("PolicyName"); + + if (role_name.empty() || policy_name.empty()) { + ldpp_dout(this, 20) << "ERROR: One of role name or policy name is empty"<< dendl; + return -EINVAL; + } + return 0; +} + +void RGWGetRolePolicy::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + string perm_policy; + op_ret = _role->get_role_policy(this, policy_name, perm_policy); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_ENTITY; + } + + if (op_ret == 0) { + s->formatter->open_object_section("GetRolePolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("GetRolePolicyResult"); + s->formatter->dump_string("PolicyName", policy_name); + s->formatter->dump_string("RoleName", role_name); + s->formatter->dump_string("PolicyDocument", perm_policy); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWListRolePolicies::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldpp_dout(this, 20) << "ERROR: Role name is empty"<< dendl; + return -EINVAL; + } + return 0; +} + +void RGWListRolePolicies::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + std::vector policy_names = _role->get_role_policy_names(); + s->formatter->open_object_section("ListRolePoliciesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("ListRolePoliciesResult"); + s->formatter->open_array_section("PolicyNames"); + for (const auto& it : policy_names) { + s->formatter->dump_string("member", it); + } + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); +} + +int RGWDeleteRolePolicy::get_params() +{ + role_name = s->info.args.get("RoleName"); + policy_name = s->info.args.get("PolicyName"); + + if (role_name.empty() || policy_name.empty()) { + ldpp_dout(this, 20) << "ERROR: One of role name or policy name is empty"<< dendl; + return -EINVAL; + } + return 0; +} + +void RGWDeleteRolePolicy::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + if (!driver->is_meta_master()) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl; + op_ret = -EINVAL; + return; + } + + bufferlist data; + s->info.args.remove("RoleName"); + s->info.args.remove("PolicyName"); + s->info.args.remove("Action"); + s->info.args.remove("Version"); + + RGWUserInfo info = s->user->get_info(); + const auto& it = info.access_keys.begin(); + RGWAccessKey key; + if (it != info.access_keys.end()) { + key.id = it->first; + RGWAccessKey cred = it->second; + key.key = cred.key; + } + op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl; + return; + } + } + + op_ret = _role->delete_policy(this, policy_name); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_ROLE_FOUND; + return; + } + + if (op_ret == 0) { + op_ret = _role->update(this, y); + } + + s->formatter->open_object_section("DeleteRolePoliciesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); +} + +int RGWTagRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldout(s->cct, 0) << "ERROR: Role name is empty" << dendl; + return -EINVAL; + } + int ret = parse_tags(); + if (ret < 0) { + return ret; + } + + return 0; +} + +void RGWTagRole::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + if (!driver->is_meta_master()) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl; + op_ret = -EINVAL; + return; + } + + bufferlist data; + s->info.args.remove("RoleName"); + s->info.args.remove("Action"); + s->info.args.remove("Version"); + auto& val_map = s->info.args.get_params(); + for (auto it = val_map.begin(); it!= val_map.end(); it++) { + if (it->first.find("Tags.member.") == 0) { + val_map.erase(it); + } + } + + RGWUserInfo info = s->user->get_info(); + const auto& it = info.access_keys.begin(); + RGWAccessKey key; + if (it != info.access_keys.end()) { + key.id = it->first; + RGWAccessKey cred = it->second; + key.key = cred.key; + } + op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl; + return; + } + } + + op_ret = _role->set_tags(this, tags); + if (op_ret == 0) { + op_ret = _role->update(this, y); + } + + if (op_ret == 0) { + s->formatter->open_object_section("TagRoleResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWListRoleTags::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldout(s->cct, 0) << "ERROR: Role name is empty" << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWListRoleTags::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + boost::optional> tag_map = _role->get_tags(); + s->formatter->open_object_section("ListRoleTagsResponse"); + s->formatter->open_object_section("ListRoleTagsResult"); + if (tag_map) { + s->formatter->open_array_section("Tags"); + for (const auto& it : tag_map.get()) { + s->formatter->open_object_section("Key"); + encode_json("Key", it.first, s->formatter); + s->formatter->close_section(); + s->formatter->open_object_section("Value"); + encode_json("Value", it.second, s->formatter); + s->formatter->close_section(); + } + s->formatter->close_section(); + } + s->formatter->close_section(); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); +} + +int RGWUntagRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldout(s->cct, 0) << "ERROR: Role name is empty" << dendl; + return -EINVAL; + } + + auto val_map = s->info.args.get_params(); + for (auto& it : val_map) { + if (it.first.find("TagKeys.member.") != string::npos) { + tagKeys.emplace_back(it.second); + } + } + return 0; +} + +void RGWUntagRole::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + if (!driver->is_meta_master()) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl; + op_ret = -EINVAL; + return; + } + + bufferlist data; + s->info.args.remove("RoleName"); + s->info.args.remove("Action"); + s->info.args.remove("Version"); + auto& val_map = s->info.args.get_params(); + std::vector::iterator> iters; + for (auto it = val_map.begin(); it!= val_map.end(); it++) { + if (it->first.find("Tags.member.") == 0) { + iters.emplace_back(it); + } + } + + for (auto& it : iters) { + val_map.erase(it); + } + RGWUserInfo info = s->user->get_info(); + const auto& it = info.access_keys.begin(); + RGWAccessKey key; + if (it != info.access_keys.end()) { + key.id = it->first; + RGWAccessKey cred = it->second; + key.key = cred.key; + } + op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl; + return; + } + } + + _role->erase_tags(tagKeys); + op_ret = _role->update(this, y); + + if (op_ret == 0) { + s->formatter->open_object_section("UntagRoleResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWUpdateRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + max_session_duration = s->info.args.get("MaxSessionDuration"); + + if (role_name.empty()) { + ldpp_dout(this, 20) << "ERROR: Role name is empty"<< dendl; + return -EINVAL; + } + + return 0; +} + +void RGWUpdateRole::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + if (!driver->is_meta_master()) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl; + op_ret = -EINVAL; + return; + } + + bufferlist data; + s->info.args.remove("RoleName"); + s->info.args.remove("MaxSessionDuration"); + s->info.args.remove("Action"); + s->info.args.remove("Version"); + + RGWUserInfo info = s->user->get_info(); + const auto& it = info.access_keys.begin(); + RGWAccessKey key; + if (it != info.access_keys.end()) { + key.id = it->first; + RGWAccessKey cred = it->second; + key.key = cred.key; + } + op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl; + return; + } + } + + if (!_role->validate_max_session_duration(this)) { + op_ret = -EINVAL; + return; + } + + _role->update_max_session_duration(max_session_duration); + op_ret = _role->update(this, y); + + s->formatter->open_object_section("UpdateRoleResponse"); + s->formatter->open_object_section("UpdateRoleResult"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); +} diff --git a/src/rgw/rgw_rest_role.h b/src/rgw/rgw_rest_role.h new file mode 100644 index 000000000..98a08833b --- /dev/null +++ b/src/rgw/rgw_rest_role.h @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "common/async/yield_context.h" + +#include "rgw_role.h" +#include "rgw_rest.h" + +class RGWRestRole : public RGWRESTOp { +protected: + std::string role_name; + std::string role_path; + std::string trust_policy; + std::string policy_name; + std::string perm_policy; + std::string path_prefix; + std::string max_session_duration; + std::multimap tags; + std::vector tagKeys; + std::unique_ptr _role; + int verify_permission(optional_yield y) override; + void send_response() override; + virtual uint64_t get_op() = 0; + int parse_tags(); +}; + +class RGWRoleRead : public RGWRestRole { +public: + RGWRoleRead() = default; + int check_caps(const RGWUserCaps& caps) override; +}; + +class RGWRoleWrite : public RGWRestRole { +public: + RGWRoleWrite() = default; + int check_caps(const RGWUserCaps& caps) override; +}; + +class RGWCreateRole : public RGWRoleWrite { + bufferlist bl_post_body; +public: + RGWCreateRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {}; + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "create_role"; } + RGWOpType get_type() override { return RGW_OP_CREATE_ROLE; } + uint64_t get_op() override { return rgw::IAM::iamCreateRole; } +}; + +class RGWDeleteRole : public RGWRoleWrite { + bufferlist bl_post_body; +public: + RGWDeleteRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {}; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "delete_role"; } + RGWOpType get_type() override { return RGW_OP_DELETE_ROLE; } + uint64_t get_op() override { return rgw::IAM::iamDeleteRole; } +}; + +class RGWGetRole : public RGWRoleRead { + int _verify_permission(const rgw::sal::RGWRole* role); +public: + RGWGetRole() = default; + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "get_role"; } + RGWOpType get_type() override { return RGW_OP_GET_ROLE; } + uint64_t get_op() override { return rgw::IAM::iamGetRole; } +}; + +class RGWModifyRoleTrustPolicy : public RGWRoleWrite { + bufferlist bl_post_body; +public: + RGWModifyRoleTrustPolicy(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {}; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "modify_role_trust_policy"; } + RGWOpType get_type() override { return RGW_OP_MODIFY_ROLE_TRUST_POLICY; } + uint64_t get_op() override { return rgw::IAM::iamModifyRoleTrustPolicy; } +}; + +class RGWListRoles : public RGWRoleRead { +public: + RGWListRoles() = default; + int verify_permission(optional_yield y) override; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "list_roles"; } + RGWOpType get_type() override { return RGW_OP_LIST_ROLES; } + uint64_t get_op() override { return rgw::IAM::iamListRoles; } +}; + +class RGWPutRolePolicy : public RGWRoleWrite { + bufferlist bl_post_body; +public: + RGWPutRolePolicy(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {}; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "put_role_policy"; } + RGWOpType get_type() override { return RGW_OP_PUT_ROLE_POLICY; } + uint64_t get_op() override { return rgw::IAM::iamPutRolePolicy; } +}; + +class RGWGetRolePolicy : public RGWRoleRead { +public: + RGWGetRolePolicy() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "get_role_policy"; } + RGWOpType get_type() override { return RGW_OP_GET_ROLE_POLICY; } + uint64_t get_op() override { return rgw::IAM::iamGetRolePolicy; } +}; + +class RGWListRolePolicies : public RGWRoleRead { +public: + RGWListRolePolicies() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "list_role_policies"; } + RGWOpType get_type() override { return RGW_OP_LIST_ROLE_POLICIES; } + uint64_t get_op() override { return rgw::IAM::iamListRolePolicies; } +}; + +class RGWDeleteRolePolicy : public RGWRoleWrite { + bufferlist bl_post_body; +public: + RGWDeleteRolePolicy(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {}; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "delete_role_policy"; } + RGWOpType get_type() override { return RGW_OP_DELETE_ROLE_POLICY; } + uint64_t get_op() override { return rgw::IAM::iamDeleteRolePolicy; } +}; + +class RGWTagRole : public RGWRoleWrite { + bufferlist bl_post_body; +public: + RGWTagRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {}; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "tag_role"; } + RGWOpType get_type() override { return RGW_OP_TAG_ROLE; } + uint64_t get_op() override { return rgw::IAM::iamTagRole; } +}; + +class RGWListRoleTags : public RGWRoleRead { +public: + RGWListRoleTags() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "list_role_tags"; } + RGWOpType get_type() override { return RGW_OP_LIST_ROLE_TAGS; } + uint64_t get_op() override { return rgw::IAM::iamListRoleTags; } +}; + +class RGWUntagRole : public RGWRoleWrite { + bufferlist bl_post_body; +public: + RGWUntagRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {}; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "untag_role"; } + RGWOpType get_type() override { return RGW_OP_UNTAG_ROLE; } + uint64_t get_op() override { return rgw::IAM::iamUntagRole; } +}; + +class RGWUpdateRole : public RGWRoleWrite { + bufferlist bl_post_body; +public: + RGWUpdateRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {}; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "update_role"; } + RGWOpType get_type() override { return RGW_OP_UPDATE_ROLE; } + uint64_t get_op() override { return rgw::IAM::iamUpdateRole; } +}; \ No newline at end of file diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc new file mode 100644 index 000000000..0b997f30b --- /dev/null +++ b/src/rgw/rgw_rest_s3.cc @@ -0,0 +1,6477 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + +#include "common/ceph_crypto.h" +#include "common/split.h" +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" +#include "common/safe_io.h" +#include "common/errno.h" +#include "auth/Crypto.h" +#include +#include +#include +#include +#define BOOST_BIND_GLOBAL_PLACEHOLDERS +#ifdef HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion" +#endif +#ifdef HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION +#pragma clang diagnostic pop +#endif +#undef BOOST_BIND_GLOBAL_PLACEHOLDERS + +#include + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_s3website.h" +#include "rgw_rest_pubsub.h" +#include "rgw_auth_s3.h" +#include "rgw_acl.h" +#include "rgw_policy_s3.h" +#include "rgw_user.h" +#include "rgw_cors.h" +#include "rgw_cors_s3.h" +#include "rgw_tag_s3.h" + +#include "rgw_client_io.h" + +#include "rgw_keystone.h" +#include "rgw_auth_keystone.h" +#include "rgw_auth_registry.h" + +#include "rgw_es_query.h" + +#include // for 'typeid' + +#include "rgw_ldap.h" +#include "rgw_token.h" +#include "rgw_rest_role.h" +#include "rgw_crypt.h" +#include "rgw_crypt_sanitize.h" +#include "rgw_rest_user_policy.h" +#include "rgw_zone.h" +#include "rgw_bucket_sync.h" + +#include "include/ceph_assert.h" +#include "rgw_role.h" +#include "rgw_rest_sts.h" +#include "rgw_rest_iam.h" +#include "rgw_sts.h" +#include "rgw_sal_rados.h" + +#include "rgw_s3select.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace rgw; +using namespace ceph::crypto; + +void list_all_buckets_start(req_state *s) +{ + s->formatter->open_array_section_in_ns("ListAllMyBucketsResult", XMLNS_AWS_S3); +} + +void list_all_buckets_end(req_state *s) +{ + s->formatter->close_section(); +} + +void dump_bucket(req_state *s, rgw::sal::Bucket& obj) +{ + s->formatter->open_object_section("Bucket"); + s->formatter->dump_string("Name", obj.get_name()); + dump_time(s, "CreationDate", obj.get_creation_time()); + s->formatter->close_section(); +} + +void rgw_get_errno_s3(rgw_http_error *e , int err_no) +{ + rgw_http_errors::const_iterator r = rgw_http_s3_errors.find(err_no); + + if (r != rgw_http_s3_errors.end()) { + e->http_ret = r->second.first; + e->s3_code = r->second.second; + } else { + e->http_ret = 500; + e->s3_code = "UnknownError"; + } +} + +static inline std::string get_s3_expiration_header( + req_state* s, + const ceph::real_time& mtime) +{ + return rgw::lc::s3_expiration_header( + s, s->object->get_key(), s->tagset, mtime, s->bucket_attrs); +} + +static inline bool get_s3_multipart_abort_header( + req_state* s, const ceph::real_time& mtime, + ceph::real_time& date, std::string& rule_id) +{ + return rgw::lc::s3_multipart_abort_header( + s, s->object->get_key(), mtime, s->bucket_attrs, date, rule_id); +} + +struct response_attr_param { + const char *param; + const char *http_attr; +}; + +static struct response_attr_param resp_attr_params[] = { + {"response-content-type", "Content-Type"}, + {"response-content-language", "Content-Language"}, + {"response-expires", "Expires"}, + {"response-cache-control", "Cache-Control"}, + {"response-content-disposition", "Content-Disposition"}, + {"response-content-encoding", "Content-Encoding"}, + {NULL, NULL}, +}; + +#define SSE_C_GROUP 1 +#define KMS_GROUP 2 + +int get_encryption_defaults(req_state *s) +{ + int meta_sse_group = 0; + constexpr auto sse_c_prefix = "x-amz-server-side-encryption-customer-"; + constexpr auto encrypt_attr = "x-amz-server-side-encryption"; + constexpr auto context_attr = "x-amz-server-side-encryption-context"; + constexpr auto kms_attr = "x-amz-server-side-encryption-aws-kms-key-id"; + constexpr auto bucket_key_attr = "x-amz-server-side-encryption-bucket-key-enabled"; + bool bucket_configuration_found { false }; + bool rest_only { false }; + + for (auto& kv : s->info.crypt_attribute_map) { + if (kv.first.find(sse_c_prefix) == 0) + meta_sse_group |= SSE_C_GROUP; + else if (kv.first.find(encrypt_attr) == 0) + meta_sse_group |= KMS_GROUP; + } + if (meta_sse_group == (SSE_C_GROUP|KMS_GROUP)) { + s->err.message = "Server side error - can't do sse-c & sse-kms|sse-s3"; + return -EINVAL; + } + + const auto& buck_attrs = s->bucket_attrs; + auto aiter = buck_attrs.find(RGW_ATTR_BUCKET_ENCRYPTION_POLICY); + RGWBucketEncryptionConfig bucket_encryption_conf; + if (aiter != buck_attrs.end()) { + ldpp_dout(s, 5) << "Found RGW_ATTR_BUCKET_ENCRYPTION_POLICY on " + << s->bucket_name << dendl; + + bufferlist::const_iterator iter{&aiter->second}; + + try { + bucket_encryption_conf.decode(iter); + bucket_configuration_found = true; + } catch (const buffer::error& e) { + s->err.message = "Server side error - can't decode bucket_encryption_conf"; + ldpp_dout(s, 5) << __func__ << "decode bucket_encryption_conf failed" << dendl; + return -EINVAL; + } + } + if (meta_sse_group & SSE_C_GROUP) { + ldpp_dout(s, 20) << "get_encryption_defaults: no defaults cause sse-c forced" + << dendl; + return 0; // sse-c: no defaults here + } + std::string sse_algorithm { bucket_encryption_conf.sse_algorithm() }; + auto kms_master_key_id { bucket_encryption_conf.kms_master_key_id() }; + bool bucket_key_enabled { bucket_encryption_conf.bucket_key_enabled() }; + bool kms_attr_seen = false; + if (bucket_configuration_found) { + ldpp_dout(s, 5) << "RGW_ATTR_BUCKET_ENCRYPTION ALGO: " + << sse_algorithm << dendl; + } + + auto iter = s->info.crypt_attribute_map.find(encrypt_attr); + if (iter != s->info.crypt_attribute_map.end()) { +ldpp_dout(s, 20) << "get_encryption_defaults: found encrypt_attr " << encrypt_attr << " = " << iter->second << ", setting sse_algorithm to that" << dendl; + rest_only = true; + sse_algorithm = iter->second; + } else if (sse_algorithm != "") { + rgw_set_amz_meta_header(s->info.crypt_attribute_map, encrypt_attr, sse_algorithm, OVERWRITE); + } + + iter = s->info.crypt_attribute_map.find(kms_attr); + if (iter != s->info.crypt_attribute_map.end()) { +ldpp_dout(s, 20) << "get_encryption_defaults: found kms_attr " << kms_attr << " = " << iter->second << ", setting kms_attr_seen" << dendl; + if (!rest_only) { + s->err.message = std::string("incomplete rest sse parms: ") + kms_attr + " not valid without kms"; + ldpp_dout(s, 5) << __func__ << "argument problem: " << s->err.message << dendl; + return -EINVAL; + } + kms_attr_seen = true; + } else if (!rest_only && kms_master_key_id != "") { +ldpp_dout(s, 20) << "get_encryption_defaults: no kms_attr, but kms_master_key_id = " << kms_master_key_id << ", settig kms_attr_seen" << dendl; + kms_attr_seen = true; + rgw_set_amz_meta_header(s->info.crypt_attribute_map, kms_attr, kms_master_key_id, OVERWRITE); + } + + iter = s->info.crypt_attribute_map.find(bucket_key_attr); + if (iter != s->info.crypt_attribute_map.end()) { +ldpp_dout(s, 20) << "get_encryption_defaults: found bucket_key_attr " << bucket_key_attr << " = " << iter->second << ", setting kms_attr_seen" << dendl; + if (!rest_only) { + s->err.message = std::string("incomplete rest sse parms: ") + bucket_key_attr + " not valid without kms"; + ldpp_dout(s, 5) << __func__ << "argument problem: " << s->err.message << dendl; + return -EINVAL; + } + kms_attr_seen = true; + } else if (!rest_only && bucket_key_enabled) { +ldpp_dout(s, 20) << "get_encryption_defaults: no bucket_key_attr, but bucket_key_enabled, setting kms_attr_seen" << dendl; + kms_attr_seen = true; + rgw_set_amz_meta_header(s->info.crypt_attribute_map, bucket_key_attr, "true", OVERWRITE); + } + + iter = s->info.crypt_attribute_map.find(context_attr); + if (iter != s->info.crypt_attribute_map.end()) { +ldpp_dout(s, 20) << "get_encryption_defaults: found context_attr " << context_attr << " = " << iter->second << ", setting kms_attr_seen" << dendl; + if (!rest_only) { + s->err.message = std::string("incomplete rest sse parms: ") + context_attr + " not valid without kms"; + ldpp_dout(s, 5) << __func__ << "argument problem: " << s->err.message << dendl; + return -EINVAL; + } + kms_attr_seen = true; + } + + if (kms_attr_seen && sse_algorithm == "") { +ldpp_dout(s, 20) << "get_encryption_defaults: kms_attr but no algorithm, defaulting to aws_kms" << dendl; + sse_algorithm = "aws:kms"; + } +for (const auto& kv: s->info.crypt_attribute_map) { +ldpp_dout(s, 20) << "get_encryption_defaults: final map: " << kv.first << " = " << kv.second << dendl; +} +ldpp_dout(s, 20) << "get_encryption_defaults: kms_attr_seen is " << kms_attr_seen << " and sse_algorithm is " << sse_algorithm << dendl; + if (kms_attr_seen && sse_algorithm != "aws:kms") { + s->err.message = "algorithm <" + sse_algorithm + "> but got sse-kms attributes"; + return -EINVAL; + } + + return 0; +} + +int RGWGetObj_ObjStore_S3Website::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + map::iterator iter; + iter = attrs.find(RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION); + if (iter != attrs.end()) { + bufferlist &bl = iter->second; + s->redirect = bl.c_str(); + s->err.http_ret = 301; + ldpp_dout(this, 20) << __CEPH_ASSERT_FUNCTION << " redirecting per x-amz-website-redirect-location=" << s->redirect << dendl; + op_ret = -ERR_WEBSITE_REDIRECT; + set_req_state_err(s, op_ret); + dump_errno(s); + dump_content_length(s, 0); + dump_redirect(s, s->redirect); + end_header(s, this); + return op_ret; + } else { + return RGWGetObj_ObjStore_S3::send_response_data(bl, bl_ofs, bl_len); + } +} + +int RGWGetObj_ObjStore_S3Website::send_response_data_error(optional_yield y) +{ + return RGWGetObj_ObjStore_S3::send_response_data_error(y); +} + +int RGWGetObj_ObjStore_S3::get_params(optional_yield y) +{ + // for multisite sync requests, only read the slo manifest itself, rather than + // all of the data from its parts. the parts will sync as separate objects + skip_manifest = s->info.args.exists(RGW_SYS_PARAM_PREFIX "sync-manifest"); + + // multisite sync requests should fetch encrypted data, along with the + // attributes needed to support decryption on the other zone + if (s->system_request) { + skip_decrypt = s->info.args.exists(RGW_SYS_PARAM_PREFIX "skip-decrypt"); + } + + // multisite sync requests should fetch cloudtiered objects + sync_cloudtiered = s->info.args.exists(RGW_SYS_PARAM_PREFIX "sync-cloudtiered"); + + dst_zone_trace = s->info.args.get(RGW_SYS_PARAM_PREFIX "if-not-replicated-to"); + + return RGWGetObj_ObjStore::get_params(y); +} + +int RGWGetObj_ObjStore_S3::send_response_data_error(optional_yield y) +{ + bufferlist bl; + return send_response_data(bl, 0 , 0); +} + +template +int decode_attr_bl_single_value(map& attrs, const char *attr_name, T *result, T def_val) +{ + map::iterator iter = attrs.find(attr_name); + if (iter == attrs.end()) { + *result = def_val; + return 0; + } + bufferlist& bl = iter->second; + if (bl.length() == 0) { + *result = def_val; + return 0; + } + auto bliter = bl.cbegin(); + try { + decode(*result, bliter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +inline bool str_has_cntrl(const std::string s) { + return std::any_of(s.begin(), s.end(), ::iscntrl); +} + +inline bool str_has_cntrl(const char* s) { + std::string _s(s); + return str_has_cntrl(_s); +} + +int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, + off_t bl_len) +{ + const char *content_type = NULL; + string content_type_str; + map response_attrs; + map::iterator riter; + bufferlist metadata_bl; + + string expires = get_s3_expiration_header(s, lastmod); + + if (sent_header) + goto send_data; + + if (custom_http_ret) { + set_req_state_err(s, 0); + dump_errno(s, custom_http_ret); + } else { + set_req_state_err(s, (partial_content && !op_ret) ? STATUS_PARTIAL_CONTENT + : op_ret); + dump_errno(s); + } + + if (op_ret) + goto done; + + if (range_str) + dump_range(s, start, end, s->obj_size); + + if (s->system_request && + s->info.args.exists(RGW_SYS_PARAM_PREFIX "prepend-metadata")) { + + dump_header(s, "Rgwx-Object-Size", (long long)total_len); + + if (rgwx_stat) { + /* + * in this case, we're not returning the object's content, only the prepended + * extra metadata + */ + total_len = 0; + } + + /* JSON encode object metadata */ + JSONFormatter jf; + jf.open_object_section("obj_metadata"); + encode_json("attrs", attrs, &jf); + utime_t ut(lastmod); + encode_json("mtime", ut, &jf); + jf.close_section(); + stringstream ss; + jf.flush(ss); + metadata_bl.append(ss.str()); + dump_header(s, "Rgwx-Embedded-Metadata-Len", metadata_bl.length()); + total_len += metadata_bl.length(); + } + + if (s->system_request && !real_clock::is_zero(lastmod)) { + /* we end up dumping mtime in two different methods, a bit redundant */ + dump_epoch_header(s, "Rgwx-Mtime", lastmod); + uint64_t pg_ver = 0; + int r = decode_attr_bl_single_value(attrs, RGW_ATTR_PG_VER, &pg_ver, (uint64_t)0); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl; + } + dump_header(s, "Rgwx-Obj-PG-Ver", pg_ver); + + uint32_t source_zone_short_id = 0; + r = decode_attr_bl_single_value(attrs, RGW_ATTR_SOURCE_ZONE, &source_zone_short_id, (uint32_t)0); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl; + } + if (source_zone_short_id != 0) { + dump_header(s, "Rgwx-Source-Zone-Short-Id", source_zone_short_id); + } + } + + for (auto &it : crypt_http_responses) + dump_header(s, it.first, it.second); + + dump_content_length(s, total_len); + dump_last_modified(s, lastmod); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + dump_header_if_nonempty(s, "x-amz-expiration", expires); + + if (attrs.find(RGW_ATTR_APPEND_PART_NUM) != attrs.end()) { + dump_header(s, "x-rgw-object-type", "Appendable"); + dump_header(s, "x-rgw-next-append-position", s->obj_size); + } else { + dump_header(s, "x-rgw-object-type", "Normal"); + } + // replication status + if (auto i = attrs.find(RGW_ATTR_OBJ_REPLICATION_STATUS); + i != attrs.end()) { + dump_header(s, "x-amz-replication-status", i->second); + } + if (auto i = attrs.find(RGW_ATTR_OBJ_REPLICATION_TRACE); + i != attrs.end()) { + try { + std::vector zones; + auto p = i->second.cbegin(); + decode(zones, p); + for (const auto& zone : zones) { + dump_header(s, "x-rgw-replicated-from", zone.to_str()); + } + } catch (const buffer::error&) {} // omit x-rgw-replicated-from headers + } + + if (! op_ret) { + if (! lo_etag.empty()) { + /* Handle etag of Swift API's large objects (DLO/SLO). It's entirerly + * legit to perform GET on them through S3 API. In such situation, + * a client should receive the composited content with corresponding + * etag value. */ + dump_etag(s, lo_etag); + } else { + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + dump_etag(s, iter->second.to_str()); + } + } + + for (struct response_attr_param *p = resp_attr_params; p->param; p++) { + bool exists; + string val = s->info.args.get(p->param, &exists); + if (exists) { + /* reject unauthenticated response header manipulation, see + * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html */ + if (s->auth.identity->is_anonymous()) { + return -ERR_INVALID_REQUEST; + } + /* HTTP specification says no control characters should be present in + * header values: https://tools.ietf.org/html/rfc7230#section-3.2 + * field-vchar = VCHAR / obs-text + * + * Failure to validate this permits a CRLF injection in HTTP headers, + * whereas S3 GetObject only permits specific headers. + */ + if(str_has_cntrl(val)) { + /* TODO: return a more distinct error in future; + * stating what the problem is */ + return -ERR_INVALID_REQUEST; + } + + if (strcmp(p->param, "response-content-type") != 0) { + response_attrs[p->http_attr] = val; + } else { + content_type_str = val; + content_type = content_type_str.c_str(); + } + } + } + + for (auto iter = attrs.begin(); iter != attrs.end(); ++iter) { + const char *name = iter->first.c_str(); + map::iterator aiter = rgw_to_http_attrs.find(name); + if (aiter != rgw_to_http_attrs.end()) { + if (response_attrs.count(aiter->second) == 0) { + /* Was not already overridden by a response param. */ + + size_t len = iter->second.length(); + string s(iter->second.c_str(), len); + while (len && !s[len - 1]) { + --len; + s.resize(len); + } + response_attrs[aiter->second] = s; + } + } else if (iter->first.compare(RGW_ATTR_CONTENT_TYPE) == 0) { + /* Special handling for content_type. */ + if (!content_type) { + content_type_str = rgw_bl_str(iter->second); + content_type = content_type_str.c_str(); + } + } else if (strcmp(name, RGW_ATTR_SLO_UINDICATOR) == 0) { + // this attr has an extra length prefix from encode() in prior versions + dump_header(s, "X-Object-Meta-Static-Large-Object", "True"); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, + sizeof(RGW_ATTR_META_PREFIX)-1) == 0) { + /* User custom metadata. */ + name += sizeof(RGW_ATTR_PREFIX) - 1; + dump_header(s, name, iter->second); + } else if (iter->first.compare(RGW_ATTR_TAGS) == 0) { + RGWObjTags obj_tags; + try{ + auto it = iter->second.cbegin(); + obj_tags.decode(it); + } catch (buffer::error &err) { + ldpp_dout(this,0) << "Error caught buffer::error couldn't decode TagSet " << dendl; + } + dump_header(s, RGW_AMZ_TAG_COUNT, obj_tags.count()); + } else if (iter->first.compare(RGW_ATTR_OBJECT_RETENTION) == 0 && get_retention){ + RGWObjectRetention retention; + try { + decode(retention, iter->second); + dump_header(s, "x-amz-object-lock-mode", retention.get_mode()); + string date = ceph::to_iso_8601(retention.get_retain_until_date()); + dump_header(s, "x-amz-object-lock-retain-until-date", date.c_str()); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl; + } + } else if (iter->first.compare(RGW_ATTR_OBJECT_LEGAL_HOLD) == 0 && get_legal_hold) { + RGWObjectLegalHold legal_hold; + try { + decode(legal_hold, iter->second); + dump_header(s, "x-amz-object-lock-legal-hold",legal_hold.get_status()); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectLegalHold" << dendl; + } + } + } + } + +done: + for (riter = response_attrs.begin(); riter != response_attrs.end(); + ++riter) { + dump_header(s, riter->first, riter->second); + } + + if (op_ret == -ERR_NOT_MODIFIED) { + end_header(s, this); + } else { + if (!content_type) + content_type = "binary/octet-stream"; + + end_header(s, this, content_type); + } + + if (metadata_bl.length()) { + dump_body(s, metadata_bl); + } + sent_header = true; + +send_data: + if (get_data && !op_ret) { + int r = dump_body(s, bl.c_str() + bl_ofs, bl_len); + if (r < 0) + return r; + } + + return 0; +} + +int RGWGetObj_ObjStore_S3::get_decrypt_filter(std::unique_ptr *filter, RGWGetObj_Filter* cb, bufferlist* manifest_bl) +{ + if (skip_decrypt) { // bypass decryption for multisite sync requests + return 0; + } + + std::unique_ptr block_crypt; + int res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses); + if (res < 0) { + return res; + } + if (block_crypt == nullptr) { + return 0; + } + + // in case of a multipart upload, we need to know the part lengths to + // correctly decrypt across part boundaries + std::vector parts_len; + + // for replicated objects, the original part lengths are preserved in an xattr + if (auto i = attrs.find(RGW_ATTR_CRYPT_PARTS); i != attrs.end()) { + try { + auto p = i->second.cbegin(); + using ceph::decode; + decode(parts_len, p); + } catch (const buffer::error&) { + ldpp_dout(this, 1) << "failed to decode RGW_ATTR_CRYPT_PARTS" << dendl; + return -EIO; + } + } else if (manifest_bl) { + // otherwise, we read the part lengths from the manifest + res = RGWGetObj_BlockDecrypt::read_manifest_parts(this, *manifest_bl, + parts_len); + if (res < 0) { + return res; + } + } + + *filter = std::make_unique( + s, s->cct, cb, std::move(block_crypt), + std::move(parts_len)); + return 0; +} +int RGWGetObj_ObjStore_S3::verify_requester(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) +{ + int ret = -EINVAL; + ret = RGWOp::verify_requester(auth_registry, y); + if(!s->user->get_caps().check_cap("amz-cache", RGW_CAP_READ) && !ret && s->info.env->exists("HTTP_X_AMZ_CACHE")) + ret = override_range_hdr(auth_registry, y); + return ret; +} + +int RGWGetObj_ObjStore_S3::override_range_hdr(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) +{ + int ret = -EINVAL; + ldpp_dout(this, 10) << "cache override headers" << dendl; + RGWEnv* rgw_env = const_cast(s->info.env); + const char* backup_range = rgw_env->get("HTTP_RANGE"); + const char hdrs_split[2] = {(char)178,'\0'}; + const char kv_split[2] = {(char)177,'\0'}; + const char* cache_hdr = rgw_env->get("HTTP_X_AMZ_CACHE"); + for (std::string_view hdr : ceph::split(cache_hdr, hdrs_split)) { + auto kv = ceph::split(hdr, kv_split); + auto k = kv.begin(); + if (std::distance(k, kv.end()) != 2) { + return -EINVAL; + } + auto v = std::next(k); + std::string key = "HTTP_"; + key.append(*k); + boost::replace_all(key, "-", "_"); + ldpp_dout(this, 10) << "after splitting cache kv key: " << key << " " << *v << dendl; + rgw_env->set(std::move(key), std::string(*v)); + } + ret = RGWOp::verify_requester(auth_registry, y); + if(!ret && backup_range) { + rgw_env->set("HTTP_RANGE",backup_range); + } else { + rgw_env->remove("HTTP_RANGE"); + } + return ret; +} + + +void RGWGetObjTags_ObjStore_S3::send_response_data(bufferlist& bl) +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (!op_ret){ + s->formatter->open_object_section_in_ns("Tagging", XMLNS_AWS_S3); + s->formatter->open_object_section("TagSet"); + if (has_tags){ + RGWObjTagSet_S3 tagset; + auto iter = bl.cbegin(); + try { + tagset.decode(iter); + } catch (buffer::error& err) { + ldpp_dout(this,0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + op_ret= -EIO; + return; + } + tagset.dump_xml(s->formatter); + } + s->formatter->close_section(); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + + +int RGWPutObjTags_ObjStore_S3::get_params(optional_yield y) +{ + RGWXMLParser parser; + + if (!parser.init()){ + return -EINVAL; + } + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int r = 0; + bufferlist data; + std::tie(r, data) = read_all_input(s, max_size, false); + + if (r < 0) + return r; + + if (!parser.parse(data.c_str(), data.length(), 1)) { + return -ERR_MALFORMED_XML; + } + + RGWObjTagging_S3 tagging; + + try { + RGWXMLDecoder::decode_xml("Tagging", tagging, &parser); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "Malformed tagging request: " << err << dendl; + return -ERR_MALFORMED_XML; + } + + RGWObjTags obj_tags; + r = tagging.rebuild(obj_tags); + if (r < 0) + return r; + + obj_tags.encode(tags_bl); + ldpp_dout(this, 20) << "Read " << obj_tags.count() << "tags" << dendl; + + return 0; +} + +void RGWPutObjTags_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + +} + +void RGWDeleteObjTags_ObjStore_S3::send_response() +{ + if (op_ret == 0){ + op_ret = STATUS_NO_CONTENT; + } + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); +} + +void RGWGetBucketTags_ObjStore_S3::send_response_data(bufferlist& bl) +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (!op_ret) { + s->formatter->open_object_section_in_ns("Tagging", XMLNS_AWS_S3); + s->formatter->open_object_section("TagSet"); + if (has_tags){ + RGWObjTagSet_S3 tagset; + auto iter = bl.cbegin(); + try { + tagset.decode(iter); + } catch (buffer::error& err) { + ldpp_dout(this,0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + op_ret= -EIO; + return; + } + tagset.dump_xml(s->formatter); + } + s->formatter->close_section(); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWPutBucketTags_ObjStore_S3::get_params(const DoutPrefixProvider *dpp, optional_yield y) +{ + RGWXMLParser parser; + + if (!parser.init()){ + return -EINVAL; + } + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + int r = 0; + bufferlist data; + + std::tie(r, data) = read_all_input(s, max_size, false); + + if (r < 0) + return r; + + if (!parser.parse(data.c_str(), data.length(), 1)) { + return -ERR_MALFORMED_XML; + } + + RGWObjTagging_S3 tagging; + try { + RGWXMLDecoder::decode_xml("Tagging", tagging, &parser); + } catch (RGWXMLDecoder::err& err) { + + ldpp_dout(dpp, 5) << "Malformed tagging request: " << err << dendl; + return -ERR_MALFORMED_XML; + } + + RGWObjTags obj_tags(50); // A tag set can contain as many as 50 tags, or it can be empty. + r = tagging.rebuild(obj_tags); + if (r < 0) + return r; + + obj_tags.encode(tags_bl); + ldpp_dout(dpp, 20) << "Read " << obj_tags.count() << "tags" << dendl; + + // forward bucket tags requests to meta master zone + if (!driver->is_meta_master()) { + /* only need to keep this data around if we're not meta master */ + in_data = std::move(data); + } + + return 0; +} + +void RGWPutBucketTags_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); +} + +void RGWDeleteBucketTags_ObjStore_S3::send_response() +{ + // A successful DeleteBucketTagging should + // return a 204 status code. + if (op_ret == 0) + op_ret = STATUS_NO_CONTENT; + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); +} + +namespace { + +bool is_valid_status(const string& s) { + return (s == "Enabled" || + s == "Disabled"); +} + +static string enabled_group_id = "s3-bucket-replication:enabled"; +static string disabled_group_id = "s3-bucket-replication:disabled"; + +struct ReplicationConfiguration { + string role; + + struct Rule { + struct DeleteMarkerReplication { + string status; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Status", status, obj); + } + + void dump_xml(Formatter *f) const { + encode_xml("Status", status, f); + } + + bool is_valid(CephContext *cct) const { + bool result = is_valid_status(status); + if (!result) { + ldout(cct, 5) << "NOTICE: bad status provided in DeleteMarkerReplication element (status=" << status << ")" << dendl; + } + return result; + } + }; + + struct Source { /* rgw extension */ + std::vector zone_names; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Zone", zone_names, obj); + } + + void dump_xml(Formatter *f) const { + encode_xml("Zone", zone_names, f); + } + }; + + struct Destination { + struct AccessControlTranslation { + string owner; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Owner", owner, obj); + } + void dump_xml(Formatter *f) const { + encode_xml("Owner", owner, f); + } + }; + + std::optional acl_translation; + std::optional account; + string bucket; + std::optional storage_class; + std::vector zone_names; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("AccessControlTranslation", acl_translation, obj); + RGWXMLDecoder::decode_xml("Account", account, obj); + if (account && account->empty()) { + account.reset(); + } + RGWXMLDecoder::decode_xml("Bucket", bucket, obj); + RGWXMLDecoder::decode_xml("StorageClass", storage_class, obj); + if (storage_class && storage_class->empty()) { + storage_class.reset(); + } + RGWXMLDecoder::decode_xml("Zone", zone_names, obj); /* rgw extension */ + } + + void dump_xml(Formatter *f) const { + encode_xml("AccessControlTranslation", acl_translation, f); + encode_xml("Account", account, f); + encode_xml("Bucket", bucket, f); + encode_xml("StorageClass", storage_class, f); + encode_xml("Zone", zone_names, f); + } + }; + + struct Filter { + struct Tag { + string key; + string value; + + bool empty() const { + return key.empty() && value.empty(); + } + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Key", key, obj); + RGWXMLDecoder::decode_xml("Value", value, obj); + }; + + void dump_xml(Formatter *f) const { + encode_xml("Key", key, f); + encode_xml("Value", value, f); + } + }; + + struct AndElements { + std::optional prefix; + std::vector tags; + + bool empty() const { + return !prefix && + (tags.size() == 0); + } + + void decode_xml(XMLObj *obj) { + std::vector _tags; + RGWXMLDecoder::decode_xml("Prefix", prefix, obj); + if (prefix && prefix->empty()) { + prefix.reset(); + } + RGWXMLDecoder::decode_xml("Tag", _tags, obj); + for (auto& t : _tags) { + if (!t.empty()) { + tags.push_back(std::move(t)); + } + } + }; + + void dump_xml(Formatter *f) const { + encode_xml("Prefix", prefix, f); + encode_xml("Tag", tags, f); + } + }; + + std::optional prefix; + std::optional tag; + std::optional and_elements; + + bool empty() const { + return (!prefix && !tag && !and_elements); + } + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Prefix", prefix, obj); + if (prefix && prefix->empty()) { + prefix.reset(); + } + RGWXMLDecoder::decode_xml("Tag", tag, obj); + if (tag && tag->empty()) { + tag.reset(); + } + RGWXMLDecoder::decode_xml("And", and_elements, obj); + if (and_elements && and_elements->empty()) { + and_elements.reset(); + } + }; + + void dump_xml(Formatter *f) const { + encode_xml("Prefix", prefix, f); + encode_xml("Tag", tag, f); + encode_xml("And", and_elements, f); + } + + bool is_valid(CephContext *cct) const { + if (tag && prefix) { + ldout(cct, 5) << "NOTICE: both tag and prefix were provided in replication filter rule" << dendl; + return false; + } + + if (and_elements) { + if (prefix && and_elements->prefix) { + ldout(cct, 5) << "NOTICE: too many prefixes were provided in re" << dendl; + return false; + } + } + return true; + }; + + int to_sync_pipe_filter(CephContext *cct, + rgw_sync_pipe_filter *f) const { + if (!is_valid(cct)) { + return -EINVAL; + } + if (prefix) { + f->prefix = *prefix; + } + if (tag) { + f->tags.insert(rgw_sync_pipe_filter_tag(tag->key, tag->value)); + } + + if (and_elements) { + if (and_elements->prefix) { + f->prefix = *and_elements->prefix; + } + for (auto& t : and_elements->tags) { + f->tags.insert(rgw_sync_pipe_filter_tag(t.key, t.value)); + } + } + return 0; + } + + void from_sync_pipe_filter(const rgw_sync_pipe_filter& f) { + if (f.prefix && f.tags.empty()) { + prefix = f.prefix; + return; + } + if (f.prefix) { + and_elements.emplace(); + and_elements->prefix = f.prefix; + } else if (f.tags.size() == 1) { + auto iter = f.tags.begin(); + if (iter == f.tags.end()) { + /* should never happen */ + return; + } + auto& t = *iter; + tag.emplace(); + tag->key = t.key; + tag->value = t.value; + return; + } + + if (f.tags.empty()) { + return; + } + + if (!and_elements) { + and_elements.emplace(); + } + + for (auto& t : f.tags) { + auto& tag = and_elements->tags.emplace_back(); + tag.key = t.key; + tag.value = t.value; + } + } + }; + + set get_zone_ids_from_names(rgw::sal::Driver* driver, + const vector& zone_names) const { + set ids; + + for (auto& name : zone_names) { + std::unique_ptr zone; + int ret = driver->get_zone()->get_zonegroup().get_zone_by_name(name, &zone); + if (ret >= 0) { + rgw_zone_id id = zone->get_id(); + ids.insert(std::move(id)); + } + } + + return ids; + } + + vector get_zone_names_from_ids(rgw::sal::Driver* driver, + const set& zone_ids) const { + vector names; + + for (auto& id : zone_ids) { + std::unique_ptr zone; + int ret = driver->get_zone()->get_zonegroup().get_zone_by_id(id.id, &zone); + if (ret >= 0) { + names.emplace_back(zone->get_name()); + } + } + + return names; + } + + std::optional delete_marker_replication; + std::optional source; + Destination destination; + std::optional filter; + string id; + int32_t priority; + string status; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("DeleteMarkerReplication", delete_marker_replication, obj); + RGWXMLDecoder::decode_xml("Source", source, obj); + RGWXMLDecoder::decode_xml("Destination", destination, obj); + RGWXMLDecoder::decode_xml("ID", id, obj); + + std::optional prefix; + RGWXMLDecoder::decode_xml("Prefix", prefix, obj); + if (prefix) { + filter.emplace(); + filter->prefix = prefix; + } + + if (!filter) { + RGWXMLDecoder::decode_xml("Filter", filter, obj); + } else { + /* don't want to have filter reset because it might have been initialized + * when decoding prefix + */ + RGWXMLDecoder::decode_xml("Filter", *filter, obj); + } + + RGWXMLDecoder::decode_xml("Priority", priority, obj); + RGWXMLDecoder::decode_xml("Status", status, obj); + } + + void dump_xml(Formatter *f) const { + encode_xml("DeleteMarkerReplication", delete_marker_replication, f); + encode_xml("Source", source, f); + encode_xml("Destination", destination, f); + encode_xml("Filter", filter, f); + encode_xml("ID", id, f); + encode_xml("Priority", priority, f); + encode_xml("Status", status, f); + } + + bool is_valid(CephContext *cct) const { + if (!is_valid_status(status)) { + ldout(cct, 5) << "NOTICE: bad status provided in rule (status=" << status << ")" << dendl; + return false; + } + if ((filter && !filter->is_valid(cct)) || + (delete_marker_replication && !delete_marker_replication->is_valid(cct))) { + return false; + } + return true; + } + + int to_sync_policy_pipe(req_state *s, rgw::sal::Driver* driver, + rgw_sync_bucket_pipes *pipe, + bool *enabled) const { + if (!is_valid(s->cct)) { + return -EINVAL; + } + + pipe->id = id; + pipe->params.priority = priority; + + const auto& user_id = s->user->get_id(); + + rgw_bucket_key dest_bk(user_id.tenant, + destination.bucket); + + if (source && !source->zone_names.empty()) { + pipe->source.zones = get_zone_ids_from_names(driver, source->zone_names); + } else { + pipe->source.set_all_zones(true); + } + if (!destination.zone_names.empty()) { + pipe->dest.zones = get_zone_ids_from_names(driver, destination.zone_names); + } else { + pipe->dest.set_all_zones(true); + } + pipe->dest.bucket.emplace(dest_bk); + + if (filter) { + int r = filter->to_sync_pipe_filter(s->cct, &pipe->params.source.filter); + if (r < 0) { + return r; + } + } + if (destination.acl_translation) { + rgw_user u; + u.tenant = user_id.tenant; + u.from_str(destination.acl_translation->owner); /* explicit tenant will override tenant, + otherwise will inherit it from s->user */ + pipe->params.dest.acl_translation.emplace(); + pipe->params.dest.acl_translation->owner = u; + } + pipe->params.dest.storage_class = destination.storage_class; + + *enabled = (status == "Enabled"); + + pipe->params.mode = rgw_sync_pipe_params::Mode::MODE_USER; + pipe->params.user = user_id.to_str(); + + return 0; + } + + void from_sync_policy_pipe(rgw::sal::Driver* driver, + const rgw_sync_bucket_pipes& pipe, + bool enabled) { + id = pipe.id; + status = (enabled ? "Enabled" : "Disabled"); + priority = pipe.params.priority; + + if (pipe.source.all_zones) { + source.reset(); + } else if (pipe.source.zones) { + source.emplace(); + source->zone_names = get_zone_names_from_ids(driver, *pipe.source.zones); + } + + if (!pipe.dest.all_zones && + pipe.dest.zones) { + destination.zone_names = get_zone_names_from_ids(driver, *pipe.dest.zones); + } + + if (pipe.params.dest.acl_translation) { + destination.acl_translation.emplace(); + destination.acl_translation->owner = pipe.params.dest.acl_translation->owner.to_str(); + } + + if (pipe.params.dest.storage_class) { + destination.storage_class = *pipe.params.dest.storage_class; + } + + if (pipe.dest.bucket) { + destination.bucket = pipe.dest.bucket->get_key(); + } + + filter.emplace(); + filter->from_sync_pipe_filter(pipe.params.source.filter); + + if (filter->empty()) { + filter.reset(); + } + } + }; + + std::vector rules; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Role", role, obj); + RGWXMLDecoder::decode_xml("Rule", rules, obj); + } + + void dump_xml(Formatter *f) const { + encode_xml("Role", role, f); + encode_xml("Rule", rules, f); + } + + int to_sync_policy_groups(req_state *s, rgw::sal::Driver* driver, + vector *result) const { + result->resize(2); + + rgw_sync_policy_group& enabled_group = (*result)[0]; + rgw_sync_policy_group& disabled_group = (*result)[1]; + + enabled_group.id = enabled_group_id; + enabled_group.status = rgw_sync_policy_group::Status::ENABLED; + disabled_group.id = disabled_group_id; + disabled_group.status = rgw_sync_policy_group::Status::ALLOWED; /* not enabled, not forbidden */ + + for (auto& rule : rules) { + rgw_sync_bucket_pipes pipe; + bool enabled; + int r = rule.to_sync_policy_pipe(s, driver, &pipe, &enabled); + if (r < 0) { + ldpp_dout(s, 5) << "NOTICE: failed to convert replication configuration into sync policy pipe (rule.id=" << rule.id << "): " << cpp_strerror(-r) << dendl; + return r; + } + + if (enabled) { + enabled_group.pipes.emplace_back(std::move(pipe)); + } else { + disabled_group.pipes.emplace_back(std::move(pipe)); + } + } + return 0; + } + + void from_sync_policy_group(rgw::sal::Driver* driver, + const rgw_sync_policy_group& group) { + + bool enabled = (group.status == rgw_sync_policy_group::Status::ENABLED); + + for (auto& pipe : group.pipes) { + auto& rule = rules.emplace_back(); + rule.from_sync_policy_pipe(driver, pipe, enabled); + } + } +}; + +} + +void RGWGetBucketReplication_ObjStore_S3::send_response_data() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + ReplicationConfiguration conf; + + if (s->bucket->get_info().sync_policy) { + auto policy = s->bucket->get_info().sync_policy; + + auto iter = policy->groups.find(enabled_group_id); + if (iter != policy->groups.end()) { + conf.from_sync_policy_group(driver, iter->second); + } + iter = policy->groups.find(disabled_group_id); + if (iter != policy->groups.end()) { + conf.from_sync_policy_group(driver, iter->second); + } + } + + if (!op_ret) { + s->formatter->open_object_section_in_ns("ReplicationConfiguration", XMLNS_AWS_S3); + conf.dump_xml(s->formatter); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWPutBucketReplication_ObjStore_S3::get_params(optional_yield y) +{ + RGWXMLParser parser; + + if (!parser.init()){ + return -EINVAL; + } + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + int r = 0; + bufferlist data; + + std::tie(r, data) = read_all_input(s, max_size, false); + + if (r < 0) + return r; + + if (!parser.parse(data.c_str(), data.length(), 1)) { + return -ERR_MALFORMED_XML; + } + + ReplicationConfiguration conf; + try { + RGWXMLDecoder::decode_xml("ReplicationConfiguration", conf, &parser); + } catch (RGWXMLDecoder::err& err) { + + ldpp_dout(this, 5) << "Malformed tagging request: " << err << dendl; + return -ERR_MALFORMED_XML; + } + + r = conf.to_sync_policy_groups(s, driver, &sync_policy_groups); + if (r < 0) { + return r; + } + + // forward requests to meta master zone + if (!driver->is_meta_master()) { + /* only need to keep this data around if we're not meta master */ + in_data = std::move(data); + } + + return 0; +} + +void RGWPutBucketReplication_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); +} + +void RGWDeleteBucketReplication_ObjStore_S3::update_sync_policy(rgw_sync_policy_info *policy) +{ + policy->groups.erase(enabled_group_id); + policy->groups.erase(disabled_group_id); +} + +void RGWDeleteBucketReplication_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); +} + +void RGWListBuckets_ObjStore_S3::send_response_begin(bool has_buckets) +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + dump_start(s); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, NULL, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING); + + if (! op_ret) { + list_all_buckets_start(s); + dump_owner(s, s->user->get_id(), s->user->get_display_name()); + s->formatter->open_array_section("Buckets"); + sent_data = true; + } +} + +void RGWListBuckets_ObjStore_S3::send_response_data(rgw::sal::BucketList& buckets) +{ + if (!sent_data) + return; + + auto& m = buckets.get_buckets(); + + for (auto iter = m.begin(); iter != m.end(); ++iter) { + auto& bucket = iter->second; + dump_bucket(s, *bucket); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWListBuckets_ObjStore_S3::send_response_end() +{ + if (sent_data) { + s->formatter->close_section(); + list_all_buckets_end(s); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWGetUsage_ObjStore_S3::get_params(optional_yield y) +{ + start_date = s->info.args.get("start-date"); + end_date = s->info.args.get("end-date"); + return 0; +} + +static void dump_usage_categories_info(Formatter *formatter, const rgw_usage_log_entry& entry, map *categories) +{ + formatter->open_array_section("categories"); + map::const_iterator uiter; + for (uiter = entry.usage_map.begin(); uiter != entry.usage_map.end(); ++uiter) { + if (categories && !categories->empty() && !categories->count(uiter->first)) + continue; + const rgw_usage_data& usage = uiter->second; + formatter->open_object_section("Entry"); + encode_json("Category", uiter->first, formatter); + encode_json("BytesSent", usage.bytes_sent, formatter); + encode_json("BytesReceived", usage.bytes_received, formatter); + encode_json("Ops", usage.ops, formatter); + encode_json("SuccessfulOps", usage.successful_ops, formatter); + formatter->close_section(); // Entry + } + formatter->close_section(); // Category +} + +static void dump_usage_bucket_info(Formatter *formatter, const std::string& name, const bucket_meta_entry& entry) +{ + formatter->open_object_section("Entry"); + encode_json("Bucket", name, formatter); + encode_json("Bytes", entry.size, formatter); + encode_json("Bytes_Rounded", entry.size_rounded, formatter); + formatter->close_section(); // entry +} + +void RGWGetUsage_ObjStore_S3::send_response() +{ + if (op_ret < 0) + set_req_state_err(s, op_ret); + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret < 0) + return; + + Formatter *formatter = s->formatter; + string last_owner; + bool user_section_open = false; + + formatter->open_object_section("Usage"); + if (show_log_entries) { + formatter->open_array_section("Entries"); + } + map::iterator iter; + for (iter = usage.begin(); iter != usage.end(); ++iter) { + const rgw_user_bucket& ub = iter->first; + const rgw_usage_log_entry& entry = iter->second; + + if (show_log_entries) { + if (ub.user.compare(last_owner) != 0) { + if (user_section_open) { + formatter->close_section(); + formatter->close_section(); + } + formatter->open_object_section("User"); + formatter->dump_string("Owner", ub.user); + formatter->open_array_section("Buckets"); + user_section_open = true; + last_owner = ub.user; + } + formatter->open_object_section("Bucket"); + formatter->dump_string("Bucket", ub.bucket); + utime_t ut(entry.epoch, 0); + ut.gmtime(formatter->dump_stream("Time")); + formatter->dump_int("Epoch", entry.epoch); + dump_usage_categories_info(formatter, entry, &categories); + formatter->close_section(); // bucket + } + + summary_map[ub.user].aggregate(entry, &categories); + } + + if (show_log_entries) { + if (user_section_open) { + formatter->close_section(); // buckets + formatter->close_section(); //user + } + formatter->close_section(); // entries + } + + if (show_log_sum) { + formatter->open_array_section("Summary"); + map::iterator siter; + for (siter = summary_map.begin(); siter != summary_map.end(); ++siter) { + const rgw_usage_log_entry& entry = siter->second; + formatter->open_object_section("User"); + formatter->dump_string("User", siter->first); + dump_usage_categories_info(formatter, entry, &categories); + rgw_usage_data total_usage; + entry.sum(total_usage, categories); + formatter->open_object_section("Total"); + encode_json("BytesSent", total_usage.bytes_sent, formatter); + encode_json("BytesReceived", total_usage.bytes_received, formatter); + encode_json("Ops", total_usage.ops, formatter); + encode_json("SuccessfulOps", total_usage.successful_ops, formatter); + formatter->close_section(); // total + formatter->close_section(); // user + } + + if (s->cct->_conf->rgw_rest_getusage_op_compat) { + formatter->open_object_section("Stats"); + } + + // send info about quota config + auto user_info = s->user->get_info(); + encode_json("QuotaMaxBytes", user_info.quota.user_quota.max_size, formatter); + encode_json("QuotaMaxBuckets", user_info.max_buckets, formatter); + encode_json("QuotaMaxObjCount", user_info.quota.user_quota.max_objects, formatter); + encode_json("QuotaMaxBytesPerBucket", user_info.quota.bucket_quota.max_objects, formatter); + encode_json("QuotaMaxObjCountPerBucket", user_info.quota.bucket_quota.max_size, formatter); + // send info about user's capacity utilization + encode_json("TotalBytes", stats.size, formatter); + encode_json("TotalBytesRounded", stats.size_rounded, formatter); + encode_json("TotalEntries", stats.num_objects, formatter); + + if (s->cct->_conf->rgw_rest_getusage_op_compat) { + formatter->close_section(); //Stats + } + + formatter->close_section(); // summary + } + + formatter->open_array_section("CapacityUsed"); + formatter->open_object_section("User"); + formatter->open_array_section("Buckets"); + for (const auto& biter : buckets_usage) { + const bucket_meta_entry& entry = biter.second; + dump_usage_bucket_info(formatter, biter.first, entry); + } + formatter->close_section(); // Buckets + formatter->close_section(); // User + formatter->close_section(); // CapacityUsed + + formatter->close_section(); // usage + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWListBucket_ObjStore_S3::get_common_params() +{ + list_versions = s->info.args.exists("versions"); + prefix = s->info.args.get("prefix"); + + // non-standard + s->info.args.get_bool("allow-unordered", &allow_unordered, false); + delimiter = s->info.args.get("delimiter"); + max_keys = s->info.args.get("max-keys"); + op_ret = parse_max_keys(); + if (op_ret < 0) { + return op_ret; + } + encoding_type = s->info.args.get("encoding-type"); + if (s->system_request) { + s->info.args.get_bool("objs-container", &objs_container, false); + const char *shard_id_str = s->info.env->get("HTTP_RGWX_SHARD_ID"); + if (shard_id_str) { + string err; + shard_id = strict_strtol(shard_id_str, 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "bad shard id specified: " << shard_id_str << dendl; + return -EINVAL; + } + } else { + shard_id = s->bucket_instance_shard_id; + } + } + return 0; +} + +int RGWListBucket_ObjStore_S3::get_params(optional_yield y) +{ + int ret = get_common_params(); + if (ret < 0) { + return ret; + } + if (!list_versions) { + marker = s->info.args.get("marker"); + } else { + marker.name = s->info.args.get("key-marker"); + marker.instance = s->info.args.get("version-id-marker"); + } + return 0; +} + +int RGWListBucket_ObjStore_S3v2::get_params(optional_yield y) +{ +int ret = get_common_params(); +if (ret < 0) { + return ret; +} +s->info.args.get_bool("fetch-owner", &fetchOwner, false); +startAfter = s->info.args.get("start-after", &start_after_exist); +continuation_token = s->info.args.get("continuation-token", &continuation_token_exist); +if(!continuation_token_exist) { + marker = startAfter; +} else { + marker = continuation_token; +} +return 0; +} + +void RGWListBucket_ObjStore_S3::send_common_versioned_response() +{ + if (!s->bucket_tenant.empty()) { + s->formatter->dump_string("Tenant", s->bucket_tenant); + } + s->formatter->dump_string("Name", s->bucket_name); + s->formatter->dump_string("Prefix", prefix); + s->formatter->dump_int("MaxKeys", max); + if (!delimiter.empty()) { + s->formatter->dump_string("Delimiter", delimiter); + } + s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" + : "false")); + + if (!common_prefixes.empty()) { + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); + pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->open_array_section("CommonPrefixes"); + if (encode_key) { + s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false)); + } else { + s->formatter->dump_string("Prefix", pref_iter->first); + } + + s->formatter->close_section(); + } + } + } + +void RGWListBucket_ObjStore_S3::send_versioned_response() +{ + s->formatter->open_object_section_in_ns("ListVersionsResult", XMLNS_AWS_S3); + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); + encode_key = true; + } + RGWListBucket_ObjStore_S3::send_common_versioned_response(); + s->formatter->dump_string("KeyMarker", marker.name); + s->formatter->dump_string("VersionIdMarker", marker.instance); + if (is_truncated && !next_marker.empty()) { + s->formatter->dump_string("NextKeyMarker", next_marker.name); + if (next_marker.instance.empty()) { + s->formatter->dump_string("NextVersionIdMarker", "null"); + } + else { + s->formatter->dump_string("NextVersionIdMarker", next_marker.instance); + } + } + + if (op_ret >= 0) { + if (objs_container) { + s->formatter->open_array_section("Entries"); + } + + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + const char *section_name = (iter->is_delete_marker() ? "DeleteMarker" + : "Version"); + s->formatter->open_object_section(section_name); + if (objs_container) { + s->formatter->dump_bool("IsDeleteMarker", iter->is_delete_marker()); + } + rgw_obj_key key(iter->key); + if (encode_key) { + string key_name; + url_encode(key.name, key_name); + s->formatter->dump_string("Key", key_name); + } + else { + s->formatter->dump_string("Key", key.name); + } + string version_id = key.instance; + if (version_id.empty()) { + version_id = "null"; + } + if (s->system_request) { + if (iter->versioned_epoch > 0) { + s->formatter->dump_int("VersionedEpoch", iter->versioned_epoch); + } + s->formatter->dump_string("RgwxTag", iter->tag); + utime_t ut(iter->meta.mtime); + ut.gmtime_nsec(s->formatter->dump_stream("RgwxMtime")); + } + s->formatter->dump_string("VersionId", version_id); + s->formatter->dump_bool("IsLatest", iter->is_current()); + dump_time(s, "LastModified", iter->meta.mtime); + if (!iter->is_delete_marker()) { + s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str()); + s->formatter->dump_int("Size", iter->meta.accounted_size); + auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); + s->formatter->dump_string("StorageClass", storage_class.c_str()); + } + dump_owner(s, rgw_user(iter->meta.owner), iter->meta.owner_display_name); + if (iter->meta.appendable) { + s->formatter->dump_string("Type", "Appendable"); + } else { + s->formatter->dump_string("Type", "Normal"); + } + s->formatter->close_section(); // Version/DeleteMarker + } + if (objs_container) { + s->formatter->close_section(); // Entries + } + s->formatter->close_section(); // ListVersionsResult + } + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +void RGWListBucket_ObjStore_S3::send_common_response() +{ + if (!s->bucket_tenant.empty()) { + s->formatter->dump_string("Tenant", s->bucket_tenant); + } + s->formatter->dump_string("Name", s->bucket_name); + s->formatter->dump_string("Prefix", prefix); + s->formatter->dump_int("MaxKeys", max); + if (!delimiter.empty()) { + if (encode_key) { + s->formatter->dump_string("Delimiter", url_encode(delimiter, false)); + } else { + s->formatter->dump_string("Delimiter", delimiter); + } + } + s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" + : "false")); + + if (!common_prefixes.empty()) { + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); + pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->open_array_section("CommonPrefixes"); + if (encode_key) { + s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false)); + } else { + s->formatter->dump_string("Prefix", pref_iter->first); + } + s->formatter->close_section(); + } + } + } + +void RGWListBucket_ObjStore_S3::send_response() +{ + if (op_ret < 0) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret < 0) { + return; + } + if (list_versions) { + send_versioned_response(); + return; + } + + s->formatter->open_object_section_in_ns("ListBucketResult", XMLNS_AWS_S3); + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); + encode_key = true; + } + + RGWListBucket_ObjStore_S3::send_common_response(); + + if (op_ret >= 0) { + if (s->format == RGWFormat::JSON) { + s->formatter->open_array_section("Contents"); + } + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + + rgw_obj_key key(iter->key); + std::string key_name; + + if (encode_key) { + url_encode(key.name, key_name); + } else { + key_name = key.name; + } + /* conditionally format JSON in the obvious way--I'm unsure if + * AWS actually does this */ + if (s->format == RGWFormat::XML) { + s->formatter->open_array_section("Contents"); + } else { + // json + s->formatter->open_object_section("dummy"); + } + s->formatter->dump_string("Key", key_name); + dump_time(s, "LastModified", iter->meta.mtime); + s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str()); + s->formatter->dump_int("Size", iter->meta.accounted_size); + auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); + s->formatter->dump_string("StorageClass", storage_class.c_str()); + dump_owner(s, rgw_user(iter->meta.owner), iter->meta.owner_display_name); + if (s->system_request) { + s->formatter->dump_string("RgwxTag", iter->tag); + } + if (iter->meta.appendable) { + s->formatter->dump_string("Type", "Appendable"); + } else { + s->formatter->dump_string("Type", "Normal"); + } + // JSON has one extra section per element + s->formatter->close_section(); + } // foreach obj + if (s->format == RGWFormat::JSON) { + s->formatter->close_section(); + } + } + s->formatter->dump_string("Marker", marker.name); + if (is_truncated && !next_marker.empty()) { + s->formatter->dump_string("NextMarker", next_marker.name); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} /* RGWListBucket_ObjStore_S3::send_response() */ + +void RGWListBucket_ObjStore_S3v2::send_versioned_response() +{ + s->formatter->open_object_section_in_ns("ListVersionsResult", XMLNS_AWS_S3); + RGWListBucket_ObjStore_S3v2::send_common_versioned_response(); + s->formatter->dump_string("KeyContinuationToken", marker.name); + s->formatter->dump_string("VersionIdContinuationToken", marker.instance); + if (is_truncated && !next_marker.empty()) { + s->formatter->dump_string("NextKeyContinuationToken", next_marker.name); + s->formatter->dump_string("NextVersionIdContinuationToken", next_marker.instance); + } + + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); + encode_key = true; + } + + if (op_ret >= 0) { + if (objs_container) { + s->formatter->open_array_section("Entries"); + } + + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + const char *section_name = (iter->is_delete_marker() ? "DeleteContinuationToken" + : "Version"); + s->formatter->open_object_section(section_name); + if (objs_container) { + s->formatter->dump_bool("IsDeleteContinuationToken", iter->is_delete_marker()); + } + rgw_obj_key key(iter->key); + if (encode_key) { + string key_name; + url_encode(key.name, key_name); + s->formatter->dump_string("Key", key_name); + } + else { + s->formatter->dump_string("Key", key.name); + } + string version_id = key.instance; + if (version_id.empty()) { + version_id = "null"; + } + if (s->system_request) { + if (iter->versioned_epoch > 0) { + s->formatter->dump_int("VersionedEpoch", iter->versioned_epoch); + } + s->formatter->dump_string("RgwxTag", iter->tag); + utime_t ut(iter->meta.mtime); + ut.gmtime_nsec(s->formatter->dump_stream("RgwxMtime")); + } + s->formatter->dump_string("VersionId", version_id); + s->formatter->dump_bool("IsLatest", iter->is_current()); + dump_time(s, "LastModified", iter->meta.mtime); + if (!iter->is_delete_marker()) { + s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str()); + s->formatter->dump_int("Size", iter->meta.accounted_size); + auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); + s->formatter->dump_string("StorageClass", storage_class.c_str()); + } + if (fetchOwner == true) { + dump_owner(s, rgw_user(iter->meta.owner), iter->meta.owner_display_name); + } + s->formatter->close_section(); + } + + + if (objs_container) { + s->formatter->close_section(); + } + + if (!common_prefixes.empty()) { + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); + pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->open_array_section("CommonPrefixes"); + if (encode_key) { + s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false)); + } else { + s->formatter->dump_string("Prefix", pref_iter->first); + } + + s->formatter->dump_int("KeyCount",objs.size()); + if (start_after_exist) { + s->formatter->dump_string("StartAfter", startAfter); + } + s->formatter->close_section(); + } + } + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWListBucket_ObjStore_S3v2::send_response() +{ + if (op_ret < 0) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret < 0) { + return; + } + if (list_versions) { + send_versioned_response(); + return; + } + + s->formatter->open_object_section_in_ns("ListBucketResult", XMLNS_AWS_S3); + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); + encode_key = true; + } + + RGWListBucket_ObjStore_S3::send_common_response(); + if (op_ret >= 0) { + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + rgw_obj_key key(iter->key); + s->formatter->open_array_section("Contents"); + if (encode_key) { + string key_name; + url_encode(key.name, key_name); + s->formatter->dump_string("Key", key_name); + } + else { + s->formatter->dump_string("Key", key.name); + } + dump_time(s, "LastModified", iter->meta.mtime); + s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str()); + s->formatter->dump_int("Size", iter->meta.accounted_size); + auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); + s->formatter->dump_string("StorageClass", storage_class.c_str()); + if (fetchOwner == true) { + dump_owner(s, rgw_user(iter->meta.owner), iter->meta.owner_display_name); + } + if (s->system_request) { + s->formatter->dump_string("RgwxTag", iter->tag); + } + if (iter->meta.appendable) { + s->formatter->dump_string("Type", "Appendable"); + } else { + s->formatter->dump_string("Type", "Normal"); + } + s->formatter->close_section(); + } + } + if (continuation_token_exist) { + s->formatter->dump_string("ContinuationToken", continuation_token); + } + if (is_truncated && !next_marker.empty()) { + s->formatter->dump_string("NextContinuationToken", next_marker.name); + } + s->formatter->dump_int("KeyCount", objs.size() + common_prefixes.size()); + if (start_after_exist) { + s->formatter->dump_string("StartAfter", startAfter); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketLogging_ObjStore_S3::send_response() +{ + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + s->formatter->open_object_section_in_ns("BucketLoggingStatus", XMLNS_AWS_S3); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketLocation_ObjStore_S3::send_response() +{ + dump_errno(s); + end_header(s, this); + dump_start(s); + + std::unique_ptr zonegroup; + string api_name; + + int ret = driver->get_zonegroup(s->bucket->get_info().zonegroup, &zonegroup); + if (ret >= 0) { + api_name = zonegroup->get_api_name(); + } else { + if (s->bucket->get_info().zonegroup != "default") { + api_name = s->bucket->get_info().zonegroup; + } + } + + s->formatter->dump_format_ns("LocationConstraint", XMLNS_AWS_S3, + "%s", api_name.c_str()); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketVersioning_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + s->formatter->open_object_section_in_ns("VersioningConfiguration", XMLNS_AWS_S3); + if (versioned) { + const char *status = (versioning_enabled ? "Enabled" : "Suspended"); + s->formatter->dump_string("Status", status); + const char *mfa_status = (mfa_enabled ? "Enabled" : "Disabled"); + s->formatter->dump_string("MfaDelete", mfa_status); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +struct ver_config_status { + int status{VersioningSuspended}; + + enum MFAStatus { + MFA_UNKNOWN, + MFA_DISABLED, + MFA_ENABLED, + } mfa_status{MFA_UNKNOWN}; + int retcode{0}; + + void decode_xml(XMLObj *obj) { + string status_str; + string mfa_str; + RGWXMLDecoder::decode_xml("Status", status_str, obj); + if (status_str == "Enabled") { + status = VersioningEnabled; + } else if (status_str != "Suspended") { + status = VersioningStatusInvalid; + } + + + if (RGWXMLDecoder::decode_xml("MfaDelete", mfa_str, obj)) { + if (mfa_str == "Enabled") { + mfa_status = MFA_ENABLED; + } else if (mfa_str == "Disabled") { + mfa_status = MFA_DISABLED; + } else { + retcode = -EINVAL; + } + } + } +}; + +int RGWSetBucketVersioning_ObjStore_S3::get_params(optional_yield y) +{ + int r = 0; + bufferlist data; + std::tie(r, data) = + read_all_input(s, s->cct->_conf->rgw_max_put_param_size, false); + if (r < 0) { + return r; + } + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + char* buf = data.c_str(); + if (!parser.parse(buf, data.length(), 1)) { + ldpp_dout(this, 10) << "NOTICE: failed to parse data: " << buf << dendl; + r = -EINVAL; + return r; + } + + ver_config_status status_conf; + + if (!RGWXMLDecoder::decode_xml("VersioningConfiguration", status_conf, &parser)) { + ldpp_dout(this, 10) << "NOTICE: bad versioning config input" << dendl; + return -EINVAL; + } + + if (!driver->is_meta_master()) { + /* only need to keep this data around if we're not meta master */ + in_data.append(data); + } + + versioning_status = status_conf.status; + if (versioning_status == VersioningStatusInvalid) { + r = -EINVAL; + } + + if (status_conf.mfa_status != ver_config_status::MFA_UNKNOWN) { + mfa_set_status = true; + switch (status_conf.mfa_status) { + case ver_config_status::MFA_DISABLED: + mfa_status = false; + break; + case ver_config_status::MFA_ENABLED: + mfa_status = true; + break; + default: + ldpp_dout(this, 0) << "ERROR: RGWSetBucketVersioning_ObjStore_S3::get_params(optional_yield y): unexpected switch case mfa_status=" << status_conf.mfa_status << dendl; + r = -EIO; + } + } else if (status_conf.retcode < 0) { + r = status_conf.retcode; + } + return r; +} + +void RGWSetBucketVersioning_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); +} + +int RGWSetBucketWebsite_ObjStore_S3::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int r = 0; + bufferlist data; + std::tie(r, data) = read_all_input(s, max_size, false); + + if (r < 0) { + return r; + } + + in_data.append(data); + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + char* buf = data.c_str(); + if (!parser.parse(buf, data.length(), 1)) { + ldpp_dout(this, 5) << "failed to parse xml: " << buf << dendl; + return -EINVAL; + } + + try { + RGWXMLDecoder::decode_xml("WebsiteConfiguration", website_conf, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "unexpected xml: " << buf << dendl; + return -EINVAL; + } + + if (website_conf.is_redirect_all && website_conf.redirect_all.hostname.empty()) { + s->err.message = "A host name must be provided to redirect all requests (e.g. \"example.com\")."; + ldpp_dout(this, 5) << s->err.message << dendl; + return -EINVAL; + } else if (!website_conf.is_redirect_all && !website_conf.is_set_index_doc) { + s->err.message = "A value for IndexDocument Suffix must be provided if RedirectAllRequestsTo is empty"; + ldpp_dout(this, 5) << s->err.message << dendl; + return -EINVAL; + } else if (!website_conf.is_redirect_all && website_conf.is_set_index_doc && + website_conf.index_doc_suffix.empty()) { + s->err.message = "The IndexDocument Suffix is not well formed"; + ldpp_dout(this, 5) << s->err.message << dendl; + return -EINVAL; + } + +#define WEBSITE_ROUTING_RULES_MAX_NUM 50 + int max_num = s->cct->_conf->rgw_website_routing_rules_max_num; + if (max_num < 0) { + max_num = WEBSITE_ROUTING_RULES_MAX_NUM; + } + int routing_rules_num = website_conf.routing_rules.rules.size(); + if (routing_rules_num > max_num) { + ldpp_dout(this, 4) << "An website routing config can have up to " + << max_num + << " rules, request website routing rules num: " + << routing_rules_num << dendl; + op_ret = -ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR; + s->err.message = std::to_string(routing_rules_num) +" routing rules provided, the number of routing rules in a website configuration is limited to " + + std::to_string(max_num) + + "."; + return -ERR_INVALID_REQUEST; + } + + return 0; +} + +void RGWSetBucketWebsite_ObjStore_S3::send_response() +{ + if (op_ret < 0) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); +} + +void RGWDeleteBucketWebsite_ObjStore_S3::send_response() +{ + if (op_ret == 0) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); +} + +void RGWGetBucketWebsite_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (op_ret < 0) { + return; + } + + RGWBucketWebsiteConf& conf = s->bucket->get_info().website_conf; + + s->formatter->open_object_section_in_ns("WebsiteConfiguration", XMLNS_AWS_S3); + conf.dump_xml(s->formatter); + s->formatter->close_section(); // WebsiteConfiguration + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void dump_bucket_metadata(req_state *s, rgw::sal::Bucket* bucket) +{ + dump_header(s, "X-RGW-Object-Count", static_cast(bucket->get_count())); + dump_header(s, "X-RGW-Bytes-Used", static_cast(bucket->get_size())); + // only bucket's owner is allowed to get the quota settings of the account + if (bucket->is_owner(s->user.get())) { + auto user_info = s->user->get_info(); + auto bucket_quota = s->bucket->get_info().quota; // bucket quota + dump_header(s, "X-RGW-Quota-User-Size", static_cast(user_info.quota.user_quota.max_size)); + dump_header(s, "X-RGW-Quota-User-Objects", static_cast(user_info.quota.user_quota.max_objects)); + dump_header(s, "X-RGW-Quota-Max-Buckets", static_cast(user_info.max_buckets)); + dump_header(s, "X-RGW-Quota-Bucket-Size", static_cast(bucket_quota.max_size)); + dump_header(s, "X-RGW-Quota-Bucket-Objects", static_cast(bucket_quota.max_objects)); + } +} + +void RGWStatBucket_ObjStore_S3::send_response() +{ + if (op_ret >= 0) { + dump_bucket_metadata(s, bucket.get()); + } + + set_req_state_err(s, op_ret); + dump_errno(s); + + end_header(s, this); + dump_start(s); +} + +static int create_s3_policy(req_state *s, rgw::sal::Driver* driver, + RGWAccessControlPolicy_S3& s3policy, + ACLOwner& owner) +{ + if (s->has_acl_header) { + if (!s->canned_acl.empty()) + return -ERR_INVALID_REQUEST; + + return s3policy.create_from_headers(s, driver, s->info.env, owner); + } + + return s3policy.create_canned(owner, s->bucket_owner, s->canned_acl); +} + +class RGWLocationConstraint : public XMLObj +{ +public: + RGWLocationConstraint() {} + ~RGWLocationConstraint() override {} + bool xml_end(const char *el) override { + if (!el) + return false; + + location_constraint = get_data(); + + return true; + } + + string location_constraint; +}; + +class RGWCreateBucketConfig : public XMLObj +{ +public: + RGWCreateBucketConfig() {} + ~RGWCreateBucketConfig() override {} +}; + +class RGWCreateBucketParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) override { + return new XMLObj; + } + +public: + RGWCreateBucketParser() {} + ~RGWCreateBucketParser() override {} + + bool get_location_constraint(string& zone_group) { + XMLObj *config = find_first("CreateBucketConfiguration"); + if (!config) + return false; + + XMLObj *constraint = config->find_first("LocationConstraint"); + if (!constraint) + return false; + + zone_group = constraint->get_data(); + + return true; + } +}; + +int RGWCreateBucket_ObjStore_S3::get_params(optional_yield y) +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + bool relaxed_names = s->cct->_conf->rgw_relaxed_s3_bucket_names; + + int r; + if (!s->system_request) { + r = valid_s3_bucket_name(s->bucket_name, relaxed_names); + if (r) return r; + } + + r = create_s3_policy(s, driver, s3policy, s->owner); + if (r < 0) + return r; + + policy = s3policy; + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int op_ret = 0; + bufferlist data; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + + if ((op_ret < 0) && (op_ret != -ERR_LENGTH_REQUIRED)) + return op_ret; + + in_data.append(data); + + if (data.length()) { + RGWCreateBucketParser parser; + + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + char* buf = data.c_str(); + bool success = parser.parse(buf, data.length(), 1); + ldpp_dout(this, 20) << "create bucket input data=" << buf << dendl; + + if (!success) { + ldpp_dout(this, 0) << "failed to parse input: " << buf << dendl; + return -EINVAL; + } + + if (!parser.get_location_constraint(location_constraint)) { + ldpp_dout(this, 0) << "provided input did not specify location constraint correctly" << dendl; + return -EINVAL; + } + + ldpp_dout(this, 10) << "create bucket location constraint: " + << location_constraint << dendl; + } + + size_t pos = location_constraint.find(':'); + if (pos != string::npos) { + placement_rule.init(location_constraint.substr(pos + 1), s->info.storage_class); + location_constraint = location_constraint.substr(0, pos); + } else { + placement_rule.storage_class = s->info.storage_class; + } + auto iter = s->info.x_meta_map.find("x-amz-bucket-object-lock-enabled"); + if (iter != s->info.x_meta_map.end()) { + if (!boost::algorithm::iequals(iter->second, "true") && !boost::algorithm::iequals(iter->second, "false")) { + return -EINVAL; + } + obj_lock_enabled = boost::algorithm::iequals(iter->second, "true"); + } + return 0; +} + +void RGWCreateBucket_ObjStore_S3::send_response() +{ + if (op_ret == -ERR_BUCKET_EXISTS) + op_ret = 0; + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + if (s->system_request) { + JSONFormatter f; /* use json formatter for system requests output */ + + f.open_object_section("info"); + encode_json("entry_point_object_ver", ep_objv, &f); + encode_json("object_ver", info.objv_tracker.read_version, &f); + encode_json("bucket_info", info, &f); + f.close_section(); + rgw_flush_formatter_and_reset(s, &f); + } +} + +void RGWDeleteBucket_ObjStore_S3::send_response() +{ + int r = op_ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +static inline void map_qs_metadata(req_state* s, bool crypto_too) +{ + /* merge S3 valid user metadata from the query-string into + * x_meta_map, which maps them to attributes */ + const auto& params = const_cast(s->info.args).get_params(); + for (const auto& elt : params) { + std::string k = boost::algorithm::to_lower_copy(elt.first); + if (k.find("x-amz-meta-") == /* offset */ 0) { + rgw_add_amz_meta_header(s->info.x_meta_map, k, elt.second); + } + if (crypto_too && k.find("x-amz-server-side-encryption") == /* offset */ 0) { + rgw_set_amz_meta_header(s->info.crypt_attribute_map, k, elt.second, OVERWRITE); + } + } +} + +int RGWPutObj_ObjStore_S3::get_params(optional_yield y) +{ + if (!s->length) { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + if (!encoding || strcmp(encoding, "chunked") != 0) { + ldout(s->cct, 20) << "neither length nor chunked encoding" << dendl; + return -ERR_LENGTH_REQUIRED; + } + + chunked_upload = true; + } + + int ret; + + map_qs_metadata(s, true); + ret = get_encryption_defaults(s); + if (ret < 0) { + ldpp_dout(this, 5) << __func__ << "(): get_encryption_defaults() returned ret=" << ret << dendl; + return ret; + } + + RGWAccessControlPolicy_S3 s3policy(s->cct); + ret = create_s3_policy(s, driver, s3policy, s->owner); + if (ret < 0) + return ret; + + policy = s3policy; + + if_match = s->info.env->get("HTTP_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_IF_NONE_MATCH"); + + /* handle object tagging */ + auto tag_str = s->info.env->get("HTTP_X_AMZ_TAGGING"); + if (tag_str){ + obj_tags = std::make_unique(); + ret = obj_tags->set_from_string(tag_str); + if (ret < 0){ + ldpp_dout(this,0) << "setting obj tags failed with " << ret << dendl; + if (ret == -ERR_INVALID_TAG){ + ret = -EINVAL; //s3 returns only -EINVAL for PUT requests + } + + return ret; + } + } + + //handle object lock + auto obj_lock_mode_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_MODE"); + auto obj_lock_date_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_RETAIN_UNTIL_DATE"); + auto obj_legal_hold_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_LEGAL_HOLD"); + if (obj_lock_mode_str && obj_lock_date_str) { + boost::optional date = ceph::from_iso_8601(obj_lock_date_str); + if (boost::none == date || ceph::real_clock::to_time_t(*date) <= ceph_clock_now()) { + ret = -EINVAL; + ldpp_dout(this,0) << "invalid x-amz-object-lock-retain-until-date value" << dendl; + return ret; + } + if (strcmp(obj_lock_mode_str, "GOVERNANCE") != 0 && strcmp(obj_lock_mode_str, "COMPLIANCE") != 0) { + ret = -EINVAL; + ldpp_dout(this,0) << "invalid x-amz-object-lock-mode value" << dendl; + return ret; + } + obj_retention = new RGWObjectRetention(obj_lock_mode_str, *date); + } else if ((obj_lock_mode_str && !obj_lock_date_str) || (!obj_lock_mode_str && obj_lock_date_str)) { + ret = -EINVAL; + ldpp_dout(this,0) << "need both x-amz-object-lock-mode and x-amz-object-lock-retain-until-date " << dendl; + return ret; + } + if (obj_legal_hold_str) { + if (strcmp(obj_legal_hold_str, "ON") != 0 && strcmp(obj_legal_hold_str, "OFF") != 0) { + ret = -EINVAL; + ldpp_dout(this,0) << "invalid x-amz-object-lock-legal-hold value" << dendl; + return ret; + } + obj_legal_hold = new RGWObjectLegalHold(obj_legal_hold_str); + } + if (!s->bucket->get_info().obj_lock_enabled() && (obj_retention || obj_legal_hold)) { + ldpp_dout(this, 0) << "ERROR: object retention or legal hold can't be set if bucket object lock not configured" << dendl; + ret = -ERR_INVALID_REQUEST; + return ret; + } + multipart_upload_id = s->info.args.get("uploadId"); + multipart_part_str = s->info.args.get("partNumber"); + if (!multipart_part_str.empty()) { + string err; + multipart_part_num = strict_strtol(multipart_part_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 10) << "bad part number: " << multipart_part_str << ": " << err << dendl; + return -EINVAL; + } + } else if (!multipart_upload_id.empty()) { + ldpp_dout(s, 10) << "part number with no multipart upload id" << dendl; + return -EINVAL; + } + + append = s->info.args.exists("append"); + if (append) { + string pos_str = s->info.args.get("position"); + string err; + long long pos_tmp = strict_strtoll(pos_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 10) << "bad position: " << pos_str << ": " << err << dendl; + return -EINVAL; + } else if (pos_tmp < 0) { + ldpp_dout(s, 10) << "bad position: " << pos_str << ": " << "position shouldn't be negative" << dendl; + return -EINVAL; + } + position = uint64_t(pos_tmp); + } + + return RGWPutObj_ObjStore::get_params(y); +} + +int RGWPutObj_ObjStore_S3::get_data(bufferlist& bl) +{ + const int ret = RGWPutObj_ObjStore::get_data(bl); + if (ret == 0) { + const int ret_auth = do_aws4_auth_completion(); + if (ret_auth < 0) { + return ret_auth; + } + } + + return ret; +} + +static int get_success_retcode(int code) +{ + switch (code) { + case 201: + return STATUS_CREATED; + case 204: + return STATUS_NO_CONTENT; + } + return 0; +} + +void RGWPutObj_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + dump_errno(s); + } else { + if (s->cct->_conf->rgw_s3_success_create_obj_status) { + op_ret = get_success_retcode( + s->cct->_conf->rgw_s3_success_create_obj_status); + set_req_state_err(s, op_ret); + } + + string expires = get_s3_expiration_header(s, mtime); + + if (copy_source.empty()) { + dump_errno(s); + dump_etag(s, etag); + dump_content_length(s, 0); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + dump_header_if_nonempty(s, "x-amz-expiration", expires); + for (auto &it : crypt_http_responses) + dump_header(s, it.first, it.second); + } else { + dump_errno(s); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + dump_header_if_nonempty(s, "x-amz-expiration", expires); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + struct tm tmp; + utime_t ut(mtime); + time_t secs = (time_t)ut.sec(); + gmtime_r(&secs, &tmp); + char buf[TIME_BUF_SIZE]; + s->formatter->open_object_section_in_ns("CopyPartResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + if (strftime(buf, sizeof(buf), "%Y-%m-%dT%T.000Z", &tmp) > 0) { + s->formatter->dump_string("LastModified", buf); + } + s->formatter->dump_string("ETag", etag); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + return; + } + } + if (append) { + if (op_ret == 0 || op_ret == -ERR_POSITION_NOT_EQUAL_TO_LENGTH) { + dump_header(s, "x-rgw-next-append-position", cur_accounted_size); + } + } + if (s->system_request && !real_clock::is_zero(mtime)) { + dump_epoch_header(s, "Rgwx-Mtime", mtime); + } + end_header(s, this); +} + +static inline void set_attr(map& attrs, const char* key, const std::string& value) +{ + bufferlist bl; + encode(value,bl); + attrs.emplace(key, std::move(bl)); +} + +static inline void set_attr(map& attrs, const char* key, const char* value) +{ + bufferlist bl; + encode(value,bl); + attrs.emplace(key, std::move(bl)); +} + +int RGWPutObj_ObjStore_S3::get_decrypt_filter( + std::unique_ptr* filter, + RGWGetObj_Filter* cb, + map& attrs, + bufferlist* manifest_bl) +{ + std::map crypt_http_responses_unused; + + std::unique_ptr block_crypt; + int res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses_unused); + if (res < 0) { + return res; + } + if (block_crypt == nullptr) { + return 0; + } + + // in case of a multipart upload, we need to know the part lengths to + // correctly decrypt across part boundaries + std::vector parts_len; + + // for replicated objects, the original part lengths are preserved in an xattr + if (auto i = attrs.find(RGW_ATTR_CRYPT_PARTS); i != attrs.end()) { + try { + auto p = i->second.cbegin(); + using ceph::decode; + decode(parts_len, p); + } catch (const buffer::error&) { + ldpp_dout(this, 1) << "failed to decode RGW_ATTR_CRYPT_PARTS" << dendl; + return -EIO; + } + } else if (manifest_bl) { + // otherwise, we read the part lengths from the manifest + res = RGWGetObj_BlockDecrypt::read_manifest_parts(this, *manifest_bl, + parts_len); + if (res < 0) { + return res; + } + } + + *filter = std::make_unique( + s, s->cct, cb, std::move(block_crypt), + std::move(parts_len)); + return 0; +} + +int RGWPutObj_ObjStore_S3::get_encrypt_filter( + std::unique_ptr *filter, + rgw::sal::DataProcessor *cb) +{ + int res = 0; + if (!multipart_upload_id.empty()) { + std::unique_ptr upload = + s->bucket->get_multipart_upload(s->object->get_name(), + multipart_upload_id); + std::unique_ptr obj = upload->get_meta_obj(); + obj->set_in_extra_data(true); + res = obj->get_obj_attrs(s->yield, this); + if (res == 0) { + std::unique_ptr block_crypt; + /* We are adding to existing object. + * We use crypto mode that configured as if we were decrypting. */ + res = rgw_s3_prepare_decrypt(s, obj->get_attrs(), &block_crypt, crypt_http_responses); + if (res == 0 && block_crypt != nullptr) + filter->reset(new RGWPutObj_BlockEncrypt(s, s->cct, cb, std::move(block_crypt))); + } + /* it is ok, to not have encryption at all */ + } + else + { + std::unique_ptr block_crypt; + res = rgw_s3_prepare_encrypt(s, attrs, &block_crypt, crypt_http_responses); + if (res == 0 && block_crypt != nullptr) { + filter->reset(new RGWPutObj_BlockEncrypt(s, s->cct, cb, std::move(block_crypt))); + } + } + return res; +} + +void RGWPostObj_ObjStore_S3::rebuild_key(rgw::sal::Object* obj) +{ + string key = obj->get_name(); + static string var = "${filename}"; + int pos = key.find(var); + if (pos < 0) + return; + + string new_key = key.substr(0, pos); + new_key.append(filename); + new_key.append(key.substr(pos + var.size())); + + obj->set_key(new_key); +} + +std::string RGWPostObj_ObjStore_S3::get_current_filename() const +{ + return s->object->get_name(); +} + +std::string RGWPostObj_ObjStore_S3::get_current_content_type() const +{ + return content_type; +} + +int RGWPostObj_ObjStore_S3::get_params(optional_yield y) +{ + op_ret = RGWPostObj_ObjStore::get_params(y); + if (op_ret < 0) { + return op_ret; + } + + map_qs_metadata(s, false); + + bool done; + do { + struct post_form_part part; + int r = read_form_part_header(&part, done); + if (r < 0) + return r; + + if (s->cct->_conf->subsys.should_gather()) { + ldpp_dout(this, 20) << "read part header -- part.name=" + << part.name << dendl; + + for (const auto& pair : part.fields) { + ldpp_dout(this, 20) << "field.name=" << pair.first << dendl; + ldpp_dout(this, 20) << "field.val=" << pair.second.val << dendl; + ldpp_dout(this, 20) << "field.params:" << dendl; + + for (const auto& param_pair : pair.second.params) { + ldpp_dout(this, 20) << " " << param_pair.first + << " -> " << param_pair.second << dendl; + } + } + } + + if (done) { /* unexpected here */ + err_msg = "Malformed request"; + return -EINVAL; + } + + if (stringcasecmp(part.name, "file") == 0) { /* beginning of data transfer */ + struct post_part_field& field = part.fields["Content-Disposition"]; + map::iterator iter = field.params.find("filename"); + if (iter != field.params.end()) { + filename = iter->second; + } + parts[part.name] = part; + break; + } + + bool boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + r = read_data(part.data, chunk_size, boundary, done); + if (r < 0 || !boundary) { + err_msg = "Couldn't find boundary"; + return -EINVAL; + } + parts[part.name] = part; + string part_str(part.data.c_str(), part.data.length()); + env.add_var(part.name, part_str); + } while (!done); + + for (auto &p: parts) { + if (! boost::istarts_with(p.first, "x-amz-server-side-encryption")) { + continue; + } + bufferlist &d { p.second.data }; + std::string v { rgw_trim_whitespace(std::string_view(d.c_str(), d.length())) }; + rgw_set_amz_meta_header(s->info.crypt_attribute_map, p.first, v, OVERWRITE); + } + int r = get_encryption_defaults(s); + if (r < 0) { + ldpp_dout(this, 5) << __func__ << "(): get_encryption_defaults() returned ret=" << r << dendl; + return r; + } + + ldpp_dout(this, 20) << "adding bucket to policy env: " << s->bucket->get_name() + << dendl; + env.add_var("bucket", s->bucket->get_name()); + + string object_str; + if (!part_str(parts, "key", &object_str)) { + err_msg = "Key not specified"; + return -EINVAL; + } + + s->object = s->bucket->get_object(rgw_obj_key(object_str)); + + rebuild_key(s->object.get()); + + if (rgw::sal::Object::empty(s->object.get())) { + err_msg = "Empty object name"; + return -EINVAL; + } + + env.add_var("key", s->object->get_name()); + + part_str(parts, "Content-Type", &content_type); + + /* AWS permits POST without Content-Type: http://tracker.ceph.com/issues/20201 */ + if (! content_type.empty()) { + env.add_var("Content-Type", content_type); + } + + std::string storage_class; + part_str(parts, "x-amz-storage-class", &storage_class); + + if (! storage_class.empty()) { + s->dest_placement.storage_class = storage_class; + if (!driver->valid_placement(s->dest_placement)) { + ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << s->dest_placement.to_str() << dendl; + err_msg = "The storage class you specified is not valid"; + return -EINVAL; + } + } + + map::iterator piter = + parts.upper_bound(RGW_AMZ_META_PREFIX); + for (; piter != parts.end(); ++piter) { + string n = piter->first; + if (strncasecmp(n.c_str(), RGW_AMZ_META_PREFIX, + sizeof(RGW_AMZ_META_PREFIX) - 1) != 0) + break; + + string attr_name = RGW_ATTR_PREFIX; + attr_name.append(n); + + /* need to null terminate it */ + bufferlist& data = piter->second.data; + string str = string(data.c_str(), data.length()); + + bufferlist attr_bl; + attr_bl.append(str.c_str(), str.size() + 1); + + attrs[attr_name] = attr_bl; + } + // TODO: refactor this and the above loop to share code + piter = parts.find(RGW_AMZ_WEBSITE_REDIRECT_LOCATION); + if (piter != parts.end()) { + string n = piter->first; + string attr_name = RGW_ATTR_PREFIX; + attr_name.append(n); + /* need to null terminate it */ + bufferlist& data = piter->second.data; + string str = string(data.c_str(), data.length()); + + bufferlist attr_bl; + attr_bl.append(str.c_str(), str.size() + 1); + + attrs[attr_name] = attr_bl; + } + + r = get_policy(y); + if (r < 0) + return r; + + r = get_tags(); + if (r < 0) + return r; + + + min_len = post_policy.min_length; + max_len = post_policy.max_length; + + + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_tags() +{ + string tags_str; + if (part_str(parts, "tagging", &tags_str)) { + RGWXMLParser parser; + if (!parser.init()){ + ldpp_dout(this, 0) << "Couldn't init RGWObjTags XML parser" << dendl; + err_msg = "Server couldn't process the request"; + return -EINVAL; // TODO: This class of errors in rgw code should be a 5XX error + } + if (!parser.parse(tags_str.c_str(), tags_str.size(), 1)) { + ldpp_dout(this,0 ) << "Invalid Tagging XML" << dendl; + err_msg = "Invalid Tagging XML"; + return -EINVAL; + } + + RGWObjTagging_S3 tagging; + + try { + RGWXMLDecoder::decode_xml("Tagging", tagging, &parser); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "Malformed tagging request: " << err << dendl; + return -EINVAL; + } + + RGWObjTags obj_tags; + int r = tagging.rebuild(obj_tags); + if (r < 0) + return r; + + bufferlist tags_bl; + obj_tags.encode(tags_bl); + ldpp_dout(this, 20) << "Read " << obj_tags.count() << "tags" << dendl; + attrs[RGW_ATTR_TAGS] = tags_bl; + } + + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_policy(optional_yield y) +{ + if (part_bl(parts, "policy", &s->auth.s3_postobj_creds.encoded_policy)) { + bool aws4_auth = false; + + /* x-amz-algorithm handling */ + using rgw::auth::s3::AWS4_HMAC_SHA256_STR; + if ((part_str(parts, "x-amz-algorithm", &s->auth.s3_postobj_creds.x_amz_algorithm)) && + (s->auth.s3_postobj_creds.x_amz_algorithm == AWS4_HMAC_SHA256_STR)) { + ldpp_dout(this, 0) << "Signature verification algorithm AWS v4 (AWS4-HMAC-SHA256)" << dendl; + aws4_auth = true; + } else { + ldpp_dout(this, 0) << "Signature verification algorithm AWS v2" << dendl; + } + + // check that the signature matches the encoded policy + if (aws4_auth) { + /* AWS4 */ + + /* x-amz-credential handling */ + if (!part_str(parts, "x-amz-credential", + &s->auth.s3_postobj_creds.x_amz_credential)) { + ldpp_dout(this, 0) << "No S3 aws4 credential found!" << dendl; + err_msg = "Missing aws4 credential"; + return -EINVAL; + } + + /* x-amz-signature handling */ + if (!part_str(parts, "x-amz-signature", + &s->auth.s3_postobj_creds.signature)) { + ldpp_dout(this, 0) << "No aws4 signature found!" << dendl; + err_msg = "Missing aws4 signature"; + return -EINVAL; + } + + /* x-amz-date handling */ + std::string received_date_str; + if (!part_str(parts, "x-amz-date", &received_date_str)) { + ldpp_dout(this, 0) << "No aws4 date found!" << dendl; + err_msg = "Missing aws4 date"; + return -EINVAL; + } + } else { + /* AWS2 */ + + // check that the signature matches the encoded policy + if (!part_str(parts, "AWSAccessKeyId", + &s->auth.s3_postobj_creds.access_key)) { + ldpp_dout(this, 0) << "No S3 aws2 access key found!" << dendl; + err_msg = "Missing aws2 access key"; + return -EINVAL; + } + + if (!part_str(parts, "signature", &s->auth.s3_postobj_creds.signature)) { + ldpp_dout(this, 0) << "No aws2 signature found!" << dendl; + err_msg = "Missing aws2 signature"; + return -EINVAL; + } + } + + if (part_str(parts, "x-amz-security-token", &s->auth.s3_postobj_creds.x_amz_security_token)) { + if (s->auth.s3_postobj_creds.x_amz_security_token.size() == 0) { + err_msg = "Invalid token"; + return -EINVAL; + } + } + + /* FIXME: this is a makeshift solution. The browser upload authentication will be + * handled by an instance of rgw::auth::Completer spawned in Handler's authorize() + * method. */ + const int ret = rgw::auth::Strategy::apply(this, auth_registry_ptr->get_s3_post(), s, y); + if (ret != 0) { + return -EACCES; + } else { + /* Populate the owner info. */ + s->owner.set_id(s->user->get_id()); + s->owner.set_name(s->user->get_display_name()); + ldpp_dout(this, 20) << "Successful Signature Verification!" << dendl; + } + + ceph::bufferlist decoded_policy; + try { + decoded_policy.decode_base64(s->auth.s3_postobj_creds.encoded_policy); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "failed to decode_base64 policy" << dendl; + err_msg = "Could not decode policy"; + return -EINVAL; + } + + decoded_policy.append('\0'); // NULL terminate + ldpp_dout(this, 20) << "POST policy: " << decoded_policy.c_str() << dendl; + + + int r = post_policy.from_json(decoded_policy, err_msg); + if (r < 0) { + if (err_msg.empty()) { + err_msg = "Failed to parse policy"; + } + ldpp_dout(this, 0) << "failed to parse policy" << dendl; + return -EINVAL; + } + + if (aws4_auth) { + /* AWS4 */ + post_policy.set_var_checked("x-amz-signature"); + } else { + /* AWS2 */ + post_policy.set_var_checked("AWSAccessKeyId"); + post_policy.set_var_checked("signature"); + } + post_policy.set_var_checked("policy"); + + r = post_policy.check(&env, err_msg); + if (r < 0) { + if (err_msg.empty()) { + err_msg = "Policy check failed"; + } + ldpp_dout(this, 0) << "policy check failed" << dendl; + return r; + } + + } else { + ldpp_dout(this, 0) << "No attached policy found!" << dendl; + } + + string canned_acl; + part_str(parts, "acl", &canned_acl); + + RGWAccessControlPolicy_S3 s3policy(s->cct); + ldpp_dout(this, 20) << "canned_acl=" << canned_acl << dendl; + if (s3policy.create_canned(s->owner, s->bucket_owner, canned_acl) < 0) { + err_msg = "Bad canned ACLs"; + return -EINVAL; + } + + policy = s3policy; + + return 0; +} + +int RGWPostObj_ObjStore_S3::complete_get_params() +{ + bool done; + do { + struct post_form_part part; + int r = read_form_part_header(&part, done); + if (r < 0) { + return r; + } + + ceph::bufferlist part_data; + bool boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + r = read_data(part.data, chunk_size, boundary, done); + if (r < 0 || !boundary) { + return -EINVAL; + } + + /* Just reading the data but not storing any results of that. */ + } while (!done); + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_data(ceph::bufferlist& bl, bool& again) +{ + bool boundary; + bool done; + + const uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + int r = read_data(bl, chunk_size, boundary, done); + if (r < 0) { + return r; + } + + if (boundary) { + if (!done) { + /* Reached end of data, let's drain the rest of the params */ + r = complete_get_params(); + if (r < 0) { + return r; + } + } + } + + again = !boundary; + return bl.length(); +} + +void RGWPostObj_ObjStore_S3::send_response() +{ + if (op_ret == 0 && parts.count("success_action_redirect")) { + string redirect; + + part_str(parts, "success_action_redirect", &redirect); + + string tenant; + string bucket; + string key; + string etag_str = "\""; + + etag_str.append(etag); + etag_str.append("\""); + + string etag_url; + + url_encode(s->bucket_tenant, tenant); /* surely overkill, but cheap */ + url_encode(s->bucket_name, bucket); + url_encode(s->object->get_name(), key); + url_encode(etag_str, etag_url); + + if (!s->bucket_tenant.empty()) { + /* + * What we really would like is to quaily the bucket name, so + * that the client could simply copy it and paste into next request. + * Unfortunately, in S3 we cannot know if the client will decide + * to come through DNS, with "bucket.tenant" sytanx, or through + * URL with "tenant\bucket" syntax. Therefore, we provide the + * tenant separately. + */ + redirect.append("?tenant="); + redirect.append(tenant); + redirect.append("&bucket="); + redirect.append(bucket); + } else { + redirect.append("?bucket="); + redirect.append(bucket); + } + redirect.append("&key="); + redirect.append(key); + redirect.append("&etag="); + redirect.append(etag_url); + + int r = check_utf8(redirect.c_str(), redirect.size()); + if (r < 0) { + op_ret = r; + goto done; + } + dump_redirect(s, redirect); + op_ret = STATUS_REDIRECT; + } else if (op_ret == 0 && parts.count("success_action_status")) { + string status_string; + uint32_t status_int; + + part_str(parts, "success_action_status", &status_string); + + int r = stringtoul(status_string, &status_int); + if (r < 0) { + op_ret = r; + goto done; + } + + switch (status_int) { + case 200: + break; + case 201: + op_ret = STATUS_CREATED; + break; + default: + op_ret = STATUS_NO_CONTENT; + break; + } + } else if (! op_ret) { + op_ret = STATUS_NO_CONTENT; + } + +done: + if (op_ret == STATUS_CREATED) { + for (auto &it : crypt_http_responses) + dump_header(s, it.first, it.second); + s->formatter->open_object_section("PostResponse"); + std::string base_uri = compute_domain_uri(s); + if (!s->bucket_tenant.empty()){ + s->formatter->dump_format("Location", "%s/%s:%s/%s", + base_uri.c_str(), + url_encode(s->bucket_tenant).c_str(), + url_encode(s->bucket_name).c_str(), + url_encode(s->object->get_name()).c_str()); + s->formatter->dump_string("Tenant", s->bucket_tenant); + } else { + s->formatter->dump_format("Location", "%s/%s/%s", + base_uri.c_str(), + url_encode(s->bucket_name).c_str(), + url_encode(s->object->get_name()).c_str()); + } + s->formatter->dump_string("Bucket", s->bucket_name); + s->formatter->dump_string("Key", s->object->get_name()); + s->formatter->dump_string("ETag", etag); + s->formatter->close_section(); + } + s->err.message = err_msg; + set_req_state_err(s, op_ret); + dump_errno(s); + if (op_ret >= 0) { + dump_content_length(s, s->formatter->get_len()); + } + end_header(s, this); + if (op_ret != STATUS_CREATED) + return; + + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWPostObj_ObjStore_S3::get_encrypt_filter( + std::unique_ptr *filter, + rgw::sal::DataProcessor *cb) +{ + std::unique_ptr block_crypt; + int res = rgw_s3_prepare_encrypt(s, attrs, &block_crypt, + crypt_http_responses); + if (res == 0 && block_crypt != nullptr) { + filter->reset(new RGWPutObj_BlockEncrypt(s, s->cct, cb, std::move(block_crypt))); + } + return res; +} + +int RGWDeleteObj_ObjStore_S3::get_params(optional_yield y) +{ + const char *if_unmod = s->info.env->get("HTTP_X_AMZ_DELETE_IF_UNMODIFIED_SINCE"); + + if (s->system_request) { + s->info.args.get_bool(RGW_SYS_PARAM_PREFIX "no-precondition-error", &no_precondition_error, false); + } + + if (if_unmod) { + std::string if_unmod_decoded = url_decode(if_unmod); + uint64_t epoch; + uint64_t nsec; + if (utime_t::parse_date(if_unmod_decoded, &epoch, &nsec) < 0) { + ldpp_dout(this, 10) << "failed to parse time: " << if_unmod_decoded << dendl; + return -EINVAL; + } + unmod_since = utime_t(epoch, nsec).to_real_time(); + } + + const char *bypass_gov_header = s->info.env->get("HTTP_X_AMZ_BYPASS_GOVERNANCE_RETENTION"); + if (bypass_gov_header) { + std::string bypass_gov_decoded = url_decode(bypass_gov_header); + bypass_governance_mode = boost::algorithm::iequals(bypass_gov_decoded, "true"); + } + + return 0; +} + +void RGWDeleteObj_ObjStore_S3::send_response() +{ + int r = op_ret; + if (r == -ENOENT) + r = 0; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + if (delete_marker) { + dump_header(s, "x-amz-delete-marker", "true"); + } + end_header(s, this); +} + +int RGWCopyObj_ObjStore_S3::init_dest_policy() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + /* build a policy for the target object */ + int r = create_s3_policy(s, driver, s3policy, s->owner); + if (r < 0) + return r; + + dest_policy = s3policy; + + return 0; +} + +int RGWCopyObj_ObjStore_S3::get_params(optional_yield y) +{ + //handle object lock + auto obj_lock_mode_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_MODE"); + auto obj_lock_date_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_RETAIN_UNTIL_DATE"); + auto obj_legal_hold_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_LEGAL_HOLD"); + if (obj_lock_mode_str && obj_lock_date_str) { + boost::optional date = ceph::from_iso_8601(obj_lock_date_str); + if (boost::none == date || ceph::real_clock::to_time_t(*date) <= ceph_clock_now()) { + s->err.message = "invalid x-amz-object-lock-retain-until-date value"; + ldpp_dout(this,0) << s->err.message << dendl; + return -EINVAL; + } + if (strcmp(obj_lock_mode_str, "GOVERNANCE") != 0 && strcmp(obj_lock_mode_str, "COMPLIANCE") != 0) { + s->err.message = "invalid x-amz-object-lock-mode value"; + ldpp_dout(this,0) << s->err.message << dendl; + return -EINVAL; + } + obj_retention = new RGWObjectRetention(obj_lock_mode_str, *date); + } else if (obj_lock_mode_str || obj_lock_date_str) { + s->err.message = "need both x-amz-object-lock-mode and x-amz-object-lock-retain-until-date "; + ldpp_dout(this,0) << s->err.message << dendl; + return -EINVAL; + } + if (obj_legal_hold_str) { + if (strcmp(obj_legal_hold_str, "ON") != 0 && strcmp(obj_legal_hold_str, "OFF") != 0) { + s->err.message = "invalid x-amz-object-lock-legal-hold value"; + ldpp_dout(this,0) << s->err.message << dendl; + return -EINVAL; + } + obj_legal_hold = new RGWObjectLegalHold(obj_legal_hold_str); + } + + if_mod = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_IF_NONE_MATCH"); + + if (s->system_request) { + source_zone = s->info.args.get(RGW_SYS_PARAM_PREFIX "source-zone"); + s->info.args.get_bool(RGW_SYS_PARAM_PREFIX "copy-if-newer", ©_if_newer, false); + } + + const char *copy_source_temp = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE"); + if (copy_source_temp) { + copy_source = copy_source_temp; + } + auto tmp_md_d = s->info.env->get("HTTP_X_AMZ_METADATA_DIRECTIVE"); + if (tmp_md_d) { + if (strcasecmp(tmp_md_d, "COPY") == 0) { + attrs_mod = rgw::sal::ATTRSMOD_NONE; + } else if (strcasecmp(tmp_md_d, "REPLACE") == 0) { + attrs_mod = rgw::sal::ATTRSMOD_REPLACE; + } else if (!source_zone.empty()) { + attrs_mod = rgw::sal::ATTRSMOD_NONE; // default for intra-zone_group copy + } else { + s->err.message = "Unknown metadata directive."; + ldpp_dout(this, 0) << s->err.message << dendl; + return -EINVAL; + } + md_directive = tmp_md_d; + } + + if (source_zone.empty() && + (s->bucket->get_tenant() == s->src_tenant_name) && + (s->bucket->get_name() == s->src_bucket_name) && + (s->object->get_name() == s->src_object->get_name()) && + s->src_object->get_instance().empty() && + (attrs_mod != rgw::sal::ATTRSMOD_REPLACE)) { + need_to_check_storage_class = true; + } + + return 0; +} + +int RGWCopyObj_ObjStore_S3::check_storage_class(const rgw_placement_rule& src_placement) +{ + if (src_placement == s->dest_placement) { + /* can only copy object into itself if replacing attrs */ + s->err.message = "This copy request is illegal because it is trying to copy " + "an object to itself without changing the object's metadata, " + "storage class, website redirect location or encryption attributes."; + ldpp_dout(this, 0) << s->err.message << dendl; + return -ERR_INVALID_REQUEST; + } + return 0; +} + +void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs) +{ + if (! sent_header) { + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret == 0) { + s->formatter->open_object_section_in_ns("CopyObjectResult", XMLNS_AWS_S3); + } + sent_header = true; + } else { + /* Send progress field. Note that this diverge from the original S3 + * spec. We do this in order to keep connection alive. + */ + s->formatter->dump_int("Progress", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_S3::send_response() +{ + if (!sent_header) + send_partial_response(0); + + if (op_ret == 0) { + dump_time(s, "LastModified", mtime); + if (!etag.empty()) { + s->formatter->dump_format("ETag", "\"%s\"",etag.c_str()); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWGetACLs_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + rgw_flush_formatter(s, s->formatter); + dump_body(s, acls); +} + +int RGWPutACLs_ObjStore_S3::get_params(optional_yield y) +{ + int ret = RGWPutACLs_ObjStore::get_params(y); + if (ret >= 0) { + const int ret_auth = do_aws4_auth_completion(); + if (ret_auth < 0) { + return ret_auth; + } + } else { + /* a request body is not required an S3 PutACLs request--n.b., + * s->length is non-null iff a content length was parsed (the + * ACP or canned ACL could be in any of 3 headers, don't worry + * about that here) */ + if ((ret == -ERR_LENGTH_REQUIRED) && + !!(s->length)) { + return 0; + } + } + return ret; +} + +int RGWPutACLs_ObjStore_S3::get_policy_from_state(rgw::sal::Driver* driver, + req_state *s, + stringstream& ss) +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + // bucket-* canned acls do not apply to bucket + if (rgw::sal::Object::empty(s->object.get())) { + if (s->canned_acl.find("bucket") != string::npos) + s->canned_acl.clear(); + } + + int r = create_s3_policy(s, driver, s3policy, owner); + if (r < 0) + return r; + + s3policy.to_xml(ss); + + return 0; +} + +void RGWPutACLs_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); +} + +void RGWGetLC_ObjStore_S3::execute(optional_yield y) +{ + config.set_ctx(s->cct); + + map::iterator aiter = s->bucket_attrs.find(RGW_ATTR_LC); + if (aiter == s->bucket_attrs.end()) { + op_ret = -ENOENT; + return; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(this, 0) << __func__ << "decode life cycle config failed" << dendl; + op_ret = -EIO; + return; + } +} + +void RGWGetLC_ObjStore_S3::send_response() +{ + if (op_ret) { + if (op_ret == -ENOENT) { + set_req_state_err(s, ERR_NO_SUCH_LC); + } else { + set_req_state_err(s, op_ret); + } + } + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (op_ret < 0) + return; + + encode_xml("LifecycleConfiguration", XMLNS_AWS_S3, config, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWPutLC_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); +} + +void RGWDeleteLC_ObjStore_S3::send_response() +{ + if (op_ret == 0) + op_ret = STATUS_NO_CONTENT; + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); +} + +void RGWGetCORS_ObjStore_S3::send_response() +{ + if (op_ret) { + if (op_ret == -ENOENT) + set_req_state_err(s, ERR_NO_SUCH_CORS_CONFIGURATION); + else + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, NULL, to_mime_type(s->format)); + dump_start(s); + if (! op_ret) { + string cors; + RGWCORSConfiguration_S3 *s3cors = + static_cast(&bucket_cors); + stringstream ss; + + s3cors->to_xml(ss); + cors = ss.str(); + dump_body(s, cors); + } +} + +int RGWPutCORS_ObjStore_S3::get_params(optional_yield y) +{ + RGWCORSXMLParser_S3 parser(this, s->cct); + RGWCORSConfiguration_S3 *cors_config; + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int r = 0; + bufferlist data; + std::tie(r, data) = read_all_input(s, max_size, false); + if (r < 0) { + return r; + } + + if (!parser.init()) { + return -EINVAL; + } + + char* buf = data.c_str(); + if (!buf || !parser.parse(buf, data.length(), 1)) { + return -ERR_MALFORMED_XML; + } + cors_config = + static_cast(parser.find_first( + "CORSConfiguration")); + if (!cors_config) { + return -ERR_MALFORMED_XML; + } + +#define CORS_RULES_MAX_NUM 100 + int max_num = s->cct->_conf->rgw_cors_rules_max_num; + if (max_num < 0) { + max_num = CORS_RULES_MAX_NUM; + } + int cors_rules_num = cors_config->get_rules().size(); + if (cors_rules_num > max_num) { + ldpp_dout(this, 4) << "An cors config can have up to " + << max_num + << " rules, request cors rules num: " + << cors_rules_num << dendl; + op_ret = -ERR_INVALID_CORS_RULES_ERROR; + s->err.message = "The number of CORS rules should not exceed allowed limit of " + + std::to_string(max_num) + " rules."; + return -ERR_INVALID_REQUEST; + } + + // forward bucket cors requests to meta master zone + if (!driver->is_meta_master()) { + /* only need to keep this data around if we're not meta master */ + in_data.append(data); + } + + if (s->cct->_conf->subsys.should_gather()) { + ldpp_dout(this, 15) << "CORSConfiguration"; + cors_config->to_xml(*_dout); + *_dout << dendl; + } + + cors_config->encode(cors_bl); + + return 0; +} + +void RGWPutCORS_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL, to_mime_type(s->format)); + dump_start(s); +} + +void RGWDeleteCORS_ObjStore_S3::send_response() +{ + int r = op_ret; + if (!r || r == -ENOENT) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, NULL); +} + +void RGWOptionsCORS_ObjStore_S3::send_response() +{ + string hdrs, exp_hdrs; + uint32_t max_age = CORS_MAX_AGE_INVALID; + /*EACCES means, there is no CORS registered yet for the bucket + *ENOENT means, there is no match of the Origin in the list of CORSRule + */ + if (op_ret == -ENOENT) + op_ret = -EACCES; + if (op_ret < 0) { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL); + return; + } + get_response_params(hdrs, exp_hdrs, &max_age); + + dump_errno(s); + dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), + max_age); + end_header(s, NULL); +} + +void RGWPutBucketEncryption_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWGetBucketEncryption_ObjStore_S3::send_response() +{ + if (op_ret) { + if (op_ret == -ENOENT) + set_req_state_err(s, ERR_NO_SUCH_BUCKET_ENCRYPTION_CONFIGURATION); + else + set_req_state_err(s, op_ret); + } + + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (!op_ret) { + encode_xml("ServerSideEncryptionConfiguration", XMLNS_AWS_S3, + bucket_encryption_conf, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWDeleteBucketEncryption_ObjStore_S3::send_response() +{ + if (op_ret == 0) { + op_ret = STATUS_NO_CONTENT; + } + + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); +} + +void RGWGetRequestPayment_ObjStore_S3::send_response() +{ + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + s->formatter->open_object_section_in_ns("RequestPaymentConfiguration", XMLNS_AWS_S3); + const char *payer = requester_pays ? "Requester" : "BucketOwner"; + s->formatter->dump_string("Payer", payer); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +class RGWSetRequestPaymentParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) override { + return new XMLObj; + } + +public: + RGWSetRequestPaymentParser() {} + ~RGWSetRequestPaymentParser() override {} + + int get_request_payment_payer(bool *requester_pays) { + XMLObj *config = find_first("RequestPaymentConfiguration"); + if (!config) + return -EINVAL; + + *requester_pays = false; + + XMLObj *field = config->find_first("Payer"); + if (!field) + return 0; + + auto& s = field->get_data(); + + if (stringcasecmp(s, "Requester") == 0) { + *requester_pays = true; + } else if (stringcasecmp(s, "BucketOwner") != 0) { + return -EINVAL; + } + + return 0; + } +}; + +int RGWSetRequestPayment_ObjStore_S3::get_params(optional_yield y) +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int r = 0; + std::tie(r, in_data) = read_all_input(s, max_size, false); + + if (r < 0) { + return r; + } + + + RGWSetRequestPaymentParser parser; + + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + char* buf = in_data.c_str(); + if (!parser.parse(buf, in_data.length(), 1)) { + ldpp_dout(this, 10) << "failed to parse data: " << buf << dendl; + return -EINVAL; + } + + return parser.get_request_payment_payer(&requester_pays); +} + +void RGWSetRequestPayment_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); +} + +int RGWInitMultipart_ObjStore_S3::get_params(optional_yield y) +{ + int ret; + + ret = get_encryption_defaults(s); + if (ret < 0) { + ldpp_dout(this, 5) << __func__ << "(): get_encryption_defaults() returned ret=" << ret << dendl; + return ret; + } + + RGWAccessControlPolicy_S3 s3policy(s->cct); + ret = create_s3_policy(s, driver, s3policy, s->owner); + if (ret < 0) + return ret; + + policy = s3policy; + + return 0; +} + +void RGWInitMultipart_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + for (auto &it : crypt_http_responses) + dump_header(s, it.first, it.second); + ceph::real_time abort_date; + string rule_id; + bool exist_multipart_abort = get_s3_multipart_abort_header(s, mtime, abort_date, rule_id); + if (exist_multipart_abort) { + dump_time_header(s, "x-amz-abort-date", abort_date); + dump_header_if_nonempty(s, "x-amz-abort-rule-id", rule_id); + } + end_header(s, this, to_mime_type(s->format)); + if (op_ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("InitiateMultipartUploadResult", XMLNS_AWS_S3); + if (!s->bucket_tenant.empty()) + s->formatter->dump_string("Tenant", s->bucket_tenant); + s->formatter->dump_string("Bucket", s->bucket_name); + s->formatter->dump_string("Key", s->object->get_name()); + s->formatter->dump_string("UploadId", upload_id); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWInitMultipart_ObjStore_S3::prepare_encryption(map& attrs) +{ + int res = 0; + res = rgw_s3_prepare_encrypt(s, attrs, nullptr, crypt_http_responses); + return res; +} + +int RGWCompleteMultipart_ObjStore_S3::get_params(optional_yield y) +{ + int ret = RGWCompleteMultipart_ObjStore::get_params(y); + if (ret < 0) { + return ret; + } + + map_qs_metadata(s, true); + + return do_aws4_auth_completion(); +} + +void RGWCompleteMultipart_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + end_header(s, this, to_mime_type(s->format)); + if (op_ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("CompleteMultipartUploadResult", XMLNS_AWS_S3); + std::string base_uri = compute_domain_uri(s); + if (!s->bucket_tenant.empty()) { + s->formatter->dump_format("Location", "%s/%s:%s/%s", + base_uri.c_str(), + s->bucket_tenant.c_str(), + s->bucket_name.c_str(), + s->object->get_name().c_str() + ); + s->formatter->dump_string("Tenant", s->bucket_tenant); + } else { + s->formatter->dump_format("Location", "%s/%s/%s", + base_uri.c_str(), + s->bucket_name.c_str(), + s->object->get_name().c_str() + ); + } + s->formatter->dump_string("Bucket", s->bucket_name); + s->formatter->dump_string("Key", s->object->get_name()); + s->formatter->dump_string("ETag", etag); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWAbortMultipart_ObjStore_S3::send_response() +{ + int r = op_ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +void RGWListMultipart_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING); + + if (op_ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("ListPartsResult", XMLNS_AWS_S3); + map>::iterator iter; + map>::reverse_iterator test_iter; + int cur_max = 0; + + iter = upload->get_parts().begin(); + test_iter = upload->get_parts().rbegin(); + if (test_iter != upload->get_parts().rend()) { + cur_max = test_iter->first; + } + if (!s->bucket_tenant.empty()) + s->formatter->dump_string("Tenant", s->bucket_tenant); + s->formatter->dump_string("Bucket", s->bucket_name); + s->formatter->dump_string("Key", s->object->get_name()); + s->formatter->dump_string("UploadId", upload_id); + s->formatter->dump_string("StorageClass", placement->get_storage_class()); + s->formatter->dump_int("PartNumberMarker", marker); + s->formatter->dump_int("NextPartNumberMarker", cur_max); + s->formatter->dump_int("MaxParts", max_parts); + s->formatter->dump_string("IsTruncated", (truncated ? "true" : "false")); + + ACLOwner& owner = policy.get_owner(); + dump_owner(s, owner.get_id(), owner.get_display_name()); + + for (; iter != upload->get_parts().end(); ++iter) { + rgw::sal::MultipartPart* part = iter->second.get(); + + s->formatter->open_object_section("Part"); + + dump_time(s, "LastModified", part->get_mtime()); + + s->formatter->dump_unsigned("PartNumber", part->get_num()); + s->formatter->dump_format("ETag", "\"%s\"", part->get_etag().c_str()); + s->formatter->dump_unsigned("Size", part->get_size()); + s->formatter->close_section(); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWListBucketMultiparts_ObjStore_S3::send_response() +{ + if (op_ret < 0) + set_req_state_err(s, op_ret); + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret < 0) + return; + + s->formatter->open_object_section_in_ns("ListMultipartUploadsResult", XMLNS_AWS_S3); + if (!s->bucket_tenant.empty()) + s->formatter->dump_string("Tenant", s->bucket_tenant); + s->formatter->dump_string("Bucket", s->bucket_name); + if (!prefix.empty()) + s->formatter->dump_string("Prefix", prefix); + if (!marker_key.empty()) + s->formatter->dump_string("KeyMarker", marker_key); + if (!marker_upload_id.empty()) + s->formatter->dump_string("UploadIdMarker", marker_upload_id); + if (!next_marker_key.empty()) + s->formatter->dump_string("NextKeyMarker", next_marker_key); + if (!next_marker_upload_id.empty()) + s->formatter->dump_string("NextUploadIdMarker", next_marker_upload_id); + s->formatter->dump_int("MaxUploads", max_uploads); + if (!delimiter.empty()) + s->formatter->dump_string("Delimiter", delimiter); + s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false")); + + if (op_ret >= 0) { + vector>::iterator iter; + for (iter = uploads.begin(); iter != uploads.end(); ++iter) { + rgw::sal::MultipartUpload* upload = iter->get(); + s->formatter->open_array_section("Upload"); + if (encode_url) { + s->formatter->dump_string("Key", url_encode(upload->get_key(), false)); + } else { + s->formatter->dump_string("Key", upload->get_key()); + } + s->formatter->dump_string("UploadId", upload->get_upload_id()); + const ACLOwner& owner = upload->get_owner(); + dump_owner(s, owner.get_id(), owner.get_display_name(), "Initiator"); + dump_owner(s, owner.get_id(), owner.get_display_name()); // Owner + s->formatter->dump_string("StorageClass", "STANDARD"); + dump_time(s, "Initiated", upload->get_mtime()); + s->formatter->close_section(); + } + if (!common_prefixes.empty()) { + s->formatter->open_array_section("CommonPrefixes"); + for (const auto& kv : common_prefixes) { + if (encode_url) { + s->formatter->dump_string("Prefix", url_encode(kv.first, false)); + } else { + s->formatter->dump_string("Prefix", kv.first); + } + } + s->formatter->close_section(); + } + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWDeleteMultiObj_ObjStore_S3::get_params(optional_yield y) +{ + int ret = RGWDeleteMultiObj_ObjStore::get_params(y); + if (ret < 0) { + return ret; + } + + const char *bypass_gov_header = s->info.env->get("HTTP_X_AMZ_BYPASS_GOVERNANCE_RETENTION"); + if (bypass_gov_header) { + std::string bypass_gov_decoded = url_decode(bypass_gov_header); + bypass_governance_mode = boost::algorithm::iequals(bypass_gov_decoded, "true"); + } + + return do_aws4_auth_completion(); +} + +void RGWDeleteMultiObj_ObjStore_S3::send_status() +{ + if (! status_dumped) { + if (op_ret < 0) + set_req_state_err(s, op_ret); + dump_errno(s); + status_dumped = true; + } +} + +void RGWDeleteMultiObj_ObjStore_S3::begin_response() +{ + + if (!status_dumped) { + send_status(); + } + + dump_start(s); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING); + s->formatter->open_object_section_in_ns("DeleteResult", XMLNS_AWS_S3); + + rgw_flush_formatter(s, s->formatter); +} + +void RGWDeleteMultiObj_ObjStore_S3::send_partial_response(const rgw_obj_key& key, + bool delete_marker, + const string& marker_version_id, + int ret, + boost::asio::deadline_timer *formatter_flush_cond) +{ + if (!key.empty()) { + delete_multi_obj_entry ops_log_entry; + ops_log_entry.key = key.name; + ops_log_entry.version_id = key.instance; + if (ret == 0) { + ops_log_entry.error = false; + ops_log_entry.http_status = 200; + ops_log_entry.delete_marker = delete_marker; + if (delete_marker) { + ops_log_entry.marker_version_id = marker_version_id; + } + if (!quiet) { + s->formatter->open_object_section("Deleted"); + s->formatter->dump_string("Key", key.name); + if (!key.instance.empty()) { + s->formatter->dump_string("VersionId", key.instance); + } + if (delete_marker) { + s->formatter->dump_bool("DeleteMarker", true); + s->formatter->dump_string("DeleteMarkerVersionId", marker_version_id); + } + s->formatter->close_section(); + } + } else if (ret < 0) { + struct rgw_http_error r; + int err_no; + + s->formatter->open_object_section("Error"); + + err_no = -ret; + rgw_get_errno_s3(&r, err_no); + + ops_log_entry.error = true; + ops_log_entry.http_status = r.http_ret; + ops_log_entry.error_message = r.s3_code; + + s->formatter->dump_string("Key", key.name); + s->formatter->dump_string("VersionId", key.instance); + s->formatter->dump_string("Code", r.s3_code); + s->formatter->dump_string("Message", r.s3_code); + s->formatter->close_section(); + } + + ops_log_entries.push_back(std::move(ops_log_entry)); + if (formatter_flush_cond) { + formatter_flush_cond->cancel(); + } else { + rgw_flush_formatter(s, s->formatter); + } + } +} + +void RGWDeleteMultiObj_ObjStore_S3::end_response() +{ + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetObjLayout_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/json"); + + JSONFormatter f; + + if (op_ret < 0) { + return; + } + + f.open_object_section("result"); + s->object->dump_obj_layout(this, s->yield, &f); + f.close_section(); + rgw_flush_formatter(s, &f); +} + +int RGWConfigBucketMetaSearch_ObjStore_S3::get_params(optional_yield y) +{ + auto iter = s->info.x_meta_map.find("x-amz-meta-search"); + if (iter == s->info.x_meta_map.end()) { + s->err.message = "X-Rgw-Meta-Search header not provided"; + ldpp_dout(this, 5) << s->err.message << dendl; + return -EINVAL; + } + + list expressions; + get_str_list(iter->second, ",", expressions); + + for (auto& expression : expressions) { + vector args; + get_str_vec(expression, ";", args); + + if (args.empty()) { + s->err.message = "invalid empty expression"; + ldpp_dout(this, 5) << s->err.message << dendl; + return -EINVAL; + } + if (args.size() > 2) { + s->err.message = string("invalid expression: ") + expression; + ldpp_dout(this, 5) << s->err.message << dendl; + return -EINVAL; + } + + string key = boost::algorithm::to_lower_copy(rgw_trim_whitespace(args[0])); + string val; + if (args.size() > 1) { + val = boost::algorithm::to_lower_copy(rgw_trim_whitespace(args[1])); + } + + if (!boost::algorithm::starts_with(key, RGW_AMZ_META_PREFIX)) { + s->err.message = string("invalid expression, key must start with '" RGW_AMZ_META_PREFIX "' : ") + expression; + ldpp_dout(this, 5) << s->err.message << dendl; + return -EINVAL; + } + + key = key.substr(sizeof(RGW_AMZ_META_PREFIX) - 1); + + ESEntityTypeMap::EntityType entity_type; + + if (val.empty() || val == "str" || val == "string") { + entity_type = ESEntityTypeMap::ES_ENTITY_STR; + } else if (val == "int" || val == "integer") { + entity_type = ESEntityTypeMap::ES_ENTITY_INT; + } else if (val == "date" || val == "datetime") { + entity_type = ESEntityTypeMap::ES_ENTITY_DATE; + } else { + s->err.message = string("invalid entity type: ") + val; + ldpp_dout(this, 5) << s->err.message << dendl; + return -EINVAL; + } + + mdsearch_config[key] = entity_type; + } + + return 0; +} + +void RGWConfigBucketMetaSearch_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); +} + +void RGWGetBucketMetaSearch_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL, to_mime_type(s->format)); + + Formatter *f = s->formatter; + f->open_array_section("GetBucketMetaSearchResult"); + for (auto& e : s->bucket->get_info().mdsearch_config) { + f->open_object_section("Entry"); + string k = string("x-amz-meta-") + e.first; + f->dump_string("Key", k.c_str()); + const char *type; + switch (e.second) { + case ESEntityTypeMap::ES_ENTITY_INT: + type = "int"; + break; + case ESEntityTypeMap::ES_ENTITY_DATE: + type = "date"; + break; + default: + type = "str"; + } + f->dump_string("Type", type); + f->close_section(); + } + f->close_section(); + rgw_flush_formatter(s, f); +} + +void RGWDelBucketMetaSearch_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); +} + +void RGWPutBucketObjectLock_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWGetBucketObjectLock_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (op_ret) { + return; + } + encode_xml("ObjectLockConfiguration", s->bucket->get_info().obj_lock, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +int RGWPutObjRetention_ObjStore_S3::get_params(optional_yield y) +{ + const char *bypass_gov_header = s->info.env->get("HTTP_X_AMZ_BYPASS_GOVERNANCE_RETENTION"); + if (bypass_gov_header) { + std::string bypass_gov_decoded = url_decode(bypass_gov_header); + bypass_governance_mode = boost::algorithm::iequals(bypass_gov_decoded, "true"); + } + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = read_all_input(s, max_size, false); + return op_ret; +} + +void RGWPutObjRetention_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWGetObjRetention_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (op_ret) { + return; + } + encode_xml("Retention", obj_retention, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWPutObjLegalHold_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWGetObjLegalHold_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (op_ret) { + return; + } + encode_xml("LegalHold", obj_legal_hold, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketPolicyStatus_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + s->formatter->open_object_section_in_ns("PolicyStatus", XMLNS_AWS_S3); + // https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGETPolicyStatus.html + // mentions TRUE and FALSE, but boto/aws official clients seem to want lower + // case which is returned by AWS as well; so let's be bug to bug compatible + // with the API + s->formatter->dump_bool("IsPublic", isPublic); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + +} + +void RGWPutBucketPublicAccessBlock_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWGetBucketPublicAccessBlock_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + access_conf.dump_xml(s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +RGWOp *RGWHandler_REST_Service_S3::op_get() +{ + if (is_usage_op()) { + return new RGWGetUsage_ObjStore_S3; + } else { + return new RGWListBuckets_ObjStore_S3; + } +} + +RGWOp *RGWHandler_REST_Service_S3::op_head() +{ + return new RGWListBuckets_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Bucket_S3::get_obj_op(bool get_data) const +{ + // Non-website mode + if (get_data) { + int list_type = 1; + s->info.args.get_int("list-type", &list_type, 1); + switch (list_type) { + case 1: + return new RGWListBucket_ObjStore_S3; + case 2: + return new RGWListBucket_ObjStore_S3v2; + default: + ldpp_dout(s, 5) << __func__ << ": unsupported list-type " << list_type << dendl; + return new RGWListBucket_ObjStore_S3; + } + } else { + return new RGWStatBucket_ObjStore_S3; + } +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_get() +{ + if (s->info.args.sub_resource_exists("encryption")) + return nullptr; + + if (s->info.args.sub_resource_exists("logging")) + return new RGWGetBucketLogging_ObjStore_S3; + + if (s->info.args.sub_resource_exists("location")) + return new RGWGetBucketLocation_ObjStore_S3; + + if (s->info.args.sub_resource_exists("versioning")) + return new RGWGetBucketVersioning_ObjStore_S3; + + if (s->info.args.sub_resource_exists("website")) { + if (!s->cct->_conf->rgw_enable_static_website) { + return NULL; + } + return new RGWGetBucketWebsite_ObjStore_S3; + } + + if (s->info.args.exists("mdsearch")) { + return new RGWGetBucketMetaSearch_ObjStore_S3; + } + + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWGetCORS_ObjStore_S3; + } else if (is_request_payment_op()) { + return new RGWGetRequestPayment_ObjStore_S3; + } else if (s->info.args.exists("uploads")) { + return new RGWListBucketMultiparts_ObjStore_S3; + } else if(is_lc_op()) { + return new RGWGetLC_ObjStore_S3; + } else if(is_policy_op()) { + return new RGWGetBucketPolicy; + } else if (is_tagging_op()) { + return new RGWGetBucketTags_ObjStore_S3; + } else if (is_object_lock_op()) { + return new RGWGetBucketObjectLock_ObjStore_S3; + } else if (is_notification_op()) { + return RGWHandler_REST_PSNotifs_S3::create_get_op(); + } else if (is_replication_op()) { + return new RGWGetBucketReplication_ObjStore_S3; + } else if (is_policy_status_op()) { + return new RGWGetBucketPolicyStatus_ObjStore_S3; + } else if (is_block_public_access_op()) { + return new RGWGetBucketPublicAccessBlock_ObjStore_S3; + } else if (is_bucket_encryption_op()) { + return new RGWGetBucketEncryption_ObjStore_S3; + } + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_head() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploads")) { + return new RGWListBucketMultiparts_ObjStore_S3; + } + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_put() +{ + if (s->info.args.sub_resource_exists("logging") || + s->info.args.sub_resource_exists("encryption")) + return nullptr; + if (s->info.args.sub_resource_exists("versioning")) + return new RGWSetBucketVersioning_ObjStore_S3; + if (s->info.args.sub_resource_exists("website")) { + if (!s->cct->_conf->rgw_enable_static_website) { + return NULL; + } + return new RGWSetBucketWebsite_ObjStore_S3; + } + if (is_tagging_op()) { + return new RGWPutBucketTags_ObjStore_S3; + } else if (is_acl_op()) { + return new RGWPutACLs_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWPutCORS_ObjStore_S3; + } else if (is_request_payment_op()) { + return new RGWSetRequestPayment_ObjStore_S3; + } else if(is_lc_op()) { + return new RGWPutLC_ObjStore_S3; + } else if(is_policy_op()) { + return new RGWPutBucketPolicy; + } else if (is_object_lock_op()) { + return new RGWPutBucketObjectLock_ObjStore_S3; + } else if (is_notification_op()) { + return RGWHandler_REST_PSNotifs_S3::create_put_op(); + } else if (is_replication_op()) { + RGWBucketSyncPolicyHandlerRef sync_policy_handler; + int ret = driver->get_sync_policy_handler(s, nullopt, nullopt, + &sync_policy_handler, null_yield); + if (ret < 0 || !sync_policy_handler || + sync_policy_handler->is_legacy_config()) { + return nullptr; + } + + return new RGWPutBucketReplication_ObjStore_S3; + } else if (is_block_public_access_op()) { + return new RGWPutBucketPublicAccessBlock_ObjStore_S3; + } else if (is_bucket_encryption_op()) { + return new RGWPutBucketEncryption_ObjStore_S3; + } + return new RGWCreateBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_delete() +{ + if (s->info.args.sub_resource_exists("logging") || + s->info.args.sub_resource_exists("encryption")) + return nullptr; + + if (is_tagging_op()) { + return new RGWDeleteBucketTags_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWDeleteCORS_ObjStore_S3; + } else if(is_lc_op()) { + return new RGWDeleteLC_ObjStore_S3; + } else if(is_policy_op()) { + return new RGWDeleteBucketPolicy; + } else if (is_notification_op()) { + return RGWHandler_REST_PSNotifs_S3::create_delete_op(); + } else if (is_replication_op()) { + return new RGWDeleteBucketReplication_ObjStore_S3; + } else if (is_block_public_access_op()) { + return new RGWDeleteBucketPublicAccessBlock; + } else if (is_bucket_encryption_op()) { + return new RGWDeleteBucketEncryption_ObjStore_S3; + } + + if (s->info.args.sub_resource_exists("website")) { + if (!s->cct->_conf->rgw_enable_static_website) { + return NULL; + } + return new RGWDeleteBucketWebsite_ObjStore_S3; + } + + if (s->info.args.exists("mdsearch")) { + return new RGWDelBucketMetaSearch_ObjStore_S3; + } + + return new RGWDeleteBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_post() +{ + if (s->info.args.exists("delete")) { + return new RGWDeleteMultiObj_ObjStore_S3; + } + + if (s->info.args.exists("mdsearch")) { + return new RGWConfigBucketMetaSearch_ObjStore_S3; + } + + return new RGWPostObj_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_options() +{ + return new RGWOptionsCORS_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Obj_S3::get_obj_op(bool get_data) +{ + RGWGetObj_ObjStore_S3 *get_obj_op = new RGWGetObj_ObjStore_S3; + get_obj_op->set_get_data(get_data); + return get_obj_op; +} + +RGWOp *RGWHandler_REST_Obj_S3::op_get() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploadId")) { + return new RGWListMultipart_ObjStore_S3; + } else if (s->info.args.exists("layout")) { + return new RGWGetObjLayout_ObjStore_S3; + } else if (is_tagging_op()) { + return new RGWGetObjTags_ObjStore_S3; + } else if (is_obj_retention_op()) { + return new RGWGetObjRetention_ObjStore_S3; + } else if (is_obj_legal_hold_op()) { + return new RGWGetObjLegalHold_ObjStore_S3; + } + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Obj_S3::op_head() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploadId")) { + return new RGWListMultipart_ObjStore_S3; + } + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Obj_S3::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_S3; + } else if (is_tagging_op()) { + return new RGWPutObjTags_ObjStore_S3; + } else if (is_obj_retention_op()) { + return new RGWPutObjRetention_ObjStore_S3; + } else if (is_obj_legal_hold_op()) { + return new RGWPutObjLegalHold_ObjStore_S3; + } + + if (s->init_state.src_bucket.empty()) + return new RGWPutObj_ObjStore_S3; + else + return new RGWCopyObj_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Obj_S3::op_delete() +{ + if (is_tagging_op()) { + return new RGWDeleteObjTags_ObjStore_S3; + } + string upload_id = s->info.args.get("uploadId"); + + if (upload_id.empty()) + return new RGWDeleteObj_ObjStore_S3; + else + return new RGWAbortMultipart_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Obj_S3::op_post() +{ + if (s->info.args.exists("uploadId")) + return new RGWCompleteMultipart_ObjStore_S3; + + if (s->info.args.exists("uploads")) + return new RGWInitMultipart_ObjStore_S3; + + if (is_select_op()) + return rgw::s3select::create_s3select_op(); + + return new RGWPostObj_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Obj_S3::op_options() +{ + return new RGWOptionsCORS_ObjStore_S3; +} + +int RGWHandler_REST_S3::init_from_header(rgw::sal::Driver* driver, + req_state* s, + RGWFormat default_formatter, + bool configurable_format) +{ + string req; + string first; + + const char *req_name = s->relative_uri.c_str(); + const char *p; + + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(s); + + /* must be called after the args parsing */ + int ret = allocate_formatter(s, default_formatter, configurable_format); + if (ret < 0) + return ret; + + if (*req_name != '/') + return 0; + + req_name++; + + if (!*req_name) + return 0; + + req = req_name; + int pos = req.find('/'); + if (pos >= 0) { + first = req.substr(0, pos); + } else { + first = req; + } + + /* + * XXX The intent of the check for empty is apparently to let the bucket + * name from DNS to be set ahead. However, we currently take the DNS + * bucket and re-insert it into URL in rgw_rest.cc:RGWREST::preprocess(). + * So, this check is meaningless. + * + * Rather than dropping this, the code needs to be changed into putting + * the bucket (and its tenant) from DNS and Host: header (HTTP_HOST) + * into req_status.bucket_name directly. + */ + if (s->init_state.url_bucket.empty()) { + // Save bucket to tide us over until token is parsed. + s->init_state.url_bucket = first; + string encoded_obj_str; + if (pos >= 0) { + encoded_obj_str = req.substr(pos+1); + } + + /* dang: s->bucket is never set here, since it's created with permissions. + * These calls will always create an object with no bucket. */ + if (!encoded_obj_str.empty()) { + if (s->bucket) { + s->object = s->bucket->get_object(rgw_obj_key(encoded_obj_str, s->info.args.get("versionId"))); + } else { + s->object = driver->get_object(rgw_obj_key(encoded_obj_str, s->info.args.get("versionId"))); + } + } + } else { + if (s->bucket) { + s->object = s->bucket->get_object(rgw_obj_key(req_name, s->info.args.get("versionId"))); + } else { + s->object = driver->get_object(rgw_obj_key(req_name, s->info.args.get("versionId"))); + } + } + return 0; +} + +int RGWHandler_REST_S3::postauth_init(optional_yield y) +{ + struct req_init_state *t = &s->init_state; + + int ret = rgw_parse_url_bucket(t->url_bucket, s->user->get_tenant(), + s->bucket_tenant, s->bucket_name); + if (ret) { + return ret; + } + if (s->auth.identity->get_identity_type() == TYPE_ROLE) { + s->bucket_tenant = s->auth.identity->get_role_tenant(); + } + + ldpp_dout(s, 10) << "s->object=" << s->object + << " s->bucket=" << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name) << dendl; + + ret = rgw_validate_tenant_name(s->bucket_tenant); + if (ret) + return ret; + if (!s->bucket_name.empty() && !rgw::sal::Object::empty(s->object.get())) { + ret = validate_object_name(s->object->get_name()); + if (ret) + return ret; + } + + if (!t->src_bucket.empty()) { + string auth_tenant; + if (s->auth.identity->get_identity_type() == TYPE_ROLE) { + auth_tenant = s->auth.identity->get_role_tenant(); + } else { + auth_tenant = s->user->get_tenant(); + } + ret = rgw_parse_url_bucket(t->src_bucket, auth_tenant, + s->src_tenant_name, s->src_bucket_name); + if (ret) { + return ret; + } + ret = rgw_validate_tenant_name(s->src_tenant_name); + if (ret) + return ret; + } + + const char *mfa = s->info.env->get("HTTP_X_AMZ_MFA"); + if (mfa) { + ret = s->user->verify_mfa(string(mfa), &s->mfa_verified, s, y); + } + + return 0; +} + +int RGWHandler_REST_S3::init(rgw::sal::Driver* driver, req_state *s, + rgw::io::BasicClient *cio) +{ + int ret; + + s->dialect = "s3"; + + ret = rgw_validate_tenant_name(s->bucket_tenant); + if (ret) + return ret; + if (!s->bucket_name.empty()) { + ret = validate_object_name(s->object->get_name()); + if (ret) + return ret; + } + + const char *cacl = s->info.env->get("HTTP_X_AMZ_ACL"); + if (cacl) + s->canned_acl = cacl; + + s->has_acl_header = s->info.env->exists_prefix("HTTP_X_AMZ_GRANT"); + + const char *copy_source = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE"); + if (copy_source && + (! s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_RANGE")) && + (! s->info.args.exists("uploadId"))) { + rgw_obj_key key; + + ret = RGWCopyObj::parse_copy_location(copy_source, + s->init_state.src_bucket, + key, + s); + if (!ret) { + ldpp_dout(s, 0) << "failed to parse copy location" << dendl; + return -EINVAL; // XXX why not -ERR_INVALID_BUCKET_NAME or -ERR_BAD_URL? + } + s->src_object = driver->get_object(key); + } + + const char *sc = s->info.env->get("HTTP_X_AMZ_STORAGE_CLASS"); + if (sc) { + s->info.storage_class = sc; + } + + return RGWHandler_REST::init(driver, s, cio); +} + +int RGWHandler_REST_S3::authorize(const DoutPrefixProvider *dpp, optional_yield y) +{ + if (s->info.args.exists("Action") && s->info.args.get("Action") == "AssumeRoleWithWebIdentity") { + return RGW_Auth_STS::authorize(dpp, driver, auth_registry, s, y); + } + return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y); +} + +enum class AwsVersion { + UNKNOWN, + V2, + V4 +}; + +enum class AwsRoute { + UNKNOWN, + QUERY_STRING, + HEADERS +}; + +static inline std::pair +discover_aws_flavour(const req_info& info) +{ + using rgw::auth::s3::AWS4_HMAC_SHA256_STR; + + AwsVersion version = AwsVersion::UNKNOWN; + AwsRoute route = AwsRoute::UNKNOWN; + + const char* http_auth = info.env->get("HTTP_AUTHORIZATION"); + if (http_auth && http_auth[0]) { + /* Authorization in Header */ + route = AwsRoute::HEADERS; + + if (!strncmp(http_auth, AWS4_HMAC_SHA256_STR, + strlen(AWS4_HMAC_SHA256_STR))) { + /* AWS v4 */ + version = AwsVersion::V4; + } else if (!strncmp(http_auth, "AWS ", 4)) { + /* AWS v2 */ + version = AwsVersion::V2; + } + } else { + route = AwsRoute::QUERY_STRING; + + if (info.args.get("x-amz-algorithm") == AWS4_HMAC_SHA256_STR) { + /* AWS v4 */ + version = AwsVersion::V4; + } else if (!info.args.get("AWSAccessKeyId").empty()) { + /* AWS v2 */ + version = AwsVersion::V2; + } + } + + return std::make_pair(version, route); +} + +/* + * verify that a signed request comes from the keyholder + * by checking the signature against our locally-computed version + * + * it tries AWS v4 before AWS v2 + */ +int RGW_Auth_S3::authorize(const DoutPrefixProvider *dpp, + rgw::sal::Driver* const driver, + const rgw::auth::StrategyRegistry& auth_registry, + req_state* const s, optional_yield y) +{ + + /* neither keystone and rados enabled; warn and exit! */ + if (!driver->ctx()->_conf->rgw_s3_auth_use_rados && + !driver->ctx()->_conf->rgw_s3_auth_use_keystone && + !driver->ctx()->_conf->rgw_s3_auth_use_ldap) { + ldpp_dout(dpp, 0) << "WARNING: no authorization backend enabled! Users will never authenticate." << dendl; + return -EPERM; + } + + const auto ret = rgw::auth::Strategy::apply(dpp, auth_registry.get_s3_main(), s, y); + if (ret == 0) { + /* Populate the owner info. */ + s->owner.set_id(s->user->get_id()); + s->owner.set_name(s->user->get_display_name()); + } + return ret; +} + +int RGWHandler_Auth_S3::init(rgw::sal::Driver* driver, req_state *state, + rgw::io::BasicClient *cio) +{ + int ret = RGWHandler_REST_S3::init_from_header(driver, state, RGWFormat::JSON, true); + if (ret < 0) + return ret; + + return RGWHandler_REST::init(driver, state, cio); +} + +namespace { +// utility classes and functions for handling parameters with the following format: +// Attributes.entry.{N}.{key|value}={VALUE} +// N - any unsigned number +// VALUE - url encoded string + +// and Attribute is holding key and value +// ctor and set are done according to the "type" argument +// if type is not "key" or "value" its a no-op +class Attribute { + std::string key; + std::string value; +public: + Attribute(const std::string& type, const std::string& key_or_value) { + set(type, key_or_value); + } + void set(const std::string& type, const std::string& key_or_value) { + if (type == "key") { + key = key_or_value; + } else if (type == "value") { + value = key_or_value; + } + } + const std::string& get_key() const { return key; } + const std::string& get_value() const { return value; } +}; + +using AttributeMap = std::map; + +// aggregate the attributes into a map +// the key and value are associated by the index (N) +// no assumptions are made on the order in which these parameters are added +void update_attribute_map(const std::string& input, AttributeMap& map) { + const boost::char_separator sep("."); + const boost::tokenizer tokens(input, sep); + auto token = tokens.begin(); + if (*token != "Attributes") { + return; + } + ++token; + + if (*token != "entry") { + return; + } + ++token; + + unsigned idx; + try { + idx = std::stoul(*token); + } catch (const std::invalid_argument&) { + return; + } + ++token; + + std::string key_or_value = ""; + // get the rest of the string regardless of dots + // this is to allow dots in the value + while (token != tokens.end()) { + key_or_value.append(*token+"."); + ++token; + } + // remove last separator + key_or_value.pop_back(); + + auto pos = key_or_value.find("="); + if (pos != std::string::npos) { + const auto key_or_value_lhs = key_or_value.substr(0, pos); + const auto key_or_value_rhs = url_decode(key_or_value.substr(pos + 1, key_or_value.size() - 1)); + const auto map_it = map.find(idx); + if (map_it == map.end()) { + // new entry + map.emplace(std::make_pair(idx, Attribute(key_or_value_lhs, key_or_value_rhs))); + } else { + // existing entry + map_it->second.set(key_or_value_lhs, key_or_value_rhs); + } + } +} +} + +void parse_post_action(const std::string& post_body, req_state* s) +{ + if (post_body.size() > 0) { + ldpp_dout(s, 10) << "Content of POST: " << post_body << dendl; + + if (post_body.find("Action") != string::npos) { + const boost::char_separator sep("&"); + const boost::tokenizer> tokens(post_body, sep); + AttributeMap map; + for (const auto& t : tokens) { + const auto pos = t.find("="); + if (pos != string::npos) { + const auto key = t.substr(0, pos); + if (boost::starts_with(key, "Attributes.")) { + update_attribute_map(t, map); + } else { + s->info.args.append(t.substr(0, pos), + url_decode(t.substr(pos+1, t.size() -1))); + } + } + } + // update the regular args with the content of the attribute map + for (const auto& attr : map) { + s->info.args.append(attr.second.get_key(), attr.second.get_value()); + } + } + } + const auto payload_hash = rgw::auth::s3::calc_v4_payload_hash(post_body); + s->info.args.append("PayloadHash", payload_hash); +} + +RGWHandler_REST* RGWRESTMgr_S3::get_handler(rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + bool is_s3website = enable_s3website && (s->prot_flags & RGW_REST_WEBSITE); + int ret = + RGWHandler_REST_S3::init_from_header(driver, s, + is_s3website ? RGWFormat::HTML : + RGWFormat::XML, true); + if (ret < 0) { + return nullptr; + } + + if (is_s3website) { + if (s->init_state.url_bucket.empty()) { + return new RGWHandler_REST_Service_S3Website(auth_registry); + } + if (rgw::sal::Object::empty(s->object.get())) { + return new RGWHandler_REST_Bucket_S3Website(auth_registry); + } + return new RGWHandler_REST_Obj_S3Website(auth_registry); + } + + if (s->init_state.url_bucket.empty()) { + // no bucket + if (s->op == OP_POST) { + // POST will be one of: IAM, STS or topic service + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + int ret; + bufferlist data; + std::tie(ret, data) = rgw_rest_read_all_input(s, max_size, false); + if (ret < 0) { + return nullptr; + } + parse_post_action(data.to_str(), s); + if (enable_sts && RGWHandler_REST_STS::action_exists(s)) { + return new RGWHandler_REST_STS(auth_registry); + } + if (enable_iam && RGWHandler_REST_IAM::action_exists(s)) { + return new RGWHandler_REST_IAM(auth_registry, data); + } + if (enable_pubsub && RGWHandler_REST_PSTopic_AWS::action_exists(s)) { + return new RGWHandler_REST_PSTopic_AWS(auth_registry); + } + return nullptr; + } + // non-POST S3 service without a bucket + return new RGWHandler_REST_Service_S3(auth_registry); + } + if (!rgw::sal::Object::empty(s->object.get())) { + // has object + return new RGWHandler_REST_Obj_S3(auth_registry); + } + if (s->info.args.exist_obj_excl_sub_resource()) { + return nullptr; + } + // has bucket + return new RGWHandler_REST_Bucket_S3(auth_registry, enable_pubsub); +} + +bool RGWHandler_REST_S3Website::web_dir() const { + std::string subdir_name; + if (!rgw::sal::Object::empty(s->object.get())) { + subdir_name = url_decode(s->object->get_name()); + } + + if (subdir_name.empty()) { + return false; + } else if (subdir_name.back() == '/' && subdir_name.size() > 1) { + subdir_name.pop_back(); + } + + std::unique_ptr obj = s->bucket->get_object(rgw_obj_key(subdir_name)); + + obj->set_atomic(); + + RGWObjState* state = nullptr; + if (obj->get_obj_state(s, &state, s->yield) < 0) { + return false; + } + if (! state->exists) { + return false; + } + return state->exists; +} + +int RGWHandler_REST_S3Website::init(rgw::sal::Driver* driver, req_state *s, + rgw::io::BasicClient* cio) +{ + // save the original object name before retarget() replaces it with the + // result of get_effective_key(). the error_handler() needs the original + // object name for redirect handling + if (!rgw::sal::Object::empty(s->object.get())) { + original_object_name = s->object->get_name(); + } else { + original_object_name = ""; + } + + return RGWHandler_REST_S3::init(driver, s, cio); +} + +int RGWHandler_REST_S3Website::retarget(RGWOp* op, RGWOp** new_op, optional_yield y) { + *new_op = op; + ldpp_dout(s, 10) << __func__ << " Starting retarget" << dendl; + + if (!(s->prot_flags & RGW_REST_WEBSITE)) + return 0; + + if (rgw::sal::Bucket::empty(s->bucket.get())) { + // TODO-FUTURE: if the bucket does not exist, maybe expose it here? + return -ERR_NO_SUCH_BUCKET; + } + + if (!s->bucket->get_info().has_website) { + // TODO-FUTURE: if the bucket has no WebsiteConfig, expose it here + return -ERR_NO_SUCH_WEBSITE_CONFIGURATION; + } + + rgw_obj_key new_obj; + string key_name; + if (!rgw::sal::Object::empty(s->object.get())) { + key_name = s->object->get_name(); + } + bool get_res = s->bucket->get_info().website_conf.get_effective_key(key_name, &new_obj.name, web_dir()); + if (!get_res) { + s->err.message = "The IndexDocument Suffix is not configurated or not well formed!"; + ldpp_dout(s, 5) << s->err.message << dendl; + return -EINVAL; + } + + ldpp_dout(s, 10) << "retarget get_effective_key " << s->object << " -> " + << new_obj << dendl; + + RGWBWRoutingRule rrule; + bool should_redirect = + s->bucket->get_info().website_conf.should_redirect(new_obj.name, 0, &rrule); + + if (should_redirect) { + const string& hostname = s->info.env->get("HTTP_HOST", ""); + const string& protocol = + (s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http"); + int redirect_code = 0; + rrule.apply_rule(protocol, hostname, key_name, &s->redirect, + &redirect_code); + // APply a custom HTTP response code + if (redirect_code > 0) + s->err.http_ret = redirect_code; // Apply a custom HTTP response code + ldpp_dout(s, 10) << "retarget redirect code=" << redirect_code + << " proto+host:" << protocol << "://" << hostname + << " -> " << s->redirect << dendl; + return -ERR_WEBSITE_REDIRECT; + } + + /* + * FIXME: if s->object != new_obj, drop op and create a new op to handle + * operation. Or remove this comment if it's not applicable anymore + * dang: This could be problematic, since we're not actually replacing op, but + * we are replacing s->object. Something might have a pointer to it. + */ + s->object = s->bucket->get_object(new_obj); + + return 0; +} + +RGWOp* RGWHandler_REST_S3Website::op_get() +{ + return get_obj_op(true); +} + +RGWOp* RGWHandler_REST_S3Website::op_head() +{ + return get_obj_op(false); +} + +int RGWHandler_REST_S3Website::serve_errordoc(const DoutPrefixProvider *dpp, int http_ret, const string& errordoc_key, optional_yield y) { + int ret = 0; + s->formatter->reset(); /* Try to throw it all away */ + + std::shared_ptr getop( static_cast(op_get())); + if (getop.get() == NULL) { + return -1; // Trigger double error handler + } + getop->init(driver, s, this); + getop->range_str = NULL; + getop->if_mod = NULL; + getop->if_unmod = NULL; + getop->if_match = NULL; + getop->if_nomatch = NULL; + /* This is okay. It's an error, so nothing will run after this, and it can be + * called by abort_early(), which can be called before s->object or s->bucket + * are set up. Note, it won't have bucket. */ + s->object = driver->get_object(errordoc_key); + + ret = init_permissions(getop.get(), y); + if (ret < 0) { + ldpp_dout(s, 20) << "serve_errordoc failed, init_permissions ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + ret = read_permissions(getop.get(), y); + if (ret < 0) { + ldpp_dout(s, 20) << "serve_errordoc failed, read_permissions ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + if (http_ret) { + getop->set_custom_http_response(http_ret); + } + + ret = getop->init_processing(y); + if (ret < 0) { + ldpp_dout(s, 20) << "serve_errordoc failed, init_processing ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + ret = getop->verify_op_mask(); + if (ret < 0) { + ldpp_dout(s, 20) << "serve_errordoc failed, verify_op_mask ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + ret = getop->verify_permission(y); + if (ret < 0) { + ldpp_dout(s, 20) << "serve_errordoc failed, verify_permission ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + ret = getop->verify_params(); + if (ret < 0) { + ldpp_dout(s, 20) << "serve_errordoc failed, verify_params ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + // No going back now + getop->pre_exec(); + /* + * FIXME Missing headers: + * With a working errordoc, the s3 error fields are rendered as HTTP headers, + * x-amz-error-code: NoSuchKey + * x-amz-error-message: The specified key does not exist. + * x-amz-error-detail-Key: foo + */ + getop->execute(y); + getop->complete(); + return 0; +} + +int RGWHandler_REST_S3Website::error_handler(int err_no, + string* error_content, + optional_yield y) { + int new_err_no = -1; + rgw_http_errors::const_iterator r = rgw_http_s3_errors.find(err_no > 0 ? err_no : -err_no); + int http_error_code = -1; + + if (r != rgw_http_s3_errors.end()) { + http_error_code = r->second.first; + } + ldpp_dout(s, 10) << "RGWHandler_REST_S3Website::error_handler err_no=" << err_no << " http_ret=" << http_error_code << dendl; + + RGWBWRoutingRule rrule; + bool have_bucket = !rgw::sal::Bucket::empty(s->bucket.get()); + bool should_redirect = false; + if (have_bucket) { + should_redirect = + s->bucket->get_info().website_conf.should_redirect(original_object_name, + http_error_code, &rrule); + } + + if (should_redirect) { + const string& hostname = s->info.env->get("HTTP_HOST", ""); + const string& protocol = + (s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http"); + int redirect_code = 0; + rrule.apply_rule(protocol, hostname, original_object_name, + &s->redirect, &redirect_code); + // Apply a custom HTTP response code + if (redirect_code > 0) + s->err.http_ret = redirect_code; // Apply a custom HTTP response code + ldpp_dout(s, 10) << "error handler redirect code=" << redirect_code + << " proto+host:" << protocol << "://" << hostname + << " -> " << s->redirect << dendl; + return -ERR_WEBSITE_REDIRECT; + } else if (err_no == -ERR_WEBSITE_REDIRECT) { + // Do nothing here, this redirect will be handled in abort_early's ERR_WEBSITE_REDIRECT block + // Do NOT fire the ErrorDoc handler + } else if (have_bucket && !s->bucket->get_info().website_conf.error_doc.empty()) { + /* This serves an entire page! + On success, it will return zero, and no further content should be sent to the socket + On failure, we need the double-error handler + */ + new_err_no = RGWHandler_REST_S3Website::serve_errordoc(s, http_error_code, s->bucket->get_info().website_conf.error_doc, y); + if (new_err_no != -1) { + err_no = new_err_no; + } + } else { + ldpp_dout(s, 20) << "No special error handling today!" << dendl; + } + + return err_no; +} + +RGWOp* RGWHandler_REST_Obj_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; +} + +RGWOp* RGWHandler_REST_Bucket_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; +} + +RGWOp* RGWHandler_REST_Service_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; +} + + +namespace rgw::auth::s3 { + +static rgw::auth::Completer::cmplptr_t +null_completer_factory(const boost::optional& secret_key) +{ + return nullptr; +} + + +AWSEngine::VersionAbstractor::auth_data_t +AWSGeneralAbstractor::get_auth_data(const req_state* const s) const +{ + AwsVersion version; + AwsRoute route; + std::tie(version, route) = discover_aws_flavour(s->info); + + if (version == AwsVersion::V2) { + return get_auth_data_v2(s); + } else if (version == AwsVersion::V4) { + return get_auth_data_v4(s, route == AwsRoute::QUERY_STRING); + } else { + /* FIXME(rzarzynski): handle anon user. */ + throw -EINVAL; + } +} + +boost::optional +AWSGeneralAbstractor::get_v4_canonical_headers( + const req_info& info, + const std::string_view& signedheaders, + const bool using_qs) const +{ + return rgw::auth::s3::get_v4_canonical_headers(info, signedheaders, + using_qs, false); +} + +AWSSignerV4::prepare_result_t +AWSSignerV4::prepare(const DoutPrefixProvider *dpp, + const std::string& access_key_id, + const string& region, + const string& service, + const req_info& info, + const bufferlist *opt_content, + bool s3_op) +{ + std::string signed_hdrs; + + ceph::real_time timestamp = ceph::real_clock::now(); + + map extra_headers; + + std::string date = ceph::to_iso_8601_no_separators(timestamp, ceph::iso_8601_format::YMDhms); + + std::string credential_scope = gen_v4_scope(timestamp, region, service); + + extra_headers["x-amz-date"] = date; + + string content_hash; + + if (opt_content) { + content_hash = rgw::auth::s3::calc_v4_payload_hash(opt_content->to_str()); + extra_headers["x-amz-content-sha256"] = content_hash; + + } + + /* craft canonical headers */ + std::string canonical_headers = \ + gen_v4_canonical_headers(info, extra_headers, &signed_hdrs); + + using sanitize = rgw::crypt_sanitize::log_content; + ldpp_dout(dpp, 10) << "canonical headers format = " + << sanitize{canonical_headers} << dendl; + + bool is_non_s3_op = !s3_op; + + const char* exp_payload_hash = nullptr; + string payload_hash; + if (is_non_s3_op) { + //For non s3 ops, we need to calculate the payload hash + payload_hash = info.args.get("PayloadHash"); + exp_payload_hash = payload_hash.c_str(); + } else { + /* Get the expected hash. */ + if (content_hash.empty()) { + exp_payload_hash = rgw::auth::s3::get_v4_exp_payload_hash(info); + } else { + exp_payload_hash = content_hash.c_str(); + } + } + + /* Craft canonical URI. Using std::move later so let it be non-const. */ + auto canonical_uri = rgw::auth::s3::gen_v4_canonical_uri(info); + + + /* Craft canonical query string. std::moving later so non-const here. */ + auto canonical_qs = rgw::auth::s3::gen_v4_canonical_qs(info, is_non_s3_op); + + auto cct = dpp->get_cct(); + + /* Craft canonical request. */ + auto canonical_req_hash = \ + rgw::auth::s3::get_v4_canon_req_hash(cct, + info.method, + std::move(canonical_uri), + std::move(canonical_qs), + std::move(canonical_headers), + signed_hdrs, + exp_payload_hash, + dpp); + + auto string_to_sign = \ + rgw::auth::s3::get_v4_string_to_sign(cct, + AWS4_HMAC_SHA256_STR, + date, + credential_scope, + std::move(canonical_req_hash), + dpp); + + const auto sig_factory = gen_v4_signature; + + /* Requests authenticated with the Query Parameters are treated as unsigned. + * From "Authenticating Requests: Using Query Parameters (AWS Signature + * Version 4)": + * + * You don't include a payload hash in the Canonical Request, because + * when you create a presigned URL, you don't know the payload content + * because the URL is used to upload an arbitrary payload. Instead, you + * use a constant string UNSIGNED-PAYLOAD. + * + * This means we have absolutely no business in spawning completer. Both + * aws4_auth_needs_complete and aws4_auth_streaming_mode are set to false + * by default. We don't need to change that. */ + return { + access_key_id, + date, + credential_scope, + std::move(signed_hdrs), + std::move(string_to_sign), + std::move(extra_headers), + sig_factory, + }; +} + +AWSSignerV4::signature_headers_t +gen_v4_signature(const DoutPrefixProvider *dpp, + const std::string_view& secret_key, + const AWSSignerV4::prepare_result_t& sig_info) +{ + auto signature = rgw::auth::s3::get_v4_signature(sig_info.scope, + dpp->get_cct(), + secret_key, + sig_info.string_to_sign, + dpp); + AWSSignerV4::signature_headers_t result; + + for (auto& entry : sig_info.extra_headers) { + result[entry.first] = entry.second; + } + auto& payload_hash = result["x-amz-content-sha256"]; + if (payload_hash.empty()) { + payload_hash = AWS4_UNSIGNED_PAYLOAD_HASH; + } + string auth_header = string("AWS4-HMAC-SHA256 Credential=").append(sig_info.access_key_id) + "/"; + auth_header.append(sig_info.scope + ",SignedHeaders=") + .append(sig_info.signed_headers + ",Signature=") + .append(signature); + result["Authorization"] = auth_header; + + return result; +} + + +AWSEngine::VersionAbstractor::auth_data_t +AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s, + const bool using_qs) const +{ + std::string_view access_key_id; + std::string_view signed_hdrs; + + std::string_view date; + std::string_view credential_scope; + std::string_view client_signature; + std::string_view session_token; + + int ret = rgw::auth::s3::parse_v4_credentials(s->info, + access_key_id, + credential_scope, + signed_hdrs, + client_signature, + date, + session_token, + using_qs, + s); + if (ret < 0) { + throw ret; + } + + /* craft canonical headers */ + boost::optional canonical_headers = \ + get_v4_canonical_headers(s->info, signed_hdrs, using_qs); + if (canonical_headers) { + using sanitize = rgw::crypt_sanitize::log_content; + ldpp_dout(s, 10) << "canonical headers format = " + << sanitize{*canonical_headers} << dendl; + } else { + throw -EPERM; + } + + bool is_non_s3_op = rgw::auth::s3::is_non_s3_op(s->op_type); + + const char* exp_payload_hash = nullptr; + string payload_hash; + if (is_non_s3_op) { + //For non s3 ops, we need to calculate the payload hash + payload_hash = s->info.args.get("PayloadHash"); + exp_payload_hash = payload_hash.c_str(); + } else { + /* Get the expected hash. */ + exp_payload_hash = rgw::auth::s3::get_v4_exp_payload_hash(s->info); + } + + /* Craft canonical URI. Using std::move later so let it be non-const. */ + auto canonical_uri = rgw::auth::s3::get_v4_canonical_uri(s->info); + + /* Craft canonical query string. std::moving later so non-const here. */ + auto canonical_qs = rgw::auth::s3::get_v4_canonical_qs(s->info, using_qs); + + /* Craft canonical method. */ + auto canonical_method = rgw::auth::s3::get_v4_canonical_method(s); + + /* Craft canonical request. */ + auto canonical_req_hash = \ + rgw::auth::s3::get_v4_canon_req_hash(s->cct, + std::move(canonical_method), + std::move(canonical_uri), + std::move(canonical_qs), + std::move(*canonical_headers), + signed_hdrs, + exp_payload_hash, + s); + + auto string_to_sign = \ + rgw::auth::s3::get_v4_string_to_sign(s->cct, + AWS4_HMAC_SHA256_STR, + date, + credential_scope, + std::move(canonical_req_hash), + s); + + const auto sig_factory = std::bind(rgw::auth::s3::get_v4_signature, + credential_scope, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3, + s); + + /* Requests authenticated with the Query Parameters are treated as unsigned. + * From "Authenticating Requests: Using Query Parameters (AWS Signature + * Version 4)": + * + * You don't include a payload hash in the Canonical Request, because + * when you create a presigned URL, you don't know the payload content + * because the URL is used to upload an arbitrary payload. Instead, you + * use a constant string UNSIGNED-PAYLOAD. + * + * This means we have absolutely no business in spawning completer. Both + * aws4_auth_needs_complete and aws4_auth_streaming_mode are set to false + * by default. We don't need to change that. */ + if (is_v4_payload_unsigned(exp_payload_hash) || is_v4_payload_empty(s) || is_non_s3_op) { + return { + access_key_id, + client_signature, + session_token, + std::move(string_to_sign), + sig_factory, + null_completer_factory + }; + } else { + /* We're going to handle a signed payload. Be aware that even empty HTTP + * body (no payload) requires verification: + * + * The x-amz-content-sha256 header is required for all AWS Signature + * Version 4 requests. It provides a hash of the request payload. If + * there is no payload, you must provide the hash of an empty string. */ + if (!is_v4_payload_streamed(exp_payload_hash)) { + ldpp_dout(s, 10) << "delaying v4 auth" << dendl; + + /* payload in a single chunk */ + switch (s->op_type) + { + case RGW_OP_CREATE_BUCKET: + case RGW_OP_PUT_OBJ: + case RGW_OP_PUT_ACLS: + case RGW_OP_PUT_CORS: + case RGW_OP_PUT_BUCKET_ENCRYPTION: + case RGW_OP_GET_BUCKET_ENCRYPTION: + case RGW_OP_DELETE_BUCKET_ENCRYPTION: + case RGW_OP_INIT_MULTIPART: // in case that Init Multipart uses CHUNK encoding + case RGW_OP_COMPLETE_MULTIPART: + case RGW_OP_SET_BUCKET_VERSIONING: + case RGW_OP_DELETE_MULTI_OBJ: + case RGW_OP_ADMIN_SET_METADATA: + case RGW_OP_SYNC_DATALOG_NOTIFY: + case RGW_OP_SYNC_DATALOG_NOTIFY2: + case RGW_OP_SYNC_MDLOG_NOTIFY: + case RGW_OP_PERIOD_POST: + case RGW_OP_SET_BUCKET_WEBSITE: + case RGW_OP_PUT_BUCKET_POLICY: + case RGW_OP_PUT_OBJ_TAGGING: + case RGW_OP_PUT_BUCKET_TAGGING: + case RGW_OP_PUT_BUCKET_REPLICATION: + case RGW_OP_PUT_LC: + case RGW_OP_SET_REQUEST_PAYMENT: + case RGW_OP_PUBSUB_NOTIF_CREATE: + case RGW_OP_PUBSUB_NOTIF_DELETE: + case RGW_OP_PUBSUB_NOTIF_LIST: + case RGW_OP_PUT_BUCKET_OBJ_LOCK: + case RGW_OP_PUT_OBJ_RETENTION: + case RGW_OP_PUT_OBJ_LEGAL_HOLD: + case RGW_STS_GET_SESSION_TOKEN: + case RGW_STS_ASSUME_ROLE: + case RGW_OP_PUT_BUCKET_PUBLIC_ACCESS_BLOCK: + case RGW_OP_GET_BUCKET_PUBLIC_ACCESS_BLOCK: + case RGW_OP_DELETE_BUCKET_PUBLIC_ACCESS_BLOCK: + case RGW_OP_GET_OBJ://s3select its post-method(payload contain the query) , the request is get-object + break; + default: + ldpp_dout(s, 10) << "ERROR: AWS4 completion for operation: " << s->op_type << ", NOT IMPLEMENTED" << dendl; + throw -ERR_NOT_IMPLEMENTED; + } + + const auto cmpl_factory = std::bind(AWSv4ComplSingle::create, + s, + std::placeholders::_1); + return { + access_key_id, + client_signature, + session_token, + std::move(string_to_sign), + sig_factory, + cmpl_factory + }; + } else { + /* IMHO "streamed" doesn't fit too good here. I would prefer to call + * it "chunked" but let's be coherent with Amazon's terminology. */ + + ldpp_dout(s, 10) << "body content detected in multiple chunks" << dendl; + + /* payload in multiple chunks */ + + switch(s->op_type) + { + case RGW_OP_PUT_OBJ: + break; + default: + ldpp_dout(s, 10) << "ERROR: AWS4 completion for this operation NOT IMPLEMENTED (streaming mode)" << dendl; + throw -ERR_NOT_IMPLEMENTED; + } + + ldpp_dout(s, 10) << "aws4 seed signature ok... delaying v4 auth" << dendl; + + /* In the case of streamed payload client sets the x-amz-content-sha256 + * to "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" but uses "UNSIGNED-PAYLOAD" + * when constructing the Canonical Request. */ + + /* In the case of single-chunk upload client set the header's value is + * coherent with the one used for Canonical Request crafting. */ + + /* In the case of query string-based authentication there should be no + * x-amz-content-sha256 header and the value "UNSIGNED-PAYLOAD" is used + * for CanonReq. */ + const auto cmpl_factory = std::bind(AWSv4ComplMulti::create, + s, + date, + credential_scope, + client_signature, + std::placeholders::_1); + return { + access_key_id, + client_signature, + session_token, + std::move(string_to_sign), + sig_factory, + cmpl_factory + }; + } + } +} + + +boost::optional +AWSGeneralBoto2Abstractor::get_v4_canonical_headers( + const req_info& info, + const std::string_view& signedheaders, + const bool using_qs) const +{ + return rgw::auth::s3::get_v4_canonical_headers(info, signedheaders, + using_qs, true); +} + + +AWSEngine::VersionAbstractor::auth_data_t +AWSGeneralAbstractor::get_auth_data_v2(const req_state* const s) const +{ + std::string_view access_key_id; + std::string_view signature; + std::string_view session_token; + bool qsr = false; + + const char* http_auth = s->info.env->get("HTTP_AUTHORIZATION"); + if (! http_auth || http_auth[0] == '\0') { + /* Credentials are provided in query string. We also need to verify + * the "Expires" parameter now. */ + access_key_id = s->info.args.get("AWSAccessKeyId"); + signature = s->info.args.get("Signature"); + qsr = true; + + std::string_view expires = s->info.args.get("Expires"); + if (expires.empty()) { + throw -EPERM; + } + + /* It looks we have the guarantee that expires is a null-terminated, + * and thus string_view::data() can be safely used. */ + const time_t exp = atoll(expires.data()); + time_t now; + time(&now); + + if (now >= exp) { + throw -EPERM; + } + if (s->info.args.exists("x-amz-security-token")) { + session_token = s->info.args.get("x-amz-security-token"); + if (session_token.size() == 0) { + throw -EPERM; + } + } + + } else { + /* The "Authorization" HTTP header is being used. */ + const std::string_view auth_str(http_auth + strlen("AWS ")); + const size_t pos = auth_str.rfind(':'); + if (pos != std::string_view::npos) { + access_key_id = auth_str.substr(0, pos); + signature = auth_str.substr(pos + 1); + } + + auto token = s->info.env->get_optional("HTTP_X_AMZ_SECURITY_TOKEN"); + if (token) { + session_token = *token; + if (session_token.size() == 0) { + throw -EPERM; + } + } + } + + /* Let's canonize the HTTP headers that are covered by the AWS auth v2. */ + std::string string_to_sign; + utime_t header_time; + if (! rgw_create_s3_canonical_header(s, s->info, &header_time, string_to_sign, + qsr)) { + ldpp_dout(s, 10) << "failed to create the canonized auth header\n" + << rgw::crypt_sanitize::auth{s,string_to_sign} << dendl; + throw -EPERM; + } + + ldpp_dout(s, 10) << "string_to_sign:\n" + << rgw::crypt_sanitize::auth{s,string_to_sign} << dendl; + + if (!qsr && !is_time_skew_ok(header_time)) { + throw -ERR_REQUEST_TIME_SKEWED; + } + + return { + std::move(access_key_id), + std::move(signature), + std::move(session_token), + std::move(string_to_sign), + rgw::auth::s3::get_v2_signature, + null_completer_factory + }; +} + + +AWSEngine::VersionAbstractor::auth_data_t +AWSBrowserUploadAbstractor::get_auth_data_v2(const req_state* const s) const +{ + return { + s->auth.s3_postobj_creds.access_key, + s->auth.s3_postobj_creds.signature, + s->auth.s3_postobj_creds.x_amz_security_token, + s->auth.s3_postobj_creds.encoded_policy.to_str(), + rgw::auth::s3::get_v2_signature, + null_completer_factory + }; +} + +AWSEngine::VersionAbstractor::auth_data_t +AWSBrowserUploadAbstractor::get_auth_data_v4(const req_state* const s) const +{ + const std::string_view credential = s->auth.s3_postobj_creds.x_amz_credential; + + /* grab access key id */ + const size_t pos = credential.find("/"); + const std::string_view access_key_id = credential.substr(0, pos); + ldpp_dout(s, 10) << "access key id = " << access_key_id << dendl; + + /* grab credential scope */ + const std::string_view credential_scope = credential.substr(pos + 1); + ldpp_dout(s, 10) << "credential scope = " << credential_scope << dendl; + + const auto sig_factory = std::bind(rgw::auth::s3::get_v4_signature, + credential_scope, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3, + s); + + return { + access_key_id, + s->auth.s3_postobj_creds.signature, + s->auth.s3_postobj_creds.x_amz_security_token, + s->auth.s3_postobj_creds.encoded_policy.to_str(), + sig_factory, + null_completer_factory + }; +} + +AWSEngine::VersionAbstractor::auth_data_t +AWSBrowserUploadAbstractor::get_auth_data(const req_state* const s) const +{ + if (s->auth.s3_postobj_creds.x_amz_algorithm == AWS4_HMAC_SHA256_STR) { + ldpp_dout(s, 0) << "Signature verification algorithm AWS v4" + << " (AWS4-HMAC-SHA256)" << dendl; + return get_auth_data_v4(s); + } else { + ldpp_dout(s, 0) << "Signature verification algorithm AWS v2" << dendl; + return get_auth_data_v2(s); + } +} + +AWSEngine::result_t +AWSEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const +{ + /* Small reminder: an ver_abstractor is allowed to throw! */ + const auto auth_data = ver_abstractor.get_auth_data(s); + + if (auth_data.access_key_id.empty() || auth_data.client_signature.empty()) { + return result_t::deny(-EINVAL); + } else { + return authenticate(dpp, + auth_data.access_key_id, + auth_data.client_signature, + auth_data.session_token, + auth_data.string_to_sign, + auth_data.signature_factory, + auth_data.completer_factory, + s, y); + } +} + +} // namespace rgw::auth::s3 + +rgw::LDAPHelper* rgw::auth::s3::LDAPEngine::ldh = nullptr; +std::mutex rgw::auth::s3::LDAPEngine::mtx; + +void rgw::auth::s3::LDAPEngine::init(CephContext* const cct) +{ + if (! cct->_conf->rgw_s3_auth_use_ldap || + cct->_conf->rgw_ldap_uri.empty()) { + return; + } + + if (! ldh) { + std::lock_guard lck(mtx); + if (! ldh) { + const string& ldap_uri = cct->_conf->rgw_ldap_uri; + const string& ldap_binddn = cct->_conf->rgw_ldap_binddn; + const string& ldap_searchdn = cct->_conf->rgw_ldap_searchdn; + const string& ldap_searchfilter = cct->_conf->rgw_ldap_searchfilter; + const string& ldap_dnattr = cct->_conf->rgw_ldap_dnattr; + std::string ldap_bindpw = parse_rgw_ldap_bindpw(cct); + + ldh = new rgw::LDAPHelper(ldap_uri, ldap_binddn, ldap_bindpw, + ldap_searchdn, ldap_searchfilter, ldap_dnattr); + + ldh->init(); + ldh->bind(); + } + } +} + +bool rgw::auth::s3::LDAPEngine::valid() { + std::lock_guard lck(mtx); + return (!!ldh); +} + +rgw::auth::RemoteApplier::acl_strategy_t +rgw::auth::s3::LDAPEngine::get_acl_strategy() const +{ + //This is based on the assumption that the default acl strategy in + // get_perms_from_aclspec, will take care. Extra acl spec is not required. + return nullptr; +} + +rgw::auth::RemoteApplier::AuthInfo +rgw::auth::s3::LDAPEngine::get_creds_info(const rgw::RGWToken& token) const noexcept +{ + /* The short form of "using" can't be used here -- we're aliasing a class' + * member. */ + using acct_privilege_t = \ + rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t; + + return rgw::auth::RemoteApplier::AuthInfo { + rgw_user(token.id), + token.id, + RGW_PERM_FULL_CONTROL, + acct_privilege_t::IS_PLAIN_ACCT, + rgw::auth::RemoteApplier::AuthInfo::NO_ACCESS_KEY, + rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER, + TYPE_LDAP + }; +} + +rgw::auth::Engine::result_t +rgw::auth::s3::LDAPEngine::authenticate( + const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t&, + const completer_factory_t& completer_factory, + const req_state* const s, + optional_yield y) const +{ + /* boost filters and/or string_ref may throw on invalid input */ + rgw::RGWToken base64_token; + try { + base64_token = rgw::from_base64(access_key_id); + } catch (...) { + base64_token = std::string(""); + } + + if (! base64_token.valid()) { + return result_t::deny(); + } + + //TODO: Uncomment, when we have a migration plan in place. + //Check if a user of type other than 'ldap' is already present, if yes, then + //return error. + /*RGWUserInfo user_info; + user_info.user_id = base64_token.id; + if (rgw_get_user_info_by_uid(driver, user_info.user_id, user_info) >= 0) { + if (user_info.type != TYPE_LDAP) { + ldpp_dout(dpp, 10) << "ERROR: User id of type: " << user_info.type << " is already present" << dendl; + return nullptr; + } + }*/ + + if (ldh->auth(base64_token.id, base64_token.key) != 0) { + return result_t::deny(-ERR_INVALID_ACCESS_KEY); + } + + auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(), + get_creds_info(base64_token)); + return result_t::grant(std::move(apl), completer_factory(boost::none)); +} /* rgw::auth::s3::LDAPEngine::authenticate */ + +void rgw::auth::s3::LDAPEngine::shutdown() { + if (ldh) { + delete ldh; + ldh = nullptr; + } +} + +/* LocalEngine */ +rgw::auth::Engine::result_t +rgw::auth::s3::LocalEngine::authenticate( + const DoutPrefixProvider* dpp, + const std::string_view& _access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* const s, + optional_yield y) const +{ + /* get the user info */ + std::unique_ptr user; + const std::string access_key_id(_access_key_id); + /* TODO(rzarzynski): we need to have string-view taking variant. */ + if (driver->get_user_by_access_key(dpp, access_key_id, y, &user) < 0) { + ldpp_dout(dpp, 5) << "error reading user info, uid=" << access_key_id + << " can't authenticate" << dendl; + return result_t::deny(-ERR_INVALID_ACCESS_KEY); + } + //TODO: Uncomment, when we have a migration plan in place. + /*else { + if (s->user->type != TYPE_RGW) { + ldpp_dout(dpp, 10) << "ERROR: User id of type: " << s->user->type + << " is present" << dendl; + throw -EPERM; + } + }*/ + + const auto iter = user->get_info().access_keys.find(access_key_id); + if (iter == std::end(user->get_info().access_keys)) { + ldpp_dout(dpp, 0) << "ERROR: access key not encoded in user info" << dendl; + return result_t::deny(-EPERM); + } + const RGWAccessKey& k = iter->second; + + const VersionAbstractor::server_signature_t server_signature = \ + signature_factory(cct, k.key, string_to_sign); + auto compare = signature.compare(server_signature); + + ldpp_dout(dpp, 15) << "string_to_sign=" + << rgw::crypt_sanitize::log_content{string_to_sign} + << dendl; + ldpp_dout(dpp, 15) << "server signature=" << server_signature << dendl; + ldpp_dout(dpp, 15) << "client signature=" << signature << dendl; + ldpp_dout(dpp, 15) << "compare=" << compare << dendl; + + if (compare != 0) { + return result_t::deny(-ERR_SIGNATURE_NO_MATCH); + } + + auto apl = apl_factory->create_apl_local(cct, s, user->get_info(), + k.subuser, std::nullopt, access_key_id); + return result_t::grant(std::move(apl), completer_factory(k.key)); +} + +rgw::auth::RemoteApplier::AuthInfo +rgw::auth::s3::STSEngine::get_creds_info(const STS::SessionToken& token) const noexcept +{ + using acct_privilege_t = \ + rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t; + + return rgw::auth::RemoteApplier::AuthInfo { + token.user, + token.acct_name, + token.perm_mask, + (token.is_admin) ? acct_privilege_t::IS_ADMIN_ACCT: acct_privilege_t::IS_PLAIN_ACCT, + token.access_key_id, + rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER, + token.acct_type + }; +} + +int +rgw::auth::s3::STSEngine::get_session_token(const DoutPrefixProvider* dpp, const std::string_view& session_token, + STS::SessionToken& token) const +{ + string decodedSessionToken; + try { + decodedSessionToken = rgw::from_base64(session_token); + } catch (...) { + ldpp_dout(dpp, 0) << "ERROR: Invalid session token, not base64 encoded." << dendl; + return -EINVAL; + } + + auto* cryptohandler = cct->get_crypto_handler(CEPH_CRYPTO_AES); + if (! cryptohandler) { + return -EINVAL; + } + string secret_s = cct->_conf->rgw_sts_key; + buffer::ptr secret(secret_s.c_str(), secret_s.length()); + int ret = 0; + if (ret = cryptohandler->validate_secret(secret); ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: Invalid secret key" << dendl; + return -EINVAL; + } + string error; + std::unique_ptr keyhandler(cryptohandler->get_key_handler(secret, error)); + if (! keyhandler) { + return -EINVAL; + } + error.clear(); + + string decrypted_str; + buffer::list en_input, dec_output; + en_input = buffer::list::static_from_string(decodedSessionToken); + + ret = keyhandler->decrypt(en_input, dec_output, &error); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: Decryption failed: " << error << dendl; + return -EPERM; + } else { + try { + dec_output.append('\0'); + auto iter = dec_output.cbegin(); + decode(token, iter); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 0) << "ERROR: decode SessionToken failed: " << error << dendl; + return -EINVAL; + } + } + return 0; +} + +rgw::auth::Engine::result_t +rgw::auth::s3::STSEngine::authenticate( + const DoutPrefixProvider* dpp, + const std::string_view& _access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* const s, + optional_yield y) const +{ + if (! s->info.args.exists("x-amz-security-token") && + ! s->info.env->exists("HTTP_X_AMZ_SECURITY_TOKEN") && + s->auth.s3_postobj_creds.x_amz_security_token.empty()) { + return result_t::deny(); + } + + STS::SessionToken token; + if (int ret = get_session_token(dpp, session_token, token); ret < 0) { + return result_t::reject(ret); + } + //Authentication + //Check if access key is not the same passed in by client + if (token.access_key_id != _access_key_id) { + ldpp_dout(dpp, 0) << "Invalid access key" << dendl; + return result_t::reject(-EPERM); + } + //Check if the token has expired + if (! token.expiration.empty()) { + std::string expiration = token.expiration; + if (! expiration.empty()) { + boost::optional exp = ceph::from_iso_8601(expiration, false); + if (exp) { + real_clock::time_point now = real_clock::now(); + if (now >= *exp) { + ldpp_dout(dpp, 0) << "ERROR: Token expired" << dendl; + return result_t::reject(-EPERM); + } + } else { + ldpp_dout(dpp, 0) << "ERROR: Invalid expiration: " << expiration << dendl; + return result_t::reject(-EPERM); + } + } + } + //Check for signature mismatch + const VersionAbstractor::server_signature_t server_signature = \ + signature_factory(cct, token.secret_access_key, string_to_sign); + auto compare = signature.compare(server_signature); + + ldpp_dout(dpp, 15) << "string_to_sign=" + << rgw::crypt_sanitize::log_content{string_to_sign} + << dendl; + ldpp_dout(dpp, 15) << "server signature=" << server_signature << dendl; + ldpp_dout(dpp, 15) << "client signature=" << signature << dendl; + ldpp_dout(dpp, 15) << "compare=" << compare << dendl; + + if (compare != 0) { + return result_t::reject(-ERR_SIGNATURE_NO_MATCH); + } + + // Get all the authorization info + std::unique_ptr user; + rgw_user user_id; + string role_id; + rgw::auth::RoleApplier::Role r; + rgw::auth::RoleApplier::TokenAttrs t_attrs; + if (! token.roleId.empty()) { + std::unique_ptr role = driver->get_role(token.roleId); + if (role->get_by_id(dpp, y) < 0) { + return result_t::deny(-EPERM); + } + r.id = token.roleId; + r.name = role->get_name(); + r.tenant = role->get_tenant(); + + vector role_policy_names = role->get_role_policy_names(); + for (auto& policy_name : role_policy_names) { + string perm_policy; + if (int ret = role->get_role_policy(dpp, policy_name, perm_policy); ret == 0) { + r.role_policies.push_back(std::move(perm_policy)); + } + } + } + + user = driver->get_user(token.user); + if (! token.user.empty() && token.acct_type != TYPE_ROLE) { + // get user info + int ret = user->load_user(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 5) << "ERROR: failed reading user info: uid=" << token.user << dendl; + return result_t::reject(-EPERM); + } + } + + if (token.acct_type == TYPE_KEYSTONE || token.acct_type == TYPE_LDAP) { + auto apl = remote_apl_factory->create_apl_remote(cct, s, get_acl_strategy(), + get_creds_info(token)); + return result_t::grant(std::move(apl), completer_factory(token.secret_access_key)); + } else if (token.acct_type == TYPE_ROLE) { + t_attrs.user_id = std::move(token.user); // This is mostly needed to assign the owner of a bucket during its creation + t_attrs.token_policy = std::move(token.policy); + t_attrs.role_session_name = std::move(token.role_session); + t_attrs.token_claims = std::move(token.token_claims); + t_attrs.token_issued_at = std::move(token.issued_at); + t_attrs.principal_tags = std::move(token.principal_tags); + auto apl = role_apl_factory->create_apl_role(cct, s, r, t_attrs); + return result_t::grant(std::move(apl), completer_factory(token.secret_access_key)); + } else { // This is for all local users of type TYPE_RGW or TYPE_NONE + string subuser; + auto apl = local_apl_factory->create_apl_local(cct, s, user->get_info(), subuser, token.perm_mask, std::string(_access_key_id)); + return result_t::grant(std::move(apl), completer_factory(token.secret_access_key)); + } +} + +bool rgw::auth::s3::S3AnonymousEngine::is_applicable( + const req_state* s +) const noexcept { + AwsVersion version; + AwsRoute route; + std::tie(version, route) = discover_aws_flavour(s->info); + + /* If HTTP OPTIONS and no authentication provided using the + * anonymous engine is applicable */ + if (s->op == OP_OPTIONS && version == AwsVersion::UNKNOWN) { + return true; + } + + return route == AwsRoute::QUERY_STRING && version == AwsVersion::UNKNOWN; +} diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h new file mode 100644 index 000000000..20237166b --- /dev/null +++ b/src/rgw/rgw_rest_s3.h @@ -0,0 +1,1215 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#define TIME_BUF_SIZE 128 + +#include +#include + +#include +#include + +#include "common/sstring.hh" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_http_errors.h" +#include "rgw_acl_s3.h" +#include "rgw_policy_s3.h" +#include "rgw_lc_s3.h" +#include "rgw_keystone.h" +#include "rgw_rest_conn.h" +#include "rgw_ldap.h" + +#include "rgw_token.h" +#include "include/ceph_assert.h" + +#include "rgw_auth.h" +#include "rgw_auth_filters.h" +#include "rgw_sts.h" + +struct rgw_http_error { + int http_ret; + const char *s3_code; +}; + +void rgw_get_errno_s3(struct rgw_http_error *e, int err_no); + +class RGWGetObj_ObjStore_S3 : public RGWGetObj_ObjStore +{ +protected: + // Serving a custom error page from an object is really a 200 response with + // just the status line altered. + int custom_http_ret = 0; + std::map crypt_http_responses; + int override_range_hdr(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y); +public: + RGWGetObj_ObjStore_S3() {} + ~RGWGetObj_ObjStore_S3() override {} + + int verify_requester(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) override; + int get_params(optional_yield y) override; + int send_response_data_error(optional_yield y) override; + int send_response_data(bufferlist& bl, off_t ofs, off_t len) override; + void set_custom_http_response(int http_ret) { custom_http_ret = http_ret; } + int get_decrypt_filter(std::unique_ptr* filter, + RGWGetObj_Filter* cb, + bufferlist* manifest_bl) override; +}; + +class RGWGetObjTags_ObjStore_S3 : public RGWGetObjTags_ObjStore +{ +public: + RGWGetObjTags_ObjStore_S3() {} + ~RGWGetObjTags_ObjStore_S3() {} + + void send_response_data(bufferlist &bl) override; +}; + +class RGWPutObjTags_ObjStore_S3 : public RGWPutObjTags_ObjStore +{ +public: + RGWPutObjTags_ObjStore_S3() {} + ~RGWPutObjTags_ObjStore_S3() {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWDeleteObjTags_ObjStore_S3 : public RGWDeleteObjTags +{ +public: + ~RGWDeleteObjTags_ObjStore_S3() override {} + void send_response() override; +}; + +class RGWGetBucketTags_ObjStore_S3 : public RGWGetBucketTags_ObjStore +{ + bufferlist tags_bl; +public: + void send_response_data(bufferlist &bl) override; +}; + +class RGWPutBucketTags_ObjStore_S3 : public RGWPutBucketTags_ObjStore +{ +public: + int get_params(const DoutPrefixProvider *dpp, optional_yield y) override; + void send_response() override; +}; + +class RGWDeleteBucketTags_ObjStore_S3 : public RGWDeleteBucketTags +{ +public: + void send_response() override; +}; + +class RGWGetBucketReplication_ObjStore_S3 : public RGWGetBucketReplication_ObjStore +{ +public: + void send_response_data() override; +}; + +class RGWPutBucketReplication_ObjStore_S3 : public RGWPutBucketReplication_ObjStore +{ +public: + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWDeleteBucketReplication_ObjStore_S3 : public RGWDeleteBucketReplication_ObjStore +{ +protected: + void update_sync_policy(rgw_sync_policy_info *policy) override; +public: + void send_response() override; +}; + +class RGWListBuckets_ObjStore_S3 : public RGWListBuckets_ObjStore { +public: + RGWListBuckets_ObjStore_S3() {} + ~RGWListBuckets_ObjStore_S3() override {} + + int get_params(optional_yield y) override { + limit = -1; /* no limit */ + return 0; + } + void send_response_begin(bool has_buckets) override; + void send_response_data(rgw::sal::BucketList& buckets) override; + void send_response_end() override; +}; + +class RGWGetUsage_ObjStore_S3 : public RGWGetUsage_ObjStore { +public: + RGWGetUsage_ObjStore_S3() {} + ~RGWGetUsage_ObjStore_S3() override {} + + int get_params(optional_yield y) override ; + void send_response() override; +}; + +class RGWListBucket_ObjStore_S3 : public RGWListBucket_ObjStore { +protected: + bool objs_container; + bool encode_key {false}; + int get_common_params(); + void send_common_response(); + void send_common_versioned_response(); + public: + RGWListBucket_ObjStore_S3() : objs_container(false) { + default_max = 1000; + } + ~RGWListBucket_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; + void send_versioned_response(); +}; + +class RGWListBucket_ObjStore_S3v2 : public RGWListBucket_ObjStore_S3 { + bool fetchOwner; + bool start_after_exist; + bool continuation_token_exist; + std::string startAfter; + std::string continuation_token; +public: + RGWListBucket_ObjStore_S3v2() : fetchOwner(false) { + } + ~RGWListBucket_ObjStore_S3v2() override {} + + int get_params(optional_yield y) override; + void send_response() override; + void send_versioned_response(); +}; + +class RGWGetBucketLogging_ObjStore_S3 : public RGWGetBucketLogging { +public: + RGWGetBucketLogging_ObjStore_S3() {} + ~RGWGetBucketLogging_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetBucketLocation_ObjStore_S3 : public RGWGetBucketLocation { +public: + RGWGetBucketLocation_ObjStore_S3() {} + ~RGWGetBucketLocation_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetBucketVersioning_ObjStore_S3 : public RGWGetBucketVersioning { +public: + RGWGetBucketVersioning_ObjStore_S3() {} + ~RGWGetBucketVersioning_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWSetBucketVersioning_ObjStore_S3 : public RGWSetBucketVersioning { +public: + RGWSetBucketVersioning_ObjStore_S3() {} + ~RGWSetBucketVersioning_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWGetBucketWebsite_ObjStore_S3 : public RGWGetBucketWebsite { +public: + RGWGetBucketWebsite_ObjStore_S3() {} + ~RGWGetBucketWebsite_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWSetBucketWebsite_ObjStore_S3 : public RGWSetBucketWebsite { +public: + RGWSetBucketWebsite_ObjStore_S3() {} + ~RGWSetBucketWebsite_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWDeleteBucketWebsite_ObjStore_S3 : public RGWDeleteBucketWebsite { +public: + RGWDeleteBucketWebsite_ObjStore_S3() {} + ~RGWDeleteBucketWebsite_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWStatBucket_ObjStore_S3 : public RGWStatBucket_ObjStore { +public: + RGWStatBucket_ObjStore_S3() {} + ~RGWStatBucket_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWCreateBucket_ObjStore_S3 : public RGWCreateBucket_ObjStore { +public: + RGWCreateBucket_ObjStore_S3() {} + ~RGWCreateBucket_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWDeleteBucket_ObjStore_S3 : public RGWDeleteBucket_ObjStore { +public: + RGWDeleteBucket_ObjStore_S3() {} + ~RGWDeleteBucket_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWPutObj_ObjStore_S3 : public RGWPutObj_ObjStore { +private: + std::map crypt_http_responses; + +public: + RGWPutObj_ObjStore_S3() {} + ~RGWPutObj_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + int get_data(bufferlist& bl) override; + void send_response() override; + + int get_encrypt_filter(std::unique_ptr *filter, + rgw::sal::DataProcessor *cb) override; + int get_decrypt_filter(std::unique_ptr* filter, + RGWGetObj_Filter* cb, + std::map& attrs, + bufferlist* manifest_bl) override; +}; + +class RGWPostObj_ObjStore_S3 : public RGWPostObj_ObjStore { + parts_collection_t parts; + std::string filename; + std::string content_type; + RGWPolicyEnv env; + RGWPolicy post_policy; + std::map crypt_http_responses; + + const rgw::auth::StrategyRegistry* auth_registry_ptr = nullptr; + + int get_policy(optional_yield y); + int get_tags(); + void rebuild_key(rgw::sal::Object* obj); + + std::string get_current_filename() const override; + std::string get_current_content_type() const override; + +public: + RGWPostObj_ObjStore_S3() {} + ~RGWPostObj_ObjStore_S3() override {} + + int verify_requester(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) override { + auth_registry_ptr = &auth_registry; + return RGWPostObj_ObjStore::verify_requester(auth_registry, y); + } + + int get_params(optional_yield y) override; + int complete_get_params(); + + void send_response() override; + int get_data(ceph::bufferlist& bl, bool& again) override; + int get_encrypt_filter(std::unique_ptr *filter, + rgw::sal::DataProcessor *cb) override; +}; + +class RGWDeleteObj_ObjStore_S3 : public RGWDeleteObj_ObjStore { +public: + RGWDeleteObj_ObjStore_S3() {} + ~RGWDeleteObj_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWCopyObj_ObjStore_S3 : public RGWCopyObj_ObjStore { + bool sent_header; +public: + RGWCopyObj_ObjStore_S3() : sent_header(false) {} + ~RGWCopyObj_ObjStore_S3() override {} + + int init_dest_policy() override; + int get_params(optional_yield y) override; + int check_storage_class(const rgw_placement_rule& src_placement) override; + void send_partial_response(off_t ofs) override; + void send_response() override; +}; + +class RGWGetACLs_ObjStore_S3 : public RGWGetACLs_ObjStore { +public: + RGWGetACLs_ObjStore_S3() {} + ~RGWGetACLs_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWPutACLs_ObjStore_S3 : public RGWPutACLs_ObjStore { +public: + RGWPutACLs_ObjStore_S3() {} + ~RGWPutACLs_ObjStore_S3() override {} + + int get_policy_from_state(rgw::sal::Driver* driver, req_state *s, std::stringstream& ss) override; + void send_response() override; + int get_params(optional_yield y) override; +}; + +class RGWGetLC_ObjStore_S3 : public RGWGetLC_ObjStore { +protected: + RGWLifecycleConfiguration_S3 config; +public: + RGWGetLC_ObjStore_S3() {} + ~RGWGetLC_ObjStore_S3() override {} + void execute(optional_yield y) override; + + void send_response() override; +}; + +class RGWPutLC_ObjStore_S3 : public RGWPutLC_ObjStore { +public: + RGWPutLC_ObjStore_S3() {} + ~RGWPutLC_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWDeleteLC_ObjStore_S3 : public RGWDeleteLC_ObjStore { +public: + RGWDeleteLC_ObjStore_S3() {} + ~RGWDeleteLC_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetCORS_ObjStore_S3 : public RGWGetCORS_ObjStore { +public: + RGWGetCORS_ObjStore_S3() {} + ~RGWGetCORS_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWPutCORS_ObjStore_S3 : public RGWPutCORS_ObjStore { +public: + RGWPutCORS_ObjStore_S3() {} + ~RGWPutCORS_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWDeleteCORS_ObjStore_S3 : public RGWDeleteCORS_ObjStore { +public: + RGWDeleteCORS_ObjStore_S3() {} + ~RGWDeleteCORS_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWOptionsCORS_ObjStore_S3 : public RGWOptionsCORS_ObjStore { +public: + RGWOptionsCORS_ObjStore_S3() {} + ~RGWOptionsCORS_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetBucketEncryption_ObjStore_S3 : public RGWGetBucketEncryption_ObjStore { +public: + RGWGetBucketEncryption_ObjStore_S3() {} + ~RGWGetBucketEncryption_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWPutBucketEncryption_ObjStore_S3 : public RGWPutBucketEncryption_ObjStore { +public: + RGWPutBucketEncryption_ObjStore_S3() {} + ~RGWPutBucketEncryption_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWDeleteBucketEncryption_ObjStore_S3 : public RGWDeleteBucketEncryption_ObjStore { +public: + RGWDeleteBucketEncryption_ObjStore_S3() {} + ~RGWDeleteBucketEncryption_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetRequestPayment_ObjStore_S3 : public RGWGetRequestPayment { +public: + RGWGetRequestPayment_ObjStore_S3() {} + ~RGWGetRequestPayment_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWSetRequestPayment_ObjStore_S3 : public RGWSetRequestPayment { +public: + RGWSetRequestPayment_ObjStore_S3() {} + ~RGWSetRequestPayment_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWInitMultipart_ObjStore_S3 : public RGWInitMultipart_ObjStore { +private: + std::map crypt_http_responses; +public: + RGWInitMultipart_ObjStore_S3() {} + ~RGWInitMultipart_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; + int prepare_encryption(std::map& attrs) override; +}; + +class RGWCompleteMultipart_ObjStore_S3 : public RGWCompleteMultipart_ObjStore { +public: + RGWCompleteMultipart_ObjStore_S3() {} + ~RGWCompleteMultipart_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWAbortMultipart_ObjStore_S3 : public RGWAbortMultipart_ObjStore { +public: + RGWAbortMultipart_ObjStore_S3() {} + ~RGWAbortMultipart_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWListMultipart_ObjStore_S3 : public RGWListMultipart_ObjStore { +public: + RGWListMultipart_ObjStore_S3() {} + ~RGWListMultipart_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWListBucketMultiparts_ObjStore_S3 : public RGWListBucketMultiparts_ObjStore { +public: + RGWListBucketMultiparts_ObjStore_S3() { + default_max = 1000; + } + ~RGWListBucketMultiparts_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWDeleteMultiObj_ObjStore_S3 : public RGWDeleteMultiObj_ObjStore { +public: + RGWDeleteMultiObj_ObjStore_S3() {} + ~RGWDeleteMultiObj_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + void send_status() override; + void begin_response() override; + void send_partial_response(const rgw_obj_key& key, bool delete_marker, + const std::string& marker_version_id, int ret, + boost::asio::deadline_timer *formatter_flush_cond) override; + void end_response() override; +}; + +class RGWPutBucketObjectLock_ObjStore_S3 : public RGWPutBucketObjectLock_ObjStore { +public: + RGWPutBucketObjectLock_ObjStore_S3() {} + ~RGWPutBucketObjectLock_ObjStore_S3() override {} + void send_response() override; +}; + +class RGWGetBucketObjectLock_ObjStore_S3 : public RGWGetBucketObjectLock_ObjStore { +public: + RGWGetBucketObjectLock_ObjStore_S3() {} + ~RGWGetBucketObjectLock_ObjStore_S3() {} + void send_response() override; +}; + +class RGWPutObjRetention_ObjStore_S3 : public RGWPutObjRetention_ObjStore { +public: + RGWPutObjRetention_ObjStore_S3() {} + ~RGWPutObjRetention_ObjStore_S3() {} + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWGetObjRetention_ObjStore_S3 : public RGWGetObjRetention_ObjStore { +public: + RGWGetObjRetention_ObjStore_S3() {} + ~RGWGetObjRetention_ObjStore_S3() {} + void send_response() override; +}; + +class RGWPutObjLegalHold_ObjStore_S3 : public RGWPutObjLegalHold_ObjStore { +public: + RGWPutObjLegalHold_ObjStore_S3() {} + ~RGWPutObjLegalHold_ObjStore_S3() {} + void send_response() override; +}; + +class RGWGetObjLegalHold_ObjStore_S3 : public RGWGetObjLegalHold_ObjStore { +public: + RGWGetObjLegalHold_ObjStore_S3() {} + ~RGWGetObjLegalHold_ObjStore_S3() {} + void send_response() override; +}; + +class RGWGetObjLayout_ObjStore_S3 : public RGWGetObjLayout { +public: + RGWGetObjLayout_ObjStore_S3() {} + ~RGWGetObjLayout_ObjStore_S3() {} + + void send_response() override; +}; + +class RGWConfigBucketMetaSearch_ObjStore_S3 : public RGWConfigBucketMetaSearch { +public: + RGWConfigBucketMetaSearch_ObjStore_S3() {} + ~RGWConfigBucketMetaSearch_ObjStore_S3() {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWGetBucketMetaSearch_ObjStore_S3 : public RGWGetBucketMetaSearch { +public: + RGWGetBucketMetaSearch_ObjStore_S3() {} + ~RGWGetBucketMetaSearch_ObjStore_S3() {} + + void send_response() override; +}; + +class RGWDelBucketMetaSearch_ObjStore_S3 : public RGWDelBucketMetaSearch { +public: + RGWDelBucketMetaSearch_ObjStore_S3() {} + ~RGWDelBucketMetaSearch_ObjStore_S3() {} + + void send_response() override; +}; + +class RGWGetBucketPolicyStatus_ObjStore_S3 : public RGWGetBucketPolicyStatus { +public: + void send_response() override; +}; + +class RGWPutBucketPublicAccessBlock_ObjStore_S3 : public RGWPutBucketPublicAccessBlock { +public: + void send_response() override; +}; + +class RGWGetBucketPublicAccessBlock_ObjStore_S3 : public RGWGetBucketPublicAccessBlock { +public: + void send_response() override; +}; + +class RGW_Auth_S3 { +public: + static int authorize(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const rgw::auth::StrategyRegistry& auth_registry, + req_state *s, optional_yield y); +}; + +class RGWHandler_Auth_S3 : public RGWHandler_REST { + friend class RGWRESTMgr_S3; + + const rgw::auth::StrategyRegistry& auth_registry; + +public: + explicit RGWHandler_Auth_S3(const rgw::auth::StrategyRegistry& auth_registry) + : RGWHandler_REST(), + auth_registry(auth_registry) { + } + ~RGWHandler_Auth_S3() override = default; + + static int validate_bucket_name(const std::string& bucket); + static int validate_object_name(const std::string& bucket); + + int init(rgw::sal::Driver* driver, + req_state *s, + rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider *dpp, optional_yield y) override { + return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y); + } + int postauth_init(optional_yield) override { return 0; } +}; + +class RGWHandler_REST_S3 : public RGWHandler_REST { + friend class RGWRESTMgr_S3; +protected: + const rgw::auth::StrategyRegistry& auth_registry; +public: + static int init_from_header(rgw::sal::Driver* driver, req_state *s, RGWFormat default_formatter, + bool configurable_format); + + explicit RGWHandler_REST_S3(const rgw::auth::StrategyRegistry& auth_registry) + : RGWHandler_REST(), + auth_registry(auth_registry) { + } + ~RGWHandler_REST_S3() override = default; + + int init(rgw::sal::Driver* driver, + req_state *s, + rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider *dpp, optional_yield y) override; + int postauth_init(optional_yield y) override; +}; + +class RGWHandler_REST_Service_S3 : public RGWHandler_REST_S3 { +protected: + bool is_usage_op() const { + return s->info.args.exists("usage"); + } + RGWOp *op_get() override; + RGWOp *op_head() override; +public: + RGWHandler_REST_Service_S3(const rgw::auth::StrategyRegistry& auth_registry) : + RGWHandler_REST_S3(auth_registry) {} + ~RGWHandler_REST_Service_S3() override = default; +}; + +class RGWHandler_REST_Bucket_S3 : public RGWHandler_REST_S3 { + const bool enable_pubsub; +protected: + bool is_acl_op() const { + return s->info.args.exists("acl"); + } + bool is_cors_op() const { + return s->info.args.exists("cors"); + } + bool is_lc_op() const { + return s->info.args.exists("lifecycle"); + } + bool is_obj_update_op() const override { + return is_acl_op() || is_cors_op(); + } + bool is_tagging_op() const { + return s->info.args.exists("tagging"); + } + bool is_request_payment_op() const { + return s->info.args.exists("requestPayment"); + } + bool is_policy_op() const { + return s->info.args.exists("policy"); + } + bool is_object_lock_op() const { + return s->info.args.exists("object-lock"); + } + bool is_notification_op() const { + if (enable_pubsub) { + return s->info.args.exists("notification"); + } + return false; + } + bool is_replication_op() const { + return s->info.args.exists("replication"); + } + bool is_policy_status_op() { + return s->info.args.exists("policyStatus"); + } + bool is_block_public_access_op() { + return s->info.args.exists("publicAccessBlock"); + } + bool is_bucket_encryption_op() { + return s->info.args.exists("encryption"); + } + + RGWOp *get_obj_op(bool get_data) const; + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + RGWOp *op_options() override; +public: + RGWHandler_REST_Bucket_S3(const rgw::auth::StrategyRegistry& auth_registry, bool _enable_pubsub) : + RGWHandler_REST_S3(auth_registry), enable_pubsub(_enable_pubsub) {} + ~RGWHandler_REST_Bucket_S3() override = default; +}; + +class RGWHandler_REST_Obj_S3 : public RGWHandler_REST_S3 { +protected: + bool is_acl_op() const { + return s->info.args.exists("acl"); + } + bool is_tagging_op() const { + return s->info.args.exists("tagging"); + } + bool is_obj_retention_op() const { + return s->info.args.exists("retention"); + } + bool is_obj_legal_hold_op() const { + return s->info.args.exists("legal-hold"); + } + + bool is_select_op() const { + return s->info.args.exists("select-type"); + } + + bool is_obj_update_op() const override { + return is_acl_op() || is_tagging_op() || is_obj_retention_op() || is_obj_legal_hold_op() || is_select_op(); + } + RGWOp *get_obj_op(bool get_data); + + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + RGWOp *op_options() override; +public: + using RGWHandler_REST_S3::RGWHandler_REST_S3; + ~RGWHandler_REST_Obj_S3() override = default; +}; + +class RGWRESTMgr_S3 : public RGWRESTMgr { +private: + const bool enable_s3website; + const bool enable_sts; + const bool enable_iam; + const bool enable_pubsub; +public: + explicit RGWRESTMgr_S3(bool _enable_s3website=false, bool _enable_sts=false, bool _enable_iam=false, bool _enable_pubsub=false) + : enable_s3website(_enable_s3website), + enable_sts(_enable_sts), + enable_iam(_enable_iam), + enable_pubsub(_enable_pubsub) { + } + + ~RGWRESTMgr_S3() override = default; + + RGWHandler_REST *get_handler(rgw::sal::Driver* driver, + req_state* s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; + +class RGWHandler_REST_Obj_S3Website; + +static inline bool looks_like_ip_address(const char *bucket) +{ + struct in6_addr a; + if (inet_pton(AF_INET6, bucket, static_cast(&a)) == 1) { + return true; + } + int num_periods = 0; + bool expect_period = false; + for (const char *b = bucket; *b; ++b) { + if (*b == '.') { + if (!expect_period) + return false; + ++num_periods; + if (num_periods > 3) + return false; + expect_period = false; + } + else if (isdigit(*b)) { + expect_period = true; + } + else { + return false; + } + } + return (num_periods == 3); +} + +inline int valid_s3_object_name(const std::string& name) { + if (name.size() > 1024) { + return -ERR_INVALID_OBJECT_NAME; + } + if (check_utf8(name.c_str(), name.size())) { + return -ERR_INVALID_OBJECT_NAME; + } + return 0; +} + +inline int valid_s3_bucket_name(const std::string& name, bool relaxed=false) +{ + // This function enforces Amazon's spec for bucket names. + // (The requirements, not the recommendations.) + int len = name.size(); + int max = (relaxed ? 255 : 63); + + if (len < 3) { + // Name too short + return -ERR_INVALID_BUCKET_NAME; + } else if (len > max) { + // Name too long + return -ERR_INVALID_BUCKET_NAME; + } + + // bucket names must start with a number or letter + if (!(isalpha(name[0]) || isdigit(name[0]))) { + if (!relaxed) + return -ERR_INVALID_BUCKET_NAME; + else if (!(name[0] == '_' || name[0] == '.' || name[0] == '-')) + return -ERR_INVALID_BUCKET_NAME; + } + + // bucket names must end with a number or letter + if (!(isalpha(name[len-1]) || isdigit(name[len-1]))) + if (!relaxed) + return -ERR_INVALID_BUCKET_NAME; + + for (const char *s = name.c_str(); *s; ++s) { + char c = *s; + if (isdigit(c)) + continue; + + if (isalpha(c)) { + // name cannot contain uppercase letters + if (relaxed || islower(c)) + continue; + } + + if (c == '_') + // name cannot contain underscore + if (relaxed) + continue; + + if (c == '-') + continue; + + if (c == '.') { + if (!relaxed && s && *s) { + // name cannot have consecutive periods or dashes + // adjacent to periods + // ensure s is neither the first nor the last character + char p = *(s-1); + char n = *(s+1); + if ((p != '-') && (n != '.') && (n != '-')) + continue; + } else { + continue; + } + } + + // Invalid character + return -ERR_INVALID_BUCKET_NAME; + } + + if (looks_like_ip_address(name.c_str())) + return -ERR_INVALID_BUCKET_NAME; + + return 0; +} + +namespace rgw::auth::s3 { + +class AWSEngine : public rgw::auth::Engine { +public: + class VersionAbstractor { + static constexpr size_t DIGEST_SIZE_V2 = CEPH_CRYPTO_HMACSHA1_DIGESTSIZE; + static constexpr size_t DIGEST_SIZE_V4 = CEPH_CRYPTO_HMACSHA256_DIGESTSIZE; + + /* Knowing the signature max size allows us to employ the sstring, and thus + * avoid dynamic allocations. The multiplier comes from representing digest + * in the base64-encoded form. */ + static constexpr size_t SIGNATURE_MAX_SIZE = \ + std::max(DIGEST_SIZE_V2, DIGEST_SIZE_V4) * 2 + sizeof('\0'); + + public: + virtual ~VersionAbstractor() {}; + + using access_key_id_t = std::string_view; + using client_signature_t = std::string_view; + using session_token_t = std::string_view; + using server_signature_t = basic_sstring; + using string_to_sign_t = std::string; + + /* Transformation for crafting the AWS signature at server side which is + * used later to compare with the user-provided one. The methodology for + * doing that depends on AWS auth version. */ + using signature_factory_t = \ + std::function; + + /* Return an instance of Completer for verifying the payload's fingerprint + * if necessary. Otherwise caller gets nullptr. Caller may provide secret + * key */ + using completer_factory_t = \ + std::function& secret_key)>; + + struct auth_data_t { + access_key_id_t access_key_id; + client_signature_t client_signature; + session_token_t session_token; + string_to_sign_t string_to_sign; + signature_factory_t signature_factory; + completer_factory_t completer_factory; + }; + + virtual auth_data_t get_auth_data(const req_state* s) const = 0; + }; + +protected: + CephContext* cct; + const VersionAbstractor& ver_abstractor; + + AWSEngine(CephContext* const cct, const VersionAbstractor& ver_abstractor) + : cct(cct), + ver_abstractor(ver_abstractor) { + } + + using result_t = rgw::auth::Engine::result_t; + using string_to_sign_t = VersionAbstractor::string_to_sign_t; + using signature_factory_t = VersionAbstractor::signature_factory_t; + using completer_factory_t = VersionAbstractor::completer_factory_t; + + /* TODO(rzarzynski): clean up. We've too many input parameter hee. Also + * the signature get_auth_data() of VersionAbstractor is too complicated. + * Replace these thing with a simple, dedicated structure. */ + virtual result_t authenticate(const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* s, + optional_yield y) const = 0; + +public: + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s, + optional_yield y) const final; +}; + + +class AWSGeneralAbstractor : public AWSEngine::VersionAbstractor { + CephContext* const cct; + + virtual boost::optional + get_v4_canonical_headers(const req_info& info, + const std::string_view& signedheaders, + const bool using_qs) const; + + auth_data_t get_auth_data_v2(const req_state* s) const; + auth_data_t get_auth_data_v4(const req_state* s, const bool using_qs) const; + +public: + explicit AWSGeneralAbstractor(CephContext* const cct) + : cct(cct) { + } + + auth_data_t get_auth_data(const req_state* s) const override; +}; + +class AWSGeneralBoto2Abstractor : public AWSGeneralAbstractor { + boost::optional + get_v4_canonical_headers(const req_info& info, + const std::string_view& signedheaders, + const bool using_qs) const override; + +public: + using AWSGeneralAbstractor::AWSGeneralAbstractor; +}; + +class AWSBrowserUploadAbstractor : public AWSEngine::VersionAbstractor { + static std::string to_string(ceph::bufferlist bl) { + return std::string(bl.c_str(), + static_cast(bl.length())); + } + + auth_data_t get_auth_data_v2(const req_state* s) const; + auth_data_t get_auth_data_v4(const req_state* s) const; + +public: + explicit AWSBrowserUploadAbstractor(CephContext*) { + } + + auth_data_t get_auth_data(const req_state* s) const override; +}; + +class AWSSignerV4 { + const DoutPrefixProvider *dpp; + CephContext *cct; + +public: + AWSSignerV4(const DoutPrefixProvider *_dpp) : dpp(_dpp), + cct(_dpp->get_cct()) {} + + using access_key_id_t = std::string_view; + using string_to_sign_t = AWSEngine::VersionAbstractor::string_to_sign_t; + using signature_headers_t = std::map; + + struct prepare_result_t; + + using signature_factory_t = \ + std::function; + + struct prepare_result_t { + access_key_id_t access_key_id; + std::string date; + std::string scope; + std::string signed_headers; + string_to_sign_t string_to_sign; + std::map extra_headers; + signature_factory_t signature_factory; + }; + + static prepare_result_t prepare(const DoutPrefixProvider *dpp, + const std::string& access_key_id, + const string& region, + const string& service, + const req_info& info, + const bufferlist *opt_content, + bool s3_op); +}; + + +extern AWSSignerV4::signature_headers_t +gen_v4_signature(const DoutPrefixProvider *dpp, + const std::string_view& secret_key, + const AWSSignerV4::prepare_result_t& sig_info); + +class LDAPEngine : public AWSEngine { + static rgw::LDAPHelper* ldh; + static std::mutex mtx; + + static void init(CephContext* const cct); + + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + using auth_info_t = rgw::auth::RemoteApplier::AuthInfo; + using result_t = rgw::auth::Engine::result_t; + +protected: + rgw::sal::Driver* driver; + const rgw::auth::RemoteApplier::Factory* const apl_factory; + + acl_strategy_t get_acl_strategy() const; + auth_info_t get_creds_info(const rgw::RGWToken& token) const noexcept; + + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t&, + const completer_factory_t& completer_factory, + const req_state* s, + optional_yield y) const override; +public: + LDAPEngine(CephContext* const cct, + rgw::sal::Driver* driver, + const VersionAbstractor& ver_abstractor, + const rgw::auth::RemoteApplier::Factory* const apl_factory) + : AWSEngine(cct, ver_abstractor), + driver(driver), + apl_factory(apl_factory) { + init(cct); + } + + using AWSEngine::authenticate; + + const char* get_name() const noexcept override { + return "rgw::auth::s3::LDAPEngine"; + } + + static bool valid(); + static void shutdown(); +}; + +class LocalEngine : public AWSEngine { + rgw::sal::Driver* driver; + const rgw::auth::LocalApplier::Factory* const apl_factory; + + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* s, + optional_yield y) const override; +public: + LocalEngine(CephContext* const cct, + rgw::sal::Driver* driver, + const VersionAbstractor& ver_abstractor, + const rgw::auth::LocalApplier::Factory* const apl_factory) + : AWSEngine(cct, ver_abstractor), + driver(driver), + apl_factory(apl_factory) { + } + + using AWSEngine::authenticate; + + const char* get_name() const noexcept override { + return "rgw::auth::s3::LocalEngine"; + } +}; + +class STSEngine : public AWSEngine { + rgw::sal::Driver* driver; + const rgw::auth::LocalApplier::Factory* const local_apl_factory; + const rgw::auth::RemoteApplier::Factory* const remote_apl_factory; + const rgw::auth::RoleApplier::Factory* const role_apl_factory; + + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + using auth_info_t = rgw::auth::RemoteApplier::AuthInfo; + + acl_strategy_t get_acl_strategy() const { return nullptr; }; + auth_info_t get_creds_info(const STS::SessionToken& token) const noexcept; + + int get_session_token(const DoutPrefixProvider* dpp, const std::string_view& session_token, + STS::SessionToken& token) const; + + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string_view& access_key_id, + const std::string_view& signature, + const std::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* s, + optional_yield y) const override; +public: + STSEngine(CephContext* const cct, + rgw::sal::Driver* driver, + const VersionAbstractor& ver_abstractor, + const rgw::auth::LocalApplier::Factory* const local_apl_factory, + const rgw::auth::RemoteApplier::Factory* const remote_apl_factory, + const rgw::auth::RoleApplier::Factory* const role_apl_factory) + : AWSEngine(cct, ver_abstractor), + driver(driver), + local_apl_factory(local_apl_factory), + remote_apl_factory(remote_apl_factory), + role_apl_factory(role_apl_factory) { + } + + using AWSEngine::authenticate; + + const char* get_name() const noexcept override { + return "rgw::auth::s3::STSEngine"; + } +}; + +class S3AnonymousEngine : public rgw::auth::AnonymousEngine { + bool is_applicable(const req_state* s) const noexcept override; + +public: + /* Let's reuse the parent class' constructor. */ + using rgw::auth::AnonymousEngine::AnonymousEngine; + + const char* get_name() const noexcept override { + return "rgw::auth::s3::S3AnonymousEngine"; + } +}; + + +} // namespace rgw::auth::s3 diff --git a/src/rgw/rgw_rest_s3website.h b/src/rgw/rgw_rest_s3website.h new file mode 100644 index 000000000..3030926a7 --- /dev/null +++ b/src/rgw/rgw_rest_s3website.h @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Robin H. Johnson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_rest_s3.h" + +class RGWHandler_REST_S3Website : public RGWHandler_REST_S3 { + std::string original_object_name; // object name before retarget() + bool web_dir() const; +protected: + int retarget(RGWOp *op, RGWOp **new_op, optional_yield y) override; + // TODO: this should be virtual I think, and ensure that it's always + // overridden, but that conflates that op_get/op_head are defined in this + // class and call this; and don't need to be overridden later. + virtual RGWOp *get_obj_op(bool get_data) { return NULL; } + RGWOp *op_get() override; + RGWOp *op_head() override; + // Only allowed to use GET+HEAD + RGWOp *op_put() override { return NULL; } + RGWOp *op_delete() override { return NULL; } + RGWOp *op_post() override { return NULL; } + RGWOp *op_copy() override { return NULL; } + RGWOp *op_options() override { return NULL; } + + int serve_errordoc(const DoutPrefixProvider *dpp, int http_ret, const std::string &errordoc_key, optional_yield y); +public: + using RGWHandler_REST_S3::RGWHandler_REST_S3; + ~RGWHandler_REST_S3Website() override = default; + + int init(rgw::sal::Driver* driver, req_state *s, rgw::io::BasicClient* cio) override; + int error_handler(int err_no, std::string *error_content, optional_yield y) override; +}; + +class RGWHandler_REST_Service_S3Website : public RGWHandler_REST_S3Website { +protected: + RGWOp *get_obj_op(bool get_data) override; +public: + using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website; + ~RGWHandler_REST_Service_S3Website() override = default; +}; + +class RGWHandler_REST_Obj_S3Website : public RGWHandler_REST_S3Website { +protected: + RGWOp *get_obj_op(bool get_data) override; +public: + using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website; + ~RGWHandler_REST_Obj_S3Website() override = default; +}; + +/* The cross-inheritance from Obj to Bucket is deliberate! + * S3Websites do NOT support any bucket operations + */ +class RGWHandler_REST_Bucket_S3Website : public RGWHandler_REST_S3Website { +protected: + RGWOp *get_obj_op(bool get_data) override; +public: + using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website; + ~RGWHandler_REST_Bucket_S3Website() override = default; +}; + +// TODO: do we actually need this? +class RGWGetObj_ObjStore_S3Website : public RGWGetObj_ObjStore_S3 +{ + friend class RGWHandler_REST_S3Website; +private: + bool is_errordoc_request; +public: + RGWGetObj_ObjStore_S3Website() : is_errordoc_request(false) {} + explicit RGWGetObj_ObjStore_S3Website(bool is_errordoc_request) : is_errordoc_request(false) { this->is_errordoc_request = is_errordoc_request; } + ~RGWGetObj_ObjStore_S3Website() override {} + int send_response_data_error(optional_yield y) override; + int send_response_data(bufferlist& bl, off_t ofs, off_t len) override; + // We override RGWGetObj_ObjStore::get_params here, to allow ignoring all + // conditional params for error pages. + int get_params(optional_yield y) override { + if (is_errordoc_request) { + range_str = NULL; + if_mod = NULL; + if_unmod = NULL; + if_match = NULL; + if_nomatch = NULL; + return 0; + } else { + return RGWGetObj_ObjStore_S3::get_params(y); + } + } +}; diff --git a/src/rgw/rgw_rest_sts.cc b/src/rgw/rgw_rest_sts.cc new file mode 100644 index 000000000..09f77f61d --- /dev/null +++ b/src/rgw/rgw_rest_sts.cc @@ -0,0 +1,819 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + + +#include "ceph_ver.h" +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rest.h" +#include "rgw_auth.h" +#include "rgw_auth_registry.h" +#include "jwt-cpp/jwt.h" +#include "rgw_rest_sts.h" + +#include "rgw_formats.h" +#include "rgw_client_io.h" + +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_iam_policy.h" +#include "rgw_iam_policy_keywords.h" + +#include "rgw_sts.h" +#include "rgw_rest_oidc_provider.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw::auth::sts { + +bool +WebTokenEngine::is_applicable(const std::string& token) const noexcept +{ + return ! token.empty(); +} + +std::string +WebTokenEngine::get_role_tenant(const string& role_arn) const +{ + string tenant; + auto r_arn = rgw::ARN::parse(role_arn); + if (r_arn) { + tenant = r_arn->account; + } + return tenant; +} + +std::string +WebTokenEngine::get_role_name(const string& role_arn) const +{ + string role_name; + auto r_arn = rgw::ARN::parse(role_arn); + if (r_arn) { + role_name = r_arn->resource; + } + if (!role_name.empty()) { + auto pos = role_name.find_last_of('/'); + if(pos != string::npos) { + role_name = role_name.substr(pos + 1); + } + } + return role_name; +} + +std::unique_ptr +WebTokenEngine::get_provider(const DoutPrefixProvider *dpp, const string& role_arn, const string& iss) const +{ + string tenant = get_role_tenant(role_arn); + + string idp_url = iss; + auto pos = idp_url.find("http://"); + if (pos == std::string::npos) { + pos = idp_url.find("https://"); + if (pos != std::string::npos) { + idp_url.erase(pos, 8); + } else { + pos = idp_url.find("www."); + if (pos != std::string::npos) { + idp_url.erase(pos, 4); + } + } + } else { + idp_url.erase(pos, 7); + } + auto provider_arn = rgw::ARN(idp_url, "oidc-provider", tenant); + string p_arn = provider_arn.to_string(); + std::unique_ptr provider = driver->get_oidc_provider(); + provider->set_arn(p_arn); + provider->set_tenant(tenant); + auto ret = provider->get(dpp); + if (ret < 0) { + return nullptr; + } + return provider; +} + +bool +WebTokenEngine::is_client_id_valid(vector& client_ids, const string& client_id) const +{ + for (auto it : client_ids) { + if (it == client_id) { + return true; + } + } + return false; +} + +bool +WebTokenEngine::is_cert_valid(const vector& thumbprints, const string& cert) const +{ + //calculate thumbprint of cert + std::unique_ptr certbio(BIO_new_mem_buf(cert.data(), cert.size()), BIO_free_all); + std::unique_ptr keybio(BIO_new(BIO_s_mem()), BIO_free_all); + string pw=""; + std::unique_ptr x_509cert(PEM_read_bio_X509(certbio.get(), nullptr, nullptr, const_cast(pw.c_str())), X509_free); + const EVP_MD* fprint_type = EVP_sha1(); + unsigned int fprint_size; + unsigned char fprint[EVP_MAX_MD_SIZE]; + + if (!X509_digest(x_509cert.get(), fprint_type, fprint, &fprint_size)) { + return false; + } + stringstream ss; + for (unsigned int i = 0; i < fprint_size; i++) { + ss << std::setfill('0') << std::setw(2) << std::hex << (0xFF & (unsigned int)fprint[i]); + } + std::string digest = ss.str(); + + for (auto& it : thumbprints) { + if (boost::iequals(it,digest)) { + return true; + } + } + return false; +} + +template +void +WebTokenEngine::recurse_and_insert(const string& key, const jwt::claim& c, T& t) const +{ + string s_val; + jwt::claim::type c_type = c.get_type(); + switch(c_type) { + case jwt::claim::type::null: + break; + case jwt::claim::type::boolean: + case jwt::claim::type::number: + case jwt::claim::type::int64: + { + s_val = c.to_json().serialize(); + t.emplace(std::make_pair(key, s_val)); + break; + } + case jwt::claim::type::string: + { + s_val = c.to_json().to_str(); + t.emplace(std::make_pair(key, s_val)); + break; + } + case jwt::claim::type::array: + { + const picojson::array& arr = c.as_array(); + for (auto& a : arr) { + recurse_and_insert(key, jwt::claim(a), t); + } + break; + } + case jwt::claim::type::object: + { + const picojson::object& obj = c.as_object(); + for (auto& m : obj) { + recurse_and_insert(m.first, jwt::claim(m.second), t); + } + break; + } + } + return; +} + +//Extract all token claims so that they can be later used in the Condition element of Role's trust policy +WebTokenEngine::token_t +WebTokenEngine::get_token_claims(const jwt::decoded_jwt& decoded) const +{ + WebTokenEngine::token_t token; + const auto& claims = decoded.get_payload_claims(); + + for (auto& c : claims) { + if (c.first == string(princTagsNamespace)) { + continue; + } + recurse_and_insert(c.first, c.second, token); + } + return token; +} + +//Offline validation of incoming Web Token which is a signed JWT (JSON Web Token) +std::tuple, boost::optional> +WebTokenEngine::get_from_jwt(const DoutPrefixProvider* dpp, const std::string& token, const req_state* const s, + optional_yield y) const +{ + WebTokenEngine::token_t t; + WebTokenEngine::principal_tags_t principal_tags; + try { + const auto& decoded = jwt::decode(token); + + auto& payload = decoded.get_payload(); + ldpp_dout(dpp, 20) << " payload = " << payload << dendl; + + t = get_token_claims(decoded); + + string iss; + if (decoded.has_issuer()) { + iss = decoded.get_issuer(); + } + + set aud; + if (decoded.has_audience()) { + aud = decoded.get_audience(); + } + + string client_id; + if (decoded.has_payload_claim("client_id")) { + client_id = decoded.get_payload_claim("client_id").as_string(); + } + if (client_id.empty() && decoded.has_payload_claim("clientId")) { + client_id = decoded.get_payload_claim("clientId").as_string(); + } + string azp; + if (decoded.has_payload_claim("azp")) { + azp = decoded.get_payload_claim("azp").as_string(); + } + + string role_arn = s->info.args.get("RoleArn"); + auto provider = get_provider(dpp, role_arn, iss); + if (! provider) { + ldpp_dout(dpp, 0) << "Couldn't get oidc provider info using input iss" << iss << dendl; + throw -EACCES; + } + if (decoded.has_payload_claim(string(princTagsNamespace))) { + auto& cl = decoded.get_payload_claim(string(princTagsNamespace)); + if (cl.get_type() == jwt::claim::type::object || cl.get_type() == jwt::claim::type::array) { + recurse_and_insert("dummy", cl, principal_tags); + for (auto it : principal_tags) { + ldpp_dout(dpp, 5) << "Key: " << it.first << " Value: " << it.second << dendl; + } + } else { + ldpp_dout(dpp, 0) << "Malformed principal tags" << cl.as_string() << dendl; + throw -EINVAL; + } + } + vector client_ids = provider->get_client_ids(); + vector thumbprints = provider->get_thumbprints(); + if (! client_ids.empty()) { + bool found = false; + for (auto& it : aud) { + if (is_client_id_valid(client_ids, it)) { + found = true; + break; + } + } + if (! found && ! is_client_id_valid(client_ids, client_id) && ! is_client_id_valid(client_ids, azp)) { + ldpp_dout(dpp, 0) << "Client id in token doesn't match with that registered with oidc provider" << dendl; + throw -EACCES; + } + } + //Validate signature + if (decoded.has_algorithm()) { + auto& algorithm = decoded.get_algorithm(); + try { + validate_signature(dpp, decoded, algorithm, iss, thumbprints, y); + } catch (...) { + throw -EACCES; + } + } else { + return {boost::none, boost::none}; + } + } catch (int error) { + if (error == -EACCES) { + throw -EACCES; + } + ldpp_dout(dpp, 5) << "Invalid JWT token" << dendl; + return {boost::none, boost::none}; + } + catch (...) { + ldpp_dout(dpp, 5) << "Invalid JWT token" << dendl; + return {boost::none, boost::none}; + } + return {t, principal_tags}; +} + +std::string +WebTokenEngine::get_cert_url(const string& iss, const DoutPrefixProvider *dpp, optional_yield y) const +{ + string cert_url; + string openidc_wellknown_url = iss; + bufferlist openidc_resp; + + if (openidc_wellknown_url.back() == '/') { + openidc_wellknown_url.pop_back(); + } + openidc_wellknown_url.append("/.well-known/openid-configuration"); + + RGWHTTPTransceiver openidc_req(cct, "GET", openidc_wellknown_url, &openidc_resp); + + //Headers + openidc_req.append_header("Content-Type", "application/x-www-form-urlencoded"); + + int res = openidc_req.process(y); + if (res < 0) { + ldpp_dout(dpp, 10) << "HTTP request res: " << res << dendl; + throw -EINVAL; + } + + //Debug only + ldpp_dout(dpp, 20) << "HTTP status: " << openidc_req.get_http_status() << dendl; + ldpp_dout(dpp, 20) << "JSON Response is: " << openidc_resp.c_str() << dendl; + + JSONParser parser; + if (parser.parse(openidc_resp.c_str(), openidc_resp.length())) { + JSONObj::data_val val; + if (parser.get_data("jwks_uri", &val)) { + cert_url = val.str.c_str(); + ldpp_dout(dpp, 20) << "Cert URL is: " << cert_url.c_str() << dendl; + } else { + ldpp_dout(dpp, 0) << "Malformed json returned while fetching openidc url" << dendl; + } + } + return cert_url; +} + +void +WebTokenEngine::validate_signature(const DoutPrefixProvider* dpp, const jwt::decoded_jwt& decoded, const string& algorithm, const string& iss, const vector& thumbprints, optional_yield y) const +{ + if (algorithm != "HS256" && algorithm != "HS384" && algorithm != "HS512") { + string cert_url = get_cert_url(iss, dpp, y); + if (cert_url.empty()) { + throw -EINVAL; + } + + // Get certificate + bufferlist cert_resp; + RGWHTTPTransceiver cert_req(cct, "GET", cert_url, &cert_resp); + //Headers + cert_req.append_header("Content-Type", "application/x-www-form-urlencoded"); + + int res = cert_req.process(y); + if (res < 0) { + ldpp_dout(dpp, 10) << "HTTP request res: " << res << dendl; + throw -EINVAL; + } + //Debug only + ldpp_dout(dpp, 20) << "HTTP status: " << cert_req.get_http_status() << dendl; + ldpp_dout(dpp, 20) << "JSON Response is: " << cert_resp.c_str() << dendl; + + JSONParser parser; + if (parser.parse(cert_resp.c_str(), cert_resp.length())) { + JSONObj::data_val val; + if (parser.get_data("keys", &val)) { + if (val.str[0] == '[') { + val.str.erase(0, 1); + } + if (val.str[val.str.size() - 1] == ']') { + val.str = val.str.erase(val.str.size() - 1, 1); + } + if (parser.parse(val.str.c_str(), val.str.size())) { + vector x5c; + if (JSONDecoder::decode_json("x5c", x5c, &parser)) { + string cert; + bool found_valid_cert = false; + for (auto& it : x5c) { + cert = "-----BEGIN CERTIFICATE-----\n" + it + "\n-----END CERTIFICATE-----"; + ldpp_dout(dpp, 20) << "Certificate is: " << cert.c_str() << dendl; + if (is_cert_valid(thumbprints, cert)) { + found_valid_cert = true; + break; + } + found_valid_cert = true; + } + if (! found_valid_cert) { + ldpp_dout(dpp, 0) << "Cert doesn't match that with the thumbprints registered with oidc provider: " << cert.c_str() << dendl; + throw -EINVAL; + } + try { + //verify method takes care of expired tokens also + if (algorithm == "RS256") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::rs256{cert}); + + verifier.verify(decoded); + } else if (algorithm == "RS384") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::rs384{cert}); + + verifier.verify(decoded); + } else if (algorithm == "RS512") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::rs512{cert}); + + verifier.verify(decoded); + } else if (algorithm == "ES256") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::es256{cert}); + + verifier.verify(decoded); + } else if (algorithm == "ES384") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::es384{cert}); + + verifier.verify(decoded); + } else if (algorithm == "ES512") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::es512{cert}); + + verifier.verify(decoded); + } else if (algorithm == "PS256") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::ps256{cert}); + + verifier.verify(decoded); + } else if (algorithm == "PS384") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::ps384{cert}); + + verifier.verify(decoded); + } else if (algorithm == "PS512") { + auto verifier = jwt::verify() + .allow_algorithm(jwt::algorithm::ps512{cert}); + + verifier.verify(decoded); + } + } catch (std::runtime_error& e) { + ldpp_dout(dpp, 0) << "Signature validation failed: " << e.what() << dendl; + throw; + } + catch (...) { + ldpp_dout(dpp, 0) << "Signature validation failed" << dendl; + throw; + } + } else { + ldpp_dout(dpp, 0) << "x5c not present" << dendl; + throw -EINVAL; + } + } else { + ldpp_dout(dpp, 0) << "Malformed JSON object for keys" << dendl; + throw -EINVAL; + } + } else { + ldpp_dout(dpp, 0) << "keys not present in JSON" << dendl; + throw -EINVAL; + } //if-else get-data + } else { + ldpp_dout(dpp, 0) << "Malformed json returned while fetching cert" << dendl; + throw -EINVAL; + } //if-else parser cert_resp + } else { + ldpp_dout(dpp, 0) << "JWT signed by HMAC algos are currently not supported" << dendl; + throw -EINVAL; + } +} + +WebTokenEngine::result_t +WebTokenEngine::authenticate( const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* const s, + optional_yield y) const +{ + if (! is_applicable(token)) { + return result_t::deny(); + } + + try { + auto [t, princ_tags] = get_from_jwt(dpp, token, s, y); + if (t) { + string role_session = s->info.args.get("RoleSessionName"); + if (role_session.empty()) { + ldout(s->cct, 0) << "Role Session Name is empty " << dendl; + return result_t::deny(-EACCES); + } + string role_arn = s->info.args.get("RoleArn"); + string role_tenant = get_role_tenant(role_arn); + string role_name = get_role_name(role_arn); + std::unique_ptr role = driver->get_role(role_name, role_tenant); + int ret = role->get(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "Role not found: name:" << role_name << " tenant: " << role_tenant << dendl; + return result_t::deny(-EACCES); + } + boost::optional> role_tags = role->get_tags(); + auto apl = apl_factory->create_apl_web_identity(cct, s, role_session, role_tenant, *t, role_tags, princ_tags); + return result_t::grant(std::move(apl)); + } + return result_t::deny(-EACCES); + } + catch (...) { + return result_t::deny(-EACCES); + } +} + +} // namespace rgw::auth::sts + +int RGWREST_STS::verify_permission(optional_yield y) +{ + STS::STSService _sts(s->cct, driver, s->user->get_id(), s->auth.identity.get()); + sts = std::move(_sts); + + string rArn = s->info.args.get("RoleArn"); + const auto& [ret, role] = sts.getRoleInfo(s, rArn, y); + if (ret < 0) { + ldpp_dout(this, 0) << "failed to get role info using role arn: " << rArn << dendl; + return ret; + } + string policy = role->get_assume_role_policy(); + buffer::list bl = buffer::list::static_from_string(policy); + + //Parse the policy + //TODO - This step should be part of Role Creation + try { + const rgw::IAM::Policy p(s->cct, s->user->get_tenant(), bl, false); + if (!s->principal_tags.empty()) { + auto res = p.eval(s->env, *s->auth.identity, rgw::IAM::stsTagSession, boost::none); + if (res != rgw::IAM::Effect::Allow) { + ldout(s->cct, 0) << "evaluating policy for stsTagSession returned deny/pass" << dendl; + return -EPERM; + } + } + uint64_t op; + if (get_type() == RGW_STS_ASSUME_ROLE_WEB_IDENTITY) { + op = rgw::IAM::stsAssumeRoleWithWebIdentity; + } else { + op = rgw::IAM::stsAssumeRole; + } + + auto res = p.eval(s->env, *s->auth.identity, op, boost::none); + if (res != rgw::IAM::Effect::Allow) { + ldout(s->cct, 0) << "evaluating policy for op: " << op << " returned deny/pass" << dendl; + return -EPERM; + } + } catch (rgw::IAM::PolicyParseException& e) { + ldpp_dout(this, 0) << "failed to parse policy: " << e.what() << dendl; + return -EPERM; + } + + return 0; +} + +void RGWREST_STS::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWSTSGetSessionToken::verify_permission(optional_yield y) +{ + rgw::Partition partition = rgw::Partition::aws; + rgw::Service service = rgw::Service::s3; + if (!verify_user_permission(this, + s, + rgw::ARN(partition, service, "", s->user->get_tenant(), ""), + rgw::IAM::stsGetSessionToken)) { + ldpp_dout(this, 0) << "User does not have permssion to perform GetSessionToken" << dendl; + return -EACCES; + } + + return 0; +} + +int RGWSTSGetSessionToken::get_params() +{ + duration = s->info.args.get("DurationSeconds"); + serialNumber = s->info.args.get("SerialNumber"); + tokenCode = s->info.args.get("TokenCode"); + + if (! duration.empty()) { + string err; + uint64_t duration_in_secs = strict_strtoll(duration.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 0) << "Invalid value of input duration: " << duration << dendl; + return -EINVAL; + } + + if (duration_in_secs < STS::GetSessionTokenRequest::getMinDuration() || + duration_in_secs > s->cct->_conf->rgw_sts_max_session_duration) { + ldpp_dout(this, 0) << "Invalid duration in secs: " << duration_in_secs << dendl; + return -EINVAL; + } + } + + return 0; +} + +void RGWSTSGetSessionToken::execute(optional_yield y) +{ + if (op_ret = get_params(); op_ret < 0) { + return; + } + + STS::STSService sts(s->cct, driver, s->user->get_id(), s->auth.identity.get()); + + STS::GetSessionTokenRequest req(duration, serialNumber, tokenCode); + const auto& [ret, creds] = sts.getSessionToken(this, req); + op_ret = std::move(ret); + //Dump the output + if (op_ret == 0) { + s->formatter->open_object_section("GetSessionTokenResponse"); + s->formatter->open_object_section("GetSessionTokenResult"); + s->formatter->open_object_section("Credentials"); + creds.dump(s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWSTSAssumeRoleWithWebIdentity::get_params() +{ + duration = s->info.args.get("DurationSeconds"); + providerId = s->info.args.get("ProviderId"); + policy = s->info.args.get("Policy"); + roleArn = s->info.args.get("RoleArn"); + roleSessionName = s->info.args.get("RoleSessionName"); + iss = s->info.args.get("provider_id"); + sub = s->info.args.get("sub"); + aud = s->info.args.get("aud"); + + if (roleArn.empty() || roleSessionName.empty() || sub.empty() || aud.empty()) { + ldpp_dout(this, 0) << "ERROR: one of role arn or role session name or token is empty" << dendl; + return -EINVAL; + } + + if (! policy.empty()) { + bufferlist bl = bufferlist::static_from_string(policy); + try { + const rgw::IAM::Policy p( + s->cct, s->user->get_tenant(), bl, + s->cct->_conf.get_val("rgw_policy_reject_invalid_principals")); + } + catch (rgw::IAM::PolicyParseException& e) { + ldpp_dout(this, 5) << "failed to parse policy: " << e.what() << "policy" << policy << dendl; + s->err.message = e.what(); + return -ERR_MALFORMED_DOC; + } + } + + return 0; +} + +void RGWSTSAssumeRoleWithWebIdentity::execute(optional_yield y) +{ + if (op_ret = get_params(); op_ret < 0) { + return; + } + + STS::AssumeRoleWithWebIdentityRequest req(s->cct, duration, providerId, policy, roleArn, + roleSessionName, iss, sub, aud, s->principal_tags); + STS::AssumeRoleWithWebIdentityResponse response = sts.assumeRoleWithWebIdentity(this, req); + op_ret = std::move(response.assumeRoleResp.retCode); + + //Dump the output + if (op_ret == 0) { + s->formatter->open_object_section("AssumeRoleWithWebIdentityResponse"); + s->formatter->open_object_section("AssumeRoleWithWebIdentityResult"); + encode_json("SubjectFromWebIdentityToken", response.sub , s->formatter); + encode_json("Audience", response.aud , s->formatter); + s->formatter->open_object_section("AssumedRoleUser"); + response.assumeRoleResp.user.dump(s->formatter); + s->formatter->close_section(); + s->formatter->open_object_section("Credentials"); + response.assumeRoleResp.creds.dump(s->formatter); + s->formatter->close_section(); + encode_json("Provider", response.providerId , s->formatter); + encode_json("PackedPolicySize", response.assumeRoleResp.packedPolicySize , s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWSTSAssumeRole::get_params() +{ + duration = s->info.args.get("DurationSeconds"); + externalId = s->info.args.get("ExternalId"); + policy = s->info.args.get("Policy"); + roleArn = s->info.args.get("RoleArn"); + roleSessionName = s->info.args.get("RoleSessionName"); + serialNumber = s->info.args.get("SerialNumber"); + tokenCode = s->info.args.get("TokenCode"); + + if (roleArn.empty() || roleSessionName.empty()) { + ldpp_dout(this, 0) << "ERROR: one of role arn or role session name is empty" << dendl; + return -EINVAL; + } + + if (! policy.empty()) { + bufferlist bl = bufferlist::static_from_string(policy); + try { + const rgw::IAM::Policy p( + s->cct, s->user->get_tenant(), bl, + s->cct->_conf.get_val("rgw_policy_reject_invalid_principals")); + } + catch (rgw::IAM::PolicyParseException& e) { + ldpp_dout(this, 0) << "failed to parse policy: " << e.what() << "policy" << policy << dendl; + s->err.message = e.what(); + return -ERR_MALFORMED_DOC; + } + } + + return 0; +} + +void RGWSTSAssumeRole::execute(optional_yield y) +{ + if (op_ret = get_params(); op_ret < 0) { + return; + } + + STS::AssumeRoleRequest req(s->cct, duration, externalId, policy, roleArn, + roleSessionName, serialNumber, tokenCode); + STS::AssumeRoleResponse response = sts.assumeRole(s, req, y); + op_ret = std::move(response.retCode); + //Dump the output + if (op_ret == 0) { + s->formatter->open_object_section("AssumeRoleResponse"); + s->formatter->open_object_section("AssumeRoleResult"); + s->formatter->open_object_section("Credentials"); + response.creds.dump(s->formatter); + s->formatter->close_section(); + s->formatter->open_object_section("AssumedRoleUser"); + response.user.dump(s->formatter); + s->formatter->close_section(); + encode_json("PackedPolicySize", response.packedPolicySize , s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGW_Auth_STS::authorize(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const rgw::auth::StrategyRegistry& auth_registry, + req_state *s, optional_yield y) +{ + return rgw::auth::Strategy::apply(dpp, auth_registry.get_sts(), s, y); +} + +using op_generator = RGWOp*(*)(); +static const std::unordered_map op_generators = { + {"AssumeRole", []() -> RGWOp* {return new RGWSTSAssumeRole;}}, + {"GetSessionToken", []() -> RGWOp* {return new RGWSTSGetSessionToken;}}, + {"AssumeRoleWithWebIdentity", []() -> RGWOp* {return new RGWSTSAssumeRoleWithWebIdentity;}} +}; + +bool RGWHandler_REST_STS::action_exists(const req_state* s) +{ + if (s->info.args.exists("Action")) { + const std::string action_name = s->info.args.get("Action"); + return op_generators.contains(action_name); + } + return false; +} + +RGWOp *RGWHandler_REST_STS::op_post() +{ + if (s->info.args.exists("Action")) { + const std::string action_name = s->info.args.get("Action"); + const auto action_it = op_generators.find(action_name); + if (action_it != op_generators.end()) { + return action_it->second(); + } + ldpp_dout(s, 10) << "unknown action '" << action_name << "' for STS handler" << dendl; + } else { + ldpp_dout(s, 10) << "missing action argument in STS handler" << dendl; + } + return nullptr; +} + +int RGWHandler_REST_STS::init(rgw::sal::Driver* driver, + req_state *s, + rgw::io::BasicClient *cio) +{ + s->dialect = "sts"; + s->prot_flags = RGW_REST_STS; + + return RGWHandler_REST::init(driver, s, cio); +} + +int RGWHandler_REST_STS::authorize(const DoutPrefixProvider* dpp, optional_yield y) +{ + if (s->info.args.exists("Action") && s->info.args.get("Action") == "AssumeRoleWithWebIdentity") { + return RGW_Auth_STS::authorize(dpp, driver, auth_registry, s, y); + } + return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y); +} + +RGWHandler_REST* +RGWRESTMgr_STS::get_handler(rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + return new RGWHandler_REST_STS(auth_registry); +} diff --git a/src/rgw/rgw_rest_sts.h b/src/rgw/rgw_rest_sts.h new file mode 100644 index 000000000..ec15de245 --- /dev/null +++ b/src/rgw/rgw_rest_sts.h @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_auth.h" +#include "rgw_auth_filters.h" +#include "rgw_rest.h" +#include "rgw_sts.h" +#include "rgw_web_idp.h" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#include "jwt-cpp/jwt.h" +#pragma clang diagnostic pop +#pragma GCC diagnostic pop +#include "rgw_oidc_provider.h" + + +namespace rgw::auth::sts { + +class WebTokenEngine : public rgw::auth::Engine { + static constexpr std::string_view princTagsNamespace = "https://aws.amazon.com/tags"; + CephContext* const cct; + rgw::sal::Driver* driver; + + using result_t = rgw::auth::Engine::result_t; + using Pair = std::pair; + using token_t = std::unordered_multimap; + using principal_tags_t = std::set; + + const rgw::auth::TokenExtractor* const extractor; + const rgw::auth::WebIdentityApplier::Factory* const apl_factory; + + bool is_applicable(const std::string& token) const noexcept; + + bool is_client_id_valid(std::vector& client_ids, const std::string& client_id) const; + + bool is_cert_valid(const std::vector& thumbprints, const std::string& cert) const; + + std::unique_ptr get_provider(const DoutPrefixProvider *dpp, const std::string& role_arn, const std::string& iss) const; + + std::string get_role_tenant(const std::string& role_arn) const; + + std::string get_role_name(const string& role_arn) const; + + std::string get_cert_url(const std::string& iss, const DoutPrefixProvider *dpp,optional_yield y) const; + + std::tuple, boost::optional> + get_from_jwt(const DoutPrefixProvider* dpp, const std::string& token, const req_state* const s, optional_yield y) const; + + void validate_signature (const DoutPrefixProvider* dpp, const jwt::decoded_jwt& decoded, const std::string& algorithm, const std::string& iss, const std::vector& thumbprints, optional_yield y) const; + + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* s, optional_yield y) const; + + template + void recurse_and_insert(const string& key, const jwt::claim& c, T& t) const; + WebTokenEngine::token_t get_token_claims(const jwt::decoded_jwt& decoded) const; + +public: + WebTokenEngine(CephContext* const cct, + rgw::sal::Driver* driver, + const rgw::auth::TokenExtractor* const extractor, + const rgw::auth::WebIdentityApplier::Factory* const apl_factory) + : cct(cct), + driver(driver), + extractor(extractor), + apl_factory(apl_factory) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::sts::WebTokenEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const override { + return authenticate(dpp, extractor->get_token(s), s, y); + } +}; /* class WebTokenEngine */ + +class DefaultStrategy : public rgw::auth::Strategy, + public rgw::auth::TokenExtractor, + public rgw::auth::WebIdentityApplier::Factory { + rgw::sal::Driver* driver; + const ImplicitTenants& implicit_tenant_context; + + /* The engine. */ + const WebTokenEngine web_token_engine; + + using aplptr_t = rgw::auth::IdentityApplier::aplptr_t; + + /* The method implements TokenExtractor for Web Token in req_state. */ + std::string get_token(const req_state* const s) const override { + return s->info.args.get("WebIdentityToken"); + } + + aplptr_t create_apl_web_identity( CephContext* cct, + const req_state* s, + const std::string& role_session, + const std::string& role_tenant, + const std::unordered_multimap& token, + boost::optional> role_tags, + boost::optional>> principal_tags) const override { + auto apl = rgw::auth::add_sysreq(cct, driver, s, + rgw::auth::WebIdentityApplier(cct, driver, role_session, role_tenant, token, role_tags, principal_tags)); + return aplptr_t(new decltype(apl)(std::move(apl))); + } + +public: + DefaultStrategy(CephContext* const cct, + const ImplicitTenants& implicit_tenant_context, + rgw::sal::Driver* driver) + : driver(driver), + implicit_tenant_context(implicit_tenant_context), + web_token_engine(cct, driver, + static_cast(this), + static_cast(this)) { + /* When the constructor's body is being executed, all member engines + * should be initialized. Thus, we can safely add them. */ + using Control = rgw::auth::Strategy::Control; + add_engine(Control::SUFFICIENT, web_token_engine); + } + + const char* get_name() const noexcept override { + return "rgw::auth::sts::DefaultStrategy"; + } +}; + +} // namespace rgw::auth::sts + +class RGWREST_STS : public RGWRESTOp { +protected: + STS::STSService sts; +public: + RGWREST_STS() = default; + int verify_permission(optional_yield y) override; + void send_response() override; +}; + +class RGWSTSAssumeRoleWithWebIdentity : public RGWREST_STS { +protected: + std::string duration; + std::string providerId; + std::string policy; + std::string roleArn; + std::string roleSessionName; + std::string sub; + std::string aud; + std::string iss; +public: + RGWSTSAssumeRoleWithWebIdentity() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "assume_role_web_identity"; } + RGWOpType get_type() override { return RGW_STS_ASSUME_ROLE_WEB_IDENTITY; } +}; + +class RGWSTSAssumeRole : public RGWREST_STS { +protected: + std::string duration; + std::string externalId; + std::string policy; + std::string roleArn; + std::string roleSessionName; + std::string serialNumber; + std::string tokenCode; +public: + RGWSTSAssumeRole() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "assume_role"; } + RGWOpType get_type() override { return RGW_STS_ASSUME_ROLE; } +}; + +class RGWSTSGetSessionToken : public RGWREST_STS { +protected: + std::string duration; + std::string serialNumber; + std::string tokenCode; +public: + RGWSTSGetSessionToken() = default; + void execute(optional_yield y) override; + int verify_permission(optional_yield y) override; + int get_params(); + const char* name() const override { return "get_session_token"; } + RGWOpType get_type() override { return RGW_STS_GET_SESSION_TOKEN; } +}; + +class RGW_Auth_STS { +public: + static int authorize(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const rgw::auth::StrategyRegistry& auth_registry, + req_state *s, optional_yield y); +}; + +class RGWHandler_REST_STS : public RGWHandler_REST { + const rgw::auth::StrategyRegistry& auth_registry; + RGWOp *op_post() override; +public: + + static bool action_exists(const req_state* s); + + RGWHandler_REST_STS(const rgw::auth::StrategyRegistry& auth_registry) + : RGWHandler_REST(), + auth_registry(auth_registry) {} + ~RGWHandler_REST_STS() override = default; + + int init(rgw::sal::Driver* driver, + req_state *s, + rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider* dpp, optional_yield y) override; + int postauth_init(optional_yield y) override { return 0; } +}; + +class RGWRESTMgr_STS : public RGWRESTMgr { +public: + RGWRESTMgr_STS() = default; + ~RGWRESTMgr_STS() override = default; + + RGWRESTMgr *get_resource_mgr(req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry&, + const std::string&) override; +}; + diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc new file mode 100644 index 000000000..ee943ea44 --- /dev/null +++ b/src/rgw/rgw_rest_swift.cc @@ -0,0 +1,3114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + +#include "include/ceph_assert.h" +#include "ceph_ver.h" + +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rest_swift.h" +#include "rgw_acl_swift.h" +#include "rgw_cors_swift.h" +#include "rgw_formats.h" +#include "rgw_client_io.h" +#include "rgw_compression.h" + +#include "rgw_auth.h" +#include "rgw_auth_registry.h" +#include "rgw_swift_auth.h" + +#include "rgw_request.h" +#include "rgw_process.h" + +#include "rgw_zone.h" +#include "rgw_sal.h" + +#include "services/svc_zone.h" + +#include +#include +#include +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int RGWListBuckets_ObjStore_SWIFT::get_params(optional_yield y) +{ + prefix = s->info.args.get("prefix"); + marker = s->info.args.get("marker"); + end_marker = s->info.args.get("end_marker"); + wants_reversed = s->info.args.exists("reverse"); + + if (wants_reversed) { + std::swap(marker, end_marker); + } + + std::string limit_str = s->info.args.get("limit"); + if (!limit_str.empty()) { + std::string err; + long l = strict_strtol(limit_str.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + + if (l > (long)limit_max || l < 0) { + return -ERR_PRECONDITION_FAILED; + } + + limit = (uint64_t)l; + } + + if (s->cct->_conf->rgw_swift_need_stats) { + bool stats, exists; + int r = s->info.args.get_bool("stats", &stats, &exists); + + if (r < 0) { + return r; + } + + if (exists) { + need_stats = stats; + } + } else { + need_stats = false; + } + + return 0; +} + +static void dump_account_metadata(req_state * const s, + const RGWUsageStats& global_stats, + const std::map &policies_stats, + /* const */map& attrs, + const RGWQuotaInfo& quota, + const RGWAccessControlPolicy_SWIFTAcct &policy) +{ + /* Adding X-Timestamp to keep align with Swift API */ + dump_header(s, "X-Timestamp", ceph_clock_now()); + + dump_header(s, "X-Account-Container-Count", global_stats.buckets_count); + dump_header(s, "X-Account-Object-Count", global_stats.objects_count); + dump_header(s, "X-Account-Bytes-Used", global_stats.bytes_used); + dump_header(s, "X-Account-Bytes-Used-Actual", global_stats.bytes_used_rounded); + + for (const auto& kv : policies_stats) { + const auto& policy_name = camelcase_dash_http_attr(kv.first); + const auto& policy_stats = kv.second; + + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Container-Count", policy_stats.buckets_count); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Object-Count", policy_stats.objects_count); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Bytes-Used", policy_stats.bytes_used); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Bytes-Used-Actual", policy_stats.bytes_used_rounded); + } + + /* Dump TempURL-related stuff */ + if (s->perm_mask == RGW_PERM_FULL_CONTROL) { + auto iter = s->user->get_info().temp_url_keys.find(0); + if (iter != std::end(s->user->get_info().temp_url_keys) && ! iter->second.empty()) { + dump_header(s, "X-Account-Meta-Temp-Url-Key", iter->second); + } + + iter = s->user->get_info().temp_url_keys.find(1); + if (iter != std::end(s->user->get_info().temp_url_keys) && ! iter->second.empty()) { + dump_header(s, "X-Account-Meta-Temp-Url-Key-2", iter->second); + } + } + + /* Dump quota headers. */ + if (quota.enabled) { + if (quota.max_size >= 0) { + dump_header(s, "X-Account-Meta-Quota-Bytes", quota.max_size); + } + + /* Limit on the number of objects in a given account is a RadosGW's + * extension. Swift's account quota WSGI filter doesn't support it. */ + if (quota.max_objects >= 0) { + dump_header(s, "X-Account-Meta-Quota-Count", quota.max_objects); + } + } + + /* Dump user-defined metadata items and generic attrs. */ + const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1; + map::iterator iter; + for (iter = attrs.lower_bound(RGW_ATTR_PREFIX); iter != attrs.end(); ++iter) { + const char *name = iter->first.c_str(); + map::const_iterator geniter = rgw_to_http_attrs.find(name); + + if (geniter != rgw_to_http_attrs.end()) { + dump_header(s, geniter->second, iter->second); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) { + dump_header_prefixed(s, "X-Account-Meta-", + camelcase_dash_http_attr(name + PREFIX_LEN), + iter->second); + } + } + + /* Dump account ACLs */ + auto account_acls = policy.to_str(); + if (account_acls) { + dump_header(s, "X-Account-Access-Control", std::move(*account_acls)); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets) +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } else if (!has_buckets && s->format == RGWFormat::PLAIN) { + op_ret = STATUS_NO_CONTENT; + set_req_state_err(s, op_ret); + } + + if (! s->cct->_conf->rgw_swift_enforce_content_length) { + /* Adding account stats in the header to keep align with Swift API */ + dump_account_metadata(s, + global_stats, + policies_stats, + s->user->get_attrs(), + s->user->get_info().quota.user_quota, + static_cast(*s->user_acl)); + dump_errno(s); + dump_header(s, "Accept-Ranges", "bytes"); + end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true); + } + + if (! op_ret) { + dump_start(s); + s->formatter->open_array_section_with_attrs("account", + FormatterAttrs("name", s->user->get_display_name().c_str(), NULL)); + + sent_data = true; + } +} + +void RGWListBuckets_ObjStore_SWIFT::handle_listing_chunk(rgw::sal::BucketList&& buckets) +{ + if (wants_reversed) { + /* Just store in the reversal buffer. Its content will be handled later, + * in send_response_end(). */ + reverse_buffer.emplace(std::begin(reverse_buffer), std::move(buckets)); + } else { + return send_response_data(buckets); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_data(rgw::sal::BucketList& buckets) +{ + if (! sent_data) { + return; + } + + /* Take care of the prefix parameter of Swift API. There is no business + * in applying the filter earlier as we really need to go through all + * entries regardless of it (the headers like X-Account-Container-Count + * aren't affected by specifying prefix). */ + const auto& m = buckets.get_buckets(); + for (auto iter = m.lower_bound(prefix); + iter != m.end() && boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + dump_bucket_entry(*iter->second); + } +} + +void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const rgw::sal::Bucket& bucket) +{ + s->formatter->open_object_section("container"); + s->formatter->dump_string("name", bucket.get_name()); + + if (need_stats) { + s->formatter->dump_int("count", bucket.get_count()); + s->formatter->dump_int("bytes", bucket.get_size()); + } + + s->formatter->close_section(); + + if (! s->cct->_conf->rgw_swift_enforce_content_length) { + rgw_flush_formatter(s, s->formatter); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_data_reversed(rgw::sal::BucketList& buckets) +{ + if (! sent_data) { + return; + } + + /* Take care of the prefix parameter of Swift API. There is no business + * in applying the filter earlier as we really need to go through all + * entries regardless of it (the headers like X-Account-Container-Count + * aren't affected by specifying prefix). */ + auto& m = buckets.get_buckets(); + + auto iter = m.rbegin(); + for (/* initialized above */; + iter != m.rend() && !boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + /* NOP */; + } + + for (/* iter carried */; + iter != m.rend() && boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + dump_bucket_entry(*iter->second); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_end() +{ + if (wants_reversed) { + for (auto& buckets : reverse_buffer) { + send_response_data_reversed(buckets); + } + } + + if (sent_data) { + s->formatter->close_section(); + } + + if (s->cct->_conf->rgw_swift_enforce_content_length) { + /* Adding account stats in the header to keep align with Swift API */ + dump_account_metadata(s, + global_stats, + policies_stats, + s->user->get_attrs(), + s->user->get_info().quota.user_quota, + static_cast(*s->user_acl)); + dump_errno(s); + end_header(s, nullptr, nullptr, s->formatter->get_len(), true); + } + + if (sent_data || s->cct->_conf->rgw_swift_enforce_content_length) { + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWListBucket_ObjStore_SWIFT::get_params(optional_yield y) +{ + prefix = s->info.args.get("prefix"); + marker = s->info.args.get("marker"); + end_marker = s->info.args.get("end_marker"); + max_keys = s->info.args.get("limit"); + + // non-standard + s->info.args.get_bool("allow_unordered", &allow_unordered, false); + + delimiter = s->info.args.get("delimiter"); + + op_ret = parse_max_keys(); + if (op_ret < 0) { + return op_ret; + } + // S3 behavior is to silently cap the max-keys. + // Swift behavior is to abort. + if (max > default_max) + return -ERR_PRECONDITION_FAILED; + + string path_args; + if (s->info.args.exists("path")) { // should handle empty path + path_args = s->info.args.get("path"); + if (!delimiter.empty() || !prefix.empty()) { + return -EINVAL; + } + prefix = path_args; + delimiter="/"; + + path = prefix; + if (path.size() && path[path.size() - 1] != '/') + path.append("/"); + + int len = prefix.size(); + int delim_size = delimiter.size(); + + if (len >= delim_size) { + if (prefix.substr(len - delim_size).compare(delimiter) != 0) + prefix.append(delimiter); + } + } + + return 0; +} + +static void dump_container_metadata(req_state *, + const rgw::sal::Bucket*, + const RGWQuotaInfo&, + const RGWBucketWebsiteConf&); + +void RGWListBucket_ObjStore_SWIFT::send_response() +{ + vector::iterator iter = objs.begin(); + map::iterator pref_iter = common_prefixes.begin(); + + dump_start(s); + dump_container_metadata(s, s->bucket.get(), quota.bucket_quota, + s->bucket->get_info().website_conf); + + s->formatter->open_array_section_with_attrs("container", + FormatterAttrs("name", + s->bucket->get_name().c_str(), + NULL)); + + while (iter != objs.end() || pref_iter != common_prefixes.end()) { + bool do_pref = false; + bool do_objs = false; + rgw_obj_key key; + if (iter != objs.end()) { + key = iter->key; + } + if (pref_iter == common_prefixes.end()) + do_objs = true; + else if (iter == objs.end()) + do_pref = true; + else if (!key.empty() && key.name.compare(pref_iter->first) == 0) { + do_objs = true; + ++pref_iter; + } else if (!key.empty() && key.name.compare(pref_iter->first) <= 0) + do_objs = true; + else + do_pref = true; + + if (do_objs && (allow_unordered || marker.empty() || marker < key)) { + if (key.name.compare(path) == 0) + goto next; + + s->formatter->open_object_section("object"); + s->formatter->dump_string("name", key.name); + s->formatter->dump_string("hash", iter->meta.etag); + s->formatter->dump_int("bytes", iter->meta.accounted_size); + if (!iter->meta.user_data.empty()) + s->formatter->dump_string("user_custom_data", iter->meta.user_data); + string single_content_type = iter->meta.content_type; + if (iter->meta.content_type.size()) { + // content type might hold multiple values, just dump the last one + ssize_t pos = iter->meta.content_type.rfind(','); + if (pos > 0) { + ++pos; + while (single_content_type[pos] == ' ') + ++pos; + single_content_type = single_content_type.substr(pos); + } + s->formatter->dump_string("content_type", single_content_type); + } + dump_time(s, "last_modified", iter->meta.mtime); + s->formatter->close_section(); + } + + if (do_pref && (marker.empty() || pref_iter->first.compare(marker.name) > 0)) { + const string& name = pref_iter->first; + if (name.compare(delimiter) == 0) + goto next; + + s->formatter->open_object_section_with_attrs("subdir", FormatterAttrs("name", name.c_str(), NULL)); + + /* swift is a bit inconsistent here */ + switch (s->format) { + case RGWFormat::XML: + s->formatter->dump_string("name", name); + break; + default: + s->formatter->dump_string("subdir", name); + } + s->formatter->close_section(); + } +next: + if (do_objs) + ++iter; + else + ++pref_iter; + } + + s->formatter->close_section(); + + int64_t content_len = 0; + if (! op_ret) { + content_len = s->formatter->get_len(); + if (content_len == 0) { + op_ret = STATUS_NO_CONTENT; + } + } else if (op_ret > 0) { + op_ret = 0; + } + + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, NULL, content_len); + if (op_ret < 0) { + return; + } + + rgw_flush_formatter_and_reset(s, s->formatter); +} // RGWListBucket_ObjStore_SWIFT::send_response + +static void dump_container_metadata(req_state *s, + const rgw::sal::Bucket* bucket, + const RGWQuotaInfo& quota, + const RGWBucketWebsiteConf& ws_conf) +{ + /* Adding X-Timestamp to keep align with Swift API */ + dump_header(s, "X-Timestamp", utime_t(s->bucket->get_info().creation_time)); + + dump_header(s, "X-Container-Object-Count", bucket->get_count()); + dump_header(s, "X-Container-Bytes-Used", bucket->get_size()); + dump_header(s, "X-Container-Bytes-Used-Actual", bucket->get_size_rounded()); + + if (rgw::sal::Object::empty(s->object.get())) { + auto swift_policy = \ + static_cast(s->bucket_acl.get()); + std::string read_acl, write_acl; + swift_policy->to_str(read_acl, write_acl); + + if (read_acl.size()) { + dump_header(s, "X-Container-Read", read_acl); + } + if (write_acl.size()) { + dump_header(s, "X-Container-Write", write_acl); + } + if (!s->bucket->get_placement_rule().name.empty()) { + dump_header(s, "X-Storage-Policy", s->bucket->get_placement_rule().name); + } + dump_header(s, "X-Storage-Class", s->bucket->get_placement_rule().get_storage_class()); + + /* Dump user-defined metadata items and generic attrs. */ + const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1; + map::iterator iter; + for (iter = s->bucket_attrs.lower_bound(RGW_ATTR_PREFIX); + iter != s->bucket_attrs.end(); + ++iter) { + const char *name = iter->first.c_str(); + map::const_iterator geniter = rgw_to_http_attrs.find(name); + + if (geniter != rgw_to_http_attrs.end()) { + dump_header(s, geniter->second, iter->second); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) { + dump_header_prefixed(s, "X-Container-Meta-", + camelcase_dash_http_attr(name + PREFIX_LEN), + iter->second); + } + } + } + + /* Dump container versioning info. */ + if (! s->bucket->get_info().swift_ver_location.empty()) { + dump_header(s, "X-Versions-Location", + url_encode(s->bucket->get_info().swift_ver_location)); + } + + /* Dump quota headers. */ + if (quota.enabled) { + if (quota.max_size >= 0) { + dump_header(s, "X-Container-Meta-Quota-Bytes", quota.max_size); + } + + if (quota.max_objects >= 0) { + dump_header(s, "X-Container-Meta-Quota-Count", quota.max_objects); + } + } + + /* Dump Static Website headers. */ + if (! ws_conf.index_doc_suffix.empty()) { + dump_header(s, "X-Container-Meta-Web-Index", ws_conf.index_doc_suffix); + } + + if (! ws_conf.error_doc.empty()) { + dump_header(s, "X-Container-Meta-Web-Error", ws_conf.error_doc); + } + + if (! ws_conf.subdir_marker.empty()) { + dump_header(s, "X-Container-Meta-Web-Directory-Type", + ws_conf.subdir_marker); + } + + if (! ws_conf.listing_css_doc.empty()) { + dump_header(s, "X-Container-Meta-Web-Listings-CSS", + ws_conf.listing_css_doc); + } + + if (ws_conf.listing_enabled) { + dump_header(s, "X-Container-Meta-Web-Listings", "true"); + } + + /* Dump bucket's modification time. Compliance with the Swift API really + * needs that. */ + dump_last_modified(s, s->bucket_mtime); +} + +void RGWStatAccount_ObjStore_SWIFT::execute(optional_yield y) +{ + RGWStatAccount_ObjStore::execute(y); + op_ret = s->user->read_attrs(s, s->yield); + attrs = s->user->get_attrs(); +} + +void RGWStatAccount_ObjStore_SWIFT::send_response() +{ + if (op_ret >= 0) { + op_ret = STATUS_NO_CONTENT; + dump_account_metadata(s, + global_stats, + policies_stats, + attrs, + s->user->get_info().quota.user_quota, + static_cast(*s->user_acl)); + } + + set_req_state_err(s, op_ret); + dump_errno(s); + + end_header(s, NULL, NULL, 0, true); + + dump_start(s); +} + +void RGWStatBucket_ObjStore_SWIFT::send_response() +{ + if (op_ret >= 0) { + op_ret = STATUS_NO_CONTENT; + dump_container_metadata(s, bucket.get(), quota.bucket_quota, + s->bucket->get_info().website_conf); + } + + set_req_state_err(s, op_ret); + dump_errno(s); + + end_header(s, this, NULL, 0, true); + dump_start(s); +} + +static int get_swift_container_settings(req_state * const s, + rgw::sal::Driver* const driver, + RGWAccessControlPolicy * const policy, + bool * const has_policy, + uint32_t * rw_mask, + RGWCORSConfiguration * const cors_config, + bool * const has_cors) +{ + const char * const read_list = s->info.env->get("HTTP_X_CONTAINER_READ"); + const char * const write_list = s->info.env->get("HTTP_X_CONTAINER_WRITE"); + + *has_policy = false; + + if (read_list || write_list) { + RGWAccessControlPolicy_SWIFT swift_policy(s->cct); + const auto r = swift_policy.create(s, driver, + s->user->get_id(), + s->user->get_display_name(), + read_list, + write_list, + *rw_mask); + if (r < 0) { + return r; + } + + *policy = swift_policy; + *has_policy = true; + } + + *has_cors = false; + + /*Check and update CORS configuration*/ + const char *allow_origins = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_ALLOW_ORIGIN"); + const char *allow_headers = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_ALLOW_HEADERS"); + const char *expose_headers = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_EXPOSE_HEADERS"); + const char *max_age = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_MAX_AGE"); + if (allow_origins) { + RGWCORSConfiguration_SWIFT *swift_cors = new RGWCORSConfiguration_SWIFT; + int r = swift_cors->create_update(allow_origins, allow_headers, expose_headers, max_age); + if (r < 0) { + ldpp_dout(s, 0) << "Error creating/updating the cors configuration" << dendl; + delete swift_cors; + return r; + } + *has_cors = true; + *cors_config = *swift_cors; + cors_config->dump(); + delete swift_cors; + } + + return 0; +} + +#define ACCT_REMOVE_ATTR_PREFIX "HTTP_X_REMOVE_ACCOUNT_META_" +#define ACCT_PUT_ATTR_PREFIX "HTTP_X_ACCOUNT_META_" +#define CONT_REMOVE_ATTR_PREFIX "HTTP_X_REMOVE_CONTAINER_META_" +#define CONT_PUT_ATTR_PREFIX "HTTP_X_CONTAINER_META_" + +static void get_rmattrs_from_headers(const req_state * const s, + const char * const put_prefix, + const char * const del_prefix, + set& rmattr_names) +{ + const size_t put_prefix_len = strlen(put_prefix); + const size_t del_prefix_len = strlen(del_prefix); + + for (const auto& kv : s->info.env->get_map()) { + size_t prefix_len = 0; + const char * const p = kv.first.c_str(); + + if (strncasecmp(p, del_prefix, del_prefix_len) == 0) { + /* Explicitly requested removal. */ + prefix_len = del_prefix_len; + } else if ((strncasecmp(p, put_prefix, put_prefix_len) == 0) + && kv.second.empty()) { + /* Removal requested by putting an empty value. */ + prefix_len = put_prefix_len; + } + + if (prefix_len > 0) { + string name(RGW_ATTR_META_PREFIX); + name.append(lowercase_dash_http_attr(p + prefix_len)); + rmattr_names.insert(name); + } + } +} + +static int get_swift_versioning_settings( + req_state * const s, + boost::optional& swift_ver_location) +{ + /* Removing the Swift's versions location has lower priority than setting + * a new one. That's the reason why we're handling it first. */ + const std::string vlocdel = + s->info.env->get("HTTP_X_REMOVE_VERSIONS_LOCATION", ""); + if (vlocdel.size()) { + swift_ver_location = boost::in_place(std::string()); + } + + if (s->info.env->exists("HTTP_X_VERSIONS_LOCATION")) { + /* If the Swift's versioning is globally disabled but someone wants to + * enable it for a given container, new version of Swift will generate + * the precondition failed error. */ + if (! s->cct->_conf->rgw_swift_versioning_enabled) { + return -ERR_PRECONDITION_FAILED; + } + + swift_ver_location = s->info.env->get("HTTP_X_VERSIONS_LOCATION", ""); + } + + return 0; +} + +int RGWCreateBucket_ObjStore_SWIFT::get_params(optional_yield y) +{ + bool has_policy; + uint32_t policy_rw_mask = 0; + + int r = get_swift_container_settings(s, driver, &policy, &has_policy, + &policy_rw_mask, &cors_config, &has_cors); + if (r < 0) { + return r; + } + + if (!has_policy) { + policy.create_default(s->user->get_id(), s->user->get_display_name()); + } + + location_constraint = driver->get_zone()->get_zonegroup().get_api_name(); + get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX, + CONT_REMOVE_ATTR_PREFIX, rmattr_names); + placement_rule.init(s->info.env->get("HTTP_X_STORAGE_POLICY", ""), s->info.storage_class); + + return get_swift_versioning_settings(s, swift_ver_location); +} + +static inline int handle_metadata_errors(req_state* const s, const int op_ret) +{ + if (op_ret == -EFBIG) { + /* Handle the custom error message of exceeding maximum custom attribute + * (stored as xattr) size. */ + const auto error_message = boost::str( + boost::format("Metadata value longer than %lld") + % s->cct->_conf.get_val("rgw_max_attr_size")); + set_req_state_err(s, EINVAL, error_message); + return -EINVAL; + } else if (op_ret == -E2BIG) { + const auto error_message = boost::str( + boost::format("Too many metadata items; max %lld") + % s->cct->_conf.get_val("rgw_max_attrs_num_in_req")); + set_req_state_err(s, EINVAL, error_message); + return -EINVAL; + } + + return op_ret; +} + +void RGWCreateBucket_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_CREATED; + } else if (op_ret == -ERR_BUCKET_EXISTS) { + op_ret = STATUS_ACCEPTED; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + /* Propose ending HTTP header with 0 Content-Length header. */ + end_header(s, NULL, NULL, 0); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWDeleteBucket_ObjStore_SWIFT::send_response() +{ + int r = op_ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this, NULL, 0); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static int get_delete_at_param(req_state *s, boost::optional &delete_at) +{ + /* Handle Swift object expiration. */ + real_time delat_proposal; + string x_delete = s->info.env->get("HTTP_X_DELETE_AFTER", ""); + + if (x_delete.empty()) { + x_delete = s->info.env->get("HTTP_X_DELETE_AT", ""); + } else { + /* X-Delete-After HTTP is present. It means we need add its value + * to the current time. */ + delat_proposal = real_clock::now(); + } + + if (x_delete.empty()) { + delete_at = boost::none; + if (s->info.env->exists("HTTP_X_REMOVE_DELETE_AT")) { + delete_at = boost::in_place(real_time()); + } + return 0; + } + string err; + long ts = strict_strtoll(x_delete.c_str(), 10, &err); + + if (!err.empty()) { + return -EINVAL; + } + + delat_proposal += make_timespan(ts); + if (delat_proposal < real_clock::now()) { + return -EINVAL; + } + + delete_at = delat_proposal; + + return 0; +} + +int RGWPutObj_ObjStore_SWIFT::verify_permission(optional_yield y) +{ + op_ret = RGWPutObj_ObjStore::verify_permission(y); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWPutObj_ObjStore_SWIFT::update_slo_segment_size(rgw_slo_entry& entry) { + + int r = 0; + const string& path = entry.path; + + /* If the path starts with slashes, strip them all. */ + const size_t pos_init = path.find_first_not_of('/'); + + if (pos_init == string::npos) { + return -EINVAL; + } + + const size_t pos_sep = path.find('/', pos_init); + if (pos_sep == string::npos) { + return -EINVAL; + } + + string bucket_name = path.substr(pos_init, pos_sep - pos_init); + string obj_name = path.substr(pos_sep + 1); + + std::unique_ptr bucket; + + if (bucket_name.compare(s->bucket->get_name()) != 0) { + r = driver->get_bucket(s, s->user.get(), s->user->get_id().tenant, bucket_name, &bucket, s->yield); + if (r < 0) { + ldpp_dout(this, 0) << "could not get bucket info for bucket=" + << bucket_name << dendl; + return r; + } + } else { + bucket = s->bucket->clone(); + } + + /* fetch the stored size of the seg (or error if not valid) */ + std::unique_ptr slo_seg = bucket->get_object(rgw_obj_key(obj_name)); + + /* no prefetch */ + slo_seg->set_atomic(); + + bool compressed; + RGWCompressionInfo cs_info; + uint64_t size_bytes{0}; + + r = slo_seg->get_obj_attrs(s->yield, s); + if (r < 0) { + return r; + } + + size_bytes = slo_seg->get_obj_size(); + + r = rgw_compression_info_from_attrset(slo_seg->get_attrs(), compressed, cs_info); + if (r < 0) { + return -EIO; + } + + if (compressed) { + size_bytes = cs_info.orig_size; + } + + /* "When the PUT operation sees the multipart-manifest=put query + * parameter, it reads the request body and verifies that each + * segment object exists and that the sizes and ETags match. If + * there is a mismatch, the PUT operation fails." + */ + if (entry.size_bytes && + (entry.size_bytes != size_bytes)) { + return -EINVAL; + } + + entry.size_bytes = size_bytes; + + return 0; +} /* RGWPutObj_ObjStore_SWIFT::update_slo_segment_sizes */ + +int RGWPutObj_ObjStore_SWIFT::get_params(optional_yield y) +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + if (!s->length) { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + if (!encoding || strcmp(encoding, "chunked") != 0) { + ldpp_dout(this, 20) << "neither length nor chunked encoding" << dendl; + return -ERR_LENGTH_REQUIRED; + } + + chunked_upload = true; + } + + supplied_etag = s->info.env->get("HTTP_ETAG"); + + if (!s->generic_attrs.count(RGW_ATTR_CONTENT_TYPE)) { + ldpp_dout(this, 5) << "content type wasn't provided, trying to guess" << dendl; + const char *suffix = strrchr(s->object->get_name().c_str(), '.'); + if (suffix) { + suffix++; + if (*suffix) { + string suffix_str(suffix); + const char *mime = rgw_find_mime_by_ext(suffix_str); + if (mime) { + s->generic_attrs[RGW_ATTR_CONTENT_TYPE] = mime; + } + } + } + } + + policy.create_default(s->user->get_id(), s->user->get_display_name()); + + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldpp_dout(this, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + if (!s->cct->_conf->rgw_swift_custom_header.empty()) { + string custom_header = s->cct->_conf->rgw_swift_custom_header; + auto data = s->info.env->get_optional(custom_header); + if (data) { + user_data = *data; + } + } + + dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST"); + bool exists; + string multipart_manifest = s->info.args.get("multipart-manifest", &exists); + if (exists) { + if (multipart_manifest != "put") { + ldpp_dout(this, 5) << "invalid multipart-manifest http param: " << multipart_manifest << dendl; + return -EINVAL; + } + +#define MAX_SLO_ENTRY_SIZE (1024 + 128) // 1024 - max obj name, 128 - enough extra for other info + uint64_t max_len = s->cct->_conf->rgw_max_slo_entries * MAX_SLO_ENTRY_SIZE; + + slo_info = new RGWSLOInfo; + + int r = 0; + std::tie(r, slo_info->raw_data) = rgw_rest_get_json_input_keep_data(s->cct, s, slo_info->entries, max_len); + if (r < 0) { + ldpp_dout(this, 5) << "failed to read input for slo r=" << r << dendl; + return r; + } + + if ((int64_t)slo_info->entries.size() > s->cct->_conf->rgw_max_slo_entries) { + ldpp_dout(this, 5) << "too many entries in slo request: " << slo_info->entries.size() << dendl; + return -EINVAL; + } + + MD5 etag_sum; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + etag_sum.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + uint64_t total_size = 0; + for (auto& entry : slo_info->entries) { + etag_sum.Update((const unsigned char *)entry.etag.c_str(), + entry.etag.length()); + + /* if size_bytes == 0, it should be replaced with the + * real segment size (which could be 0); this follows from the + * fact that Swift requires all segments to exist, but permits + * the size_bytes element to be omitted from the SLO manifest, see + * https://docs.openstack.org/swift/latest/api/large_objects.html + */ + r = update_slo_segment_size(entry); + if (r < 0) { + return r; + } + + total_size += entry.size_bytes; + + ldpp_dout(this, 20) << "slo_part: " << entry.path + << " size=" << entry.size_bytes + << " etag=" << entry.etag + << dendl; + } + complete_etag(etag_sum, &lo_etag); + slo_info->total_size = total_size; + + ofs = slo_info->raw_data.length(); + } + + return RGWPutObj_ObjStore::get_params(y); +} + +void RGWPutObj_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_CREATED; + } + set_req_state_err(s, op_ret); + } + + if (! lo_etag.empty()) { + /* Static Large Object of Swift API has two etags represented by + * following members: + * - etag - for the manifest itself (it will be stored in xattrs), + * - lo_etag - for the content composited from SLO's segments. + * The value is calculated basing on segments' etags. + * In response for PUT request we have to expose the second one. + * The first one may be obtained by GET with "multipart-manifest=get" + * in query string on a given SLO. */ + dump_etag(s, lo_etag, true /* quoted */); + } else { + dump_etag(s, etag); + } + + dump_last_modified(s, mtime); + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static int get_swift_account_settings(req_state * const s, + rgw::sal::Driver* const driver, + RGWAccessControlPolicy_SWIFTAcct* const policy, + bool * const has_policy) +{ + *has_policy = false; + + const char * const acl_attr = s->info.env->get("HTTP_X_ACCOUNT_ACCESS_CONTROL"); + if (acl_attr) { + RGWAccessControlPolicy_SWIFTAcct swift_acct_policy(s->cct); + const bool r = swift_acct_policy.create(s, driver, + s->user->get_id(), + s->user->get_display_name(), + string(acl_attr)); + if (r != true) { + return -EINVAL; + } + + *policy = swift_acct_policy; + *has_policy = true; + } + + return 0; +} + +int RGWPutMetadataAccount_ObjStore_SWIFT::get_params(optional_yield y) +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + int ret = get_swift_account_settings(s, + driver, + // FIXME: we need to carry unique_ptr in generic class + // and allocate appropriate ACL class in the ctor + static_cast(&policy), + &has_policy); + if (ret < 0) { + return ret; + } + + get_rmattrs_from_headers(s, ACCT_PUT_ATTR_PREFIX, ACCT_REMOVE_ATTR_PREFIX, + rmattr_names); + return 0; +} + +void RGWPutMetadataAccount_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWPutMetadataBucket_ObjStore_SWIFT::get_params(optional_yield y) +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + int r = get_swift_container_settings(s, driver, &policy, &has_policy, + &policy_rw_mask, &cors_config, &has_cors); + if (r < 0) { + return r; + } + + get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX, CONT_REMOVE_ATTR_PREFIX, + rmattr_names); + placement_rule.init(s->info.env->get("HTTP_X_STORAGE_POLICY", ""), s->info.storage_class); + + return get_swift_versioning_settings(s, swift_ver_location); +} + +void RGWPutMetadataBucket_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret && (op_ret != -EINVAL)) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWPutMetadataObject_ObjStore_SWIFT::get_params(optional_yield y) +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + /* Handle Swift object expiration. */ + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldpp_dout(this, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST"); + + return 0; +} + +void RGWPutMetadataObject_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_ACCEPTED; + } + set_req_state_err(s, op_ret); + } + + if (!s->is_err()) { + dump_content_length(s, 0); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void bulkdelete_respond(const unsigned num_deleted, + const unsigned int num_unfound, + const std::list& failures, + const int prot_flags, /* in */ + ceph::Formatter& formatter) /* out */ +{ + formatter.open_object_section("delete"); + + string resp_status; + string resp_body; + + if (!failures.empty()) { + int reason = ERR_INVALID_REQUEST; + for (const auto& fail_desc : failures) { + if (-ENOENT != fail_desc.err && -EACCES != fail_desc.err) { + reason = fail_desc.err; + } + } + rgw_err err; + set_req_state_err(err, reason, prot_flags); + dump_errno(err, resp_status); + } else if (0 == num_deleted && 0 == num_unfound) { + /* 400 Bad Request */ + dump_errno(400, resp_status); + resp_body = "Invalid bulk delete."; + } else { + /* 200 OK */ + dump_errno(200, resp_status); + } + + encode_json("Number Deleted", num_deleted, &formatter); + encode_json("Number Not Found", num_unfound, &formatter); + encode_json("Response Body", resp_body, &formatter); + encode_json("Response Status", resp_status, &formatter); + + formatter.open_array_section("Errors"); + for (const auto& fail_desc : failures) { + formatter.open_array_section("object"); + + stringstream ss_name; + ss_name << fail_desc.path; + encode_json("Name", ss_name.str(), &formatter); + + rgw_err err; + set_req_state_err(err, fail_desc.err, prot_flags); + string status; + dump_errno(err, status); + encode_json("Status", status, &formatter); + formatter.close_section(); + } + formatter.close_section(); + + formatter.close_section(); +} + +int RGWDeleteObj_ObjStore_SWIFT::verify_permission(optional_yield y) +{ + op_ret = RGWDeleteObj_ObjStore::verify_permission(y); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWDeleteObj_ObjStore_SWIFT::get_params(optional_yield y) +{ + const string& mm = s->info.args.get("multipart-manifest"); + multipart_delete = (mm.compare("delete") == 0); + + return RGWDeleteObj_ObjStore::get_params(y); +} + +void RGWDeleteObj_ObjStore_SWIFT::send_response() +{ + int r = op_ret; + + if (multipart_delete) { + r = 0; + } else if(!r) { + r = STATUS_NO_CONTENT; + } + + set_req_state_err(s, r); + dump_errno(s); + + if (multipart_delete) { + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + + if (deleter) { + bulkdelete_respond(deleter->get_num_deleted(), + deleter->get_num_unfound(), + deleter->get_failures(), + s->prot_flags, + *s->formatter); + } else if (-ENOENT == op_ret) { + bulkdelete_respond(0, 1, {}, s->prot_flags, *s->formatter); + } else { + RGWBulkDelete::acct_path_t path; + path.bucket_name = s->bucket_name; + path.obj_key = s->object->get_key(); + + RGWBulkDelete::fail_desc_t fail_desc; + fail_desc.err = op_ret; + fail_desc.path = path; + + bulkdelete_respond(0, 0, { fail_desc }, s->prot_flags, *s->formatter); + } + } else { + end_header(s, this); + } + + rgw_flush_formatter_and_reset(s, s->formatter); + +} + +static void get_contype_from_attrs(map& attrs, + string& content_type) +{ + map::iterator iter = attrs.find(RGW_ATTR_CONTENT_TYPE); + if (iter != attrs.end()) { + content_type = rgw_bl_str(iter->second); + } +} + +static void dump_object_metadata(const DoutPrefixProvider* dpp, req_state * const s, + const map& attrs) +{ + map response_attrs; + + for (auto kv : attrs) { + const char * name = kv.first.c_str(); + const auto aiter = rgw_to_http_attrs.find(name); + + if (aiter != std::end(rgw_to_http_attrs)) { + response_attrs[aiter->second] = rgw_bl_str(kv.second); + } else if (strcmp(name, RGW_ATTR_SLO_UINDICATOR) == 0) { + // this attr has an extra length prefix from encode() in prior versions + dump_header(s, "X-Object-Meta-Static-Large-Object", "True"); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, + sizeof(RGW_ATTR_META_PREFIX)-1) == 0) { + name += sizeof(RGW_ATTR_META_PREFIX) - 1; + dump_header_prefixed(s, "X-Object-Meta-", + camelcase_dash_http_attr(name), kv.second); + } + } + + /* Handle override and fallback for Content-Disposition HTTP header. + * At the moment this will be used only by TempURL of the Swift API. */ + const auto cditer = rgw_to_http_attrs.find(RGW_ATTR_CONTENT_DISP); + if (cditer != std::end(rgw_to_http_attrs)) { + const auto& name = cditer->second; + + if (!s->content_disp.override.empty()) { + response_attrs[name] = s->content_disp.override; + } else if (!s->content_disp.fallback.empty() + && response_attrs.find(name) == std::end(response_attrs)) { + response_attrs[name] = s->content_disp.fallback; + } + } + + for (const auto& kv : response_attrs) { + dump_header(s, kv.first, kv.second); + } + + const auto iter = attrs.find(RGW_ATTR_DELETE_AT); + if (iter != std::end(attrs)) { + utime_t delete_at; + try { + decode(delete_at, iter->second); + if (!delete_at.is_zero()) { + dump_header(s, "X-Delete-At", delete_at.sec()); + } + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: cannot decode object's " RGW_ATTR_DELETE_AT + " attr, ignoring" + << dendl; + } + } +} + +int RGWCopyObj_ObjStore_SWIFT::init_dest_policy() +{ + dest_policy.create_default(s->user->get_id(), s->user->get_display_name()); + + return 0; +} + +int RGWCopyObj_ObjStore_SWIFT::get_params(optional_yield y) +{ + if_mod = s->info.env->get("HTTP_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_COPY_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_COPY_IF_NONE_MATCH"); + + const char * const fresh_meta = s->info.env->get("HTTP_X_FRESH_METADATA"); + if (fresh_meta && strcasecmp(fresh_meta, "TRUE") == 0) { + attrs_mod = rgw::sal::ATTRSMOD_REPLACE; + } else { + attrs_mod = rgw::sal::ATTRSMOD_MERGE; + } + + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldpp_dout(this, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + return 0; +} + +void RGWCopyObj_ObjStore_SWIFT::send_partial_response(off_t ofs) +{ + if (! sent_header) { + if (! op_ret) + op_ret = STATUS_CREATED; + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + + /* Send progress information. Note that this diverge from the original swift + * spec. We do this in order to keep connection alive. + */ + if (op_ret == 0) { + s->formatter->open_array_section("progress"); + } + sent_header = true; + } else { + s->formatter->dump_int("ofs", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_SWIFT::dump_copy_info() +{ + /* Dump X-Copied-From. */ + dump_header(s, "X-Copied-From", url_encode(src_bucket->get_name()) + + "/" + url_encode(s->src_object->get_name())); + + /* Dump X-Copied-From-Account. */ + /* XXX tenant */ + dump_header(s, "X-Copied-From-Account", url_encode(s->user->get_id().id)); + + /* Dump X-Copied-From-Last-Modified. */ + dump_time_header(s, "X-Copied-From-Last-Modified", src_mtime); +} + +void RGWCopyObj_ObjStore_SWIFT::send_response() +{ + if (! sent_header) { + string content_type; + if (! op_ret) + op_ret = STATUS_CREATED; + set_req_state_err(s, op_ret); + dump_errno(s); + dump_etag(s, etag); + dump_last_modified(s, mtime); + dump_copy_info(); + get_contype_from_attrs(attrs, content_type); + dump_object_metadata(this, s, attrs); + end_header(s, this, !content_type.empty() ? content_type.c_str() + : "binary/octet-stream"); + } else { + s->formatter->close_section(); + rgw_flush_formatter(s, s->formatter); + } +} + +int RGWGetObj_ObjStore_SWIFT::verify_permission(optional_yield y) +{ + op_ret = RGWGetObj_ObjStore::verify_permission(y); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWGetObj_ObjStore_SWIFT::get_params(optional_yield y) +{ + const string& mm = s->info.args.get("multipart-manifest"); + skip_manifest = (mm.compare("get") == 0); + + return RGWGetObj_ObjStore::get_params(y); +} + +int RGWGetObj_ObjStore_SWIFT::send_response_data_error(optional_yield y) +{ + std::string error_content; + op_ret = error_handler(op_ret, &error_content, y); + if (! op_ret) { + /* The error handler has taken care of the error. */ + return 0; + } + + bufferlist error_bl; + error_bl.append(error_content); + return send_response_data(error_bl, 0, error_bl.length()); +} + +int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, + const off_t bl_ofs, + const off_t bl_len) +{ + string content_type; + + if (sent_header) { + goto send_data; + } + + if (custom_http_ret) { + set_req_state_err(s, 0); + dump_errno(s, custom_http_ret); + } else { + set_req_state_err(s, (partial_content && !op_ret) ? STATUS_PARTIAL_CONTENT + : op_ret); + dump_errno(s); + + if (s->is_err()) { + end_header(s, NULL); + return 0; + } + } + + if (range_str) { + dump_range(s, ofs, end, s->obj_size); + } + + if (s->is_err()) { + end_header(s, NULL); + return 0; + } + + dump_content_length(s, total_len); + dump_last_modified(s, lastmod); + dump_header(s, "X-Timestamp", utime_t(lastmod)); + if (is_slo) { + dump_header(s, "X-Static-Large-Object", "True"); + } + + if (! op_ret) { + if (! lo_etag.empty()) { + dump_etag(s, lo_etag, true /* quoted */); + } else { + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + dump_etag(s, iter->second.to_str()); + } + } + + get_contype_from_attrs(attrs, content_type); + dump_object_metadata(this, s, attrs); + } + + end_header(s, this, !content_type.empty() ? content_type.c_str() + : "binary/octet-stream"); + + sent_header = true; + +send_data: + if (get_data && !op_ret) { + const auto r = dump_body(s, bl.c_str() + bl_ofs, bl_len); + if (r < 0) { + return r; + } + } + rgw_flush_formatter_and_reset(s, s->formatter); + + return 0; +} + +void RGWOptionsCORS_ObjStore_SWIFT::send_response() +{ + string hdrs, exp_hdrs; + uint32_t max_age = CORS_MAX_AGE_INVALID; + /*EACCES means, there is no CORS registered yet for the bucket + *ENOENT means, there is no match of the Origin in the list of CORSRule + */ + if (op_ret == -ENOENT) + op_ret = -EACCES; + if (op_ret < 0) { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL); + return; + } + get_response_params(hdrs, exp_hdrs, &max_age); + dump_errno(s); + dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), + max_age); + end_header(s, NULL); +} + +int RGWBulkDelete_ObjStore_SWIFT::get_data( + list& items, bool * const is_truncated) +{ + constexpr size_t MAX_LINE_SIZE = 2048; + + RGWClientIOStreamBuf ciosb(static_cast(*(s->cio)), + size_t(s->cct->_conf->rgw_max_chunk_size)); + istream cioin(&ciosb); + + char buf[MAX_LINE_SIZE]; + while (cioin.getline(buf, sizeof(buf))) { + string path_str(buf); + + ldpp_dout(this, 20) << "extracted Bulk Delete entry: " << path_str << dendl; + + RGWBulkDelete::acct_path_t path; + + /* We need to skip all slashes at the beginning in order to preserve + * compliance with Swift. */ + const size_t start_pos = path_str.find_first_not_of('/'); + + if (string::npos != start_pos) { + /* Seperator is the first slash after the leading ones. */ + const size_t sep_pos = path_str.find('/', start_pos); + + if (string::npos != sep_pos) { + path.bucket_name = url_decode(path_str.substr(start_pos, + sep_pos - start_pos)); + path.obj_key = url_decode(path_str.substr(sep_pos + 1)); + } else { + /* It's guaranteed here that bucket name is at least one character + * long and is different than slash. */ + path.bucket_name = url_decode(path_str.substr(start_pos)); + } + + items.push_back(path); + } + + if (items.size() == MAX_CHUNK_ENTRIES) { + *is_truncated = true; + return 0; + } + } + + *is_truncated = false; + return 0; +} + +void RGWBulkDelete_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + + bulkdelete_respond(deleter->get_num_deleted(), + deleter->get_num_unfound(), + deleter->get_failures(), + s->prot_flags, + *s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +std::unique_ptr +RGWBulkUploadOp_ObjStore_SWIFT::create_stream() +{ + class SwiftStreamGetter : public StreamGetter { + const DoutPrefixProvider* dpp; + const size_t conlen; + size_t curpos; + req_state* const s; + + public: + SwiftStreamGetter(const DoutPrefixProvider* dpp, req_state* const s, const size_t conlen) + : dpp(dpp), + conlen(conlen), + curpos(0), + s(s) { + } + + ssize_t get_at_most(size_t want, ceph::bufferlist& dst) override { + /* maximum requested by a caller */ + /* data provided by client */ + /* RadosGW's limit. */ + const size_t max_chunk_size = \ + static_cast(s->cct->_conf->rgw_max_chunk_size); + const size_t max_to_read = std::min({ want, conlen - curpos, max_chunk_size }); + + ldpp_dout(dpp, 20) << "bulk_upload: get_at_most max_to_read=" + << max_to_read + << ", dst.c_str()=" << reinterpret_cast(dst.c_str()) << dendl; + + bufferptr bp(max_to_read); + const auto read_len = recv_body(s, bp.c_str(), max_to_read); + dst.append(bp, 0, read_len); + //const auto read_len = recv_body(s, dst.c_str(), max_to_read); + if (read_len < 0) { + return read_len; + } + + curpos += read_len; + return curpos > s->cct->_conf->rgw_max_put_size ? -ERR_TOO_LARGE + : read_len; + } + + ssize_t get_exactly(size_t want, ceph::bufferlist& dst) override { + ldpp_dout(dpp, 20) << "bulk_upload: get_exactly want=" << want << dendl; + + /* FIXME: do this in a loop. */ + const auto ret = get_at_most(want, dst); + ldpp_dout(dpp, 20) << "bulk_upload: get_exactly ret=" << ret << dendl; + if (ret < 0) { + return ret; + } else if (static_cast(ret) != want) { + return -EINVAL; + } else { + return want; + } + } + }; + + if (! s->length) { + op_ret = -EINVAL; + return nullptr; + } else { + ldpp_dout(this, 20) << "bulk upload: create_stream for length=" + << s->length << dendl; + + const size_t conlen = atoll(s->length); + return std::unique_ptr(new SwiftStreamGetter(this, s, conlen)); + } +} + +void RGWBulkUploadOp_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + rgw_flush_formatter_and_reset(s, s->formatter); + + s->formatter->open_object_section("delete"); + + std::string resp_status; + std::string resp_body; + + if (! failures.empty()) { + rgw_err err; + + const auto last_err = { failures.back().err }; + if (boost::algorithm::contains(last_err, terminal_errors)) { + /* The terminal errors are affecting the status of the whole upload. */ + set_req_state_err(err, failures.back().err, s->prot_flags); + } else { + set_req_state_err(err, ERR_INVALID_REQUEST, s->prot_flags); + } + + dump_errno(err, resp_status); + } else if (0 == num_created && failures.empty()) { + /* Nothing created, nothing failed. This means the archive contained no + * entity we could understand (regular file or directory). We need to + * send 400 Bad Request to an HTTP client in the internal status field. */ + dump_errno(400, resp_status); + resp_body = "Invalid Tar File: No Valid Files"; + } else { + /* 200 OK */ + dump_errno(201, resp_status); + } + + encode_json("Number Files Created", num_created, s->formatter); + encode_json("Response Body", resp_body, s->formatter); + encode_json("Response Status", resp_status, s->formatter); + + s->formatter->open_array_section("Errors"); + for (const auto& fail_desc : failures) { + s->formatter->open_array_section("object"); + + encode_json("Name", fail_desc.path, s->formatter); + + rgw_err err; + set_req_state_err(err, fail_desc.err, s->prot_flags); + std::string status; + dump_errno(err, status); + encode_json("Status", status, s->formatter); + + s->formatter->close_section(); + } + s->formatter->close_section(); + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +void RGWGetCrossDomainPolicy_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + + std::stringstream ss; + + ss << R"()" << "\n" + << R"()" << "\n" + << R"()" << "\n" + << g_conf()->rgw_cross_domain_policy << "\n" + << R"()"; + + dump_body(s, ss.str()); +} + +void RGWGetHealthCheck_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret) { + static constexpr char DISABLED[] = "DISABLED BY FILE"; + dump_body(s, DISABLED, strlen(DISABLED)); + } +} + +const vector> RGWInfo_ObjStore_SWIFT::swift_info = +{ + {"bulk_delete", {false, nullptr}}, + {"container_quotas", {false, nullptr}}, + {"swift", {false, RGWInfo_ObjStore_SWIFT::list_swift_data}}, + {"tempurl", { false, RGWInfo_ObjStore_SWIFT::list_tempurl_data}}, + {"slo", {false, RGWInfo_ObjStore_SWIFT::list_slo_data}}, + {"account_quotas", {false, nullptr}}, + {"staticweb", {false, nullptr}}, + {"tempauth", {false, RGWInfo_ObjStore_SWIFT::list_tempauth_data}}, +}; + +void RGWInfo_ObjStore_SWIFT::execute(optional_yield y) +{ + bool is_admin_info_enabled = false; + + const string& swiftinfo_sig = s->info.args.get("swiftinfo_sig"); + const string& swiftinfo_expires = s->info.args.get("swiftinfo_expires"); + + if (!swiftinfo_sig.empty() && + !swiftinfo_expires.empty() && + !is_expired(swiftinfo_expires, this)) { + is_admin_info_enabled = true; + } + + s->formatter->open_object_section("info"); + + for (const auto& pair : swift_info) { + if(!is_admin_info_enabled && pair.second.is_admin_info) + continue; + + if (!pair.second.list_data) { + s->formatter->open_object_section((pair.first).c_str()); + s->formatter->close_section(); + } + else { + pair.second.list_data(*(s->formatter), s->cct->_conf, driver); + } + } + + s->formatter->close_section(); +} + +void RGWInfo_ObjStore_SWIFT::send_response() +{ + if (op_ret < 0) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWInfo_ObjStore_SWIFT::list_swift_data(Formatter& formatter, + const ConfigProxy& config, + rgw::sal::Driver* driver) +{ + formatter.open_object_section("swift"); + formatter.dump_int("max_file_size", config->rgw_max_put_size); + formatter.dump_int("container_listing_limit", RGW_LIST_BUCKETS_LIMIT_MAX); + + string ceph_version(CEPH_GIT_NICE_VER); + formatter.dump_string("version", ceph_version); + + const size_t max_attr_name_len = \ + g_conf().get_val("rgw_max_attr_name_len"); + if (max_attr_name_len) { + const size_t meta_name_limit = \ + max_attr_name_len - strlen(RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX); + formatter.dump_int("max_meta_name_length", meta_name_limit); + } + + const size_t meta_value_limit = g_conf().get_val("rgw_max_attr_size"); + if (meta_value_limit) { + formatter.dump_int("max_meta_value_length", meta_value_limit); + } + + const size_t meta_num_limit = \ + g_conf().get_val("rgw_max_attrs_num_in_req"); + if (meta_num_limit) { + formatter.dump_int("max_meta_count", meta_num_limit); + } + + formatter.open_array_section("policies"); + const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup(); + + std::set targets; + zonegroup.get_placement_target_names(targets); + for (const auto& placement_targets : targets) { + formatter.open_object_section("policy"); + if (placement_targets.compare(zonegroup.get_default_placement_name()) == 0) + formatter.dump_bool("default", true); + formatter.dump_string("name", placement_targets.c_str()); + formatter.close_section(); + } + formatter.close_section(); + + formatter.dump_int("max_object_name_size", RGWHandler_REST::MAX_OBJ_NAME_LEN); + formatter.dump_bool("strict_cors_mode", true); + formatter.dump_int("max_container_name_length", RGWHandler_REST::MAX_BUCKET_NAME_LEN); + formatter.close_section(); +} + +void RGWInfo_ObjStore_SWIFT::list_tempauth_data(Formatter& formatter, + const ConfigProxy& config, + rgw::sal::Driver* driver) +{ + formatter.open_object_section("tempauth"); + formatter.dump_bool("account_acls", true); + formatter.close_section(); +} +void RGWInfo_ObjStore_SWIFT::list_tempurl_data(Formatter& formatter, + const ConfigProxy& config, + rgw::sal::Driver* driver) +{ + formatter.open_object_section("tempurl"); + formatter.open_array_section("methods"); + formatter.dump_string("methodname", "GET"); + formatter.dump_string("methodname", "HEAD"); + formatter.dump_string("methodname", "PUT"); + formatter.dump_string("methodname", "POST"); + formatter.dump_string("methodname", "DELETE"); + formatter.close_section(); + formatter.close_section(); +} + +void RGWInfo_ObjStore_SWIFT::list_slo_data(Formatter& formatter, + const ConfigProxy& config, + rgw::sal::Driver* driver) +{ + formatter.open_object_section("slo"); + formatter.dump_int("max_manifest_segments", config->rgw_max_slo_entries); + formatter.close_section(); +} + +bool RGWInfo_ObjStore_SWIFT::is_expired(const std::string& expires, const DoutPrefixProvider *dpp) +{ + string err; + const utime_t now = ceph_clock_now(); + const uint64_t expiration = (uint64_t)strict_strtoll(expires.c_str(), + 10, &err); + if (!err.empty()) { + ldpp_dout(dpp, 5) << "failed to parse siginfo_expires: " << err << dendl; + return true; + } + + if (expiration <= (uint64_t)now.sec()) { + ldpp_dout(dpp, 5) << "siginfo expired: " << expiration << " <= " << now.sec() << dendl; + return true; + } + + return false; +} + + +void RGWFormPost::init(rgw::sal::Driver* const driver, + req_state* const s, + RGWHandler* const dialect_handler) +{ + if (!rgw::sal::Object::empty(s->object)) { + prefix = std::move(s->object->get_name()); + s->object->set_key(rgw_obj_key()); + } + + return RGWPostObj_ObjStore::init(driver, s, dialect_handler); +} + +std::size_t RGWFormPost::get_max_file_size() /*const*/ +{ + std::string max_str = get_part_str(ctrl_parts, "max_file_size", "0"); + + std::string err; + const std::size_t max_file_size = + static_cast(strict_strtoll(max_str.c_str(), 10, &err)); + + if (! err.empty()) { + ldpp_dout(this, 5) << "failed to parse FormPost's max_file_size: " << err + << dendl; + return 0; + } + + return max_file_size; +} + +bool RGWFormPost::is_non_expired() +{ + std::string expires = get_part_str(ctrl_parts, "expires", "0"); + + std::string err; + const uint64_t expires_timestamp = + static_cast(strict_strtoll(expires.c_str(), 10, &err)); + + if (! err.empty()) { + ldpp_dout(this, 5) << "failed to parse FormPost's expires: " << err << dendl; + return false; + } + + const utime_t now = ceph_clock_now(); + if (std::cmp_less_equal(expires_timestamp, now.sec())) { + ldpp_dout(this, 5) << "FormPost form expired: " + << expires_timestamp << " <= " << now.sec() << dendl; + return false; + } + + return true; +} + +bool RGWFormPost::is_integral() +{ + const std::string form_signature = get_part_str(ctrl_parts, "signature"); + + try { + get_owner_info(s, s->user->get_info()); + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } catch (...) { + ldpp_dout(this, 5) << "cannot get user_info of account's owner" << dendl; + return false; + } + + for (const auto& kv : s->user->get_info().temp_url_keys) { + const int temp_url_key_num = kv.first; + const string& temp_url_key = kv.second; + + if (temp_url_key.empty()) { + continue; + } + + SignatureHelper sig_helper; + sig_helper.calc(temp_url_key, + s->info.request_uri, + get_part_str(ctrl_parts, "redirect"), + get_part_str(ctrl_parts, "max_file_size", "0"), + get_part_str(ctrl_parts, "max_file_count", "0"), + get_part_str(ctrl_parts, "expires", "0")); + + const auto local_sig = sig_helper.get_signature(); + + ldpp_dout(this, 20) << "FormPost signature [" << temp_url_key_num << "]" + << " (calculated): " << local_sig << dendl; + + if (sig_helper.is_equal_to(form_signature)) { + return true; + } else { + ldpp_dout(this, 5) << "FormPost's signature mismatch: " + << local_sig << " != " << form_signature << dendl; + } + } + + return false; +} + +void RGWFormPost::get_owner_info(const req_state* const s, + RGWUserInfo& owner_info) const +{ + /* We cannot use req_state::bucket_name because it isn't available + * now. It will be initialized in RGWHandler_REST_SWIFT::postauth_init(). */ + const string& bucket_name = s->init_state.url_bucket; + + std::unique_ptr user; + + /* TempURL in Formpost only requires that bucket name is specified. */ + if (bucket_name.empty()) { + throw -EPERM; + } + + if (!s->account_name.empty()) { + RGWUserInfo uinfo; + bool found = false; + + const rgw_user uid(s->account_name); + if (uid.tenant.empty()) { + const rgw_user tenanted_uid(uid.id, uid.id); + user = driver->get_user(tenanted_uid); + + if (user->load_user(s, s->yield) >= 0) { + /* Succeeded. */ + found = true; + } + } + + if (!found) { + user = driver->get_user(uid); + if (user->load_user(s, s->yield) < 0) { + throw -EPERM; + } + } + } + + /* Need to get user info of bucket owner. */ + std::unique_ptr bucket; + int ret = driver->get_bucket(s, user.get(), user->get_tenant(), bucket_name, &bucket, s->yield); + if (ret < 0) { + throw ret; + } + + ldpp_dout(this, 20) << "temp url user (bucket owner): " << bucket->get_info().owner + << dendl; + + user = driver->get_user(bucket->get_info().owner); + if (user->load_user(s, s->yield) < 0) { + throw -EPERM; + } + + owner_info = user->get_info(); +} + +int RGWFormPost::get_params(optional_yield y) +{ + /* The parentt class extracts boundary info from the Content-Type. */ + int ret = RGWPostObj_ObjStore::get_params(y); + if (ret < 0) { + return ret; + } + + policy.create_default(s->user->get_id(), s->user->get_display_name()); + + /* Let's start parsing the HTTP body by parsing each form part step- + * by-step till encountering the first part with file data. */ + do { + struct post_form_part part; + ret = read_form_part_header(&part, stream_done); + if (ret < 0) { + return ret; + } + + if (s->cct->_conf->subsys.should_gather()) { + ldpp_dout(this, 20) << "read part header -- part.name=" + << part.name << dendl; + + for (const auto& pair : part.fields) { + ldpp_dout(this, 20) << "field.name=" << pair.first << dendl; + ldpp_dout(this, 20) << "field.val=" << pair.second.val << dendl; + ldpp_dout(this, 20) << "field.params:" << dendl; + + for (const auto& param_pair : pair.second.params) { + ldpp_dout(this, 20) << " " << param_pair.first + << " -> " << param_pair.second << dendl; + } + } + } + + if (stream_done) { + /* Unexpected here. */ + err_msg = "Malformed request"; + return -EINVAL; + } + + const auto field_iter = part.fields.find("Content-Disposition"); + if (std::end(part.fields) != field_iter && + std::end(field_iter->second.params) != field_iter->second.params.find("filename")) { + /* First data part ahead. */ + current_data_part = std::move(part); + + /* Stop the iteration. We can assume that all control parts have been + * already parsed. The rest of HTTP body should contain data parts + * only. They will be picked up by ::get_data(). */ + break; + } else { + /* Control part ahead. Receive, parse and driver for later usage. */ + bool boundary; + ret = read_data(part.data, s->cct->_conf->rgw_max_chunk_size, + boundary, stream_done); + if (ret < 0) { + return ret; + } else if (! boundary) { + err_msg = "Couldn't find boundary"; + return -EINVAL; + } + + ctrl_parts[part.name] = std::move(part); + } + } while (! stream_done); + + min_len = 0; + max_len = get_max_file_size(); + + if (! current_data_part) { + err_msg = "FormPost: no files to process"; + return -EINVAL; + } + + if (! is_non_expired()) { + err_msg = "FormPost: Form Expired"; + return -EPERM; + } + + if (! is_integral()) { + err_msg = "FormPost: Invalid Signature"; + return -EPERM; + } + + return 0; +} + +std::string RGWFormPost::get_current_filename() const +{ + try { + const auto& field = current_data_part->fields.at("Content-Disposition"); + const auto iter = field.params.find("filename"); + + if (std::end(field.params) != iter) { + return prefix + iter->second; + } + } catch (std::out_of_range&) { + /* NOP */; + } + + return prefix; +} + +std::string RGWFormPost::get_current_content_type() const +{ + try { + const auto& field = current_data_part->fields.at("Content-Type"); + return field.val; + } catch (std::out_of_range&) { + /* NOP */; + } + + return std::string(); +} + +bool RGWFormPost::is_next_file_to_upload() +{ + if (! stream_done) { + /* We have at least one additional part in the body. */ + struct post_form_part part; + int r = read_form_part_header(&part, stream_done); + if (r < 0) { + return false; + } + + const auto field_iter = part.fields.find("Content-Disposition"); + if (std::end(part.fields) != field_iter) { + const auto& params = field_iter->second.params; + const auto& filename_iter = params.find("filename"); + + if (std::end(params) != filename_iter && ! filename_iter->second.empty()) { + current_data_part = std::move(part); + return true; + } + } + } + + return false; +} + +int RGWFormPost::get_data(ceph::bufferlist& bl, bool& again) +{ + bool boundary; + + int r = read_data(bl, s->cct->_conf->rgw_max_chunk_size, + boundary, stream_done); + if (r < 0) { + return r; + } + + /* Tell RGWPostObj::execute(optional_yield y) that it has some data to put. */ + again = !boundary; + + return bl.length(); +} + +void RGWFormPost::send_response() +{ + std::string redirect = get_part_str(ctrl_parts, "redirect"); + if (! redirect.empty()) { + op_ret = STATUS_REDIRECT; + } + + set_req_state_err(s, op_ret); + s->err.err_code = err_msg; + dump_errno(s); + if (! redirect.empty()) { + dump_redirect(s, redirect); + } + end_header(s, this); +} + +bool RGWFormPost::is_formpost_req(req_state* const s) +{ + std::string content_type; + std::map params; + + parse_boundary_params(s->info.env->get("CONTENT_TYPE", ""), + content_type, params); + + return boost::algorithm::iequals(content_type, "multipart/form-data") && + params.count("boundary") > 0; +} + + +RGWOp *RGWHandler_REST_Service_SWIFT::op_get() +{ + return new RGWListBuckets_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_head() +{ + return new RGWStatAccount_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_put() +{ + if (s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + return nullptr; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_post() +{ + if (s->info.args.exists("bulk-delete")) { + return new RGWBulkDelete_ObjStore_SWIFT; + } + return new RGWPutMetadataAccount_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_delete() +{ + if (s->info.args.exists("bulk-delete")) { + return new RGWBulkDelete_ObjStore_SWIFT; + } + return NULL; +} + +int RGWSwiftWebsiteHandler::serve_errordoc(const int http_ret, + const std::string error_doc, + optional_yield y) +{ + /* Try to throw it all away. */ + s->formatter->reset(); + + class RGWGetErrorPage : public RGWGetObj_ObjStore_SWIFT { + public: + RGWGetErrorPage(rgw::sal::Driver* const driver, + RGWHandler_REST* const handler, + req_state* const s, + const int http_ret) { + /* Calling a virtual from the base class is safe as the subobject should + * be properly initialized and we haven't overridden the init method. */ + init(driver, s, handler); + set_get_data(true); + set_custom_http_response(http_ret); + } + + int error_handler(const int err_no, + std::string* const error_content, optional_yield y) override { + /* Enforce that any error generated while getting the error page will + * not be send to a client. This allows us to recover from the double + * fault situation by sending the original message. */ + return 0; + } + } get_errpage_op(driver, handler, s, http_ret); + + /* This is okay. It's an error, so nothing will run after this, and it can be + * called by abort_early(), which can be called before s->object or s->bucket + * are set up. */ + if (!rgw::sal::Bucket::empty(s->bucket.get())) { + s->object = s->bucket->get_object(rgw_obj_key(std::to_string(http_ret) + error_doc)); + } else { + s->object = driver->get_object(rgw_obj_key(std::to_string(http_ret) + error_doc)); + } + + RGWOp* newop = &get_errpage_op; + RGWRequest req(0); + return rgw_process_authenticated(handler, newop, &req, s, y, driver, true); +} + +int RGWSwiftWebsiteHandler::error_handler(const int err_no, + std::string* const error_content, + optional_yield y) +{ + if (!s->bucket.get()) { + /* No bucket, default no-op handler */ + return err_no; + } + + const auto& ws_conf = s->bucket->get_info().website_conf; + + if (can_be_website_req() && ! ws_conf.error_doc.empty()) { + set_req_state_err(s, err_no); + return serve_errordoc(s->err.http_ret, ws_conf.error_doc, y); + } + + /* Let's go to the default, no-op handler. */ + return err_no; +} + +bool RGWSwiftWebsiteHandler::is_web_mode() const +{ + const std::string_view webmode = s->info.env->get("HTTP_X_WEB_MODE", ""); + return boost::algorithm::iequals(webmode, "true"); +} + +bool RGWSwiftWebsiteHandler::can_be_website_req() const +{ + /* Static website works only with the GET or HEAD method. Nothing more. */ + static const std::set ws_methods = { "GET", "HEAD" }; + if (ws_methods.count(s->info.method) == 0) { + return false; + } + + /* We also need to handle early failures from the auth system. In such cases + * req_state::auth.identity may be empty. Let's treat that the same way as + * the anonymous access. */ + if (! s->auth.identity) { + return true; + } + + /* Swift serves websites only for anonymous requests unless client explicitly + * requested this behaviour by supplying X-Web-Mode HTTP header set to true. */ + if (s->auth.identity->is_anonymous() || is_web_mode()) { + return true; + } + + return false; +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_redirect_op() +{ + class RGWMovedPermanently: public RGWOp { + const std::string location; + public: + explicit RGWMovedPermanently(const std::string& location) + : location(location) { + } + + int verify_permission(optional_yield) override { + return 0; + } + + void execute(optional_yield) override { + op_ret = -ERR_PERMANENT_REDIRECT; + return; + } + + void send_response() override { + set_req_state_err(s, op_ret); + dump_errno(s); + dump_content_length(s, 0); + dump_redirect(s, location); + end_header(s, this); + } + + const char* name() const override { + return "RGWMovedPermanently"; + } + }; + + return new RGWMovedPermanently(s->info.request_uri + '/'); +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_index_op() +{ + /* Retarget to get obj on requested index file. */ + if (! s->object->empty()) { + s->object->set_name(s->object->get_name() + + s->bucket->get_info().website_conf.get_index_doc()); + } else { + s->object->set_name(s->bucket->get_info().website_conf.get_index_doc()); + } + + auto getop = new RGWGetObj_ObjStore_SWIFT; + getop->set_get_data(boost::algorithm::equals("GET", s->info.method)); + + return getop; +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_listing_op() +{ + class RGWWebsiteListing : public RGWListBucket_ObjStore_SWIFT { + const std::string prefix_override; + + int get_params(optional_yield) override { + prefix = prefix_override; + max = default_max; + delimiter = "/"; + return 0; + } + + void send_response() override { + /* Generate the header now. */ + set_req_state_err(s, op_ret); + dump_errno(s); + dump_container_metadata(s, s->bucket.get(), quota.bucket_quota, + s->bucket->get_info().website_conf); + end_header(s, this, "text/html"); + if (op_ret < 0) { + return; + } + + /* Now it's the time to start generating HTML bucket listing. + * All the crazy stuff with crafting tags will be delegated to + * RGWSwiftWebsiteListingFormatter. */ + std::stringstream ss; + RGWSwiftWebsiteListingFormatter htmler(ss, prefix); + + const auto& ws_conf = s->bucket->get_info().website_conf; + htmler.generate_header(s->decoded_uri, + ws_conf.listing_css_doc); + + for (const auto& pair : common_prefixes) { + std::string subdir_name = pair.first; + if (! subdir_name.empty()) { + /* To be compliant with Swift we need to remove the trailing + * slash. */ + subdir_name.pop_back(); + } + + htmler.dump_subdir(subdir_name); + } + + for (const rgw_bucket_dir_entry& obj : objs) { + if (! common_prefixes.count(obj.key.name + '/')) { + htmler.dump_object(obj); + } + } + + htmler.generate_footer(); + dump_body(s, ss.str()); + } + public: + /* Taking prefix_override by value to leverage std::string r-value ref + * ctor and thus avoid extra memory copying/increasing ref counter. */ + explicit RGWWebsiteListing(std::string prefix_override) + : prefix_override(std::move(prefix_override)) { + } + }; + + std::string prefix = std::move(s->object->get_name()); + s->object->set_key(rgw_obj_key()); + + return new RGWWebsiteListing(std::move(prefix)); +} + +bool RGWSwiftWebsiteHandler::is_web_dir() const +{ + std::string subdir_name = url_decode(s->object->get_name()); + + /* Remove character from the subdir name if it is "/". */ + if (subdir_name.empty()) { + return false; + } else if (subdir_name.back() == '/') { + subdir_name.pop_back(); + if (subdir_name.empty()) { + return false; + } + } + + std::unique_ptr obj = s->bucket->get_object(rgw_obj_key(std::move(subdir_name))); + + /* First, get attrset of the object we'll try to retrieve. */ + obj->set_atomic(); + obj->set_prefetch_data(); + + RGWObjState* state = nullptr; + if (obj->get_obj_state(s, &state, s->yield, false)) { + return false; + } + + /* A nonexistent object cannot be a considered as a marker representing + * the emulation of catalog in FS hierarchy. */ + if (! state->exists) { + return false; + } + + /* Decode the content type. */ + std::string content_type; + get_contype_from_attrs(state->attrset, content_type); + + const auto& ws_conf = s->bucket->get_info().website_conf; + const std::string subdir_marker = ws_conf.subdir_marker.empty() + ? "application/directory" + : ws_conf.subdir_marker; + return subdir_marker == content_type && state->size <= 1; +} + +bool RGWSwiftWebsiteHandler::is_index_present(const std::string& index) const +{ + std::unique_ptr obj = s->bucket->get_object(rgw_obj_key(index)); + + obj->set_atomic(); + obj->set_prefetch_data(); + + RGWObjState* state = nullptr; + if (obj->get_obj_state(s, &state, s->yield, false)) { + return false; + } + + /* A nonexistent object cannot be a considered as a viable index. We will + * try to list the bucket or - if this is impossible - return an error. */ + return state->exists; +} + +int RGWSwiftWebsiteHandler::retarget_bucket(RGWOp* op, RGWOp** new_op) +{ + ldpp_dout(s, 10) << "Starting retarget" << dendl; + RGWOp* op_override = nullptr; + + /* In Swift static web content is served if the request is anonymous or + * has X-Web-Mode HTTP header specified to true. */ + if (can_be_website_req()) { + const auto& ws_conf = s->bucket->get_info().website_conf; + const auto& index = s->bucket->get_info().website_conf.get_index_doc(); + + if (s->decoded_uri.back() != '/') { + op_override = get_ws_redirect_op(); + } else if (! index.empty() && is_index_present(index)) { + op_override = get_ws_index_op(); + } else if (ws_conf.listing_enabled) { + op_override = get_ws_listing_op(); + } + } + + if (op_override) { + handler->put_op(op); + op_override->init(driver, s, handler); + + *new_op = op_override; + } else { + *new_op = op; + } + + /* Return 404 Not Found is the request has web mode enforced but we static web + * wasn't able to serve it accordingly. */ + return ! op_override && is_web_mode() ? -ENOENT : 0; +} + +int RGWSwiftWebsiteHandler::retarget_object(RGWOp* op, RGWOp** new_op) +{ + ldpp_dout(s, 10) << "Starting object retarget" << dendl; + RGWOp* op_override = nullptr; + + /* In Swift static web content is served if the request is anonymous or + * has X-Web-Mode HTTP header specified to true. */ + if (can_be_website_req() && is_web_dir()) { + const auto& ws_conf = s->bucket->get_info().website_conf; + const auto& index = s->bucket->get_info().website_conf.get_index_doc(); + + if (s->decoded_uri.back() != '/') { + op_override = get_ws_redirect_op(); + } else if (! index.empty() && is_index_present(index)) { + op_override = get_ws_index_op(); + } else if (ws_conf.listing_enabled) { + op_override = get_ws_listing_op(); + } + } else { + /* A regular request or the specified object isn't a subdirectory marker. + * We don't need any re-targeting. Error handling (like sending a custom + * error page) will be performed by error_handler of the actual RGWOp. */ + return 0; + } + + if (op_override) { + handler->put_op(op); + op_override->init(driver, s, handler); + + *new_op = op_override; + } else { + *new_op = op; + } + + /* Return 404 Not Found if we aren't able to re-target for subdir marker. */ + return ! op_override ? -ENOENT : 0; +} + + +RGWOp *RGWHandler_REST_Bucket_SWIFT::get_obj_op(bool get_data) +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_SWIFT; + } + + if (get_data) + return new RGWListBucket_ObjStore_SWIFT; + else + return new RGWStatBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_get() +{ + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_head() +{ + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_SWIFT; + } + if(s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + return new RGWCreateBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_delete() +{ + return new RGWDeleteBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_post() +{ + if (RGWFormPost::is_formpost_req(s)) { + return new RGWFormPost; + } else { + return new RGWPutMetadataBucket_ObjStore_SWIFT; + } +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_options() +{ + return new RGWOptionsCORS_ObjStore_SWIFT; +} + + +RGWOp *RGWHandler_REST_Obj_SWIFT::get_obj_op(bool get_data) +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_SWIFT; + } + + RGWGetObj_ObjStore_SWIFT *get_obj_op = new RGWGetObj_ObjStore_SWIFT; + get_obj_op->set_get_data(get_data); + return get_obj_op; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_get() +{ + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_head() +{ + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_SWIFT; + } + if(s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + if (s->init_state.src_bucket.empty()) + return new RGWPutObj_ObjStore_SWIFT; + else + return new RGWCopyObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_delete() +{ + return new RGWDeleteObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_post() +{ + if (RGWFormPost::is_formpost_req(s)) { + return new RGWFormPost; + } else { + return new RGWPutMetadataObject_ObjStore_SWIFT; + } +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_copy() +{ + return new RGWCopyObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_options() +{ + return new RGWOptionsCORS_ObjStore_SWIFT; +} + + +int RGWHandler_REST_SWIFT::authorize(const DoutPrefixProvider *dpp, optional_yield y) +{ + return rgw::auth::Strategy::apply(dpp, auth_strategy, s, y); +} + +int RGWHandler_REST_SWIFT::postauth_init(optional_yield y) +{ + struct req_init_state* t = &s->init_state; + + /* XXX Stub this until Swift Auth sets account into URL. */ + if (g_conf()->rgw_swift_account_in_url + && s->user->get_id().id == RGW_USER_ANON_ID) { + s->bucket_tenant = s->account_name; + } else { + s->bucket_tenant = s->user->get_tenant(); + } + s->bucket_name = t->url_bucket; + + if (!s->object) { + /* Need an object, even an empty one */ + s->object = driver->get_object(rgw_obj_key()); + } + + ldpp_dout(s, 10) << "s->object=" << + (!s->object->empty() ? s->object->get_key() : rgw_obj_key("")) + << " s->bucket=" + << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name) + << dendl; + + int ret; + ret = rgw_validate_tenant_name(s->bucket_tenant); + if (ret) + return ret; + ret = validate_bucket_name(s->bucket_name); + if (ret) + return ret; + ret = validate_object_name(s->object->get_name()); + if (ret) + return ret; + + if (!t->src_bucket.empty()) { + /* + * We don't allow cross-tenant copy at present. It requires account + * names in the URL for Swift. + */ + s->src_tenant_name = s->user->get_tenant(); + s->src_bucket_name = t->src_bucket; + + ret = validate_bucket_name(s->src_bucket_name); + if (ret < 0) { + return ret; + } + ret = validate_object_name(s->src_object->get_name()); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket) +{ + const size_t len = bucket.size(); + + if (len > MAX_BUCKET_NAME_LEN) { + /* Bucket Name too long. Generate custom error message and bind it + * to an R-value reference. */ + const auto msg = boost::str( + boost::format("Container name length of %lld longer than %lld") + % len % int(MAX_BUCKET_NAME_LEN)); + set_req_state_err(s, ERR_INVALID_BUCKET_NAME, msg); + return -ERR_INVALID_BUCKET_NAME; + } + + + if (len == 0) + return 0; + + if (bucket[0] == '.') + return -ERR_INVALID_BUCKET_NAME; + + if (check_utf8(bucket.c_str(), len)) + return -ERR_INVALID_UTF8; + + const char *s = bucket.c_str(); + + for (size_t i = 0; i < len; ++i, ++s) { + if (*(unsigned char *)s == 0xff) + return -ERR_INVALID_BUCKET_NAME; + if (*(unsigned char *)s == '/') + return -ERR_INVALID_BUCKET_NAME; + } + + return 0; +} + +static void next_tok(string& str, string& tok, char delim) +{ + if (str.size() == 0) { + tok = ""; + return; + } + tok = str; + int pos = str.find(delim); + if (pos > 0) { + tok = str.substr(0, pos); + str = str.substr(pos + 1); + } else { + str = ""; + } +} + +int RGWHandler_REST_SWIFT::init_from_header(rgw::sal::Driver* driver, + req_state* const s, + const std::string& frontend_prefix) +{ + string req; + string first; + + s->prot_flags |= RGW_REST_SWIFT; + + char reqbuf[frontend_prefix.length() + s->decoded_uri.length() + 1]; + sprintf(reqbuf, "%s%s", frontend_prefix.c_str(), s->decoded_uri.c_str()); + const char *req_name = reqbuf; + + const char *p; + + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(s); + + /* Skip the leading slash of URL hierarchy. */ + if (req_name[0] != '/') { + return 0; + } else { + req_name++; + } + + if ('\0' == req_name[0]) { + return g_conf()->rgw_swift_url_prefix == "/" ? -ERR_BAD_URL : 0; + } + + req = req_name; + + size_t pos = req.find('/'); + if (std::string::npos != pos && g_conf()->rgw_swift_url_prefix != "/") { + bool cut_url = g_conf()->rgw_swift_url_prefix.length(); + first = req.substr(0, pos); + + if (first.compare(g_conf()->rgw_swift_url_prefix) == 0) { + if (cut_url) { + /* Rewind to the "v1/..." part. */ + next_tok(req, first, '/'); + } + } + } else if (req.compare(g_conf()->rgw_swift_url_prefix) == 0) { + s->formatter = new RGWFormatter_Plain; + return -ERR_BAD_URL; + } else { + first = req; + } + + std::string tenant_path; + if (! g_conf()->rgw_swift_tenant_name.empty()) { + tenant_path = "/AUTH_"; + tenant_path.append(g_conf()->rgw_swift_tenant_name); + } + + /* verify that the request_uri conforms with what's expected */ + char buf[g_conf()->rgw_swift_url_prefix.length() + 16 + tenant_path.length()]; + int blen; + if (g_conf()->rgw_swift_url_prefix == "/") { + blen = sprintf(buf, "/v1%s", tenant_path.c_str()); + } else { + blen = sprintf(buf, "/%s/v1%s", + g_conf()->rgw_swift_url_prefix.c_str(), tenant_path.c_str()); + } + + if (strncmp(reqbuf, buf, blen) != 0) { + return -ENOENT; + } + + int ret = allocate_formatter(s, RGWFormat::PLAIN, true); + if (ret < 0) + return ret; + + string ver; + + next_tok(req, ver, '/'); + + if (!tenant_path.empty() || g_conf()->rgw_swift_account_in_url) { + string account_name; + next_tok(req, account_name, '/'); + + /* Erase all pre-defined prefixes like "AUTH_" or "KEY_". */ + const vector skipped_prefixes = { "AUTH_", "KEY_" }; + + for (const auto& pfx : skipped_prefixes) { + const size_t comp_len = min(account_name.length(), pfx.length()); + if (account_name.compare(0, comp_len, pfx) == 0) { + /* Prefix is present. Drop it. */ + account_name = account_name.substr(comp_len); + break; + } + } + + if (account_name.empty()) { + return -ERR_PRECONDITION_FAILED; + } else { + s->account_name = account_name; + } + } + + next_tok(req, first, '/'); + + ldpp_dout(s, 10) << "ver=" << ver << " first=" << first << " req=" << req << dendl; + if (first.size() == 0) + return 0; + + s->info.effective_uri = "/" + first; + + // Save bucket to tide us over until token is parsed. + s->init_state.url_bucket = first; + + if (req.size()) { + s->object = driver->get_object( + rgw_obj_key(req, s->info.env->get("HTTP_X_OBJECT_VERSION_ID", ""))); /* rgw swift extension */ + s->info.effective_uri.append("/" + s->object->get_name()); + } + + return 0; +} + +int RGWHandler_REST_SWIFT::init(rgw::sal::Driver* driver, req_state* s, + rgw::io::BasicClient *cio) +{ + struct req_init_state *t = &s->init_state; + + s->dialect = "swift"; + + std::string copy_source = s->info.env->get("HTTP_X_COPY_FROM", ""); + if (! copy_source.empty()) { + rgw_obj_key key; + bool result = RGWCopyObj::parse_copy_location(copy_source, t->src_bucket, key, s); + if (!result) + return -ERR_BAD_URL; + s->src_object = driver->get_object(key); + if (!s->src_object) + return -ERR_BAD_URL; + } + + if (s->op == OP_COPY) { + std::string req_dest = s->info.env->get("HTTP_DESTINATION", ""); + if (req_dest.empty()) + return -ERR_BAD_URL; + + std::string dest_bucket_name; + rgw_obj_key dest_obj_key; + bool result = + RGWCopyObj::parse_copy_location(req_dest, dest_bucket_name, + dest_obj_key, s); + if (!result) + return -ERR_BAD_URL; + + std::string dest_object_name = dest_obj_key.name; + + /* convert COPY operation into PUT */ + t->src_bucket = t->url_bucket; + s->src_object = s->object->clone(); + t->url_bucket = dest_bucket_name; + s->object->set_name(dest_object_name); + s->op = OP_PUT; + } + + s->info.storage_class = s->info.env->get("HTTP_X_OBJECT_STORAGE_CLASS", ""); + + return RGWHandler_REST::init(driver, s, cio); +} + +RGWHandler_REST* +RGWRESTMgr_SWIFT::get_handler(rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + int ret = RGWHandler_REST_SWIFT::init_from_header(driver, s, frontend_prefix); + if (ret < 0) { + ldpp_dout(s, 10) << "init_from_header returned err=" << ret << dendl; + return nullptr; + } + + const auto& auth_strategy = auth_registry.get_swift(); + + if (s->init_state.url_bucket.empty()) { + return new RGWHandler_REST_Service_SWIFT(auth_strategy); + } + + if (rgw::sal::Object::empty(s->object.get())) { + return new RGWHandler_REST_Bucket_SWIFT(auth_strategy); + } + + return new RGWHandler_REST_Obj_SWIFT(auth_strategy); +} + +RGWHandler_REST* RGWRESTMgr_SWIFT_Info::get_handler( + rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + s->prot_flags |= RGW_REST_SWIFT; + const auto& auth_strategy = auth_registry.get_swift(); + return new RGWHandler_REST_SWIFT_Info(auth_strategy); +} diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h new file mode 100644 index 000000000..89873131c --- /dev/null +++ b/src/rgw/rgw_rest_swift.h @@ -0,0 +1,685 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once +#define TIME_BUF_SIZE 128 + +#include + +#include +#include + +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_swift_auth.h" +#include "rgw_http_errors.h" + + +class RGWGetObj_ObjStore_SWIFT : public RGWGetObj_ObjStore { + int custom_http_ret = 0; +public: + RGWGetObj_ObjStore_SWIFT() {} + ~RGWGetObj_ObjStore_SWIFT() override {} + + int verify_permission(optional_yield y) override; + int get_params(optional_yield y) override; + int send_response_data_error(optional_yield y) override; + int send_response_data(bufferlist& bl, off_t ofs, off_t len) override; + + void set_custom_http_response(const int http_ret) { + custom_http_ret = http_ret; + } + + bool need_object_expiration() override { + return true; + } +}; + +class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore { + bool need_stats; + bool wants_reversed; + std::string prefix; + std::vector reverse_buffer; + + uint64_t get_default_max() const override { + return 0; + } + +public: + RGWListBuckets_ObjStore_SWIFT() + : need_stats(true), + wants_reversed(false) { + } + ~RGWListBuckets_ObjStore_SWIFT() override {} + + int get_params(optional_yield y) override; + void handle_listing_chunk(rgw::sal::BucketList&& buckets) override; + void send_response_begin(bool has_buckets) override; + void send_response_data(rgw::sal::BucketList& buckets) override; + void send_response_data_reversed(rgw::sal::BucketList& buckets); + void dump_bucket_entry(const rgw::sal::Bucket& obj); + void send_response_end() override; + + bool should_get_stats() override { return need_stats; } + bool supports_account_metadata() override { return true; } +}; + +class RGWListBucket_ObjStore_SWIFT : public RGWListBucket_ObjStore { + std::string path; +public: + RGWListBucket_ObjStore_SWIFT() { + default_max = 10000; + } + ~RGWListBucket_ObjStore_SWIFT() override {} + + int get_params(optional_yield y) override; + void send_response() override; + bool need_container_stats() override { return true; } +}; + +class RGWStatAccount_ObjStore_SWIFT : public RGWStatAccount_ObjStore { + std::map attrs; +public: + RGWStatAccount_ObjStore_SWIFT() { + } + ~RGWStatAccount_ObjStore_SWIFT() override {} + + void execute(optional_yield y) override; + void send_response() override; +}; + +class RGWStatBucket_ObjStore_SWIFT : public RGWStatBucket_ObjStore { +public: + RGWStatBucket_ObjStore_SWIFT() {} + ~RGWStatBucket_ObjStore_SWIFT() override {} + + void send_response() override; +}; + +class RGWCreateBucket_ObjStore_SWIFT : public RGWCreateBucket_ObjStore { +protected: + bool need_metadata_upload() const override { return true; } +public: + RGWCreateBucket_ObjStore_SWIFT() {} + ~RGWCreateBucket_ObjStore_SWIFT() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWDeleteBucket_ObjStore_SWIFT : public RGWDeleteBucket_ObjStore { +public: + RGWDeleteBucket_ObjStore_SWIFT() {} + ~RGWDeleteBucket_ObjStore_SWIFT() override {} + + void send_response() override; +}; + +class RGWPutObj_ObjStore_SWIFT : public RGWPutObj_ObjStore { + std::string lo_etag; +public: + RGWPutObj_ObjStore_SWIFT() {} + ~RGWPutObj_ObjStore_SWIFT() override {} + + int update_slo_segment_size(rgw_slo_entry& entry); + + int verify_permission(optional_yield y) override; + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWPutMetadataAccount_ObjStore_SWIFT : public RGWPutMetadataAccount_ObjStore { +public: + RGWPutMetadataAccount_ObjStore_SWIFT() {} + ~RGWPutMetadataAccount_ObjStore_SWIFT() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWPutMetadataBucket_ObjStore_SWIFT : public RGWPutMetadataBucket_ObjStore { +public: + RGWPutMetadataBucket_ObjStore_SWIFT() {} + ~RGWPutMetadataBucket_ObjStore_SWIFT() override {} + + int get_params(optional_yield y) override; + void send_response() override; +}; + +class RGWPutMetadataObject_ObjStore_SWIFT : public RGWPutMetadataObject_ObjStore { +public: + RGWPutMetadataObject_ObjStore_SWIFT() {} + ~RGWPutMetadataObject_ObjStore_SWIFT() override {} + + int get_params(optional_yield y) override; + void send_response() override; + bool need_object_expiration() override { return true; } +}; + +class RGWDeleteObj_ObjStore_SWIFT : public RGWDeleteObj_ObjStore { +public: + RGWDeleteObj_ObjStore_SWIFT() {} + ~RGWDeleteObj_ObjStore_SWIFT() override {} + + int verify_permission(optional_yield y) override; + int get_params(optional_yield y) override; + bool need_object_expiration() override { return true; } + void send_response() override; +}; + +class RGWCopyObj_ObjStore_SWIFT : public RGWCopyObj_ObjStore { + bool sent_header; +protected: + void dump_copy_info(); +public: + RGWCopyObj_ObjStore_SWIFT() : sent_header(false) {} + ~RGWCopyObj_ObjStore_SWIFT() override {} + + int init_dest_policy() override; + int get_params(optional_yield y) override; + void send_response() override; + void send_partial_response(off_t ofs) override; +}; + +class RGWGetACLs_ObjStore_SWIFT : public RGWGetACLs_ObjStore { +public: + RGWGetACLs_ObjStore_SWIFT() {} + ~RGWGetACLs_ObjStore_SWIFT() override {} + + void send_response() override {} +}; + +class RGWPutACLs_ObjStore_SWIFT : public RGWPutACLs_ObjStore { +public: + RGWPutACLs_ObjStore_SWIFT() : RGWPutACLs_ObjStore() {} + ~RGWPutACLs_ObjStore_SWIFT() override {} + + void send_response() override {} +}; + +class RGWOptionsCORS_ObjStore_SWIFT : public RGWOptionsCORS_ObjStore { +public: + RGWOptionsCORS_ObjStore_SWIFT() {} + ~RGWOptionsCORS_ObjStore_SWIFT() override {} + + void send_response() override; +}; + +class RGWBulkDelete_ObjStore_SWIFT : public RGWBulkDelete_ObjStore { +public: + RGWBulkDelete_ObjStore_SWIFT() {} + ~RGWBulkDelete_ObjStore_SWIFT() override {} + + int get_data(std::list& items, + bool * is_truncated) override; + void send_response() override; +}; + +class RGWBulkUploadOp_ObjStore_SWIFT : public RGWBulkUploadOp_ObjStore { + size_t conlen; + size_t curpos; + +public: + RGWBulkUploadOp_ObjStore_SWIFT() + : conlen(0), + curpos(0) { + } + ~RGWBulkUploadOp_ObjStore_SWIFT() = default; + + std::unique_ptr create_stream() override; + void send_response() override; +}; + +class RGWInfo_ObjStore_SWIFT : public RGWInfo_ObjStore { +protected: + struct info + { + bool is_admin_info; + std::function list_data; + }; + + static const std::vector> swift_info; +public: + RGWInfo_ObjStore_SWIFT() {} + ~RGWInfo_ObjStore_SWIFT() override {} + + void execute(optional_yield y) override; + void send_response() override; + static void list_swift_data(Formatter& formatter, const ConfigProxy& config, rgw::sal::Driver* driver); + static void list_tempauth_data(Formatter& formatter, const ConfigProxy& config, rgw::sal::Driver* driver); + static void list_tempurl_data(Formatter& formatter, const ConfigProxy& config, rgw::sal::Driver* driver); + static void list_slo_data(Formatter& formatter, const ConfigProxy& config, rgw::sal::Driver* driver); + static bool is_expired(const std::string& expires, const DoutPrefixProvider* dpp); +}; + + +class RGWFormPost : public RGWPostObj_ObjStore { + std::string get_current_filename() const override; + std::string get_current_content_type() const override; + std::size_t get_max_file_size() /*const*/; + bool is_next_file_to_upload() override; + bool is_integral(); + bool is_non_expired(); + void get_owner_info(const req_state* s, + RGWUserInfo& owner_info) const; + + parts_collection_t ctrl_parts; + boost::optional current_data_part; + std::string prefix; + bool stream_done = false; + + class SignatureHelper; +public: + RGWFormPost() = default; + ~RGWFormPost() = default; + + void init(rgw::sal::Driver* driver, + req_state* s, + RGWHandler* dialect_handler) override; + + int get_params(optional_yield y) override; + int get_data(ceph::bufferlist& bl, bool& again) override; + void send_response() override; + + static bool is_formpost_req(req_state* const s); +}; + +class RGWFormPost::SignatureHelper +{ +private: + static constexpr uint32_t output_size = + CEPH_CRYPTO_HMACSHA1_DIGESTSIZE * 2 + 1; + + unsigned char dest[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; // 20 + char dest_str[output_size]; + +public: + SignatureHelper() = default; + + const char* calc(const std::string& key, + const std::string_view& path_info, + const std::string_view& redirect, + const std::string_view& max_file_size, + const std::string_view& max_file_count, + const std::string_view& expires) { + using ceph::crypto::HMACSHA1; + using UCHARPTR = const unsigned char*; + + HMACSHA1 hmac((UCHARPTR) key.data(), key.size()); + + hmac.Update((UCHARPTR) path_info.data(), path_info.size()); + hmac.Update((UCHARPTR) "\n", 1); + + hmac.Update((UCHARPTR) redirect.data(), redirect.size()); + hmac.Update((UCHARPTR) "\n", 1); + + hmac.Update((UCHARPTR) max_file_size.data(), max_file_size.size()); + hmac.Update((UCHARPTR) "\n", 1); + + hmac.Update((UCHARPTR) max_file_count.data(), max_file_count.size()); + hmac.Update((UCHARPTR) "\n", 1); + + hmac.Update((UCHARPTR) expires.data(), expires.size()); + + hmac.Final(dest); + + buf_to_hex((UCHARPTR) dest, sizeof(dest), dest_str); + + return dest_str; + } + + const char* get_signature() const { + return dest_str; + } + + bool is_equal_to(const std::string& rhs) const { + /* never allow out-of-range exception */ + if (rhs.size() < (output_size - 1)) { + return false; + } + return rhs.compare(0 /* pos */, output_size, dest_str) == 0; + } + +}; /* RGWFormPost::SignatureHelper */ + + +class RGWSwiftWebsiteHandler { + rgw::sal::Driver* const driver; + req_state* const s; + RGWHandler_REST* const handler; + + bool is_web_mode() const; + bool can_be_website_req() const; + bool is_web_dir() const; + bool is_index_present(const std::string& index) const; + + int serve_errordoc(int http_ret, std::string error_doc, optional_yield y); + + RGWOp* get_ws_redirect_op(); + RGWOp* get_ws_index_op(); + RGWOp* get_ws_listing_op(); +public: + RGWSwiftWebsiteHandler(rgw::sal::Driver* const driver, + req_state* const s, + RGWHandler_REST* const handler) + : driver(driver), + s(s), + handler(handler) { + } + + int error_handler(const int err_no, + std::string* const error_content, + optional_yield y); + int retarget_bucket(RGWOp* op, RGWOp** new_op); + int retarget_object(RGWOp* op, RGWOp** new_op); +}; + + +class RGWHandler_REST_SWIFT : public RGWHandler_REST { + friend class RGWRESTMgr_SWIFT; + friend class RGWRESTMgr_SWIFT_Info; +protected: + const rgw::auth::Strategy& auth_strategy; + + virtual bool is_acl_op() const { + return false; + } + + static int init_from_header(rgw::sal::Driver* driver, req_state* s, + const std::string& frontend_prefix); +public: + explicit RGWHandler_REST_SWIFT(const rgw::auth::Strategy& auth_strategy) + : auth_strategy(auth_strategy) { + } + ~RGWHandler_REST_SWIFT() override = default; + + int validate_bucket_name(const std::string& bucket); + + int init(rgw::sal::Driver* driver, req_state *s, rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider *dpp, optional_yield y) override; + int postauth_init(optional_yield y) override; + + RGWAccessControlPolicy *alloc_policy() { return nullptr; /* return new RGWAccessControlPolicy_SWIFT; */ } + void free_policy(RGWAccessControlPolicy *policy) { delete policy; } +}; + +class RGWHandler_REST_Service_SWIFT : public RGWHandler_REST_SWIFT { +protected: + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_post() override; + RGWOp *op_delete() override; +public: + using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT; + ~RGWHandler_REST_Service_SWIFT() override = default; +}; + +class RGWHandler_REST_Bucket_SWIFT : public RGWHandler_REST_SWIFT { + /* We need the boost::optional here only because of handler's late + * initialization (see the init() method). */ + boost::optional website_handler; +protected: + bool is_obj_update_op() const override { + return s->op == OP_POST; + } + + RGWOp *get_obj_op(bool get_data); + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + RGWOp *op_options() override; +public: + using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT; + ~RGWHandler_REST_Bucket_SWIFT() override = default; + + int error_handler(int err_no, std::string *error_content, optional_yield y) override { + return website_handler->error_handler(err_no, error_content, y); + } + + int retarget(RGWOp* op, RGWOp** new_op, optional_yield) override { + return website_handler->retarget_bucket(op, new_op); + } + + int init(rgw::sal::Driver* const driver, + req_state* const s, + rgw::io::BasicClient* const cio) override { + website_handler = boost::in_place(driver, s, this); + return RGWHandler_REST_SWIFT::init(driver, s, cio); + } +}; + +class RGWHandler_REST_Obj_SWIFT : public RGWHandler_REST_SWIFT { + /* We need the boost::optional here only because of handler's late + * initialization (see the init() method). */ + boost::optional website_handler; +protected: + bool is_obj_update_op() const override { + return s->op == OP_POST; + } + + RGWOp *get_obj_op(bool get_data); + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + RGWOp *op_copy() override; + RGWOp *op_options() override; + +public: + using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT; + ~RGWHandler_REST_Obj_SWIFT() override = default; + + int error_handler(int err_no, std::string *error_content, + optional_yield y) override { + return website_handler->error_handler(err_no, error_content, y); + } + + int retarget(RGWOp* op, RGWOp** new_op, optional_yield) override { + return website_handler->retarget_object(op, new_op); + } + + int init(rgw::sal::Driver* const driver, + req_state* const s, + rgw::io::BasicClient* const cio) override { + website_handler = boost::in_place(driver, s, this); + return RGWHandler_REST_SWIFT::init(driver, s, cio); + } +}; + +class RGWRESTMgr_SWIFT : public RGWRESTMgr { +protected: + RGWRESTMgr* get_resource_mgr_as_default(req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this->get_resource_mgr(s, uri, out_uri); + } + +public: + RGWRESTMgr_SWIFT() = default; + ~RGWRESTMgr_SWIFT() override = default; + + RGWHandler_REST *get_handler(rgw::sal::Driver* driver, + req_state *s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; + + +class RGWGetCrossDomainPolicy_ObjStore_SWIFT + : public RGWGetCrossDomainPolicy_ObjStore { +public: + RGWGetCrossDomainPolicy_ObjStore_SWIFT() = default; + ~RGWGetCrossDomainPolicy_ObjStore_SWIFT() override = default; + + void send_response() override; +}; + +class RGWGetHealthCheck_ObjStore_SWIFT + : public RGWGetHealthCheck_ObjStore { +public: + RGWGetHealthCheck_ObjStore_SWIFT() = default; + ~RGWGetHealthCheck_ObjStore_SWIFT() override = default; + + void send_response() override; +}; + +class RGWHandler_SWIFT_CrossDomain : public RGWHandler_REST { +public: + RGWHandler_SWIFT_CrossDomain() = default; + ~RGWHandler_SWIFT_CrossDomain() override = default; + + RGWOp *op_get() override { + return new RGWGetCrossDomainPolicy_ObjStore_SWIFT(); + } + + int init(rgw::sal::Driver* const driver, + req_state* const state, + rgw::io::BasicClient* const cio) override { + state->dialect = "swift"; + state->formatter = new JSONFormatter; + state->format = RGWFormat::JSON; + + return RGWHandler::init(driver, state, cio); + } + + int authorize(const DoutPrefixProvider *dpp, optional_yield) override { + return 0; + } + + int postauth_init(optional_yield) override { + return 0; + } + + int read_permissions(RGWOp *, optional_yield y) override { + return 0; + } + + virtual RGWAccessControlPolicy *alloc_policy() { return nullptr; } + virtual void free_policy(RGWAccessControlPolicy *policy) {} +}; + +class RGWRESTMgr_SWIFT_CrossDomain : public RGWRESTMgr { +protected: + RGWRESTMgr *get_resource_mgr(req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + +public: + RGWRESTMgr_SWIFT_CrossDomain() = default; + ~RGWRESTMgr_SWIFT_CrossDomain() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry&, + const std::string&) override { + s->prot_flags |= RGW_REST_SWIFT; + return new RGWHandler_SWIFT_CrossDomain; + } +}; + + +class RGWHandler_SWIFT_HealthCheck : public RGWHandler_REST { +public: + RGWHandler_SWIFT_HealthCheck() = default; + ~RGWHandler_SWIFT_HealthCheck() override = default; + + RGWOp *op_get() override { + return new RGWGetHealthCheck_ObjStore_SWIFT(); + } + + int init(rgw::sal::Driver* const driver, + req_state* const state, + rgw::io::BasicClient* const cio) override { + state->dialect = "swift"; + state->formatter = new JSONFormatter; + state->format = RGWFormat::JSON; + + return RGWHandler::init(driver, state, cio); + } + + int authorize(const DoutPrefixProvider *dpp, optional_yield y) override { + return 0; + } + + int postauth_init(optional_yield) override { + return 0; + } + + int read_permissions(RGWOp *, optional_yield y) override { + return 0; + } + + virtual RGWAccessControlPolicy *alloc_policy() { return nullptr; } + virtual void free_policy(RGWAccessControlPolicy *policy) {} +}; + +class RGWRESTMgr_SWIFT_HealthCheck : public RGWRESTMgr { +protected: + RGWRESTMgr *get_resource_mgr(req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + +public: + RGWRESTMgr_SWIFT_HealthCheck() = default; + ~RGWRESTMgr_SWIFT_HealthCheck() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state* const s, + const rgw::auth::StrategyRegistry&, + const std::string&) override { + s->prot_flags |= RGW_REST_SWIFT; + return new RGWHandler_SWIFT_HealthCheck; + } +}; + + +class RGWHandler_REST_SWIFT_Info : public RGWHandler_REST_SWIFT { +public: + using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT; + ~RGWHandler_REST_SWIFT_Info() override = default; + + RGWOp *op_get() override { + return new RGWInfo_ObjStore_SWIFT(); + } + + int init(rgw::sal::Driver* const driver, + req_state* const state, + rgw::io::BasicClient* const cio) override { + state->dialect = "swift"; + state->formatter = new JSONFormatter; + state->format = RGWFormat::JSON; + + return RGWHandler::init(driver, state, cio); + } + + int authorize(const DoutPrefixProvider *dpp, optional_yield) override { + return 0; + } + + int postauth_init(optional_yield) override { + return 0; + } + + int read_permissions(RGWOp *, optional_yield y) override { + return 0; + } +}; + +class RGWRESTMgr_SWIFT_Info : public RGWRESTMgr { +public: + RGWRESTMgr_SWIFT_Info() = default; + ~RGWRESTMgr_SWIFT_Info() override = default; + + RGWHandler_REST *get_handler(rgw::sal::Driver* driver, + req_state* s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; diff --git a/src/rgw/rgw_rest_usage.cc b/src/rgw/rgw_rest_usage.cc new file mode 100644 index 000000000..9207a68cd --- /dev/null +++ b/src/rgw/rgw_rest_usage.cc @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_op.h" +#include "rgw_usage.h" +#include "rgw_rest_usage.h" +#include "rgw_sal.h" + +#include "include/str_list.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +class RGWOp_Usage_Get : public RGWRESTOp { + +public: + RGWOp_Usage_Get() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("usage", RGW_CAP_READ); + } + void execute(optional_yield y) override; + + const char* name() const override { return "get_usage"; } +}; + +void RGWOp_Usage_Get::execute(optional_yield y) { + map categories; + + string uid_str; + string bucket_name; + uint64_t start, end; + bool show_entries; + bool show_summary; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); + std::unique_ptr user = driver->get_user(rgw_user(uid_str)); + std::unique_ptr bucket; + + if (!bucket_name.empty()) { + driver->get_bucket(nullptr, user.get(), std::string(), bucket_name, &bucket, null_yield); + } + + RESTArgs::get_epoch(s, "start", 0, &start); + RESTArgs::get_epoch(s, "end", (uint64_t)-1, &end); + RESTArgs::get_bool(s, "show-entries", true, &show_entries); + RESTArgs::get_bool(s, "show-summary", true, &show_summary); + + string cat_str; + RESTArgs::get_string(s, "categories", cat_str, &cat_str); + + if (!cat_str.empty()) { + list cat_list; + list::iterator iter; + get_str_list(cat_str, cat_list); + for (iter = cat_list.begin(); iter != cat_list.end(); ++iter) { + categories[*iter] = true; + } + } + + op_ret = RGWUsage::show(this, driver, user.get(), bucket.get(), start, end, show_entries, show_summary, &categories, flusher); +} + +class RGWOp_Usage_Delete : public RGWRESTOp { + +public: + RGWOp_Usage_Delete() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("usage", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + + const char* name() const override { return "trim_usage"; } +}; + +void RGWOp_Usage_Delete::execute(optional_yield y) { + string uid_str; + string bucket_name; + uint64_t start, end; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); + std::unique_ptr user = driver->get_user(rgw_user(uid_str)); + std::unique_ptr bucket; + + if (!bucket_name.empty()) { + driver->get_bucket(nullptr, user.get(), std::string(), bucket_name, &bucket, null_yield); + } + + RESTArgs::get_epoch(s, "start", 0, &start); + RESTArgs::get_epoch(s, "end", (uint64_t)-1, &end); + + if (rgw::sal::User::empty(user.get()) && + bucket_name.empty() && + !start && + end == (uint64_t)-1) { + bool remove_all; + RESTArgs::get_bool(s, "remove-all", false, &remove_all); + if (!remove_all) { + op_ret = -EINVAL; + return; + } + } + + op_ret = RGWUsage::trim(this, driver, user.get(), bucket.get(), start, end); +} + +RGWOp *RGWHandler_Usage::op_get() +{ + return new RGWOp_Usage_Get; +} + +RGWOp *RGWHandler_Usage::op_delete() +{ + return new RGWOp_Usage_Delete; +} + + diff --git a/src/rgw/rgw_rest_usage.h b/src/rgw/rgw_rest_usage.h new file mode 100644 index 000000000..f68edb0ec --- /dev/null +++ b/src/rgw/rgw_rest_usage.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + + +class RGWHandler_Usage : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_delete() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Usage() override = default; + + int read_permissions(RGWOp*, optional_yield) override { + return 0; + } +}; + +class RGWRESTMgr_Usage : public RGWRESTMgr { +public: + RGWRESTMgr_Usage() = default; + ~RGWRESTMgr_Usage() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Usage(auth_registry); + } +}; diff --git a/src/rgw/rgw_rest_user_policy.cc b/src/rgw/rgw_rest_user_policy.cc new file mode 100644 index 000000000..2e300468b --- /dev/null +++ b/src/rgw/rgw_rest_user_policy.cc @@ -0,0 +1,413 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_common.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_user_policy.h" +#include "rgw_sal.h" +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + + +void RGWRestUserPolicy::dump(Formatter *f) const +{ + encode_json("PolicyName", policy_name , f); + encode_json("UserName", user_name , f); + encode_json("PolicyDocument", policy, f); +} + +void RGWRestUserPolicy::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWRestUserPolicy::verify_permission(optional_yield y) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if(int ret = check_caps(s->user->get_caps()); ret == 0) { + return ret; + } + + uint64_t op = get_op(); + std::string user_name = s->info.args.get("UserName"); + rgw_user user_id(user_name); + if (! verify_user_permission(this, s, rgw::ARN(rgw::ARN(user_id.id, + "user", + user_id.tenant)), op)) { + return -EACCES; + } + return 0; +} + +bool RGWRestUserPolicy::validate_input() +{ + if (policy_name.length() > MAX_POLICY_NAME_LEN) { + ldpp_dout(this, 0) << "ERROR: Invalid policy name length " << dendl; + return false; + } + + std::regex regex_policy_name("[A-Za-z0-9:=,.@-]+"); + if (! std::regex_match(policy_name, regex_policy_name)) { + ldpp_dout(this, 0) << "ERROR: Invalid chars in policy name " << dendl; + return false; + } + + return true; +} + +int RGWUserPolicyRead::check_caps(const RGWUserCaps& caps) +{ + return caps.check_cap("user-policy", RGW_CAP_READ); +} + +int RGWUserPolicyWrite::check_caps(const RGWUserCaps& caps) +{ + return caps.check_cap("user-policy", RGW_CAP_WRITE); +} + +uint64_t RGWPutUserPolicy::get_op() +{ + return rgw::IAM::iamPutUserPolicy; +} + +int RGWPutUserPolicy::get_params() +{ + policy_name = url_decode(s->info.args.get("PolicyName"), true); + user_name = url_decode(s->info.args.get("UserName"), true); + policy = url_decode(s->info.args.get("PolicyDocument"), true); + + if (policy_name.empty() || user_name.empty() || policy.empty()) { + ldpp_dout(this, 20) << "ERROR: one of policy name, user name or policy document is empty" + << dendl; + return -EINVAL; + } + + if (! validate_input()) { + return -EINVAL; + } + + return 0; +} + +void RGWPutUserPolicy::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + bufferlist bl = bufferlist::static_from_string(policy); + + std::unique_ptr user = driver->get_user(rgw_user(user_name)); + + op_ret = user->load_user(s, s->yield); + if (op_ret < 0) { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + op_ret = user->read_attrs(s, s->yield); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + ceph::bufferlist in_data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + + try { + const rgw::IAM::Policy p( + s->cct, s->user->get_tenant(), bl, + s->cct->_conf.get_val("rgw_policy_reject_invalid_principals")); + std::map policies; + if (auto it = user->get_attrs().find(RGW_ATTR_USER_POLICY); it != user->get_attrs().end()) { + bufferlist out_bl = it->second; + decode(policies, out_bl); + } + bufferlist in_bl; + policies[policy_name] = policy; + constexpr unsigned int USER_POLICIES_MAX_NUM = 100; + const unsigned int max_num = s->cct->_conf->rgw_user_policies_max_num < 0 ? + USER_POLICIES_MAX_NUM : s->cct->_conf->rgw_user_policies_max_num; + if (policies.size() > max_num) { + ldpp_dout(this, 4) << "IAM user policies has reached the num config: " + << max_num << ", cant add another" << dendl; + op_ret = -ERR_INVALID_REQUEST; + s->err.message = + "The number of IAM user policies should not exceed allowed limit " + "of " + + std::to_string(max_num) + " policies."; + return; + } + encode(policies, in_bl); + user->get_attrs()[RGW_ATTR_USER_POLICY] = in_bl; + + op_ret = user->store_user(s, s->yield, false); + if (op_ret < 0) { + op_ret = -ERR_INTERNAL_ERROR; + } + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode user policies" << dendl; + op_ret = -EIO; + } catch (rgw::IAM::PolicyParseException& e) { + ldpp_dout(this, 5) << "failed to parse policy: " << e.what() << dendl; + s->err.message = e.what(); + op_ret = -ERR_MALFORMED_DOC; + } + + if (op_ret == 0) { + s->formatter->open_object_section("PutUserPolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +uint64_t RGWGetUserPolicy::get_op() +{ + return rgw::IAM::iamGetUserPolicy; +} + +int RGWGetUserPolicy::get_params() +{ + policy_name = s->info.args.get("PolicyName"); + user_name = s->info.args.get("UserName"); + + if (policy_name.empty() || user_name.empty()) { + ldpp_dout(this, 20) << "ERROR: one of policy name or user name is empty" + << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWGetUserPolicy::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + std::unique_ptr user = driver->get_user(rgw_user(user_name)); + op_ret = user->read_attrs(s, s->yield); + if (op_ret == -ENOENT) { + ldpp_dout(this, 0) << "ERROR: attrs not found for user" << user_name << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + if (op_ret == 0) { + s->formatter->open_object_section("GetUserPolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("GetUserPolicyResult"); + std::map policies; + if (auto it = user->get_attrs().find(RGW_ATTR_USER_POLICY); it != user->get_attrs().end()) { + bufferlist bl = it->second; + try { + decode(policies, bl); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode user policies" << dendl; + op_ret = -EIO; + return; + } + if (auto it = policies.find(policy_name); it != policies.end()) { + policy = policies[policy_name]; + dump(s->formatter); + } else { + ldpp_dout(this, 0) << "ERROR: policy not found" << policy << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + } else { + ldpp_dout(this, 0) << "ERROR: RGW_ATTR_USER_POLICY not found" << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + s->formatter->close_section(); + s->formatter->close_section(); + } + if (op_ret < 0) { + op_ret = -ERR_INTERNAL_ERROR; + } +} + +uint64_t RGWListUserPolicies::get_op() +{ + return rgw::IAM::iamListUserPolicies; +} + +int RGWListUserPolicies::get_params() +{ + user_name = s->info.args.get("UserName"); + + if (user_name.empty()) { + ldpp_dout(this, 20) << "ERROR: user name is empty" << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWListUserPolicies::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + std::unique_ptr user = driver->get_user(rgw_user(user_name)); + op_ret = user->read_attrs(s, s->yield); + if (op_ret == -ENOENT) { + ldpp_dout(this, 0) << "ERROR: attrs not found for user" << user_name << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + if (op_ret == 0) { + std::map policies; + if (auto it = user->get_attrs().find(RGW_ATTR_USER_POLICY); it != user->get_attrs().end()) { + s->formatter->open_object_section("ListUserPoliciesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("ListUserPoliciesResult"); + bufferlist bl = it->second; + try { + decode(policies, bl); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode user policies" << dendl; + op_ret = -EIO; + return; + } + s->formatter->open_object_section("PolicyNames"); + for (const auto& p : policies) { + s->formatter->dump_string("member", p.first); + } + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); + } else { + ldpp_dout(this, 0) << "ERROR: RGW_ATTR_USER_POLICY not found" << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + } + if (op_ret < 0) { + op_ret = -ERR_INTERNAL_ERROR; + } +} + +uint64_t RGWDeleteUserPolicy::get_op() +{ + return rgw::IAM::iamDeleteUserPolicy; +} + +int RGWDeleteUserPolicy::get_params() +{ + policy_name = s->info.args.get("PolicyName"); + user_name = s->info.args.get("UserName"); + + if (policy_name.empty() || user_name.empty()) { + ldpp_dout(this, 20) << "ERROR: One of policy name or user name is empty"<< dendl; + return -EINVAL; + } + + return 0; +} + +void RGWDeleteUserPolicy::execute(optional_yield y) +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + std::unique_ptr user = driver->get_user(rgw_user(user_name)); + op_ret = user->load_user(s, s->yield); + if (op_ret < 0) { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + op_ret = user->read_attrs(this, s->yield); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + ceph::bufferlist in_data; + op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y); + if (op_ret < 0) { + // a policy might've been uploaded to this site when there was no sync + // req. in earlier releases, proceed deletion + if (op_ret != -ENOENT) { + ldpp_dout(this, 5) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + ldpp_dout(this, 0) << "ERROR: forward_request_to_master returned ret=" << op_ret << dendl; + } + + std::map policies; + if (auto it = user->get_attrs().find(RGW_ATTR_USER_POLICY); it != user->get_attrs().end()) { + bufferlist out_bl = it->second; + try { + decode(policies, out_bl); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode user policies" << dendl; + op_ret = -EIO; + return; + } + + if (auto p = policies.find(policy_name); p != policies.end()) { + bufferlist in_bl; + policies.erase(p); + encode(policies, in_bl); + user->get_attrs()[RGW_ATTR_USER_POLICY] = in_bl; + + op_ret = user->store_user(s, s->yield, false); + if (op_ret < 0) { + op_ret = -ERR_INTERNAL_ERROR; + } + if (op_ret == 0) { + s->formatter->open_object_section("DeleteUserPoliciesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } + } else { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + } else { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } +} diff --git a/src/rgw/rgw_rest_user_policy.h b/src/rgw/rgw_rest_user_policy.h new file mode 100644 index 000000000..4a123456e --- /dev/null +++ b/src/rgw/rgw_rest_user_policy.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once +#include "rgw_rest.h" + +class RGWRestUserPolicy : public RGWRESTOp { +protected: + static constexpr int MAX_POLICY_NAME_LEN = 128; + std::string policy_name; + std::string user_name; + std::string policy; + + bool validate_input(); + +public: + int verify_permission(optional_yield y) override; + virtual uint64_t get_op() = 0; + void send_response() override; + void dump(Formatter *f) const; +}; + +class RGWUserPolicyRead : public RGWRestUserPolicy { +public: + RGWUserPolicyRead() = default; + int check_caps(const RGWUserCaps& caps) override; +}; + +class RGWUserPolicyWrite : public RGWRestUserPolicy { +public: + RGWUserPolicyWrite() = default; + int check_caps(const RGWUserCaps& caps) override; +}; + +class RGWPutUserPolicy : public RGWUserPolicyWrite { +public: + RGWPutUserPolicy() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "put_user-policy"; } + uint64_t get_op() override; + RGWOpType get_type() override { return RGW_OP_PUT_USER_POLICY; } +}; + +class RGWGetUserPolicy : public RGWUserPolicyRead { +public: + RGWGetUserPolicy() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "get_user_policy"; } + uint64_t get_op() override; + RGWOpType get_type() override { return RGW_OP_GET_USER_POLICY; } +}; + +class RGWListUserPolicies : public RGWUserPolicyRead { +public: + RGWListUserPolicies() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "list_user_policies"; } + uint64_t get_op() override; + RGWOpType get_type() override { return RGW_OP_LIST_USER_POLICIES; } +}; + +class RGWDeleteUserPolicy : public RGWUserPolicyWrite { +public: + RGWDeleteUserPolicy() = default; + void execute(optional_yield y) override; + int get_params(); + const char* name() const override { return "delete_user_policy"; } + uint64_t get_op() override; + RGWOpType get_type() override { return RGW_OP_DELETE_USER_POLICY; } +}; diff --git a/src/rgw/rgw_role.cc b/src/rgw/rgw_role.cc new file mode 100644 index 000000000..fb188e7f8 --- /dev/null +++ b/src/rgw/rgw_role.cc @@ -0,0 +1,444 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "common/ceph_time.h" +#include "rgw_rados.h" +#include "rgw_zone.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_common.h" +#include "rgw_tools.h" +#include "rgw_role.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" +#include "services/svc_meta_be_sobj.h" +#include "services/svc_meta.h" +#include "services/svc_role_rados.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw { namespace sal { + +const string RGWRole::role_name_oid_prefix = "role_names."; +const string RGWRole::role_oid_prefix = "roles."; +const string RGWRole::role_path_oid_prefix = "role_paths."; +const string RGWRole::role_arn_prefix = "arn:aws:iam::"; + +void RGWRoleInfo::dump(Formatter *f) const +{ + encode_json("RoleId", id , f); + std::string role_name; + if (tenant.empty()) { + role_name = name; + } else { + role_name = tenant + '$' + name; + } + encode_json("RoleName", role_name , f); + encode_json("Path", path, f); + encode_json("Arn", arn, f); + encode_json("CreateDate", creation_date, f); + encode_json("MaxSessionDuration", max_session_duration, f); + encode_json("AssumeRolePolicyDocument", trust_policy, f); + if (!perm_policy_map.empty()) { + f->open_array_section("PermissionPolicies"); + for (const auto& it : perm_policy_map) { + f->open_object_section("Policy"); + encode_json("PolicyName", it.first, f); + encode_json("PolicyValue", it.second, f); + f->close_section(); + } + f->close_section(); + } + if (!tags.empty()) { + f->open_array_section("Tags"); + for (const auto& it : tags) { + f->open_object_section("Tag"); + encode_json("Key", it.first, f); + encode_json("Value", it.second, f); + f->close_section(); + } + f->close_section(); + } +} + +void RGWRoleInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("RoleId", id, obj); + JSONDecoder::decode_json("RoleName", name, obj); + JSONDecoder::decode_json("Path", path, obj); + JSONDecoder::decode_json("Arn", arn, obj); + JSONDecoder::decode_json("CreateDate", creation_date, obj); + JSONDecoder::decode_json("MaxSessionDuration", max_session_duration, obj); + JSONDecoder::decode_json("AssumeRolePolicyDocument", trust_policy, obj); + + auto tags_iter = obj->find_first("Tags"); + if (!tags_iter.end()) { + JSONObj* tags_json = *tags_iter; + auto iter = tags_json->find_first(); + + for (; !iter.end(); ++iter) { + std::string key, val; + JSONDecoder::decode_json("Key", key, *iter); + JSONDecoder::decode_json("Value", val, *iter); + this->tags.emplace(key, val); + } + } + + auto perm_policy_iter = obj->find_first("PermissionPolicies"); + if (!perm_policy_iter.end()) { + JSONObj* perm_policies = *perm_policy_iter; + auto iter = perm_policies->find_first(); + + for (; !iter.end(); ++iter) { + std::string policy_name, policy_val; + JSONDecoder::decode_json("PolicyName", policy_name, *iter); + JSONDecoder::decode_json("PolicyValue", policy_val, *iter); + this->perm_policy_map.emplace(policy_name, policy_val); + } + } + + if (auto pos = name.find('$'); pos != std::string::npos) { + tenant = name.substr(0, pos); + name = name.substr(pos+1); + } +} + +RGWRole::RGWRole(std::string name, + std::string tenant, + std::string path, + std::string trust_policy, + std::string max_session_duration_str, + std::multimap tags) +{ + info.name = std::move(name); + info.path = std::move(path); + info.trust_policy = std::move(trust_policy); + info.tenant = std::move(tenant); + info.tags = std::move(tags); + if (this->info.path.empty()) + this->info.path = "/"; + extract_name_tenant(this->info.name); + if (max_session_duration_str.empty()) { + info.max_session_duration = SESSION_DURATION_MIN; + } else { + info.max_session_duration = std::stoull(max_session_duration_str); + } + info.mtime = real_time(); +} + +RGWRole::RGWRole(std::string id) +{ + info.id = std::move(id); +} + +int RGWRole::get(const DoutPrefixProvider *dpp, optional_yield y) +{ + int ret = read_name(dpp, y); + if (ret < 0) { + return ret; + } + + ret = read_info(dpp, y); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWRole::get_by_id(const DoutPrefixProvider *dpp, optional_yield y) +{ + int ret = read_info(dpp, y); + if (ret < 0) { + return ret; + } + + return 0; +} + +void RGWRole::dump(Formatter *f) const +{ + info.dump(f); +} + +void RGWRole::decode_json(JSONObj *obj) +{ + info.decode_json(obj); +} + +bool RGWRole::validate_max_session_duration(const DoutPrefixProvider* dpp) +{ + if (info.max_session_duration < SESSION_DURATION_MIN || + info.max_session_duration > SESSION_DURATION_MAX) { + ldpp_dout(dpp, 0) << "ERROR: Invalid session duration, should be between 3600 and 43200 seconds " << dendl; + return false; + } + return true; +} + +bool RGWRole::validate_input(const DoutPrefixProvider* dpp) +{ + if (info.name.length() > MAX_ROLE_NAME_LEN) { + ldpp_dout(dpp, 0) << "ERROR: Invalid name length " << dendl; + return false; + } + + if (info.path.length() > MAX_PATH_NAME_LEN) { + ldpp_dout(dpp, 0) << "ERROR: Invalid path length " << dendl; + return false; + } + + std::regex regex_name("[A-Za-z0-9:=,.@-]+"); + if (! std::regex_match(info.name, regex_name)) { + ldpp_dout(dpp, 0) << "ERROR: Invalid chars in name " << dendl; + return false; + } + + std::regex regex_path("(/[!-~]+/)|(/)"); + if (! std::regex_match(info.path,regex_path)) { + ldpp_dout(dpp, 0) << "ERROR: Invalid chars in path " << dendl; + return false; + } + + if (!validate_max_session_duration(dpp)) { + return false; + } + return true; +} + +void RGWRole::extract_name_tenant(const std::string& str) { + if (auto pos = str.find('$'); + pos != std::string::npos) { + info.tenant = str.substr(0, pos); + info.name = str.substr(pos+1); + } +} + +int RGWRole::update(const DoutPrefixProvider *dpp, optional_yield y) +{ + int ret = store_info(dpp, false, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: storing info in Role pool: " + << info.id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +void RGWRole::set_perm_policy(const string& policy_name, const string& perm_policy) +{ + info.perm_policy_map[policy_name] = perm_policy; +} + +vector RGWRole::get_role_policy_names() +{ + vector policy_names; + for (const auto& it : info.perm_policy_map) + { + policy_names.push_back(std::move(it.first)); + } + + return policy_names; +} + +int RGWRole::get_role_policy(const DoutPrefixProvider* dpp, const string& policy_name, string& perm_policy) +{ + const auto it = info.perm_policy_map.find(policy_name); + if (it == info.perm_policy_map.end()) { + ldpp_dout(dpp, 0) << "ERROR: Policy name: " << policy_name << " not found" << dendl; + return -ENOENT; + } else { + perm_policy = it->second; + } + return 0; +} + +int RGWRole::delete_policy(const DoutPrefixProvider* dpp, const string& policy_name) +{ + const auto& it = info.perm_policy_map.find(policy_name); + if (it == info.perm_policy_map.end()) { + ldpp_dout(dpp, 0) << "ERROR: Policy name: " << policy_name << " not found" << dendl; + return -ENOENT; + } else { + info.perm_policy_map.erase(it); + } + return 0; +} + +void RGWRole::update_trust_policy(string& trust_policy) +{ + this->info.trust_policy = trust_policy; +} + +int RGWRole::set_tags(const DoutPrefixProvider* dpp, const multimap& tags_map) +{ + for (auto& it : tags_map) { + this->info.tags.emplace(it.first, it.second); + } + if (this->info.tags.size() > 50) { + ldpp_dout(dpp, 0) << "No. of tags is greater than 50" << dendl; + return -EINVAL; + } + return 0; +} + +boost::optional> RGWRole::get_tags() +{ + if(this->info.tags.empty()) { + return boost::none; + } + return this->info.tags; +} + +void RGWRole::erase_tags(const vector& tagKeys) +{ + for (auto& it : tagKeys) { + this->info.tags.erase(it); + } +} + +void RGWRole::update_max_session_duration(const std::string& max_session_duration_str) +{ + if (max_session_duration_str.empty()) { + info.max_session_duration = SESSION_DURATION_MIN; + } else { + info.max_session_duration = std::stoull(max_session_duration_str); + } +} + +const string& RGWRole::get_names_oid_prefix() +{ + return role_name_oid_prefix; +} + +const string& RGWRole::get_info_oid_prefix() +{ + return role_oid_prefix; +} + +const string& RGWRole::get_path_oid_prefix() +{ + return role_path_oid_prefix; +} + +RGWRoleMetadataHandler::RGWRoleMetadataHandler(Driver* driver, + RGWSI_Role_RADOS *role_svc) +{ + this->driver = driver; + base_init(role_svc->ctx(), role_svc->get_be_handler()); +} + +RGWMetadataObject *RGWRoleMetadataHandler::get_meta_obj(JSONObj *jo, + const obj_version& objv, + const ceph::real_time& mtime) +{ + RGWRoleInfo info; + + try { + info.decode_json(jo); + } catch (JSONDecoder:: err& e) { + return nullptr; + } + + return new RGWRoleMetadataObject(info, objv, mtime, driver); +} + +int RGWRoleMetadataHandler::do_get(RGWSI_MetaBackend_Handler::Op *op, + std::string& entry, + RGWMetadataObject **obj, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + std::unique_ptr role = driver->get_role(entry); + int ret = role->read_info(dpp, y); + if (ret < 0) { + return ret; + } + + RGWObjVersionTracker objv_tracker = role->get_objv_tracker(); + real_time mtime = role->get_mtime(); + + RGWRoleInfo info = role->get_info(); + RGWRoleMetadataObject *rdo = new RGWRoleMetadataObject(info, objv_tracker.read_version, + mtime, driver); + *obj = rdo; + + return 0; +} + +int RGWRoleMetadataHandler::do_remove(RGWSI_MetaBackend_Handler::Op *op, + std::string& entry, + RGWObjVersionTracker& objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + std::unique_ptr role = driver->get_role(entry); + int ret = role->read_info(dpp, y); + if (ret < 0) { + return ret == -ENOENT? 0 : ret; + } + + return role->delete_obj(dpp, y); +} + +class RGWMetadataHandlerPut_Role : public RGWMetadataHandlerPut_SObj +{ + RGWRoleMetadataHandler *rhandler; + RGWRoleMetadataObject *mdo; +public: + RGWMetadataHandlerPut_Role(RGWRoleMetadataHandler *handler, + RGWSI_MetaBackend_Handler::Op *op, + std::string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, + RGWMDLogSyncType type, + bool from_remote_zone) : + RGWMetadataHandlerPut_SObj(handler, op, entry, obj, objv_tracker, y, type, from_remote_zone), + rhandler(handler) { + mdo = static_cast(obj); + } + + int put_checked(const DoutPrefixProvider *dpp) override { + auto& info = mdo->get_role_info(); + auto mtime = mdo->get_mtime(); + auto* driver = mdo->get_driver(); + info.mtime = mtime; + std::unique_ptr role = driver->get_role(info); + int ret = role->create(dpp, true, info.id, y); + if (ret == -EEXIST) { + ret = role->update(dpp, y); + } + + return ret < 0 ? ret : STATUS_APPLIED; + } +}; + +int RGWRoleMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, + std::string& entry, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, + bool from_remote_zone) +{ + RGWMetadataHandlerPut_Role put_op(this, op , entry, obj, objv_tracker, y, type, from_remote_zone); + return do_put_operate(&put_op, dpp); +} + +} } // namespace rgw::sal diff --git a/src/rgw/rgw_role.h b/src/rgw/rgw_role.h new file mode 100644 index 000000000..9183829d9 --- /dev/null +++ b/src/rgw/rgw_role.h @@ -0,0 +1,209 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include + +#include "common/async/yield_context.h" + +#include "common/ceph_json.h" +#include "common/ceph_context.h" +#include "rgw_rados.h" +#include "rgw_metadata.h" + +class RGWRados; + +namespace rgw { namespace sal { +struct RGWRoleInfo +{ + std::string id; + std::string name; + std::string path; + std::string arn; + std::string creation_date; + std::string trust_policy; + std::map perm_policy_map; + std::string tenant; + uint64_t max_session_duration; + std::multimap tags; + std::map attrs; + RGWObjVersionTracker objv_tracker; + real_time mtime; + + RGWRoleInfo() = default; + + ~RGWRoleInfo() = default; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(id, bl); + encode(name, bl); + encode(path, bl); + encode(arn, bl); + encode(creation_date, bl); + encode(trust_policy, bl); + encode(perm_policy_map, bl); + encode(tenant, bl); + encode(max_session_duration, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(id, bl); + decode(name, bl); + decode(path, bl); + decode(arn, bl); + decode(creation_date, bl); + decode(trust_policy, bl); + decode(perm_policy_map, bl); + if (struct_v >= 2) { + decode(tenant, bl); + } + if (struct_v >= 3) { + decode(max_session_duration, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWRoleInfo) + +class RGWRole +{ +public: + static const std::string role_name_oid_prefix; + static const std::string role_oid_prefix; + static const std::string role_path_oid_prefix; + static const std::string role_arn_prefix; + static constexpr int MAX_ROLE_NAME_LEN = 64; + static constexpr int MAX_PATH_NAME_LEN = 512; + static constexpr uint64_t SESSION_DURATION_MIN = 3600; // in seconds + static constexpr uint64_t SESSION_DURATION_MAX = 43200; // in seconds +protected: + RGWRoleInfo info; +public: + virtual int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) = 0; + virtual int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) = 0; + virtual int store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) = 0; + virtual int read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y) = 0; + virtual int read_name(const DoutPrefixProvider *dpp, optional_yield y) = 0; + virtual int read_info(const DoutPrefixProvider *dpp, optional_yield y) = 0; + bool validate_max_session_duration(const DoutPrefixProvider* dpp); + bool validate_input(const DoutPrefixProvider* dpp); + void extract_name_tenant(const std::string& str); + + RGWRole(std::string name, + std::string tenant, + std::string path="", + std::string trust_policy="", + std::string max_session_duration_str="", + std::multimap tags={}); + + explicit RGWRole(std::string id); + + explicit RGWRole(const RGWRoleInfo& info) : info(info) {} + + RGWRole() = default; + + virtual ~RGWRole() = default; + + const std::string& get_id() const { return info.id; } + const std::string& get_name() const { return info.name; } + const std::string& get_tenant() const { return info.tenant; } + const std::string& get_path() const { return info.path; } + const std::string& get_create_date() const { return info.creation_date; } + const std::string& get_assume_role_policy() const { return info.trust_policy;} + const uint64_t& get_max_session_duration() const { return info.max_session_duration; } + const RGWObjVersionTracker& get_objv_tracker() const { return info.objv_tracker; } + const real_time& get_mtime() const { return info.mtime; } + std::map& get_attrs() { return info.attrs; } + RGWRoleInfo& get_info() { return info; } + + void set_id(const std::string& id) { this->info.id = id; } + void set_mtime(const real_time& mtime) { this->info.mtime = mtime; } + + virtual int create(const DoutPrefixProvider *dpp, bool exclusive, const std::string &role_id, optional_yield y) = 0; + virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) = 0; + int get(const DoutPrefixProvider *dpp, optional_yield y); + int get_by_id(const DoutPrefixProvider *dpp, optional_yield y); + int update(const DoutPrefixProvider *dpp, optional_yield y); + void update_trust_policy(std::string& trust_policy); + void set_perm_policy(const std::string& policy_name, const std::string& perm_policy); + std::vector get_role_policy_names(); + int get_role_policy(const DoutPrefixProvider* dpp, const std::string& policy_name, std::string& perm_policy); + int delete_policy(const DoutPrefixProvider* dpp, const std::string& policy_name); + int set_tags(const DoutPrefixProvider* dpp, const std::multimap& tags_map); + boost::optional> get_tags(); + void erase_tags(const std::vector& tagKeys); + void update_max_session_duration(const std::string& max_session_duration_str); + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + static const std::string& get_names_oid_prefix(); + static const std::string& get_info_oid_prefix(); + static const std::string& get_path_oid_prefix(); +}; + +class RGWRoleMetadataObject: public RGWMetadataObject { + RGWRoleInfo info; + Driver* driver; +public: + RGWRoleMetadataObject() = default; + RGWRoleMetadataObject(RGWRoleInfo& info, + const obj_version& v, + real_time m, + Driver* driver) : RGWMetadataObject(v,m), info(info), driver(driver) {} + + void dump(Formatter *f) const override { + info.dump(f); + } + + RGWRoleInfo& get_role_info() { + return info; + } + + Driver* get_driver() { + return driver; + } +}; + +class RGWRoleMetadataHandler: public RGWMetadataHandler_GenericMetaBE +{ +public: + RGWRoleMetadataHandler(Driver* driver, RGWSI_Role_RADOS *role_svc); + + std::string get_type() final { return "roles"; } + + RGWMetadataObject *get_meta_obj(JSONObj *jo, + const obj_version& objv, + const ceph::real_time& mtime); + + int do_get(RGWSI_MetaBackend_Handler::Op *op, + std::string& entry, + RGWMetadataObject **obj, + optional_yield y, + const DoutPrefixProvider *dpp) final; + + int do_remove(RGWSI_MetaBackend_Handler::Op *op, + std::string& entry, + RGWObjVersionTracker& objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) final; + + int do_put(RGWSI_MetaBackend_Handler::Op *op, + std::string& entr, + RGWMetadataObject *obj, + RGWObjVersionTracker& objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWMDLogSyncType type, + bool from_remote_zone) override; + +private: + Driver* driver; +}; +} } // namespace rgw::sal diff --git a/src/rgw/rgw_s3select.cc b/src/rgw/rgw_s3select.cc new file mode 100644 index 000000000..c7eaa6984 --- /dev/null +++ b/src/rgw/rgw_s3select.cc @@ -0,0 +1,1001 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_s3select_private.h" + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::s3select { +RGWOp* create_s3select_op() +{ + return new RGWSelectObj_ObjStore_S3(); +} +}; + +using namespace s3selectEngine; + +std::string& aws_response_handler::get_sql_result() +{ + return sql_result; +} + +uint64_t aws_response_handler::get_processed_size() +{ + return processed_size; +} + +void aws_response_handler::update_processed_size(uint64_t value) +{ + processed_size += value; +} + +uint64_t aws_response_handler::get_total_bytes_returned() +{ + return total_bytes_returned; +} + +void aws_response_handler::update_total_bytes_returned(uint64_t value) +{ + total_bytes_returned = value; +} + +void aws_response_handler::push_header(const char* header_name, const char* header_value) +{ + char x; + short s; + x = char(strlen(header_name)); + m_buff_header.append(&x, sizeof(x)); + m_buff_header.append(header_name); + x = char(7); + m_buff_header.append(&x, sizeof(x)); + s = htons(uint16_t(strlen(header_value))); + m_buff_header.append(reinterpret_cast(&s), sizeof(s)); + m_buff_header.append(header_value); +} + +#define IDX( x ) static_cast( x ) + +int aws_response_handler::create_header_records() +{ + //headers description(AWS) + //[header-name-byte-length:1][header-name:variable-length][header-value-type:1][header-value:variable-length] + //1 + push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::RECORDS)]); + //2 + push_header(header_name_str[IDX(header_name_En::CONTENT_TYPE)], header_value_str[IDX(header_value_En::OCTET_STREAM)]); + //3 + push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]); + return m_buff_header.size(); +} + +int aws_response_handler::create_header_continuation() +{ + //headers description(AWS) + //1 + push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::CONT)]); + //2 + push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]); + return m_buff_header.size(); +} + +int aws_response_handler::create_header_progress() +{ + //headers description(AWS) + //1 + push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::PROGRESS)]); + //2 + push_header(header_name_str[IDX(header_name_En::CONTENT_TYPE)], header_value_str[IDX(header_value_En::XML)]); + //3 + push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]); + return m_buff_header.size(); +} + +int aws_response_handler::create_header_stats() +{ + //headers description(AWS) + //1 + push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::STATS)]); + //2 + push_header(header_name_str[IDX(header_name_En::CONTENT_TYPE)], header_value_str[IDX(header_value_En::XML)]); + //3 + push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]); + return m_buff_header.size(); +} + +int aws_response_handler::create_header_end() +{ + //headers description(AWS) + //1 + push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::END)]); + //2 + push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]); + return m_buff_header.size(); +} + +int aws_response_handler::create_error_header_records(const char* error_message) +{ + //headers description(AWS) + //[header-name-byte-length:1][header-name:variable-length][header-value-type:1][header-value:variable-length] + //1 + push_header(header_name_str[IDX(header_name_En::ERROR_CODE)], header_value_str[IDX(header_value_En::ENGINE_ERROR)]); + //2 + push_header(header_name_str[IDX(header_name_En::ERROR_MESSAGE)], error_message); + //3 + push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::ERROR_TYPE)]); + return m_buff_header.size(); +} + +int aws_response_handler::create_message(u_int32_t header_len) +{ + //message description(AWS): + //[total-byte-length:4][header-byte-length:4][crc:4][headers:variable-length][payload:variable-length][crc:4] + //s3select result is produced into sql_result, the sql_result is also the response-message, thus the attach headers and CRC + //are created later to the produced SQL result, and actually wrapping the payload. + auto push_encode_int = [&](u_int32_t s, int pos) { + u_int32_t x = htonl(s); + sql_result.replace(pos, sizeof(x), reinterpret_cast(&x), sizeof(x)); + }; + u_int32_t total_byte_len = 0; + u_int32_t preload_crc = 0; + u_int32_t message_crc = 0; + total_byte_len = sql_result.size() + 4; //the total is greater in 4 bytes than current size + push_encode_int(total_byte_len, 0); + push_encode_int(header_len, 4); + crc32.reset(); + crc32 = std::for_each(sql_result.data(), sql_result.data() + 8, crc32); //crc for starting 8 bytes + preload_crc = crc32(); + push_encode_int(preload_crc, 8); + crc32.reset(); + crc32 = std::for_each(sql_result.begin(), sql_result.end(), crc32); //crc for payload + checksum + message_crc = crc32(); + u_int32_t x = htonl(message_crc); + sql_result.append(reinterpret_cast(&x), sizeof(x)); + return sql_result.size(); +} + +void aws_response_handler::init_response() +{ + //12 positions for header-crc + sql_result.resize(header_crc_size, '\0'); +} + +void aws_response_handler::init_success_response() +{ + m_buff_header.clear(); + header_size = create_header_records(); + sql_result.append(m_buff_header.c_str(), header_size); +#ifdef PAYLOAD_TAG + sql_result.append(PAYLOAD_LINE); +#endif +} + +void aws_response_handler::send_continuation_response() +{ + sql_result.resize(header_crc_size, '\0'); + m_buff_header.clear(); + header_size = create_header_continuation(); + sql_result.append(m_buff_header.c_str(), header_size); + int buff_len = create_message(header_size); + s->formatter->write_bin_data(sql_result.data(), buff_len); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void aws_response_handler::init_progress_response() +{ + sql_result.resize(header_crc_size, '\0'); + m_buff_header.clear(); + header_size = create_header_progress(); + sql_result.append(m_buff_header.c_str(), header_size); +} + +void aws_response_handler::init_stats_response() +{ + sql_result.resize(header_crc_size, '\0'); + m_buff_header.clear(); + header_size = create_header_stats(); + sql_result.append(m_buff_header.c_str(), header_size); +} + +void aws_response_handler::init_end_response() +{ + sql_result.resize(header_crc_size, '\0'); + m_buff_header.clear(); + header_size = create_header_end(); + sql_result.append(m_buff_header.c_str(), header_size); + int buff_len = create_message(header_size); + s->formatter->write_bin_data(sql_result.data(), buff_len); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void aws_response_handler::init_error_response(const char* error_message) +{ + //currently not in use. the headers in the case of error, are not extracted by AWS-cli. + m_buff_header.clear(); + header_size = create_error_header_records(error_message); + sql_result.append(m_buff_header.c_str(), header_size); +} + +void aws_response_handler::send_success_response() +{ +#ifdef PAYLOAD_TAG + sql_result.append(END_PAYLOAD_LINE); +#endif + int buff_len = create_message(header_size); + s->formatter->write_bin_data(sql_result.data(), buff_len); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void aws_response_handler::send_error_response(const char* error_code, + const char* error_message, + const char* resource_id) +{ + set_req_state_err(s, 0); + dump_errno(s, 400); + end_header(s, m_rgwop, "application/xml", CHUNKED_TRANSFER_ENCODING); + dump_start(s); + s->formatter->open_object_section("Error"); + s->formatter->dump_string("Code", error_code); + s->formatter->dump_string("Message", error_message); + s->formatter->dump_string("Resource", "#Resource#"); + s->formatter->dump_string("RequestId", resource_id); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void aws_response_handler::send_progress_response() +{ + std::string progress_payload = fmt::format("{}{}{}" + , get_processed_size(), get_processed_size(), get_total_bytes_returned()); + sql_result.append(progress_payload); + int buff_len = create_message(header_size); + s->formatter->write_bin_data(sql_result.data(), buff_len); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void aws_response_handler::send_stats_response() +{ + std::string stats_payload = fmt::format("{}{}{}" + , get_processed_size(), get_processed_size(), get_total_bytes_returned()); + sql_result.append(stats_payload); + int buff_len = create_message(header_size); + s->formatter->write_bin_data(sql_result.data(), buff_len); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +RGWSelectObj_ObjStore_S3::RGWSelectObj_ObjStore_S3(): + m_buff_header(std::make_unique(1000)), + m_scan_range_ind(false), + m_start_scan_sz(0), + m_end_scan_sz(0), + m_object_size_for_processing(0), + m_parquet_type(false), + m_json_type(false), + chunk_number(0), + m_requested_range(0), + m_scan_offset(1024), + m_skip_next_chunk(false), + m_is_trino_request(false) +{ + set_get_data(true); + fp_get_obj_size = [&]() { + return get_obj_size(); + }; + fp_range_req = [&](int64_t start, int64_t len, void* buff, optional_yield* y) { + ldout(s->cct, 10) << "S3select: range-request start: " << start << " length: " << len << dendl; + auto status = range_request(start, len, buff, *y); + return status; + }; +#ifdef _ARROW_EXIST + m_rgw_api.set_get_size_api(fp_get_obj_size); + m_rgw_api.set_range_req_api(fp_range_req); +#endif + fp_result_header_format = [this](std::string& result) { + m_aws_response_handler.init_response(); + m_aws_response_handler.init_success_response(); + return 0; + }; + fp_s3select_result_format = [this](std::string& result) { + fp_chunked_transfer_encoding(); + m_aws_response_handler.send_success_response(); + return 0; + }; + + fp_debug_mesg = [&](const char* mesg){ + ldpp_dout(this, 10) << mesg << dendl; + }; + + fp_chunked_transfer_encoding = [&](void){ + if (chunk_number == 0) { + if (op_ret < 0) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); + } + chunk_number++; + }; +} + +RGWSelectObj_ObjStore_S3::~RGWSelectObj_ObjStore_S3() +{} + +int RGWSelectObj_ObjStore_S3::get_params(optional_yield y) +{ + if(m_s3select_query.empty() == false) { + return 0; + } +#ifndef _ARROW_EXIST + m_parquet_type = false; + ldpp_dout(this, 10) << "arrow library is not installed" << dendl; +#endif + + //retrieve s3-select query from payload + bufferlist data; + int ret; + int max_size = 4096; + std::tie(ret, data) = read_all_input(s, max_size, false); + if (ret != 0) { + ldpp_dout(this, 10) << "s3-select query: failed to retrieve query; ret = " << ret << dendl; + return ret; + } + m_s3select_query = data.to_str(); + if (m_s3select_query.length() > 0) { + ldpp_dout(this, 10) << "s3-select query: " << m_s3select_query << dendl; + } else { + ldpp_dout(this, 10) << "s3-select query: failed to retrieve query;" << dendl; + return -1; + } + const auto& m = s->info.env->get_map(); + auto user_agent = m.find("HTTP_USER_AGENT"); { + if (user_agent != m.end()){ + if (user_agent->second.find("Trino") != std::string::npos){ + m_is_trino_request = true; + ldpp_dout(this, 10) << "s3-select query: request sent by Trino." << dendl; + } + } + } + + int status = handle_aws_cli_parameters(m_sql_query); + if (status<0) { + return status; + } + return RGWGetObj_ObjStore_S3::get_params(y); +} + +int RGWSelectObj_ObjStore_S3::run_s3select_on_csv(const char* query, const char* input, size_t input_length) +{ + int status = 0; + uint32_t length_before_processing, length_post_processing; + csv_object::csv_defintions csv; + const char* s3select_syntax_error = "s3select-Syntax-Error"; + const char* s3select_resource_id = "resourcse-id"; + const char* s3select_processTime_error = "s3select-ProcessingTime-Error"; + + s3select_syntax.parse_query(query); + if (m_row_delimiter.size()) { + csv.row_delimiter = *m_row_delimiter.c_str(); + } + if (m_column_delimiter.size()) { + csv.column_delimiter = *m_column_delimiter.c_str(); + } + if (m_quot.size()) { + csv.quot_char = *m_quot.c_str(); + } + if (m_escape_char.size()) { + csv.escape_char = *m_escape_char.c_str(); + } + if (output_row_delimiter.size()) { + csv.output_row_delimiter = *output_row_delimiter.c_str(); + } + if (output_column_delimiter.size()) { + csv.output_column_delimiter = *output_column_delimiter.c_str(); + } + if (output_quot.size()) { + csv.output_quot_char = *output_quot.c_str(); + } + if (output_escape_char.size()) { + csv.output_escape_char = *output_escape_char.c_str(); + } + if(output_quote_fields.compare("ALWAYS") == 0) { + csv.quote_fields_always = true; + } else if(output_quote_fields.compare("ASNEEDED") == 0) { + csv.quote_fields_asneeded = true; + } + if(m_header_info.compare("IGNORE")==0) { + csv.ignore_header_info=true; + } else if(m_header_info.compare("USE")==0) { + csv.use_header_info=true; + } + //m_s3_csv_object.set_external_debug_system(fp_debug_mesg); + m_s3_csv_object.set_result_formatters(fp_s3select_result_format,fp_result_header_format); + m_s3_csv_object.set_csv_query(&s3select_syntax, csv); + if (s3select_syntax.get_error_description().empty() == false) { + //error-flow (syntax-error) + m_aws_response_handler.send_error_response(s3select_syntax_error, + s3select_syntax.get_error_description().c_str(), + s3select_resource_id); + ldpp_dout(this, 10) << "s3-select query: failed to prase the following query {" << query << "}" << dendl; + ldpp_dout(this, 10) << "s3-select query: syntax-error {" << s3select_syntax.get_error_description() << "}" << dendl; + return -1; + } else { + if (input == nullptr) { + input = ""; + } + fp_result_header_format(m_aws_response_handler.get_sql_result()); + length_before_processing = m_s3_csv_object.get_return_result_size(); + //query is correct(syntax), processing is starting. + status = m_s3_csv_object.run_s3select_on_stream(m_aws_response_handler.get_sql_result(), input, input_length, m_object_size_for_processing); + length_post_processing = m_s3_csv_object.get_return_result_size(); + m_aws_response_handler.update_total_bytes_returned( m_s3_csv_object.get_return_result_size() ); + + if (status < 0) { + //error flow(processing-time) + m_aws_response_handler.send_error_response(s3select_processTime_error, + m_s3_csv_object.get_error_description().c_str(), + s3select_resource_id); + ldpp_dout(this, 10) << "s3-select query: failed to process query; {" << m_s3_csv_object.get_error_description() << "}" << dendl; + return -1; + } + + } + if ((length_post_processing-length_before_processing) != 0) { + ldpp_dout(this, 10) << "s3-select: sql-result-size = " << m_aws_response_handler.get_sql_result().size() << dendl; + } else { + m_aws_response_handler.send_continuation_response(); + } + ldpp_dout(this, 10) << "s3-select: complete chunk processing : chunk length = " << input_length << dendl; + if (enable_progress == true) { + fp_chunked_transfer_encoding(); + m_aws_response_handler.init_progress_response(); + m_aws_response_handler.send_progress_response(); + } + return status; +} + +int RGWSelectObj_ObjStore_S3::run_s3select_on_parquet(const char* query) +{ + int status = 0; +#ifdef _ARROW_EXIST + if (!m_s3_parquet_object.is_set()) { + //parsing the SQL statement. + s3select_syntax.parse_query(m_sql_query.c_str()); + //m_s3_parquet_object.set_external_debug_system(fp_debug_mesg); + try { + //at this stage the Parquet-processing requires for the meta-data that reside on Parquet object + m_s3_parquet_object.set_parquet_object(std::string("s3object"), &s3select_syntax, &m_rgw_api); + } catch(base_s3select_exception& e) { + ldpp_dout(this, 10) << "S3select: failed upon parquet-reader construction: " << e.what() << dendl; + fp_result_header_format(m_aws_response_handler.get_sql_result()); + m_aws_response_handler.get_sql_result().append(e.what()); + fp_s3select_result_format(m_aws_response_handler.get_sql_result()); + return -1; + } + } + if (s3select_syntax.get_error_description().empty() == false) { + //the SQL statement failed the syntax parser + fp_result_header_format(m_aws_response_handler.get_sql_result()); + m_aws_response_handler.get_sql_result().append(s3select_syntax.get_error_description().data()); + fp_s3select_result_format(m_aws_response_handler.get_sql_result()); + ldpp_dout(this, 10) << "s3-select query: failed to prase query; {" << s3select_syntax.get_error_description() << "}" << dendl; + status = -1; + } else { + fp_result_header_format(m_aws_response_handler.get_sql_result()); + //at this stage the Parquet-processing "takes control", it keep calling to s3-range-request according to the SQL statement. + status = m_s3_parquet_object.run_s3select_on_object(m_aws_response_handler.get_sql_result(), fp_s3select_result_format, fp_result_header_format); + if (status < 0) { + m_aws_response_handler.get_sql_result().append(m_s3_parquet_object.get_error_description()); + fp_s3select_result_format(m_aws_response_handler.get_sql_result()); + ldout(s->cct, 10) << "S3select: failure while execution" << m_s3_parquet_object.get_error_description() << dendl; + } + } +#endif + return status; +} + +int RGWSelectObj_ObjStore_S3::run_s3select_on_json(const char* query, const char* input, size_t input_length) +{ + int status = 0; + + const char* s3select_processTime_error = "s3select-ProcessingTime-Error"; + const char* s3select_syntax_error = "s3select-Syntax-Error"; + const char* s3select_resource_id = "resourcse-id"; + const char* s3select_json_error = "json-Format-Error"; + + m_aws_response_handler.init_response(); + + //the JSON data-type should be(currently) only DOCUMENT + if (m_json_datatype.compare("DOCUMENT") != 0) { + const char* s3select_json_error_msg = "s3-select query: wrong json dataType should use DOCUMENT; "; + m_aws_response_handler.send_error_response(s3select_json_error, + s3select_json_error_msg, + s3select_resource_id); + ldpp_dout(this, 10) << s3select_json_error_msg << dendl; + return -EINVAL; + } + + //parsing the SQL statement + s3select_syntax.parse_query(m_sql_query.c_str()); + if (s3select_syntax.get_error_description().empty() == false) { + //SQL statement is wrong(syntax). + m_aws_response_handler.send_error_response(s3select_syntax_error, + s3select_syntax.get_error_description().c_str(), + s3select_resource_id); + ldpp_dout(this, 10) << "s3-select query: failed to prase query; {" << s3select_syntax.get_error_description() << "}" << dendl; + return -EINVAL; + } + + //initializing json processor + m_s3_json_object.set_json_query(&s3select_syntax); + + if (input == nullptr) { + input = ""; + } + m_aws_response_handler.init_success_response(); + uint32_t length_before_processing = m_aws_response_handler.get_sql_result().size(); + //query is correct(syntax), processing is starting. + try { + status = m_s3_json_object.run_s3select_on_stream(m_aws_response_handler.get_sql_result(), input, input_length, m_object_size_for_processing); + } catch(base_s3select_exception& e) { + ldpp_dout(this, 10) << "S3select: failed to process JSON object: " << e.what() << dendl; + m_aws_response_handler.get_sql_result().append(e.what()); + m_aws_response_handler.send_error_response(s3select_processTime_error, + e.what(), + s3select_resource_id); + return -EINVAL; + } + uint32_t length_post_processing = m_aws_response_handler.get_sql_result().size(); + m_aws_response_handler.update_total_bytes_returned(length_post_processing - length_before_processing); + if (status < 0) { + //error flow(processing-time) + m_aws_response_handler.send_error_response(s3select_processTime_error, + m_s3_json_object.get_error_description().c_str(), + s3select_resource_id); + ldpp_dout(this, 10) << "s3-select query: failed to process query; {" << m_s3_json_object.get_error_description() << "}" << dendl; + return -EINVAL; + } + fp_chunked_transfer_encoding(); + + if (length_post_processing-length_before_processing != 0) { + m_aws_response_handler.send_success_response(); + } else { + m_aws_response_handler.send_continuation_response(); + } + if (enable_progress == true) { + m_aws_response_handler.init_progress_response(); + m_aws_response_handler.send_progress_response(); + } + + return status; +} + +int RGWSelectObj_ObjStore_S3::handle_aws_cli_parameters(std::string& sql_query) +{ + std::string input_tag{"InputSerialization"}; + std::string output_tag{"OutputSerialization"}; + if (chunk_number !=0) { + return 0; + } +#define GT ">" +#define LT "<" +#define APOS "'" + + if (m_s3select_query.find(GT) != std::string::npos) { + boost::replace_all(m_s3select_query, GT, ">"); + } + if (m_s3select_query.find(LT) != std::string::npos) { + boost::replace_all(m_s3select_query, LT, "<"); + } + if (m_s3select_query.find(APOS) != std::string::npos) { + boost::replace_all(m_s3select_query, APOS, "'"); + } + //AWS cli s3select parameters + if (m_s3select_query.find(input_tag+">", 0); + size_t _qe = m_s3select_query.find("", _qi); + m_s3select_input = m_s3select_query.substr(_qi + input_tag.size() + 2, _qe - (_qi + input_tag.size() + 2)); + extract_by_tag(m_s3select_input, "FieldDelimiter", m_column_delimiter); + extract_by_tag(m_s3select_input, "QuoteCharacter", m_quot); + extract_by_tag(m_s3select_input, "RecordDelimiter", m_row_delimiter); + extract_by_tag(m_s3select_input, "FileHeaderInfo", m_header_info); + extract_by_tag(m_s3select_input, "Type", m_json_datatype); + if (m_row_delimiter.size()==0) { + m_row_delimiter='\n'; + } else if (m_row_delimiter.compare(" ") == 0) { + //presto change + m_row_delimiter='\n'; + } + extract_by_tag(m_s3select_input, "QuoteEscapeCharacter", m_escape_char); + extract_by_tag(m_s3select_input, "CompressionType", m_compression_type); + size_t _qo = m_s3select_query.find("<" + output_tag + ">", 0); + size_t _qs = m_s3select_query.find("", _qi); + m_s3select_output = m_s3select_query.substr(_qo + output_tag.size() + 2, _qs - (_qo + output_tag.size() + 2)); + extract_by_tag(m_s3select_output, "FieldDelimiter", output_column_delimiter); + extract_by_tag(m_s3select_output, "QuoteCharacter", output_quot); + extract_by_tag(m_s3select_output, "QuoteEscapeCharacter", output_escape_char); + extract_by_tag(m_s3select_output, "QuoteFields", output_quote_fields); + extract_by_tag(m_s3select_output, "RecordDelimiter", output_row_delimiter); + if (output_row_delimiter.size()==0) { + output_row_delimiter='\n'; + } else if (output_row_delimiter.compare(" ") == 0) { + //presto change + output_row_delimiter='\n'; + } + if (m_compression_type.length()>0 && m_compression_type.compare("NONE") != 0) { + ldpp_dout(this, 10) << "RGW supports currently only NONE option for compression type" << dendl; + return -1; + } + extract_by_tag(m_s3select_query, "Start", m_start_scan); + extract_by_tag(m_s3select_query, "End", m_end_scan); + if (m_start_scan.size() || m_end_scan.size()) { + m_scan_range_ind = true; + if (m_start_scan.size()) { + m_start_scan_sz = std::stol(m_start_scan); + } + if (m_end_scan.size()) { + m_end_scan_sz = std::stol(m_end_scan); + } else { + m_end_scan_sz = std::numeric_limits::max(); + } + } + if (m_enable_progress.compare("true")==0) { + enable_progress = true; + } else { + enable_progress = false; + } + return 0; +} + +int RGWSelectObj_ObjStore_S3::extract_by_tag(std::string input, std::string tag_name, std::string& result) +{ + result = ""; + size_t _qs = input.find("<" + tag_name + ">", 0); + size_t qs_input = _qs + tag_name.size() + 2; + if (_qs == std::string::npos) { + return -1; + } + size_t _qe = input.find("", qs_input); + if (_qe == std::string::npos) { + return -1; + } + result = input.substr(qs_input, _qe - qs_input); + return 0; +} + +size_t RGWSelectObj_ObjStore_S3::get_obj_size() +{ + return s->obj_size; +} + +int RGWSelectObj_ObjStore_S3::range_request(int64_t ofs, int64_t len, void* buff, optional_yield y) +{ + //purpose: implementation for arrow::ReadAt, this may take several async calls. + //send_response_date(call_back) accumulate buffer, upon completion control is back to ReadAt. + range_req_str = "bytes=" + std::to_string(ofs) + "-" + std::to_string(ofs+len-1); + range_str = range_req_str.c_str(); + range_parsed = false; + RGWGetObj::parse_range(); + requested_buffer.clear(); + m_request_range = len; + ldout(s->cct, 10) << "S3select: calling execute(async):" << " request-offset :" << ofs << " request-length :" << len << " buffer size : " << requested_buffer.size() << dendl; + RGWGetObj::execute(y); + if (buff) { + memcpy(buff, requested_buffer.data(), len); + } + ldout(s->cct, 10) << "S3select: done waiting, buffer is complete buffer-size:" << requested_buffer.size() << dendl; + return len; +} + +void RGWSelectObj_ObjStore_S3::execute(optional_yield y) +{ + int status = 0; + char parquet_magic[4]; + static constexpr uint8_t parquet_magic1[4] = {'P', 'A', 'R', '1'}; + static constexpr uint8_t parquet_magicE[4] = {'P', 'A', 'R', 'E'}; + get_params(y); +#ifdef _ARROW_EXIST + m_rgw_api.m_y = &y; +#endif + if (m_parquet_type) { + //parquet processing + range_request(0, 4, parquet_magic, y); + if (memcmp(parquet_magic, parquet_magic1, 4) && memcmp(parquet_magic, parquet_magicE, 4)) { + ldout(s->cct, 10) << s->object->get_name() << " does not contain parquet magic" << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + s3select_syntax.parse_query(m_sql_query.c_str()); + status = run_s3select_on_parquet(m_sql_query.c_str()); + if (status) { + ldout(s->cct, 10) << "S3select: failed to process query <" << m_sql_query << "> on object " << s->object->get_name() << dendl; + op_ret = -ERR_INVALID_REQUEST; + } else { + ldout(s->cct, 10) << "S3select: complete query with success " << dendl; + } + } else { + //CSV or JSON processing + if (m_scan_range_ind) { + + m_requested_range = (m_end_scan_sz - m_start_scan_sz); + + if(m_is_trino_request){ + // fetch more than requested(m_scan_offset), that additional bytes are scanned for end of row, + // thus the additional length will be processed, and no broken row for Trino. + // assumption: row is smaller than m_scan_offset. (a different approach is to request for additional range) + range_request(m_start_scan_sz, m_requested_range + m_scan_offset, nullptr, y); + } else { + range_request(m_start_scan_sz, m_requested_range, nullptr, y); + } + + } else { + RGWGetObj::execute(y); + } + }//if (m_parquet_type) +} + +int RGWSelectObj_ObjStore_S3::parquet_processing(bufferlist& bl, off_t ofs, off_t len) +{ + fp_chunked_transfer_encoding(); + size_t append_in_callback = 0; + int part_no = 1; + //concat the requested buffer + for (auto& it : bl.buffers()) { + if (it.length() == 0) { + ldout(s->cct, 10) << "S3select: get zero-buffer while appending request-buffer " << dendl; + } + append_in_callback += it.length(); + ldout(s->cct, 10) << "S3select: part " << part_no++ << " it.length() = " << it.length() << dendl; + requested_buffer.append(&(it)[0]+ofs, len); + } + ldout(s->cct, 10) << "S3select:append_in_callback = " << append_in_callback << dendl; + if (requested_buffer.size() < m_request_range) { + ldout(s->cct, 10) << "S3select: need another round buffe-size: " << requested_buffer.size() << " request range length:" << m_request_range << dendl; + return 0; + } else {//buffer is complete + ldout(s->cct, 10) << "S3select: buffer is complete " << requested_buffer.size() << " request range length:" << m_request_range << dendl; + m_request_range = 0; + } + return 0; +} + +void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp, off_t& ofs, off_t& len) +{ +//in case it is a scan range request and sent by Trino client. +//this routine chops the start/end of chunks. +//the purpose is to return "perfect" results, with no broken or missing lines. + + off_t new_offset = 0; + if(m_scan_range_ind){//only upon range-scan + int64_t sc=0; + int64_t start =0; + const char* row_delimiter = m_row_delimiter.c_str(); + + ldpp_dout(this, 10) << "s3select query: per Trino request the first and last chunk should modified." << dendl; + + //chop the head of the first chunk and only upon the slice does not include the head of the object. + if(m_start_scan_sz && (m_aws_response_handler.get_processed_size()==0)){ + char* p = const_cast(it_cp+ofs); + while(strncmp(row_delimiter,p,1) && (p - (it_cp+ofs)) < len)p++; + if(!strncmp(row_delimiter,p,1)){ + new_offset += (p - (it_cp+ofs))+1; + } + } + + //RR : end of the range-request. the original request sent by Trino client + //RD : row-delimiter + //[ ... ] : chunk boundaries + + //chop the end of the last chunk for this request + //if it's the last chunk, search for first row-delimiter for the following different use-cases + if((m_aws_response_handler.get_processed_size()+len) >= m_requested_range){ + //had pass the requested range, start to search for first delimiter + if(m_aws_response_handler.get_processed_size()>m_requested_range){ + //the previous chunk contain the complete request(all data) and an extra bytes. + //thus, search for the first row-delimiter + //[:previous (RR) ... ][:current (RD) ] + start = 0; + } else if(m_aws_response_handler.get_processed_size()){ + //the *current* chunk contain the complete request in the middle of the chunk. + //thus, search for the first row-delimiter after the complete request position + //[:current (RR) .... (RD) ] + start = m_requested_range - m_aws_response_handler.get_processed_size(); + } else { + //the current chunk is the first chunk and it contains complete request + //[:current:first-chunk (RR) .... (RD) ] + start = m_requested_range; + } + + for(sc=start;sc(it_cp) + ofs + sc; + if(!strncmp(row_delimiter,p,1)){ + ldout(s->cct, 10) << "S3select: found row-delimiter on " << sc << " get_processed_size = " << m_aws_response_handler.get_processed_size() << dendl; + len = sc + 1;//+1 is for delimiter. TODO what about m_object_size_for_processing (to update according to len) + //the end of row exist in current chunk. + //thus, the next chunk should be skipped + m_skip_next_chunk = true; + break; + } + } + } + ofs += new_offset; + } + + ldout(s->cct, 10) << "S3select: shape_chunk_per_trino_requests:update progress len = " << len << dendl; + len -= new_offset; +} + +int RGWSelectObj_ObjStore_S3::csv_processing(bufferlist& bl, off_t ofs, off_t len) +{ + int status = 0; + if(m_skip_next_chunk == true){ + return status; + } + + if (s->obj_size == 0 || m_object_size_for_processing == 0) { + status = run_s3select_on_csv(m_sql_query.c_str(), nullptr, 0); + if (status<0){ + return -EINVAL; + } + } else { + auto bl_len = bl.get_num_buffers(); + int buff_no=0; + for(auto& it : bl.buffers()) { + ldpp_dout(this, 10) << "s3select :processing segment " << buff_no << " out of " << bl_len << " off " << ofs + << " len " << len << " obj-size " << m_object_size_for_processing << dendl; + if (it.length() == 0 || len == 0) { + ldpp_dout(this, 10) << "s3select :it->_len is zero. segment " << buff_no << " out of " << bl_len + << " obj-size " << m_object_size_for_processing << dendl; + continue; + } + + if((ofs + len) > it.length()){ + ldpp_dout(this, 10) << "offset and length may cause invalid read: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << dendl; + ofs = 0; + len = it.length(); + } + + if(m_is_trino_request){ + shape_chunk_per_trino_requests(&(it)[0], ofs, len); + } + + ldpp_dout(this, 10) << "s3select: chunk: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << " m_object_size_for_processing = " << m_object_size_for_processing << dendl; + + m_aws_response_handler.update_processed_size(it.length());//NOTE : to run analysis to validate len is aligned with m_processed_bytes + status = run_s3select_on_csv(m_sql_query.c_str(), &(it)[0] + ofs, len); + if (status<0) { + return -EINVAL; + } + if (m_s3_csv_object.is_sql_limit_reached()) { + break; + } + buff_no++; + }//for + }//else + + ldpp_dout(this, 10) << "s3select : m_aws_response_handler.get_processed_size() " << m_aws_response_handler.get_processed_size() + << " m_object_size_for_processing " << uint64_t(m_object_size_for_processing) << dendl; + + if (m_aws_response_handler.get_processed_size() >= uint64_t(m_object_size_for_processing) || m_s3_csv_object.is_sql_limit_reached()) { + if (status >=0) { + m_aws_response_handler.init_stats_response(); + m_aws_response_handler.send_stats_response(); + m_aws_response_handler.init_end_response(); + ldpp_dout(this, 10) << "s3select : reached the end of query request : aws_response_handler.get_processed_size() " << m_aws_response_handler.get_processed_size() + << "m_object_size_for_processing : " << m_object_size_for_processing << dendl; + } + if (m_s3_csv_object.is_sql_limit_reached()) { + //stop fetching chunks + ldpp_dout(this, 10) << "s3select : reached the limit :" << m_aws_response_handler.get_processed_size() << dendl; + status = -ENOENT; + } + } + + return status; +} + +int RGWSelectObj_ObjStore_S3::json_processing(bufferlist& bl, off_t ofs, off_t len) +{ + int status = 0; + + if (s->obj_size == 0 || m_object_size_for_processing == 0) { + //in case of empty object the s3select-function returns a correct "empty" result(for aggregation and non-aggregation queries). + status = run_s3select_on_json(m_sql_query.c_str(), nullptr, 0); + if (status<0) + return -EINVAL; + } else { + //loop on buffer-list(chunks) + auto bl_len = bl.get_num_buffers(); + int i=0; + for(auto& it : bl.buffers()) { + ldpp_dout(this, 10) << "processing segment " << i << " out of " << bl_len << " off " << ofs + << " len " << len << " obj-size " << m_object_size_for_processing << dendl; + //skipping the empty chunks + if (len == 0) { + ldpp_dout(this, 10) << "s3select:it->_len is zero. segment " << i << " out of " << bl_len + << " obj-size " << m_object_size_for_processing << dendl; + continue; + } + + if((ofs + len) > it.length()){ + ldpp_dout(this, 10) << "s3select: offset and length may cause invalid read: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << dendl; + ofs = 0; + len = it.length(); + } + + m_aws_response_handler.update_processed_size(len); + status = run_s3select_on_json(m_sql_query.c_str(), &(it)[0] + ofs, len); + if (status<0) { + status = -EINVAL; + break; + } + if (m_s3_json_object.is_sql_limit_reached()) { + break; + } + i++; + }//for + }//else + + if (status>=0 && (m_aws_response_handler.get_processed_size() == uint64_t(m_object_size_for_processing) || m_s3_json_object.is_sql_limit_reached())) { + //flush the internal JSON buffer(upon last chunk) + status = run_s3select_on_json(m_sql_query.c_str(), nullptr, 0); + if (status<0) { + return -EINVAL; + } + if (status >=0) { + m_aws_response_handler.init_stats_response(); + m_aws_response_handler.send_stats_response(); + m_aws_response_handler.init_end_response(); + } + if (m_s3_json_object.is_sql_limit_reached()){ + //stop fetching chunks + status = -ENOENT; + ldpp_dout(this, 10) << "s3select : reached the limit :" << m_aws_response_handler.get_processed_size() << dendl; + } + } + return status; +} + +int RGWSelectObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t ofs, off_t len) +{ + if (m_scan_range_ind == false){ + m_object_size_for_processing = s->obj_size; + } + if (m_scan_range_ind == true){ + if (m_end_scan_sz == -1){ + m_end_scan_sz = s->obj_size; + } + if (static_cast((m_end_scan_sz - m_start_scan_sz))>s->obj_size){ //in the case user provides range bigger than object-size + m_object_size_for_processing = s->obj_size; + } else { + m_object_size_for_processing = m_end_scan_sz - m_start_scan_sz; + } + } + if (!m_aws_response_handler.is_set()) { + m_aws_response_handler.set(s, this); + } + if (len == 0 && s->obj_size != 0) { + return 0; + } + if (m_parquet_type) { + return parquet_processing(bl,ofs,len); + } + if (m_json_type) { + return json_processing(bl,ofs,len); + } + return csv_processing(bl,ofs,len); +} + diff --git a/src/rgw/rgw_s3select.h b/src/rgw/rgw_s3select.h new file mode 100644 index 000000000..4a506ba4c --- /dev/null +++ b/src/rgw/rgw_s3select.h @@ -0,0 +1,10 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +// + +#pragma once + +namespace rgw::s3select { +RGWOp* create_s3select_op(); +} + diff --git a/src/rgw/rgw_s3select_private.h b/src/rgw/rgw_s3select_private.h new file mode 100644 index 000000000..fa595b0da --- /dev/null +++ b/src/rgw/rgw_s3select_private.h @@ -0,0 +1,258 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +// +#pragma once + +#include +#include +#include +#include + +#include "common/ceph_crypto.h" +#include "common/split.h" +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" +#include "common/safe_io.h" +#include "common/errno.h" +#include "auth/Crypto.h" +#include +#include +#include +#define BOOST_BIND_GLOBAL_PLACEHOLDERS +#ifdef HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion" +#endif +#ifdef HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION +#pragma clang diagnostic pop +#endif +#undef BOOST_BIND_GLOBAL_PLACEHOLDERS + +#include + + +#pragma GCC diagnostic push +#pragma clang diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated" +#pragma clang diagnostic ignored "-Wdeprecated" +#include +#pragma GCC diagnostic pop +#pragma clang diagnostic pop + +#include "rgw_rest_s3.h" +#include "rgw_s3select.h" + +class aws_response_handler +{ + +private: + std::string sql_result; + req_state* s; + uint32_t header_size; + // the parameters are according to CRC-32 algorithm and its aligned with AWS-cli checksum + boost::crc_optimal<32, 0x04C11DB7, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc32; + RGWOp* m_rgwop; + std::string m_buff_header; + uint64_t total_bytes_returned; + uint64_t processed_size; + + enum class header_name_En { + EVENT_TYPE, + CONTENT_TYPE, + MESSAGE_TYPE, + ERROR_CODE, + ERROR_MESSAGE + }; + + enum class header_value_En { + RECORDS, + OCTET_STREAM, + EVENT, + CONT, + PROGRESS, + END, + XML, + STATS, + ENGINE_ERROR, + ERROR_TYPE + }; + + const char* PAYLOAD_LINE= "\n\n\n\n"; + const char* END_PAYLOAD_LINE= "\n"; + const char* header_name_str[5] = {":event-type", ":content-type", ":message-type", ":error-code", ":error-message"}; + const char* header_value_str[10] = {"Records", "application/octet-stream", "event", "Cont", "Progress", "End", "text/xml", "Stats", "s3select-engine-error", "error"}; + static constexpr size_t header_crc_size = 12; + + void push_header(const char* header_name, const char* header_value); + + int create_message(u_int32_t header_len); + +public: + aws_response_handler(req_state* ps, RGWOp* rgwop) : s(ps), m_rgwop(rgwop), total_bytes_returned{0}, processed_size{0} + {} + + aws_response_handler() : s(nullptr), m_rgwop(nullptr), total_bytes_returned{0}, processed_size{0} + {} + + bool is_set() + { + if(s==nullptr || m_rgwop == nullptr){ + return false; + } + return true; + } + + void set(req_state* ps, RGWOp* rgwop) + { + s = ps; + m_rgwop = rgwop; + } + + std::string& get_sql_result(); + + uint64_t get_processed_size(); + + void update_processed_size(uint64_t value); + + uint64_t get_total_bytes_returned(); + + void update_total_bytes_returned(uint64_t value); + + int create_header_records(); + + int create_header_continuation(); + + int create_header_progress(); + + int create_header_stats(); + + int create_header_end(); + + int create_error_header_records(const char* error_message); + + void init_response(); + + void init_success_response(); + + void send_continuation_response(); + + void init_progress_response(); + + void init_end_response(); + + void init_stats_response(); + + void init_error_response(const char* error_message); + + void send_success_response(); + + void send_progress_response(); + + void send_stats_response(); + + void send_error_response(const char* error_code, + const char* error_message, + const char* resource_id); + +}; //end class aws_response_handler + +class RGWSelectObj_ObjStore_S3 : public RGWGetObj_ObjStore_S3 +{ + +private: + s3selectEngine::s3select s3select_syntax; + std::string m_s3select_query; + std::string m_s3select_input; + std::string m_s3select_output; + s3selectEngine::csv_object m_s3_csv_object; +#ifdef _ARROW_EXIST + s3selectEngine::parquet_object m_s3_parquet_object; +#endif + s3selectEngine::json_object m_s3_json_object; + std::string m_column_delimiter; + std::string m_quot; + std::string m_row_delimiter; + std::string m_compression_type; + std::string m_escape_char; + std::unique_ptr m_buff_header; + std::string m_header_info; + std::string m_sql_query; + std::string m_enable_progress; + std::string output_column_delimiter; + std::string output_quot; + std::string output_escape_char; + std::string output_quote_fields; + std::string output_row_delimiter; + std::string m_start_scan; + std::string m_end_scan; + bool m_scan_range_ind; + int64_t m_start_scan_sz; + int64_t m_end_scan_sz; + int64_t m_object_size_for_processing; + aws_response_handler m_aws_response_handler; + bool enable_progress; + + //parquet request + bool m_parquet_type; + //json request + std::string m_json_datatype; + bool m_json_type; +#ifdef _ARROW_EXIST + s3selectEngine::rgw_s3select_api m_rgw_api; +#endif + //a request for range may statisfy by several calls to send_response_date; + size_t m_request_range; + std::string requested_buffer; + std::string range_req_str; + std::function fp_result_header_format; + std::function fp_s3select_result_format; + std::function fp_debug_mesg; + std::function fp_chunked_transfer_encoding; + int m_header_size; + +public: + unsigned int chunk_number; + size_t m_requested_range; + size_t m_scan_offset; + bool m_skip_next_chunk; + bool m_is_trino_request; + + RGWSelectObj_ObjStore_S3(); + virtual ~RGWSelectObj_ObjStore_S3(); + + virtual int send_response_data(bufferlist& bl, off_t ofs, off_t len) override; + + virtual int get_params(optional_yield y) override; + + virtual void execute(optional_yield) override; + +private: + + int csv_processing(bufferlist& bl, off_t ofs, off_t len); + + int parquet_processing(bufferlist& bl, off_t ofs, off_t len); + + int json_processing(bufferlist& bl, off_t ofs, off_t len); + + int run_s3select_on_csv(const char* query, const char* input, size_t input_length); + + int run_s3select_on_parquet(const char* query); + + int run_s3select_on_json(const char* query, const char* input, size_t input_length); + + int extract_by_tag(std::string input, std::string tag_name, std::string& result); + + void convert_escape_seq(std::string& esc); + + int handle_aws_cli_parameters(std::string& sql_query); + + int range_request(int64_t start, int64_t len, void*, optional_yield); + + size_t get_obj_size(); + std::function fp_range_req; + std::function fp_get_obj_size; + + void shape_chunk_per_trino_requests(const char*, off_t& ofs, off_t& len); +}; + diff --git a/src/rgw/rgw_sal.cc b/src/rgw/rgw_sal.cc new file mode 100644 index 000000000..58a21f707 --- /dev/null +++ b/src/rgw/rgw_sal.cc @@ -0,0 +1,402 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include + +#include "common/errno.h" + +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "driver/rados/config/store.h" +#include "driver/json_config/store.h" +#include "rgw_d3n_datacache.h" + +#ifdef WITH_RADOSGW_DBSTORE +#include "rgw_sal_dbstore.h" +#include "driver/dbstore/config/store.h" +#endif + +#ifdef WITH_RADOSGW_MOTR +#include "rgw_sal_motr.h" +#endif + +#ifdef WITH_RADOSGW_DAOS +#include "rgw_sal_daos.h" +#endif + +#define dout_subsys ceph_subsys_rgw + +extern "C" { +extern rgw::sal::Driver* newRadosStore(void); +#ifdef WITH_RADOSGW_DBSTORE +extern rgw::sal::Driver* newDBStore(CephContext *cct); +#endif +#ifdef WITH_RADOSGW_MOTR +extern rgw::sal::Driver* newMotrStore(CephContext *cct); +#endif +#ifdef WITH_RADOSGW_DAOS +extern rgw::sal::Driver* newDaosStore(CephContext *cct); +#endif +extern rgw::sal::Driver* newBaseFilter(rgw::sal::Driver* next); + +} + +RGWObjState::RGWObjState() { +} + +RGWObjState::~RGWObjState() { +} + +RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) { + is_atomic = rhs.is_atomic; + has_attrs = rhs.has_attrs; + exists = rhs.exists; + size = rhs.size; + accounted_size = rhs.accounted_size; + mtime = rhs.mtime; + epoch = rhs.epoch; + if (rhs.obj_tag.length()) { + obj_tag = rhs.obj_tag; + } + if (rhs.tail_tag.length()) { + tail_tag = rhs.tail_tag; + } + write_tag = rhs.write_tag; + fake_tag = rhs.fake_tag; + shadow_obj = rhs.shadow_obj; + has_data = rhs.has_data; + if (rhs.data.length()) { + data = rhs.data; + } + prefetch_data = rhs.prefetch_data; + keep_tail = rhs.keep_tail; + is_olh = rhs.is_olh; + objv_tracker = rhs.objv_tracker; + pg_ver = rhs.pg_ver; + compressed = rhs.compressed; +} + +rgw::sal::Driver* DriverManager::init_storage_provider(const DoutPrefixProvider* dpp, + CephContext* cct, + const Config& cfg, + bool use_gc_thread, + bool use_lc_thread, + bool quota_threads, + bool run_sync_thread, + bool run_reshard_thread, + bool use_cache, + bool use_gc) +{ + rgw::sal::Driver* driver{nullptr}; + + if (cfg.store_name.compare("rados") == 0) { + driver = newRadosStore(); + RGWRados* rados = static_cast(driver)->getRados(); + + if ((*rados).set_use_cache(use_cache) + .set_use_datacache(false) + .set_use_gc(use_gc) + .set_run_gc_thread(use_gc_thread) + .set_run_lc_thread(use_lc_thread) + .set_run_quota_threads(quota_threads) + .set_run_sync_thread(run_sync_thread) + .set_run_reshard_thread(run_reshard_thread) + .init_begin(cct, dpp) < 0) { + delete driver; + return nullptr; + } + if (driver->initialize(cct, dpp) < 0) { + delete driver; + return nullptr; + } + if (rados->init_complete(dpp) < 0) { + delete driver; + return nullptr; + } + } + else if (cfg.store_name.compare("d3n") == 0) { + driver = new rgw::sal::RadosStore(); + RGWRados* rados = new D3nRGWDataCache; + dynamic_cast(driver)->setRados(rados); + rados->set_store(static_cast(driver)); + + if ((*rados).set_use_cache(use_cache) + .set_use_datacache(true) + .set_run_gc_thread(use_gc_thread) + .set_run_lc_thread(use_lc_thread) + .set_run_quota_threads(quota_threads) + .set_run_sync_thread(run_sync_thread) + .set_run_reshard_thread(run_reshard_thread) + .init_begin(cct, dpp) < 0) { + delete driver; + return nullptr; + } + if (driver->initialize(cct, dpp) < 0) { + delete driver; + return nullptr; + } + if (rados->init_complete(dpp) < 0) { + delete driver; + return nullptr; + } + + lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_local_datacache_enabled=" << + cct->_conf->rgw_d3n_l1_local_datacache_enabled << dendl; + lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_datacache_persistent_path='" << + cct->_conf->rgw_d3n_l1_datacache_persistent_path << "'" << dendl; + lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_datacache_size=" << + cct->_conf->rgw_d3n_l1_datacache_size << dendl; + lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_evict_cache_on_start=" << + cct->_conf->rgw_d3n_l1_evict_cache_on_start << dendl; + lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_fadvise=" << + cct->_conf->rgw_d3n_l1_fadvise << dendl; + lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_eviction_policy=" << + cct->_conf->rgw_d3n_l1_eviction_policy << dendl; + } +#ifdef WITH_RADOSGW_DBSTORE + else if (cfg.store_name.compare("dbstore") == 0) { + driver = newDBStore(cct); + + if ((*(rgw::sal::DBStore*)driver).set_run_lc_thread(use_lc_thread) + .initialize(cct, dpp) < 0) { + delete driver; + return nullptr; + } + } +#endif + +#ifdef WITH_RADOSGW_MOTR + else if (cfg.store_name.compare("motr") == 0) { + driver = newMotrStore(cct); + if (driver == nullptr) { + ldpp_dout(dpp, 0) << "newMotrStore() failed!" << dendl; + return driver; + } + ((rgw::sal::MotrStore *)driver)->init_metadata_cache(dpp, cct); + + return store; + } +#endif + +#ifdef WITH_RADOSGW_DAOS + else if (cfg.store_name.compare("daos") == 0) { + driver = newDaosStore(cct); + if (driver == nullptr) { + ldpp_dout(dpp, 0) << "newDaosStore() failed!" << dendl; + return driver; + } + int ret = driver->initialize(cct, dpp); + if (ret != 0) { + ldpp_dout(dpp, 20) << "ERROR: store->initialize() failed: " << ret << dendl; + delete driver; + return nullptr; + } + } +#endif + + if (cfg.filter_name.compare("base") == 0) { + rgw::sal::Driver* next = driver; + driver = newBaseFilter(next); + + if (driver->initialize(cct, dpp) < 0) { + delete driver; + delete next; + return nullptr; + } + } + + return driver; +} + +rgw::sal::Driver* DriverManager::init_raw_storage_provider(const DoutPrefixProvider* dpp, CephContext* cct, const Config& cfg) +{ + rgw::sal::Driver* driver = nullptr; + if (cfg.store_name.compare("rados") == 0) { + driver = newRadosStore(); + RGWRados* rados = static_cast(driver)->getRados(); + + rados->set_context(cct); + + int ret = rados->init_svc(true, dpp); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl; + delete driver; + return nullptr; + } + + if (rados->init_rados() < 0) { + delete driver; + return nullptr; + } + if (driver->initialize(cct, dpp) < 0) { + delete driver; + return nullptr; + } + } else if (cfg.store_name.compare("dbstore") == 0) { +#ifdef WITH_RADOSGW_DBSTORE + driver = newDBStore(cct); + + if ((*(rgw::sal::DBStore*)driver).initialize(cct, dpp) < 0) { + delete driver; + return nullptr; + } +#else + driver = nullptr; +#endif + } else if (cfg.store_name.compare("motr") == 0) { +#ifdef WITH_RADOSGW_MOTR + driver = newMotrStore(cct); +#else + driver = nullptr; +#endif + } else if (cfg.store_name.compare("daos") == 0) { +#ifdef WITH_RADOSGW_DAOS + driver = newDaosStore(cct); + + if (driver->initialize(cct, dpp) < 0) { + delete driver; + return nullptr; + } +#else + driver = nullptr; +#endif + } + + if (cfg.filter_name.compare("base") == 0) { + rgw::sal::Driver* next = driver; + driver = newBaseFilter(next); + + if (driver->initialize(cct, dpp) < 0) { + delete driver; + delete next; + return nullptr; + } + } + + return driver; +} + +void DriverManager::close_storage(rgw::sal::Driver* driver) +{ + if (!driver) + return; + + driver->finalize(); + + delete driver; +} + +DriverManager::Config DriverManager::get_config(bool admin, CephContext* cct) +{ + DriverManager::Config cfg; + + // Get the store backend + const auto& config_store = g_conf().get_val("rgw_backend_store"); + if (config_store == "rados") { + cfg.store_name = "rados"; + + /* Check to see if d3n is configured, but only for non-admin */ + const auto& d3n = g_conf().get_val("rgw_d3n_l1_local_datacache_enabled"); + if (!admin && d3n) { + if (g_conf().get_val("rgw_max_chunk_size") != + g_conf().get_val("rgw_obj_stripe_size")) { + lsubdout(cct, rgw_datacache, 0) << "rgw_d3n: WARNING: D3N DataCache disabling (D3N requires that the chunk_size equals stripe_size)" << dendl; + } else if (!g_conf().get_val("rgw_beast_enable_async")) { + lsubdout(cct, rgw_datacache, 0) << "rgw_d3n: WARNING: D3N DataCache disabling (D3N requires yield context - rgw_beast_enable_async=true)" << dendl; + } else { + cfg.store_name = "d3n"; + } + } + } +#ifdef WITH_RADOSGW_DBSTORE + else if (config_store == "dbstore") { + cfg.store_name = "dbstore"; + } +#endif +#ifdef WITH_RADOSGW_MOTR + else if (config_store == "motr") { + cfg.store_name = "motr"; + } +#endif +#ifdef WITH_RADOSGW_DAOS + else if (config_store == "daos") { + cfg.store_name = "daos"; + } +#endif + + // Get the filter + cfg.filter_name = "none"; + const auto& config_filter = g_conf().get_val("rgw_filter"); + if (config_filter == "base") { + cfg.filter_name = "base"; + } + + return cfg; +} + +auto DriverManager::create_config_store(const DoutPrefixProvider* dpp, + std::string_view type) + -> std::unique_ptr +{ + try { + if (type == "rados") { + return rgw::rados::create_config_store(dpp); +#ifdef WITH_RADOSGW_DBSTORE + } else if (type == "dbstore") { + const auto uri = g_conf().get_val("dbstore_config_uri"); + return rgw::dbstore::create_config_store(dpp, uri); +#endif + } else if (type == "json") { + auto filename = g_conf().get_val("rgw_json_config"); + return rgw::sal::create_json_config_store(dpp, filename); + } else { + ldpp_dout(dpp, -1) << "ERROR: unrecognized config store type '" + << type << "'" << dendl; + return nullptr; + } + } catch (const std::exception& e) { + ldpp_dout(dpp, -1) << "ERROR: failed to initialize config store '" + << type << "': " << e.what() << dendl; + } + return nullptr; +} + +namespace rgw::sal { +int Object::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end) +{ + if (ofs < 0) { + ofs += obj_size; + if (ofs < 0) + ofs = 0; + end = obj_size - 1; + } else if (end < 0) { + end = obj_size - 1; + } + + if (obj_size > 0) { + if (ofs >= (off_t)obj_size) { + return -ERANGE; + } + if (end >= (off_t)obj_size) { + end = obj_size - 1; + } + } + return 0; +} +} // namespace rgw::sal diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h new file mode 100644 index 000000000..944737dee --- /dev/null +++ b/src/rgw/rgw_sal.h @@ -0,0 +1,1644 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_sal_fwd.h" +#include "rgw_lua.h" +#include "rgw_user.h" +#include "rgw_notify_event_type.h" +#include "common/tracer.h" +#include "rgw_datalog_notify.h" +#include "include/random.h" + +class RGWRESTMgr; +class RGWAccessListFilter; +class RGWLC; +struct rgw_user_bucket; +class RGWUsageBatch; +class RGWCoroutinesManagerRegistry; +class RGWBucketSyncPolicyHandler; +using RGWBucketSyncPolicyHandlerRef = std::shared_ptr; +class RGWDataSyncStatusManager; +class RGWSyncModuleInstance; +typedef std::shared_ptr RGWSyncModuleInstanceRef; +class RGWCompressionInfo; +struct rgw_pubsub_topics; +struct rgw_pubsub_bucket_topics; + + +using RGWBucketListNameFilter = std::function; + + +namespace rgw { + class Aio; + namespace IAM { struct Policy; } +} + +class RGWGetDataCB { +public: + virtual int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) = 0; + RGWGetDataCB() {} + virtual ~RGWGetDataCB() {} +}; + +struct RGWUsageIter { + std::string read_iter; + uint32_t index; + + RGWUsageIter() : index(0) {} +}; + +/** + * @struct RGWClusterStat + * Cluster-wide usage information + */ +struct RGWClusterStat { + /// total device size + uint64_t kb; + /// total used + uint64_t kb_used; + /// total available/free + uint64_t kb_avail; + /// number of objects + uint64_t num_objects; +}; + +class RGWGetBucketStats_CB : public RefCountedObject { +protected: + rgw_bucket bucket; + std::map* stats; +public: + explicit RGWGetBucketStats_CB(const rgw_bucket& _bucket) : bucket(_bucket), stats(NULL) {} + ~RGWGetBucketStats_CB() override {} + virtual void handle_response(int r) = 0; + virtual void set_response(std::map* _stats) { + stats = _stats; + } +}; + +class RGWGetUserStats_CB : public RefCountedObject { +protected: + rgw_user user; + RGWStorageStats stats; +public: + explicit RGWGetUserStats_CB(const rgw_user& _user) : user(_user) {} + ~RGWGetUserStats_CB() override {} + virtual void handle_response(int r) = 0; + virtual void set_response(RGWStorageStats& _stats) { + stats = _stats; + } +}; + +struct RGWObjState { + rgw_obj obj; + bool is_atomic{false}; + bool has_attrs{false}; + bool exists{false}; + uint64_t size{0}; //< size of raw object + uint64_t accounted_size{0}; //< size before compression, encryption + ceph::real_time mtime; + uint64_t epoch{0}; + bufferlist obj_tag; + bufferlist tail_tag; + std::string write_tag; + bool fake_tag{false}; + std::string shadow_obj; + bool has_data{false}; + bufferlist data; + bool prefetch_data{false}; + bool keep_tail{false}; + bool is_olh{false}; + bufferlist olh_tag; + uint64_t pg_ver{false}; + uint32_t zone_short_id{0}; + bool compressed{false}; + + /* important! don't forget to update copy constructor */ + + RGWObjVersionTracker objv_tracker; + + std::map attrset; + + RGWObjState(); + RGWObjState(const RGWObjState& rhs); + ~RGWObjState(); + + bool get_attr(std::string name, bufferlist& dest) { + auto iter = attrset.find(name); + if (iter != attrset.end()) { + dest = iter->second; + return true; + } + return false; + } +}; + +/** + * @defgroup RGWSAL RGW Store Abstraction Layer + * + * The Store Abstraction Layer is an API that separates the top layer of RGW that + * handles client protocols (such as S3 or Swift) from the bottom layer of RGW that + * interacts with a backing store. It allows the creation of multiple backing stores + * that can co-exist with a single RGW instance, and allows the creation of stacking + * layers of translators that can modify operations as they pass down the stack. + * Examples of translators might be a cache layer, a duplication layer that copies + * operations to multiple stores, or a policy layer that sends some operations to one + * store and some to another. + * + * The basic unit of a SAL implementation is the Store. Whether an actual backing store + * or a translator, there will be a Store implementation that represents it. Examples + * are the RadosStore that communicates via RADOS with a Ceph cluster, and the DBStore + * that uses a SQL db (such as SQLite3) as a backing store. There is a singleton + * instance of each Store. + * + * Data within RGW is owned by a User. The User is the unit of authentication and + * access control. + * + * Data within RGW is stored as an Object. Each Object is a single chunk of data, owned + * by a single User, contained within a single Bucket. It has metadata associated with + * it, such as size, owner, and so on, and a set of key-value attributes that can + * contain anything needed by the top half. + * + * Data with RGW is organized into Buckets. Each Bucket is owned by a User, and + * contains Objects. There is a single, flat layer of Buckets, there is no hierarchy, + * and each Object is contained in a single Bucket. + * + * Instantiations of SAL classes are done as unique pointers, using std::unique_ptr. + * Instances of these classes are acquired via getters, and it's up to the caller to + * manage the lifetime. + * + * @note Anything using RGWObjContext is subject to change, as that type will not be + * used in the final API. + * @{ + */ + +/** + * @file rgw_sal.h + * @brief Base abstractions and API for SAL + */ + +namespace rgw { namespace sal { + +/** + * @addtogroup RGWSAL + * @{ + */ + +#define RGW_SAL_VERSION 1 + +struct MPSerializer; +class GCChain; +class RGWOIDCProvider; +class RGWRole; + +enum AttrsMod { + ATTRSMOD_NONE = 0, + ATTRSMOD_REPLACE = 1, + ATTRSMOD_MERGE = 2 +}; + +// a simple streaming data processing abstraction +/** + * @brief A simple streaming data processing abstraction + */ +class DataProcessor { + public: + virtual ~DataProcessor() {} + + /** + * @brief Consume a bufferlist in its entirety at the given object offset. + * + * An empty bufferlist is given to request that any buffered data be flushed, though this doesn't + * wait for completions + */ + virtual int process(bufferlist&& data, uint64_t offset) = 0; +}; + +/** + * @brief a data consumer that writes an object in a bucket + */ +class ObjectProcessor : public DataProcessor { + public: + /** prepare to start processing object data */ + virtual int prepare(optional_yield y) = 0; + + /** complete the operation and make its result visible to clients */ + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) = 0; +}; + +/** Base class for AIO completions */ +class Completions { + public: + Completions() {} + virtual ~Completions() = default; + virtual int drain() = 0; +}; + +/** A list of key-value attributes */ + using Attrs = std::map; + +/** + * @brief Base singleton representing a Store or Filter + * + * The Driver is the base abstraction of the SAL layer. It represents a base storage + * mechanism, or a intermediate stacking layer. There is a single instance of a given + * Driver per RGW, and this Driver mediates all access to it's backing. + * + * A Driver contains, loosely, @a User, @a Bucket, and @a Object entities. The @a Object + * contains data, and it's associated metadata. The @a Bucket contains Objects, and + * metadata about the bucket. Both Buckets and Objects are owned by a @a User, which is + * the basic unit of access control. + * + * A Driver also has metadata and some global responsibilities. For example, a driver is + * responsible for managing the LifeCycle activities for it's data. + */ +class Driver { + public: + Driver() {} + virtual ~Driver() = default; + + /** Post-creation initialization of driver */ + virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) = 0; + /** Name of this driver provider (e.g., "rados") */ + virtual const std::string get_name() const = 0; + /** Get cluster unique identifier */ + virtual std::string get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y) = 0; + /** Get a User from a rgw_user. Does not query driver for user info, so quick */ + virtual std::unique_ptr get_user(const rgw_user& u) = 0; + /** Lookup a User by access key. Queries driver for user info. */ + virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr* user) = 0; + /** Lookup a User by email address. Queries driver for user info. */ + virtual int get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr* user) = 0; + /** Lookup a User by swift username. Queries driver for user info. */ + virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr* user) = 0; + /** Get a basic Object. This Object is not looked up, and is incomplete, since is + * does not have a bucket. This should only be used when an Object is needed before + * there is a Bucket, otherwise use the get_object() in the Bucket class. */ + virtual std::unique_ptr get_object(const rgw_obj_key& k) = 0; + /** Get a Bucket by info. Does not query the driver, just uses the give bucket info. */ + virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr* bucket) = 0; + /** Lookup a Bucket by key. Queries driver for bucket info. */ + virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr* bucket, optional_yield y) = 0; + /** Lookup a Bucket by name. Queries driver for bucket info. */ + virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr* bucket, optional_yield y) = 0; + /** For multisite, this driver is the zone's master */ + virtual bool is_meta_master() = 0; + /** For multisite, forward an OP to the zone's master */ + virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv, + bufferlist& in_data, JSONParser* jp, req_info& info, + optional_yield y) = 0; + virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, req_info& info, + optional_yield y) = 0; + /** Get zone info for this driver */ + virtual Zone* get_zone() = 0; + /** Get a unique ID specific to this zone. */ + virtual std::string zone_unique_id(uint64_t unique_num) = 0; + /** Get a unique Swift transaction ID specific to this zone */ + virtual std::string zone_unique_trans_id(const uint64_t unique_num) = 0; + /** Lookup a zonegroup by ID */ + virtual int get_zonegroup(const std::string& id, std::unique_ptr* zonegroup) = 0; + /** List all zones in all zone groups by ID */ + virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list& zone_ids) = 0; + /** Get statistics about the cluster represented by this driver */ + virtual int cluster_stat(RGWClusterStat& stats) = 0; + /** Get a @a Lifecycle object. Used to manage/run lifecycle transitions */ + virtual std::unique_ptr get_lifecycle(void) = 0; + /** Get a @a Completions object. Used for Async I/O tracking */ + virtual std::unique_ptr get_completions(void) = 0; + + /** Get a @a Notification object. Used to communicate with non-RGW daemons, such as + * management/tracking software */ + /** RGWOp variant */ + virtual std::unique_ptr get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, + rgw::notify::EventType event_type, optional_yield y, const std::string* object_name=nullptr) = 0; + /** No-req_state variant (e.g., rgwlc) */ + virtual std::unique_ptr get_notification( + const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, + rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, + std::string& _req_id, optional_yield y) = 0; + /** Read the topic config entry into @a data and (optionally) @a objv_tracker */ + virtual int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) = 0; + /** Write @a info and (optionally) @a objv_tracker into the config */ + virtual int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) = 0; + /** Remove the topic config, optionally a specific version */ + virtual int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker, + optional_yield y,const DoutPrefixProvider *dpp) = 0; + /** Get access to the lifecycle management thread */ + virtual RGWLC* get_rgwlc(void) = 0; + /** Get access to the coroutine registry. Used to create new coroutine managers */ + virtual RGWCoroutinesManagerRegistry* get_cr_registry() = 0; + + /** Log usage data to the driver. Usage data is things like bytes sent/received and + * op count */ + virtual int log_usage(const DoutPrefixProvider *dpp, std::map& usage_info) = 0; + /** Log OP data to the driver. Data is opaque to SAL */ + virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) = 0; + /** Register this driver to the service map. Somewhat Rados specific; may be removed*/ + virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, + const std::map& meta) = 0; + /** Get default quota info. Used as fallback if a user or bucket has no quota set*/ + virtual void get_quota(RGWQuota& quota) = 0; + /** Get global rate limit configuration*/ + virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) = 0; + /** Enable or disable a set of bucket. e.g. if a User is suspended */ + virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, std::vector& buckets, bool enabled) = 0; + /** Get a new request ID */ + virtual uint64_t get_new_req_id() = 0; + /** Get a handler for bucket sync policy. */ + virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef* phandler, + optional_yield y) = 0; + /** Get a status manager for bucket sync */ + virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) = 0; + /** Wake up sync threads for bucket metadata sync */ + virtual void wakeup_meta_sync_shards(std::set& shard_ids) = 0; + /** Wake up sync threads for bucket data sync */ + virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map>& shard_ids) = 0; + /** Clear all usage statistics globally */ + virtual int clear_usage(const DoutPrefixProvider *dpp) = 0; + /** Get usage statistics for all users and buckets */ + virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, + RGWUsageIter& usage_iter, + std::map& usage) = 0; + /** Trim usage log for all users and buckets */ + virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) = 0; + /** Get a configuration value for the given name */ + virtual int get_config_key_val(std::string name, bufferlist* bl) = 0; + /** Start a metadata listing of the given section */ + virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) = 0; + /** Get the next key from a metadata list */ + virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list& keys, bool* truncated) = 0; + /** Complete a metadata listing */ + virtual void meta_list_keys_complete(void* handle) = 0; + /** Get the marker associated with the current metadata listing */ + virtual std::string meta_get_marker(void* handle) = 0; + /** Remove a specific metadata key */ + virtual int meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y) = 0; + /** Get an instance of the Sync module for bucket sync */ + virtual const RGWSyncModuleInstanceRef& get_sync_module() = 0; + /** Get the ID of the current host */ + virtual std::string get_host_id() = 0; + /** Get a Lua script manager for running lua scripts */ + virtual std::unique_ptr get_lua_manager() = 0; + /** Get an IAM Role by name etc. */ + virtual std::unique_ptr get_role(std::string name, + std::string tenant, + std::string path="", + std::string trust_policy="", + std::string max_session_duration_str="", + std::multimap tags={}) = 0; + /** Get an IAM Role by ID */ + virtual std::unique_ptr get_role(std::string id) = 0; + virtual std::unique_ptr get_role(const RGWRoleInfo& info) = 0; + /** Get all IAM Roles optionally filtered by path */ + virtual int get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + std::vector>& roles) = 0; + /** Get an empty Open ID Connector provider */ + virtual std::unique_ptr get_oidc_provider() = 0; + /** Get all Open ID Connector providers, optionally filtered by tenant */ + virtual int get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + std::vector>& providers) = 0; + /** Get a Writer that appends to an object */ + virtual std::unique_ptr get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) = 0; + /** Get a Writer that atomically writes an entire object */ + virtual std::unique_ptr get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) = 0; + + /** Get the compression type of a placement rule */ + virtual const std::string& get_compression_type(const rgw_placement_rule& rule) = 0; + /** Check to see if this placement rule is valid */ + virtual bool valid_placement(const rgw_placement_rule& rule) = 0; + + /** Clean up a driver for termination */ + virtual void finalize(void) = 0; + + /** Get the Ceph context associated with this driver. May be removed. */ + virtual CephContext* ctx(void) = 0; + + /** Register admin APIs unique to this driver */ + virtual void register_admin_apis(RGWRESTMgr* mgr) = 0; +}; + +/** + * @brief User abstraction + * + * This represents a user. In general, there will be a @a User associated with an OP + * (the user performing the OP), and potentially several others acting as owners. + * Lifetime of a User is a bit tricky , since it must last as long as any Buckets + * associated with it. A User has associated metadata, including a set of key/value + * attributes, and statistics (including usage) about the User. + */ +class User { + public: + User() {} + virtual ~User() = default; + + /** Clone a copy of this user. Used when modification is necessary of the copy */ + virtual std::unique_ptr clone() = 0; + /** List the buckets owned by a user */ + virtual int list_buckets(const DoutPrefixProvider* dpp, + const std::string& marker, const std::string& end_marker, + uint64_t max, bool need_stats, BucketList& buckets, + optional_yield y) = 0; + /** Create a new bucket owned by this user. Creates in the backing store, not just the instantiation. */ + virtual int create_bucket(const DoutPrefixProvider* dpp, + const rgw_bucket& b, + const std::string& zonegroup_id, + rgw_placement_rule& placement_rule, + std::string& swift_ver_location, + const RGWQuotaInfo* pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool* existed, + req_info& req_info, + std::unique_ptr* bucket, + optional_yield y) = 0; + + /** Get the display name for this User */ + virtual std::string& get_display_name() = 0; + /** Get the tenant name for this User */ + virtual const std::string& get_tenant() = 0; + /** Set the tenant name for this User */ + virtual void set_tenant(std::string& _t) = 0; + /** Get the namespace for this User */ + virtual const std::string& get_ns() = 0; + /** Set the namespace for this User */ + virtual void set_ns(std::string& _ns) = 0; + /** Clear the namespace for this User */ + virtual void clear_ns() = 0; + /** Get the full ID for this User */ + virtual const rgw_user& get_id() const = 0; + /** Get the type of this User */ + virtual uint32_t get_type() const = 0; + /** Get the maximum number of buckets allowed for this User */ + virtual int32_t get_max_buckets() const = 0; + /** Get the capabilities for this User */ + virtual const RGWUserCaps& get_caps() const = 0; + /** Get the version tracker for this User */ + virtual RGWObjVersionTracker& get_version_tracker() = 0; + /** Get the cached attributes for this User */ + virtual Attrs& get_attrs() = 0; + /** Set the cached attributes fro this User */ + virtual void set_attrs(Attrs& _attrs) = 0; + /** Check if a User is empty */ + virtual bool empty() const = 0; + /** Check if a User pointer is empty */ + static bool empty(const User* u) { return (!u || u->empty()); } + /** Check if a User unique_pointer is empty */ + static bool empty(const std::unique_ptr& u) { return (!u || u->empty()); } + /** Read the User attributes from the backing Store */ + virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) = 0; + /** Set the attributes in attrs, leaving any other existing attrs set, and + * write them to the backing store; a merge operation */ + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) = 0; + /** Read the User stats from the backing Store, synchronous */ + virtual int read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time* last_stats_sync = nullptr, + ceph::real_time* last_stats_update = nullptr) = 0; + /** Read the User stats from the backing Store, asynchronous */ + virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) = 0; + /** Flush accumulated stat changes for this User to the backing store */ + virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) = 0; + /** Read detailed usage stats for this User from the backing store */ + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) = 0; + /** Trim User usage stats to the given epoch range */ + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) = 0; + + /** Load this User from the backing store. requires ID to be set, fills all other fields. */ + virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) = 0; + /** Store this User to the backing store */ + virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) = 0; + /** Remove this User from the backing store */ + virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) = 0; + /** Verify multi-factor authentication for this user */ + virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) = 0; + + /* dang temporary; will be removed when User is complete */ + virtual RGWUserInfo& get_info() = 0; + + /** Print the User to @a out */ + virtual void print(std::ostream& out) const = 0; + + friend inline std::ostream& operator<<(std::ostream& out, const User& u) { + u.print(out); + return out; + } + + friend inline std::ostream& operator<<(std::ostream& out, const User* u) { + if (!u) + out << ""; + else + u->print(out); + return out; + } + + friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr& p) { + out << p.get(); + return out; + } +}; + +/** + * @brief Bucket abstraction + * + * This represents a bucket. A bucket is a container for objects. It is owned by a user, and has + * it's own set of metadata, including a set of key/value attributes. A bucket may not contain + * other buckets, only objects. Buckets have Access Control Lists (ACLs) that control what users + * can access the contents of the bucket, and in what ways. + */ +class Bucket { + public: + + /** + * @brief Parameters for a bucket list operation + */ + struct ListParams { + std::string prefix; + std::string delim; + rgw_obj_key marker; + rgw_obj_key end_marker; + std::string ns; + bool enforce_ns{true}; + RGWAccessListFilter* access_list_filter{nullptr}; + RGWBucketListNameFilter force_check_filter; + bool list_versions{false}; + bool allow_unordered{false}; + int shard_id{RGW_NO_SHARD}; + + friend std::ostream& operator<<(std::ostream& out, const ListParams& p) { + out << "rgw::sal::Bucket::ListParams{ prefix=\"" << p.prefix << + "\", delim=\"" << p.delim << + "\", marker=\"" << p.marker << + "\", end_marker=\"" << p.end_marker << + "\", ns=\"" << p.ns << + "\", enforce_ns=" << p.enforce_ns << + ", list_versions=" << p.list_versions << + ", allow_unordered=" << p.allow_unordered << + ", shard_id=" << p.shard_id << + " }"; + return out; + } + }; + /** + * @brief Results from a bucket list operation + */ + struct ListResults { + std::vector objs; + std::map common_prefixes; + bool is_truncated{false}; + rgw_obj_key next_marker; + }; + + Bucket() = default; + virtual ~Bucket() = default; + + /** Get an @a Object belonging to this bucket */ + virtual std::unique_ptr get_object(const rgw_obj_key& key) = 0; + /** List the contents of this bucket */ + virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, ListResults&, optional_yield y) = 0; + /** Get the cached attributes associated with this bucket */ + virtual Attrs& get_attrs(void) = 0; + /** Set the cached attributes on this bucket */ + virtual int set_attrs(Attrs a) = 0; + /** Remove this bucket from the backing store */ + virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) = 0; + /** Remove this bucket, bypassing garbage collection. May be removed */ + virtual int remove_bucket_bypass_gc(int concurrent_max, bool + keep_index_consistent, + optional_yield y, const + DoutPrefixProvider *dpp) = 0; + /** Get then ACL for this bucket */ + virtual RGWAccessControlPolicy& get_acl(void) = 0; + /** Set the ACL for this bucket */ + virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl, optional_yield y) = 0; + + // XXXX hack + virtual void set_owner(rgw::sal::User* _owner) = 0; + + /** Load this bucket from the backing store. Requires the key to be set, fills other fields. + * If @a get_stats is true, then statistics on the bucket are also looked up. */ + virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats = false) = 0; + /** Read the bucket stats from the backing Store, synchronous */ + virtual int read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, std::string* bucket_ver, std::string* master_ver, + std::map& stats, + std::string* max_marker = nullptr, + bool* syncstopped = nullptr) = 0; + /** Read the bucket stats from the backing Store, asynchronous */ + virtual int read_stats_async(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetBucketStats_CB* ctx) = 0; + /** Sync this bucket's stats to the owning user's stats in the backing store */ + virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) = 0; + /** Refresh the metadata stats (size, count, and so on) from the backing store */ + virtual int update_container_stats(const DoutPrefixProvider* dpp) = 0; + /** Check if this bucket needs resharding, and schedule it if it does */ + virtual int check_bucket_shards(const DoutPrefixProvider* dpp) = 0; + /** Change the owner of this bucket in the backing store. Current owner must be set. Does not + * change ownership of the objects in the bucket. */ + virtual int chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y) = 0; + /** Store the cached bucket info into the backing store */ + virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time mtime) = 0; + /** Check to see if the given user is the owner of this bucket */ + virtual bool is_owner(User* user) = 0; + /** Get the owner of this bucket */ + virtual User* get_owner(void) = 0; + /** Get the owner of this bucket in the form of an ACLOwner object */ + virtual ACLOwner get_acl_owner(void) = 0; + /** Check in the backing store if this bucket is empty */ + virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) = 0; + /** Chec k if the given size fits within the quota */ + virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) = 0; + /** Set the attributes in attrs, leaving any other existing attrs set, and + * write them to the backing store; a merge operation */ + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) = 0; + /** Try to refresh the cached bucket info from the backing store. Used in + * read-modify-update loop. */ + virtual int try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime) = 0; + /** Read usage information about this bucket from the backing store */ + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) = 0; + /** Trim the usage information to the given epoch range */ + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) = 0; + /** Remove objects from the bucket index of this bucket. May be removed from API */ + virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list& objs_to_unlink) = 0; + /** Check the state of the bucket index, and get stats from it. May be removed from API */ + virtual int check_index(const DoutPrefixProvider *dpp, std::map& existing_stats, std::map& calculated_stats) = 0; + /** Rebuild the bucket index. May be removed from API */ + virtual int rebuild_index(const DoutPrefixProvider *dpp) = 0; + /** Set a timeout on the check_index() call. May be removed from API */ + virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) = 0; + /** Remove this specific bucket instance from the backing store. May be removed from API */ + virtual int purge_instance(const DoutPrefixProvider* dpp) = 0; + + /** Check if this instantiation is empty */ + virtual bool empty() const = 0; + /** Get the cached name of this bucket */ + virtual const std::string& get_name() const = 0; + /** Get the cached tenant of this bucket */ + virtual const std::string& get_tenant() const = 0; + /** Get the cached marker of this bucket */ + virtual const std::string& get_marker() const = 0; + /** Get the cached ID of this bucket */ + virtual const std::string& get_bucket_id() const = 0; + /** Get the cached size of this bucket */ + virtual size_t get_size() const = 0; + /** Get the cached rounded size of this bucket */ + virtual size_t get_size_rounded() const = 0; + /** Get the cached object count of this bucket */ + virtual uint64_t get_count() const = 0; + /** Get the cached placement rule of this bucket */ + virtual rgw_placement_rule& get_placement_rule() = 0; + /** Get the cached creation time of this bucket */ + virtual ceph::real_time& get_creation_time() = 0; + /** Get the cached modification time of this bucket */ + virtual ceph::real_time& get_modification_time() = 0; + /** Get the cached version of this bucket */ + virtual obj_version& get_version() = 0; + /** Set the cached version of this bucket */ + virtual void set_version(obj_version &ver) = 0; + /** Check if this bucket is versioned */ + virtual bool versioned() = 0; + /** Check if this bucket has versioning enabled */ + virtual bool versioning_enabled() = 0; + + /** Check if a Bucket pointer is empty */ + static bool empty(const Bucket* b) { return (!b || b->empty()); } + /** Check if a Bucket unique pointer is empty */ + static bool empty(const std::unique_ptr& b) { return (!b || b->empty()); } + /** Clone a copy of this bucket. Used when modification is necessary of the copy */ + virtual std::unique_ptr clone() = 0; + + /** Create a multipart upload in this bucket */ + virtual std::unique_ptr get_multipart_upload( + const std::string& oid, + std::optional upload_id=std::nullopt, + ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) = 0; + /** List multipart uploads currently in this bucket */ + virtual int list_multiparts(const DoutPrefixProvider *dpp, + const std::string& prefix, + std::string& marker, + const std::string& delim, + const int& max_uploads, + std::vector>& uploads, + std::map *common_prefixes, + bool *is_truncated) = 0; + /** Abort multipart uploads in a bucket */ + virtual int abort_multiparts(const DoutPrefixProvider* dpp, + CephContext* cct) = 0; + + /** Read the bucket notification config into @a notifications with and (optionally) @a objv_tracker */ + virtual int read_topics(rgw_pubsub_bucket_topics& notifications, + RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) = 0; + /** Write @a notifications with (optionally) @a objv_tracker into the bucket notification config */ + virtual int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) = 0; + /** Remove the bucket notification config with (optionally) @a objv_tracker */ + virtual int remove_topics(RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) = 0; + + /* dang - This is temporary, until the API is completed */ + virtual rgw_bucket& get_key() = 0; + virtual RGWBucketInfo& get_info() = 0; + + /** Print the User to @a out */ + virtual void print(std::ostream& out) const = 0; + + friend inline std::ostream& operator<<(std::ostream& out, const Bucket& b) { + b.print(out); + return out; + } + + friend inline std::ostream& operator<<(std::ostream& out, const Bucket* b) { + if (!b) + out << ""; + else + b->print(out); + return out; + } + + friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr& p) { + out << p.get(); + return out; + } + + virtual bool operator==(const Bucket& b) const = 0; + virtual bool operator!=(const Bucket& b) const = 0; + + friend class BucketList; +}; + +/** + * @brief A list of buckets + * + * This is the result from a bucket listing operation. + */ +class BucketList { + std::map> buckets; + bool truncated; + +public: + BucketList() : buckets(), truncated(false) {} + BucketList(BucketList&& _bl) : + buckets(std::move(_bl.buckets)), + truncated(_bl.truncated) + { } + BucketList& operator=(const BucketList&) = delete; + BucketList& operator=(BucketList&& _bl) { + for (auto& ent : _bl.buckets) { + buckets.emplace(ent.first, std::move(ent.second)); + } + truncated = _bl.truncated; + return *this; + }; + + /** Get the list of buckets. The list is a map of pairs. */ + std::map>& get_buckets() { return buckets; } + /** True if the list is truncated (that is, there are more buckets to list) */ + bool is_truncated(void) const { return truncated; } + /** Set the truncated state of the list */ + void set_truncated(bool trunc) { truncated = trunc; } + /** Add a bucket to the list. Takes ownership of the bucket */ + void add(std::unique_ptr bucket) { + buckets.emplace(bucket->get_name(), std::move(bucket)); + } + /** The number of buckets in this list */ + size_t count() const { return buckets.size(); } + /** Clear the list */ + void clear(void) { + buckets.clear(); + truncated = false; + } +}; + +/** + * @brief Object abstraction + * + * This represents an Object. An Object is the basic unit of data storage. It + * represents a blob of data, a set of metadata (such as size, owner, ACLs, etc.) and + * a set of key/value attributes. Objects may be versioned. If a versioned object + * is written to, a new object with the same name but a different version is created, + * and the old version of the object is still accessible. If an unversioned object + * is written to, it is replaced, and the old data is not accessible. + */ +class Object { + public: + + /** + * @brief Read operation on an Object + * + * This represents a Read operation on an Object. Read operations are optionally + * asynchronous, using the iterate() API. + */ + struct ReadOp { + struct Params { + const ceph::real_time* mod_ptr{nullptr}; + const ceph::real_time* unmod_ptr{nullptr}; + bool high_precision_time{false}; + uint32_t mod_zone_id{0}; + uint64_t mod_pg_ver{0}; + const char* if_match{nullptr}; + const char* if_nomatch{nullptr}; + ceph::real_time* lastmod{nullptr}; + rgw_obj* target_obj{nullptr}; // XXX dang remove? + } params; + + virtual ~ReadOp() = default; + + /** Prepare the Read op. Must be called first */ + virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) = 0; + + /** Synchronous read. Read from @a ofs to @a end (inclusive) + * into @a bl. Length is `end - ofs + 1`. */ + virtual int read(int64_t ofs, int64_t end, bufferlist& bl, + optional_yield y, const DoutPrefixProvider* dpp) = 0; + + /** Asynchronous read. Read from @a ofs to @a end (inclusive) + * calling @a cb on each read chunk. Length is `end - ofs + + * 1`. */ + virtual int iterate(const DoutPrefixProvider* dpp, int64_t ofs, + int64_t end, RGWGetDataCB* cb, optional_yield y) = 0; + + /** Get an attribute by name */ + virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) = 0; + }; + + /** + * @brief Delete operation on an Object + * + * This deletes an Object from the backing store. + */ + struct DeleteOp { + struct Params { + ACLOwner bucket_owner; + ACLOwner obj_owner; + int versioning_status{0}; + uint64_t olh_epoch{0}; + std::string marker_version_id; + uint32_t bilog_flags{0}; + std::list* remove_objs{nullptr}; + ceph::real_time expiration_time; + ceph::real_time unmod_since; + ceph::real_time mtime; + bool high_precision_time{false}; + rgw_zone_set* zones_trace{nullptr}; + bool abortmp{false}; + uint64_t parts_accounted_size{0}; + } params; + + struct Result { + bool delete_marker{false}; + std::string version_id; + } result; + + virtual ~DeleteOp() = default; + + /** Delete the object */ + virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) = 0; + }; + + Object() {} + virtual ~Object() = default; + + /** Shortcut synchronous delete call for common deletes */ + virtual int delete_object(const DoutPrefixProvider* dpp, + optional_yield y, + bool prevent_versioning = false) = 0; + /** Asynchronous delete call */ + virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio, + bool keep_index_consistent, optional_yield y) = 0; + /** Copy an this object to another object. */ + virtual int copy_object(User* user, + req_info* info, const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, ceph::real_time* mtime, + const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, const char* if_nomatch, + AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs, + RGWObjCategory category, uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, std::string* tag, std::string* etag, + void (*progress_cb)(off_t, void *), void* progress_data, + const DoutPrefixProvider* dpp, optional_yield y) = 0; + /** Get the ACL for this object */ + virtual RGWAccessControlPolicy& get_acl(void) = 0; + /** Set the ACL for this object */ + virtual int set_acl(const RGWAccessControlPolicy& acl) = 0; + /** Mark further operations on this object as being atomic */ + virtual void set_atomic() = 0; + /** Check if this object is atomic */ + virtual bool is_atomic() = 0; + /** Pre-fetch data when reading */ + virtual void set_prefetch_data() = 0; + /** Check if this object should prefetch */ + virtual bool is_prefetch_data() = 0; + /** Mark data as compressed */ + virtual void set_compressed() = 0; + /** Check if this object is compressed */ + virtual bool is_compressed() = 0; + /** Invalidate cached info about this object, except atomic, prefetch, and + * compressed */ + virtual void invalidate() = 0; + + /** Check to see if this object has an empty key. This means it's uninitialized */ + virtual bool empty() const = 0; + /** Get the name of this object */ + virtual const std::string &get_name() const = 0; + + /** Get the object state for this object. Will be removed in the future */ + virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) = 0; + /** Set attributes for this object from the backing store. Attrs can be set or + * deleted. @note the attribute APIs may be revisited in the future. */ + virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) = 0; + /** Get attributes for this object */ + virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) = 0; + /** Modify attributes for this object. */ + virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) = 0; + /** Delete attributes for this object */ + virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) = 0; + /** Check to see if this object has expired */ + virtual bool is_expired() = 0; + /** Create a randomized instance ID for this object */ + virtual void gen_rand_obj_instance_name() = 0; + /** Get a multipart serializer for this object */ + virtual std::unique_ptr get_serializer(const DoutPrefixProvider *dpp, + const std::string& lock_name) = 0; + /** Move the data of an object to new placement storage */ + virtual int transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) = 0; + /** Move an object to the cloud */ + virtual int transition_to_cloud(Bucket* bucket, + rgw::sal::PlacementTier* tier, + rgw_bucket_dir_entry& o, + std::set& cloud_targets, + CephContext* cct, + bool update_object, + const DoutPrefixProvider* dpp, + optional_yield y) = 0; + /** Check to see if two placement rules match */ + virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) = 0; + /** Dump driver-specific object layout info in JSON */ + virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) = 0; + + /** Get the cached attributes for this object */ + virtual Attrs& get_attrs(void) = 0; + /** Get the (const) cached attributes for this object */ + virtual const Attrs& get_attrs(void) const = 0; + /** Set the cached attributes for this object */ + virtual int set_attrs(Attrs a) = 0; + /** Check to see if attributes are cached on this object */ + virtual bool has_attrs(void) = 0; + /** Get the cached modification time for this object */ + virtual ceph::real_time get_mtime(void) const = 0; + /** Get the cached size for this object */ + virtual uint64_t get_obj_size(void) const = 0; + /** Get the bucket containing this object */ + virtual Bucket* get_bucket(void) const = 0; + /** Set the bucket containing this object */ + virtual void set_bucket(Bucket* b) = 0; + /** Get the sharding hash representation of this object */ + virtual std::string get_hash_source(void) = 0; + /** Set the sharding hash representation of this object */ + virtual void set_hash_source(std::string s) = 0; + /** Build an Object Identifier string for this object */ + virtual std::string get_oid(void) const = 0; + /** True if this object is a delete marker (newest version is deleted) */ + virtual bool get_delete_marker(void) = 0; + /** True if this object is stored in the extra data pool */ + virtual bool get_in_extra_data(void) = 0; + /** Set the in_extra_data field */ + virtual void set_in_extra_data(bool i) = 0; + /** Helper to sanitize object size, offset, and end values */ + int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end); + /** Set the cached size of this object */ + virtual void set_obj_size(uint64_t s) = 0; + /** Set the cached name of this object */ + virtual void set_name(const std::string& n) = 0; + /** Set the cached key of this object */ + virtual void set_key(const rgw_obj_key& k) = 0; + /** Get an rgw_obj representing this object */ + virtual rgw_obj get_obj(void) const = 0; + + /** Restore the previous swift version of this object */ + virtual int swift_versioning_restore(bool& restored, /* out */ + const DoutPrefixProvider* dpp) = 0; + /** Copy the current version of a swift object to the configured destination bucket*/ + virtual int swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) = 0; + + /** Get a new ReadOp for this object */ + virtual std::unique_ptr get_read_op() = 0; + /** Get a new DeleteOp for this object */ + virtual std::unique_ptr get_delete_op() = 0; + + /** Get @a count OMAP values via listing, starting at @a marker for this object */ + virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map* m, + bool* pmore, optional_yield y) = 0; + /** Get all OMAP key/value pairs for this object */ + virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map* m, + optional_yield y) = 0; + /** Get the OMAP values matching the given set of keys */ + virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid, + const std::set& keys, + Attrs* vals) = 0; + /** Get a single OMAP value matching the given key */ + virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) = 0; + /** Change the ownership of this object */ + virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) = 0; + + /** Check to see if the given object pointer is uninitialized */ + static bool empty(const Object* o) { return (!o || o->empty()); } + /** Check to see if the given object unique pointer is uninitialized */ + static bool empty(const std::unique_ptr& o) { return (!o || o->empty()); } + /** Get a unique copy of this object */ + virtual std::unique_ptr clone() = 0; + + /* dang - This is temporary, until the API is completed */ + /** Get the key for this object */ + virtual rgw_obj_key& get_key() = 0; + /** Set the instance for this object */ + virtual void set_instance(const std::string &i) = 0; + /** Get the instance for this object */ + virtual const std::string &get_instance() const = 0; + /** Check to see if this object has an instance set */ + virtual bool have_instance(void) = 0; + /** Clear the instance on this object */ + virtual void clear_instance() = 0; + + /** Print the User to @a out */ + virtual void print(std::ostream& out) const = 0; + + friend inline std::ostream& operator<<(std::ostream& out, const Object& o) { + o.print(out); + return out; + } + friend inline std::ostream& operator<<(std::ostream& out, const Object* o) { + if (!o) + out << ""; + else + o->print(out); + return out; + } + friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr& p) { + out << p.get(); + return out; + } +}; + +/** + * @brief Abstraction of a single part of a multipart upload + */ +class MultipartPart { +public: + MultipartPart() = default; + virtual ~MultipartPart() = default; + + /** Get the part number of this part */ + virtual uint32_t get_num() = 0; + /** Get the size of this part */ + virtual uint64_t get_size() = 0; + /** Get the etag of this part */ + virtual const std::string& get_etag() = 0; + /** Get the modification time of this part */ + virtual ceph::real_time& get_mtime() = 0; +}; + +/** + * @brief Abstraction of a multipart upload + * + * This represents a multipart upload. For large objects, it's inefficient to do a + * single, long-lived upload of the object. Instead, protocols such as S3 allow the + * client to start a multipart upload, and then upload object in smaller parts in + * parallel. A MultipartUpload consists of a target bucket, a unique identifier, and a + * set of upload parts. + */ +class MultipartUpload { +public: + MultipartUpload() = default; + virtual ~MultipartUpload() = default; + + /** Get the name of the object representing this upload in the backing store */ + virtual const std::string& get_meta() const = 0; + /** Get the name of the target object for this upload */ + virtual const std::string& get_key() const = 0; + /** Get the unique ID of this upload */ + virtual const std::string& get_upload_id() const = 0; + /** Get the owner of this upload */ + virtual const ACLOwner& get_owner() const = 0; + /** Get the modification time of this upload */ + virtual ceph::real_time& get_mtime() = 0; + + /** Get all the cached parts that make up this upload */ + virtual std::map>& get_parts() = 0; + + /** Get the trace context of this upload */ + virtual const jspan_context& get_trace() = 0; + + /** Get the Object that represents this upload */ + virtual std::unique_ptr get_meta_obj() = 0; + + /** Initialize this upload */ + virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) = 0; + /** List all the parts of this upload, filling the parts cache */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int num_parts, int marker, + int* next_marker, bool* truncated, + bool assume_unsorted = false) = 0; + /** Abort this upload */ + virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) = 0; + /** Complete this upload, making it available as a normal object */ + virtual int complete(const DoutPrefixProvider* dpp, + optional_yield y, CephContext* cct, + std::map& part_etags, + std::list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& ofs, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) = 0; + + /** Get placement and/or attribute info for this upload */ + virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) = 0; + + /** Get a Writer to write to a part of this upload */ + virtual std::unique_ptr get_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) = 0; + + /** Print the Upload to @a out */ + virtual void print(std::ostream& out) const = 0; + + friend inline std::ostream& operator<<(std::ostream& out, const MultipartUpload& u) { + u.print(out); + return out; + } + friend inline std::ostream& operator<<(std::ostream& out, const MultipartUpload* u) { + if (!u) + out << ""; + else + u->print(out); + return out; + } + friend inline std::ostream& operator<<(std::ostream& out, const + std::unique_ptr& p) { + out << p.get(); + return out; + } +}; + +/** + * @brief Interface of a lock/serialization + */ +class Serializer { +public: + Serializer() = default; + virtual ~Serializer() = default; + + /** Try to take the lock for the given amount of time. */ + virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) = 0; + /** Unlock the lock */ + virtual int unlock() = 0; + + /** Print the Serializer to @a out */ + virtual void print(std::ostream& out) const = 0; + + friend inline std::ostream& operator<<(std::ostream& out, const Serializer& s) { + s.print(out); + return out; + } + friend inline std::ostream& operator<<(std::ostream& out, const Serializer* s) { + if (!s) + out << ""; + else + s->print(out); + return out; + } +}; + +/** @brief Abstraction of a serializer for multipart uploads + */ +class MPSerializer : public Serializer { +public: + MPSerializer() = default; + virtual ~MPSerializer() = default; + + virtual void clear_locked() = 0; + /** Check to see if locked */ + virtual bool is_locked() = 0; +}; + +/** @brief Abstraction of a serializer for Lifecycle + */ +class LCSerializer : public Serializer { +public: + LCSerializer() {} + virtual ~LCSerializer() = default; +}; + +/** + * @brief Abstraction for lifecycle processing + * + * Lifecycle processing loops over the objects in a bucket, applying per-bucket policy + * to each object. Examples of policy can be deleting after a certain amount of time, + * deleting extra versions, changing the storage class, and so on. + */ +class Lifecycle { +public: + /** Head of a lifecycle run. Used for tracking parallel lifecycle runs. */ + struct LCHead { + LCHead() = default; + virtual ~LCHead() = default; + + virtual time_t& get_start_date() = 0; + virtual void set_start_date(time_t) = 0; + virtual std::string& get_marker() = 0; + virtual void set_marker(const std::string&) = 0; + virtual time_t& get_shard_rollover_date() = 0; + virtual void set_shard_rollover_date(time_t) = 0; + }; + + /** Single entry in a lifecycle run. Multiple entries can exist processing different + * buckets. */ + struct LCEntry { + LCEntry() = default; + virtual ~LCEntry() = default; + + virtual std::string& get_bucket() = 0; + virtual void set_bucket(const std::string&) = 0; + virtual std::string& get_oid() = 0; + virtual void set_oid(const std::string&) = 0; + virtual uint64_t get_start_time() = 0; + virtual void set_start_time(uint64_t) = 0; + virtual uint32_t get_status() = 0; + virtual void set_status(uint32_t) = 0; + + /** Print the entry to @a out */ + virtual void print(std::ostream& out) const = 0; + + friend inline std::ostream& operator<<(std::ostream& out, const LCEntry& e) { + e.print(out); + return out; + } + friend inline std::ostream& operator<<(std::ostream& out, const LCEntry* e) { + if (!e) + out << ""; + else + e->print(out); + return out; + } + friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr& p) { + out << p.get(); + return out; + } + }; + + Lifecycle() = default; + virtual ~Lifecycle() = default; + + /** Get an empty entry */ + virtual std::unique_ptr get_entry() = 0; + /** Get an entry matching the given marker */ + virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr* entry) = 0; + /** Get the entry following the given marker */ + virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr* entry) = 0; + /** Store a modified entry in then backing store */ + virtual int set_entry(const std::string& oid, LCEntry& entry) = 0; + /** List all known entries */ + virtual int list_entries(const std::string& oid, const std::string& marker, + uint32_t max_entries, + std::vector>& entries) = 0; + /** Remove an entry from the backing store */ + virtual int rm_entry(const std::string& oid, LCEntry& entry) = 0; + /** Get a head */ + virtual int get_head(const std::string& oid, std::unique_ptr* head) = 0; + /** Store a modified head to the backing store */ + virtual int put_head(const std::string& oid, LCHead& head) = 0; + + /** Get a serializer for lifecycle */ + virtual std::unique_ptr get_serializer(const std::string& lock_name, + const std::string& oid, + const std::string& cookie) = 0; +}; + +/** + * @brief Abstraction for a Notification event + * + * RGW can generate notifications for various events, such as object creation or + * deletion. + */ +class Notification { +protected: + public: + Notification() {} + + virtual ~Notification() = default; + + /** Indicate the start of the event associated with this notification */ + virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) = 0; + /** Indicate the successful completion of the event associated with this notification */ + virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size, + const ceph::real_time& mtime, const std::string& etag, const std::string& version) = 0; +}; + +/** + * @brief Abstraction for an asynchronous writer + * + * Writing is done through a set of filters. This allows chaining filters to do things + * like compression and encryption on async writes. This is the base abstraction for + * those filters. + */ +class Writer : public ObjectProcessor { +public: + Writer() {} + virtual ~Writer() = default; + + /** prepare to start processing object data */ + virtual int prepare(optional_yield y) = 0; + + /** + * Process a buffer. Called multiple times to write different buffers. + * data.length() == 0 indicates the last call and may be used to flush + * the data buffers. + */ + virtual int process(bufferlist&& data, uint64_t offset) = 0; + + /** complete the operation and make its result visible to clients */ + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) = 0; +}; + + +/** + * @brief Abstraction of a placement tier + * + * This abstraction allows access to information about placement tiers, + * including storage class. + */ +class PlacementTier { +public: + virtual ~PlacementTier() = default; + + /** Get the type of this tier */ + virtual const std::string& get_tier_type() = 0; + /** Get the storage class of this tier */ + virtual const std::string& get_storage_class() = 0; + /** Should we retain the head object when transitioning */ + virtual bool retain_head_object() = 0; + /** Get the placement rule associated with this tier */ +}; + +/** + * @brief Abstraction of a zone group + * + * This class allows access to information about a zonegroup. It may be the + * group containing the current zone, or another group. + */ +class ZoneGroup { +public: + virtual ~ZoneGroup() = default; + /** Get the ID of this zonegroup */ + virtual const std::string& get_id() const = 0; + /** Get the name of this zonegroup */ + virtual const std::string& get_name() const = 0; + /** Determine if two zonegroups are the same */ + virtual int equals(const std::string& other_zonegroup) const = 0; + /** Get the endpoint from zonegroup, or from master zone if not set */ + virtual const std::string& get_endpoint() const = 0; + /** Check if a placement target (by name) exists in this zonegroup */ + virtual bool placement_target_exists(std::string& target) const = 0; + /** Check if this is the master zonegroup */ + virtual bool is_master_zonegroup() const = 0; + /** Get the API name of this zonegroup */ + virtual const std::string& get_api_name() const = 0; + /** Get the list of placement target names for this zone */ + virtual void get_placement_target_names(std::set& names) const = 0; + /** Get the name of the default placement target for this zone */ + virtual const std::string& get_default_placement_name() const = 0; + /** Get the list of hostnames from this zone */ + virtual int get_hostnames(std::list& names) const = 0; + /** Get the list of hostnames that host s3 websites from this zone */ + virtual int get_s3website_hostnames(std::list& names) const = 0; + /** Get the number of zones in this zonegroup */ + virtual int get_zone_count() const = 0; + /** Get the placement tier associated with the rule */ + virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr* tier) = 0; + /** Get a zone by ID */ + virtual int get_zone_by_id(const std::string& id, std::unique_ptr* zone) = 0; + /** Get a zone by Name */ + virtual int get_zone_by_name(const std::string& name, std::unique_ptr* zone) = 0; + /** List zones in zone group by ID */ + virtual int list_zones(std::list& zone_ids) = 0; + /// Return true if the given feature is enabled in the zonegroup. + virtual bool supports(std::string_view feature) const = 0; + /** Clone a copy of this zonegroup. */ + virtual std::unique_ptr clone() = 0; +}; + +/** + * @brief Abstraction of a Zone + * + * This abstraction allows access to information about zones. This can be the zone + * containing the RGW, or another zone. + */ +class Zone { + public: + virtual ~Zone() = default; + + /** Clone a copy of this zone. */ + virtual std::unique_ptr clone() = 0; + /** Get info about the zonegroup containing this zone */ + virtual ZoneGroup& get_zonegroup() = 0; + /** Get the ID of this zone */ + virtual const std::string& get_id() = 0; + /** Get the name of this zone */ + virtual const std::string& get_name() const = 0; + /** True if this zone is writable */ + virtual bool is_writeable() = 0; + /** Get the URL for the endpoint for redirecting to this zone */ + virtual bool get_redirect_endpoint(std::string* endpoint) = 0; + /** Check to see if the given API is supported in this zone */ + virtual bool has_zonegroup_api(const std::string& api) const = 0; + /** Get the current period ID for this zone */ + virtual const std::string& get_current_period_id() = 0; + /** Get thes system access key for this zone */ + virtual const RGWAccessKey& get_system_key() = 0; + /** Get the name of the realm containing this zone */ + virtual const std::string& get_realm_name() = 0; + /** Get the ID of the realm containing this zone */ + virtual const std::string& get_realm_id() = 0; + /** Get the tier type for the zone */ + virtual const std::string_view get_tier_type() = 0; + /** Get a handler for zone sync policy. */ + virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() = 0; +}; + +/** + * @brief Abstraction of a manager for Lua scripts and packages + * + * RGW can load and process Lua scripts. This will handle loading/storing scripts; adding, deleting, and listing packages + */ +class LuaManager { +public: + virtual ~LuaManager() = default; + + /** Get a script named with the given key from the backing store */ + virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) = 0; + /** Put a script named with the given key to the backing store */ + virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) = 0; + /** Delete a script named with the given key from the backing store */ + virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) = 0; + /** Add a lua package */ + virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) = 0; + /** Remove a lua package */ + virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) = 0; + /** List lua packages */ + virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) = 0; +}; + +/** @} namespace rgw::sal in group RGWSAL */ +} } // namespace rgw::sal + +/** + * @brief A manager for Drivers + * + * This will manage the singleton instances of the various drivers. Drivers come in two + * varieties: Full and Raw. A full driver is suitable for use in a radosgw daemon. It + * has full access to the cluster, if any. A raw driver is a stripped down driver, used + * for admin commands. + */ +class DriverManager { +public: + struct Config { + /** Name of store to create */ + std::string store_name; + /** Name of filter to create or "none" */ + std::string filter_name; + }; + + DriverManager() {} + /** Get a full driver by service name */ + static rgw::sal::Driver* get_storage(const DoutPrefixProvider* dpp, + CephContext* cct, + const Config& cfg, + bool use_gc_thread, + bool use_lc_thread, + bool quota_threads, + bool run_sync_thread, + bool run_reshard_thread, + bool use_cache = true, + bool use_gc = true) { + rgw::sal::Driver* driver = init_storage_provider(dpp, cct, cfg, use_gc_thread, + use_lc_thread, + quota_threads, + run_sync_thread, + run_reshard_thread, + use_cache, use_gc); + return driver; + } + /** Get a stripped down driver by service name */ + static rgw::sal::Driver* get_raw_storage(const DoutPrefixProvider* dpp, + CephContext* cct, const Config& cfg) { + rgw::sal::Driver* driver = init_raw_storage_provider(dpp, cct, cfg); + return driver; + } + /** Initialize a new full Driver */ + static rgw::sal::Driver* init_storage_provider(const DoutPrefixProvider* dpp, + CephContext* cct, + const Config& cfg, + bool use_gc_thread, + bool use_lc_thread, + bool quota_threads, + bool run_sync_thread, + bool run_reshard_thread, + bool use_metadata_cache, + bool use_gc); + /** Initialize a new raw Driver */ + static rgw::sal::Driver* init_raw_storage_provider(const DoutPrefixProvider* dpp, + CephContext* cct, + const Config& cfg); + /** Close a Driver when it's no longer needed */ + static void close_storage(rgw::sal::Driver* driver); + + /** Get the config for Drivers */ + static Config get_config(bool admin, CephContext* cct); + + /** Create a ConfigStore */ + static auto create_config_store(const DoutPrefixProvider* dpp, + std::string_view type) + -> std::unique_ptr; + +}; + +/** @} */ diff --git a/src/rgw/rgw_sal_config.h b/src/rgw/rgw_sal_config.h new file mode 100644 index 000000000..705094022 --- /dev/null +++ b/src/rgw/rgw_sal_config.h @@ -0,0 +1,301 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include +#include +#include "rgw_sal_fwd.h" + +class DoutPrefixProvider; +class optional_yield; +struct RGWPeriod; +struct RGWPeriodConfig; +struct RGWRealm; +struct RGWZoneGroup; +struct RGWZoneParams; + +namespace rgw::sal { + +/// Results of a listing operation +template +struct ListResult { + /// The subspan of the input entries that contain results + std::span entries; + /// The next marker to resume listing, or empty + std::string next; +}; + +/// Storage abstraction for realm/zonegroup/zone configuration +class ConfigStore { + public: + virtual ~ConfigStore() {} + + /// @group Realm + ///@{ + + /// Set the cluster-wide default realm id + virtual int write_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id) = 0; + /// Read the cluster's default realm id + virtual int read_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string& realm_id) = 0; + /// Delete the cluster's default realm id + virtual int delete_default_realm_id(const DoutPrefixProvider* dpp, + optional_yield y) = 0; + + /// Create a realm + virtual int create_realm(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWRealm& info, + std::unique_ptr* writer) = 0; + /// Read a realm by id + virtual int read_realm_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWRealm& info, + std::unique_ptr* writer) = 0; + /// Read a realm by name + virtual int read_realm_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_name, + RGWRealm& info, + std::unique_ptr* writer) = 0; + /// Read the cluster's default realm + virtual int read_default_realm(const DoutPrefixProvider* dpp, + optional_yield y, + RGWRealm& info, + std::unique_ptr* writer) = 0; + /// Look up a realm id by its name + virtual int read_realm_id(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view realm_name, + std::string& realm_id) = 0; + /// Notify the cluster of a new period, so radosgws can reload with the new + /// configuration + virtual int realm_notify_new_period(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWPeriod& period) = 0; + /// List up to 'entries.size()' realm names starting from the given marker + virtual int list_realm_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) = 0; + ///@} + + /// @group Period + ///@{ + + /// Write a period and advance its latest epoch + virtual int create_period(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWPeriod& info) = 0; + /// Read a period by id and epoch. If no epoch is given, read the latest + virtual int read_period(const DoutPrefixProvider* dpp, + optional_yield y, std::string_view period_id, + std::optional epoch, RGWPeriod& info) = 0; + /// Delete all period epochs with the given period id + virtual int delete_period(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view period_id) = 0; + /// List up to 'entries.size()' period ids starting from the given marker + virtual int list_period_ids(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) = 0; + ///@} + + /// @group ZoneGroup + ///@{ + + /// Set the cluster-wide default zonegroup id + virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zonegroup_id) = 0; + /// Read the cluster's default zonegroup id + virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zonegroup_id) = 0; + /// Delete the cluster's default zonegroup id + virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) = 0; + + /// Create a zonegroup + virtual int create_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneGroup& info, + std::unique_ptr* writer) = 0; + /// Read a zonegroup by id + virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_id, + RGWZoneGroup& info, + std::unique_ptr* writer) = 0; + /// Read a zonegroup by name + virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zonegroup_name, + RGWZoneGroup& info, + std::unique_ptr* writer) = 0; + /// Read the cluster's default zonegroup + virtual int read_default_zonegroup(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneGroup& info, + std::unique_ptr* writer) = 0; + /// List up to 'entries.size()' zonegroup names starting from the given marker + virtual int list_zonegroup_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) = 0; + ///@} + + /// @group Zone + ///@{ + + /// Set the realm-wide default zone id + virtual int write_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + std::string_view zone_id) = 0; + /// Read the realm's default zone id + virtual int read_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + std::string& zone_id) = 0; + /// Delete the realm's default zone id + virtual int delete_default_zone_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id) = 0; + + /// Create a zone + virtual int create_zone(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + const RGWZoneParams& info, + std::unique_ptr* writer) = 0; + /// Read a zone by id + virtual int read_zone_by_id(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_id, + RGWZoneParams& info, + std::unique_ptr* writer) = 0; + /// Read a zone by id or name. If both are empty, try to load the + /// cluster's default zone + virtual int read_zone_by_name(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view zone_name, + RGWZoneParams& info, + std::unique_ptr* writer) = 0; + /// Read the realm's default zone + virtual int read_default_zone(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWZoneParams& info, + std::unique_ptr* writer) = 0; + /// List up to 'entries.size()' zone names starting from the given marker + virtual int list_zone_names(const DoutPrefixProvider* dpp, + optional_yield y, const std::string& marker, + std::span entries, + ListResult& result) = 0; + ///@} + + /// @group PeriodConfig + ///@{ + + /// Read period config object + virtual int read_period_config(const DoutPrefixProvider* dpp, + optional_yield y, + std::string_view realm_id, + RGWPeriodConfig& info) = 0; + /// Write period config object + virtual int write_period_config(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, + std::string_view realm_id, + const RGWPeriodConfig& info) = 0; + ///@} + +}; // ConfigStore + + +/// A handle to manage the atomic updates of an existing realm object. This +/// is initialized on read, and any subsequent writes through this handle will +/// fail with -ECANCELED if another writer updates the object in the meantime. +class RealmWriter { + public: + virtual ~RealmWriter() {} + + /// Overwrite an existing realm. Must not change id or name + virtual int write(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWRealm& info) = 0; + /// Rename an existing realm. Must not change id + virtual int rename(const DoutPrefixProvider* dpp, + optional_yield y, + RGWRealm& info, + std::string_view new_name) = 0; + /// Delete an existing realm + virtual int remove(const DoutPrefixProvider* dpp, + optional_yield y) = 0; +}; + +/// A handle to manage the atomic updates of an existing zonegroup object. This +/// is initialized on read, and any subsequent writes through this handle will +/// fail with -ECANCELED if another writer updates the object in the meantime. +class ZoneGroupWriter { + public: + virtual ~ZoneGroupWriter() {} + + /// Overwrite an existing zonegroup. Must not change id or name + virtual int write(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWZoneGroup& info) = 0; + /// Rename an existing zonegroup. Must not change id + virtual int rename(const DoutPrefixProvider* dpp, + optional_yield y, + RGWZoneGroup& info, + std::string_view new_name) = 0; + /// Delete an existing zonegroup + virtual int remove(const DoutPrefixProvider* dpp, + optional_yield y) = 0; +}; + +/// A handle to manage the atomic updates of an existing zone object. This +/// is initialized on read, and any subsequent writes through this handle will +/// fail with -ECANCELED if another writer updates the object in the meantime. +class ZoneWriter { + public: + virtual ~ZoneWriter() {} + + /// Overwrite an existing zone. Must not change id or name + virtual int write(const DoutPrefixProvider* dpp, + optional_yield y, + const RGWZoneParams& info) = 0; + /// Rename an existing zone. Must not change id + virtual int rename(const DoutPrefixProvider* dpp, + optional_yield y, + RGWZoneParams& info, + std::string_view new_name) = 0; + /// Delete an existing zone + virtual int remove(const DoutPrefixProvider* dpp, + optional_yield y) = 0; +}; + +} // namespace rgw::sal diff --git a/src/rgw/rgw_sal_daos.cc b/src/rgw/rgw_sal_daos.cc new file mode 100644 index 000000000..4b0234b1f --- /dev/null +++ b/src/rgw/rgw_sal_daos.cc @@ -0,0 +1,2473 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=2 sw=2 expandtab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * SAL implementation for the CORTX DAOS backend + * + * Copyright (C) 2022 Seagate Technology LLC and/or its Affiliates + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_sal_daos.h" + +#include +#include +#include + +#include +#include + +#include "common/Clock.h" +#include "common/errno.h" +#include "rgw_bucket.h" +#include "rgw_compression.h" +#include "rgw_sal.h" + +#define dout_subsys ceph_subsys_rgw + +using std::list; +using std::map; +using std::set; +using std::string; +using std::vector; + +namespace fs = std::filesystem; + +namespace rgw::sal { + +using ::ceph::decode; +using ::ceph::encode; + +int DaosUser::list_buckets(const DoutPrefixProvider* dpp, const string& marker, + const string& end_marker, uint64_t max, + bool need_stats, BucketList& buckets, + optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: list_user_buckets: marker=" << marker + << " end_marker=" << end_marker << " max=" << max << dendl; + int ret = 0; + bool is_truncated = false; + buckets.clear(); + vector bucket_infos(max); + daos_size_t bcount = bucket_infos.size(); + vector> values(bcount, vector(DS3_MAX_ENCODED_LEN)); + for (daos_size_t i = 0; i < bcount; i++) { + bucket_infos[i].encoded = values[i].data(); + bucket_infos[i].encoded_length = values[i].size(); + } + + char daos_marker[DS3_MAX_BUCKET_NAME]; + std::strncpy(daos_marker, marker.c_str(), sizeof(daos_marker)); + ret = ds3_bucket_list(&bcount, bucket_infos.data(), daos_marker, + &is_truncated, store->ds3, nullptr); + ldpp_dout(dpp, 20) << "DEBUG: ds3_bucket_list: bcount=" << bcount + << " ret=" << ret << dendl; + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_list failed!" << ret << dendl; + return ret; + } + + bucket_infos.resize(bcount); + values.resize(bcount); + + for (const auto& bi : bucket_infos) { + DaosBucketInfo dbinfo; + bufferlist bl; + bl.append(reinterpret_cast(bi.encoded), bi.encoded_length); + auto iter = bl.cbegin(); + dbinfo.decode(iter); + buckets.add(std::make_unique(this->store, dbinfo.info, this)); + } + + buckets.set_truncated(is_truncated); + return 0; +} + +int DaosUser::create_bucket( + const DoutPrefixProvider* dpp, const rgw_bucket& b, + const std::string& zonegroup_id, rgw_placement_rule& placement_rule, + std::string& swift_ver_location, const RGWQuotaInfo* pquota_info, + const RGWAccessControlPolicy& policy, Attrs& attrs, RGWBucketInfo& info, + obj_version& ep_objv, bool exclusive, bool obj_lock_enabled, bool* existed, + req_info& req_info, std::unique_ptr* bucket_out, optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: create_bucket:" << b.name << dendl; + int ret; + std::unique_ptr bucket; + + // Look up the bucket. Create it if it doesn't exist. + ret = this->store->get_bucket(dpp, this, b, &bucket, y); + if (ret != 0 && ret != -ENOENT) { + return ret; + } + + if (ret != -ENOENT) { + *existed = true; + if (swift_ver_location.empty()) { + swift_ver_location = bucket->get_info().swift_ver_location; + } + placement_rule.inherit_from(bucket->get_info().placement_rule); + + // TODO: ACL policy + // // don't allow changes to the acl policy + // RGWAccessControlPolicy old_policy(ctx()); + // int rc = rgw_op_get_bucket_policy_from_attr( + // dpp, this, u, bucket->get_attrs(), &old_policy, y); + // if (rc >= 0 && old_policy != policy) { + // bucket_out->swap(bucket); + // return -EEXIST; + //} + } else { + placement_rule.name = "default"; + placement_rule.storage_class = "STANDARD"; + bucket = std::make_unique(store, b, this); + bucket->set_attrs(attrs); + + *existed = false; + } + + // TODO: how to handle zone and multi-site. + + if (!*existed) { + info.placement_rule = placement_rule; + info.bucket = b; + info.owner = this->get_info().user_id; + info.zonegroup = zonegroup_id; + info.creation_time = ceph::real_clock::now(); + if (obj_lock_enabled) + info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED; + bucket->set_version(ep_objv); + bucket->get_info() = info; + + // Create a new bucket: + DaosBucket* daos_bucket = static_cast(bucket.get()); + bufferlist bl; + std::unique_ptr bucket_info = + daos_bucket->get_encoded_info(bl, ceph::real_time()); + ret = ds3_bucket_create(bucket->get_name().c_str(), bucket_info.get(), + nullptr, store->ds3, nullptr); + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_create failed! ret=" << ret + << dendl; + return ret; + } + } else { + bucket->set_version(ep_objv); + bucket->get_info() = info; + } + + bucket_out->swap(bucket); + + return ret; +} + +int DaosUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosUser::read_stats(const DoutPrefixProvider* dpp, optional_yield y, + RGWStorageStats* stats, + ceph::real_time* last_stats_sync, + ceph::real_time* last_stats_update) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +/* stats - Not for first pass */ +int DaosUser::read_stats_async(const DoutPrefixProvider* dpp, + RGWGetUserStats_CB* cb) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosUser::complete_flush_stats(const DoutPrefixProvider* dpp, + optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosUser::read_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + map& usage) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosUser::trim_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch, + uint64_t end_epoch) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosUser::load_user(const DoutPrefixProvider* dpp, optional_yield y) { + const string name = info.user_id.to_str(); + ldpp_dout(dpp, 20) << "DEBUG: load_user, name=" << name << dendl; + + DaosUserInfo duinfo; + int ret = read_user(dpp, name, &duinfo); + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: load_user failed, name=" << name << dendl; + return ret; + } + + info = duinfo.info; + attrs = duinfo.attrs; + objv_tracker.read_version = duinfo.user_version; + return 0; +} + +int DaosUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, + Attrs& new_attrs, optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: merge_and_store_attrs, new_attrs=" << new_attrs + << dendl; + for (auto& it : new_attrs) { + attrs[it.first] = it.second; + } + return store_user(dpp, y, false); +} + +int DaosUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, + bool exclusive, RGWUserInfo* old_info) { + const string name = info.user_id.to_str(); + ldpp_dout(dpp, 10) << "DEBUG: Store_user(): User name=" << name << dendl; + + // Read user + int ret = 0; + struct DaosUserInfo duinfo; + ret = read_user(dpp, name, &duinfo); + obj_version obj_ver = duinfo.user_version; + std::unique_ptr old_user_info; + std::vector old_access_ids; + + // Check if the user already exists + if (ret == 0 && obj_ver.ver) { + // already exists. + + if (old_info) { + *old_info = duinfo.info; + } + + if (objv_tracker.read_version.ver != obj_ver.ver) { + // Object version mismatch.. return ECANCELED + ret = -ECANCELED; + ldpp_dout(dpp, 0) << "User Read version mismatch read_version=" + << objv_tracker.read_version.ver + << " obj_ver=" << obj_ver.ver << dendl; + return ret; + } + + if (exclusive) { + // return + return ret; + } + obj_ver.ver++; + + for (auto const& [id, key] : duinfo.info.access_keys) { + old_access_ids.push_back(id.c_str()); + } + old_user_info.reset( + new ds3_user_info{.name = duinfo.info.user_id.to_str().c_str(), + .email = duinfo.info.user_email.c_str(), + .access_ids = old_access_ids.data(), + .access_ids_nr = old_access_ids.size()}); + } else { + obj_ver.ver = 1; + obj_ver.tag = "UserTAG"; + } + + bufferlist bl; + std::unique_ptr user_info = + get_encoded_info(bl, obj_ver); + + ret = ds3_user_set(name.c_str(), user_info.get(), old_user_info.get(), + store->ds3, nullptr); + + if (ret != 0) { + ldpp_dout(dpp, 0) << "Error: ds3_user_set failed, name=" << name + << " ret=" << ret << dendl; + } + + return ret; +} + +int DaosUser::read_user(const DoutPrefixProvider* dpp, std::string name, + DaosUserInfo* duinfo) { + // Initialize ds3_user_info + bufferlist bl; + uint64_t size = DS3_MAX_ENCODED_LEN; + struct ds3_user_info user_info = {.encoded = bl.append_hole(size).c_str(), + .encoded_length = size}; + + int ret = ds3_user_get(name.c_str(), &user_info, store->ds3, nullptr); + + if (ret != 0) { + ldpp_dout(dpp, 0) << "Error: ds3_user_get failed, name=" << name + << " ret=" << ret << dendl; + return ret; + } + + // Decode + bufferlist& blr = bl; + auto iter = blr.cbegin(); + duinfo->decode(iter); + return ret; +} + +std::unique_ptr DaosUser::get_encoded_info( + bufferlist& bl, obj_version& obj_ver) { + // Encode user data + struct DaosUserInfo duinfo; + duinfo.info = info; + duinfo.attrs = attrs; + duinfo.user_version = obj_ver; + duinfo.encode(bl); + + // Initialize ds3_user_info + access_ids.clear(); + for (auto const& [id, key] : info.access_keys) { + access_ids.push_back(id.c_str()); + } + return std::unique_ptr( + new ds3_user_info{.name = info.user_id.to_str().c_str(), + .email = info.user_email.c_str(), + .access_ids = access_ids.data(), + .access_ids_nr = access_ids.size(), + .encoded = bl.c_str(), + .encoded_length = bl.length()}); +} + +int DaosUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y) { + const string name = info.user_id.to_str(); + + // TODO: the expectation is that the object version needs to be passed in as a + // method arg see int DB::remove_user(const DoutPrefixProvider *dpp, + // RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv) + obj_version obj_ver; + bufferlist bl; + std::unique_ptr user_info = + get_encoded_info(bl, obj_ver); + + // Remove user + int ret = ds3_user_remove(name.c_str(), user_info.get(), store->ds3, nullptr); + if (ret != 0) { + ldpp_dout(dpp, 0) << "Error: ds3_user_set failed, name=" << name + << " ret=" << ret << dendl; + } + return ret; +} + +DaosBucket::~DaosBucket() { close(nullptr); } + +int DaosBucket::open(const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << "DEBUG: open, name=" << info.bucket.name.c_str() + << dendl; + // Idempotent + if (is_open()) { + return 0; + } + + int ret = ds3_bucket_open(get_name().c_str(), &ds3b, store->ds3, nullptr); + ldpp_dout(dpp, 20) << "DEBUG: ds3_bucket_open, name=" << get_name() + << ", ret=" << ret << dendl; + + return ret; +} + +int DaosBucket::close(const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << "DEBUG: close" << dendl; + // Idempotent + if (!is_open()) { + return 0; + } + + int ret = ds3_bucket_close(ds3b, nullptr); + ds3b = nullptr; + ldpp_dout(dpp, 20) << "DEBUG: ds3_bucket_close ret=" << ret << dendl; + + return ret; +} + +std::unique_ptr DaosBucket::get_encoded_info( + bufferlist& bl, ceph::real_time _mtime) { + DaosBucketInfo dbinfo; + dbinfo.info = info; + dbinfo.bucket_attrs = attrs; + dbinfo.mtime = _mtime; + dbinfo.bucket_version = bucket_version; + dbinfo.encode(bl); + + auto bucket_info = std::make_unique(); + bucket_info->encoded = bl.c_str(); + bucket_info->encoded_length = bl.length(); + std::strncpy(bucket_info->name, get_name().c_str(), sizeof(bucket_info->name)); + return bucket_info; +} + +int DaosBucket::remove_bucket(const DoutPrefixProvider* dpp, + bool delete_children, bool forward_to_master, + req_info* req_info, optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: remove_bucket, delete_children=" + + << delete_children + + << " forward_to_master=" << forward_to_master << dendl; + + return ds3_bucket_destroy(get_name().c_str(), delete_children, store->ds3, + nullptr); +} + +int DaosBucket::remove_bucket_bypass_gc(int concurrent_max, + bool keep_index_consistent, + optional_yield y, + const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << "DEBUG: remove_bucket_bypass_gc, concurrent_max=" + + << concurrent_max + + << " keep_index_consistent=" << keep_index_consistent + + << dendl; + return ds3_bucket_destroy(get_name().c_str(), true, store->ds3, nullptr); +} + +int DaosBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive, + ceph::real_time _mtime) { + ldpp_dout(dpp, 20) << "DEBUG: put_info(): bucket name=" << get_name() + << dendl; + + int ret = open(dpp); + if (ret != 0) { + return ret; + } + + bufferlist bl; + std::unique_ptr bucket_info = + get_encoded_info(bl, ceph::real_time()); + + ret = ds3_bucket_set_info(bucket_info.get(), ds3b, nullptr); + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_set_info failed: " << ret << dendl; + } + return ret; +} + +int DaosBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y, + bool get_stats) { + ldpp_dout(dpp, 20) << "DEBUG: load_bucket(): bucket name=" << get_name() + << dendl; + int ret = open(dpp); + if (ret != 0) { + return ret; + } + + bufferlist bl; + DaosBucketInfo dbinfo; + uint64_t size = DS3_MAX_ENCODED_LEN; + struct ds3_bucket_info bucket_info = {.encoded = bl.append_hole(size).c_str(), + .encoded_length = size}; + + ret = ds3_bucket_get_info(&bucket_info, ds3b, nullptr); + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_get_info failed: " << ret << dendl; + return ret; + } + + auto iter = bl.cbegin(); + dbinfo.decode(iter); + info = dbinfo.info; + rgw_placement_rule placement_rule; + placement_rule.name = "default"; + placement_rule.storage_class = "STANDARD"; + info.placement_rule = placement_rule; + + attrs = dbinfo.bucket_attrs; + mtime = dbinfo.mtime; + bucket_version = dbinfo.bucket_version; + return ret; +} + +/* stats - Not for first pass */ +int DaosBucket::read_stats(const DoutPrefixProvider* dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, std::string* bucket_ver, + std::string* master_ver, + std::map& stats, + std::string* max_marker, bool* syncstopped) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::read_stats_async( + const DoutPrefixProvider* dpp, + const bucket_index_layout_generation& idx_layout, int shard_id, + RGWGetBucketStats_CB* ctx) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::sync_user_stats(const DoutPrefixProvider* dpp, + optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::update_container_stats(const DoutPrefixProvider* dpp) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::check_bucket_shards(const DoutPrefixProvider* dpp) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::chown(const DoutPrefixProvider* dpp, User& new_user, + optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +/* Make sure to call load_bucket() if you need it first */ +bool DaosBucket::is_owner(User* user) { + return (info.owner.compare(user->get_id()) == 0); +} + +int DaosBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y) { + /* XXX: Check if bucket contains any objects */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::check_quota(const DoutPrefixProvider* dpp, RGWQuota& quota, + uint64_t obj_size, optional_yield y, + bool check_size_only) { + /* Not Handled in the first pass as stats are also needed */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp, + Attrs& new_attrs, optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: merge_and_store_attrs, new_attrs=" << new_attrs + << dendl; + for (auto& it : new_attrs) { + attrs[it.first] = it.second; + } + + return put_info(dpp, y, ceph::real_time()); +} + +int DaosBucket::try_refresh_info(const DoutPrefixProvider* dpp, + ceph::real_time* pmtime) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +/* XXX: usage and stats not supported in the first pass */ +int DaosBucket::read_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + map& usage) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::trim_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch, + uint64_t end_epoch) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::remove_objs_from_index( + const DoutPrefixProvider* dpp, + std::list& objs_to_unlink) { + /* XXX: CHECK: Unlike RadosStore, there is no seperate bucket index table. + * Delete all the object in the list from the object table of this + * bucket + */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::check_index( + const DoutPrefixProvider* dpp, + std::map& existing_stats, + std::map& calculated_stats) { + /* XXX: stats not supported yet */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::rebuild_index(const DoutPrefixProvider* dpp) { + /* there is no index table in DAOS. Not applicable */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::set_tag_timeout(const DoutPrefixProvider* dpp, + uint64_t timeout) { + /* XXX: CHECK: set tag timeout for all the bucket objects? */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::purge_instance(const DoutPrefixProvider* dpp) { + /* XXX: CHECK: for DAOS only single instance supported. + * Remove all the objects for that instance? Anything extra needed? + */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosBucket::set_acl(const DoutPrefixProvider* dpp, + RGWAccessControlPolicy& acl, optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: set_acl" << dendl; + int ret = 0; + bufferlist aclbl; + + acls = acl; + acl.encode(aclbl); + + Attrs attrs = get_attrs(); + attrs[RGW_ATTR_ACL] = aclbl; + + return ret; +} + +std::unique_ptr DaosBucket::get_object(const rgw_obj_key& k) { + return std::make_unique(this->store, k, this); +} + +bool compare_rgw_bucket_dir_entry(rgw_bucket_dir_entry& entry1, + rgw_bucket_dir_entry& entry2) { + return (entry1.key < entry2.key); +} + +bool compare_multipart_upload(std::unique_ptr& upload1, + std::unique_ptr& upload2) { + return (upload1->get_key() < upload2->get_key()); +} + +int DaosBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max, + ListResults& results, optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: list bucket=" << get_name() << " max=" << max + << " params=" << params << dendl; + // End + if (max == 0) { + return 0; + } + + int ret = open(dpp); + if (ret != 0) { + return ret; + } + + // Init needed structures + vector object_infos(max); + uint32_t nobj = object_infos.size(); + vector> values(nobj, vector(DS3_MAX_ENCODED_LEN)); + for (uint32_t i = 0; i < nobj; i++) { + object_infos[i].encoded = values[i].data(); + object_infos[i].encoded_length = values[i].size(); + } + + vector common_prefixes(max); + uint32_t ncp = common_prefixes.size(); + + char daos_marker[DS3_MAX_KEY_BUFF]; + std::strncpy(daos_marker, params.marker.get_oid().c_str(), sizeof(daos_marker)); + + ret = ds3_bucket_list_obj(&nobj, object_infos.data(), &ncp, + common_prefixes.data(), params.prefix.c_str(), + params.delim.c_str(), daos_marker, + params.list_versions, &results.is_truncated, ds3b); + + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_list_obj failed, name=" + << get_name() << ", ret=" << ret << dendl; + return ret; + } + + object_infos.resize(nobj); + values.resize(nobj); + common_prefixes.resize(ncp); + + // Fill common prefixes + for (auto const& cp : common_prefixes) { + results.common_prefixes[cp.prefix] = true; + } + + // Decode objs + for (auto const& obj : object_infos) { + bufferlist bl; + rgw_bucket_dir_entry ent; + bl.append(reinterpret_cast(obj.encoded), obj.encoded_length); + auto iter = bl.cbegin(); + ent.decode(iter); + if (params.list_versions || ent.is_visible()) { + results.objs.emplace_back(std::move(ent)); + } + } + + if (!params.allow_unordered) { + std::sort(results.objs.begin(), results.objs.end(), + compare_rgw_bucket_dir_entry); + } + + return ret; +} + +int DaosBucket::list_multiparts( + const DoutPrefixProvider* dpp, const string& prefix, string& marker, + const string& delim, const int& max_uploads, + vector>& uploads, + map* common_prefixes, bool* is_truncated) { + ldpp_dout(dpp, 20) << "DEBUG: list_multiparts" << dendl; + // End of uploading + if (max_uploads == 0) { + *is_truncated = false; + return 0; + } + + // Init needed structures + vector multipart_upload_infos(max_uploads); + uint32_t nmp = multipart_upload_infos.size(); + vector> values(nmp, vector(DS3_MAX_ENCODED_LEN)); + for (uint32_t i = 0; i < nmp; i++) { + multipart_upload_infos[i].encoded = values[i].data(); + multipart_upload_infos[i].encoded_length = values[i].size(); + } + + vector cps(max_uploads); + uint32_t ncp = cps.size(); + + char daos_marker[DS3_MAX_KEY_BUFF]; + std::strncpy(daos_marker, marker.c_str(), sizeof(daos_marker)); + + int ret = ds3_bucket_list_multipart( + get_name().c_str(), &nmp, multipart_upload_infos.data(), &ncp, cps.data(), + prefix.c_str(), delim.c_str(), daos_marker, is_truncated, store->ds3); + + multipart_upload_infos.resize(nmp); + values.resize(nmp); + cps.resize(ncp); + + // Fill common prefixes + for (auto const& cp : cps) { + (*common_prefixes)[cp.prefix] = true; + } + + for (auto const& mp : multipart_upload_infos) { + // Decode the xattr + bufferlist bl; + rgw_bucket_dir_entry ent; + bl.append(reinterpret_cast(mp.encoded), mp.encoded_length); + auto iter = bl.cbegin(); + ent.decode(iter); + string name = ent.key.name; + + ACLOwner owner(rgw_user(ent.meta.owner)); + owner.set_name(ent.meta.owner_display_name); + uploads.push_back(this->get_multipart_upload( + name, mp.upload_id, std::move(owner), ent.meta.mtime)); + } + + // Sort uploads + std::sort(uploads.begin(), uploads.end(), compare_multipart_upload); + + return ret; +} + +int DaosBucket::abort_multiparts(const DoutPrefixProvider* dpp, + CephContext* cct) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +void DaosStore::finalize(void) { + ldout(cctx, 20) << "DEBUG: finalize" << dendl; + int ret; + + ret = ds3_disconnect(ds3, nullptr); + if (ret != 0) { + ldout(cctx, 0) << "ERROR: ds3_disconnect() failed: " << ret << dendl; + } + ds3 = nullptr; + + ret = ds3_fini(); + if (ret != 0) { + ldout(cctx, 0) << "ERROR: daos_fini() failed: " << ret << dendl; + } +} + +int DaosStore::initialize(CephContext* cct, const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << "DEBUG: initialize" << dendl; + int ret = ds3_init(); + + // DS3 init failed, allow the case where init is already done + if (ret != 0 && ret != DER_ALREADY) { + ldout(cct, 0) << "ERROR: ds3_init() failed: " << ret << dendl; + return ret; + } + + // XXX: these params should be taken from config settings and + // cct somehow? + const auto& daos_pool = cct->_conf.get_val("daos_pool"); + ldout(cct, 20) << "INFO: daos pool: " << daos_pool << dendl; + + ret = ds3_connect(daos_pool.c_str(), nullptr, &ds3, nullptr); + + if (ret != 0) { + ldout(cct, 0) << "ERROR: ds3_connect() failed: " << ret << dendl; + ds3_fini(); + } + + return ret; +} + +const std::string& DaosZoneGroup::get_endpoint() const { + if (!group.endpoints.empty()) { + return group.endpoints.front(); + } else { + // use zonegroup's master zone endpoints + auto z = group.zones.find(group.master_zone); + if (z != group.zones.end() && !z->second.endpoints.empty()) { + return z->second.endpoints.front(); + } + } + return empty; +} + +bool DaosZoneGroup::placement_target_exists(std::string& target) const { + return !!group.placement_targets.count(target); +} + +void DaosZoneGroup::get_placement_target_names( + std::set& names) const { + for (const auto& target : group.placement_targets) { + names.emplace(target.second.name); + } +} + +int DaosZoneGroup::get_placement_tier(const rgw_placement_rule& rule, + std::unique_ptr* tier) { + std::map::const_iterator titer; + titer = group.placement_targets.find(rule.name); + if (titer == group.placement_targets.end()) { + return -ENOENT; + } + + const auto& target_rule = titer->second; + std::map::const_iterator ttier; + ttier = target_rule.tier_targets.find(rule.storage_class); + if (ttier == target_rule.tier_targets.end()) { + // not found + return -ENOENT; + } + + PlacementTier* t; + t = new DaosPlacementTier(store, ttier->second); + if (!t) return -ENOMEM; + + tier->reset(t); + return 0; +} + +ZoneGroup& DaosZone::get_zonegroup() { return zonegroup; } + +int DaosZone::get_zonegroup(const std::string& id, + std::unique_ptr* group) { + /* XXX: for now only one zonegroup supported */ + ZoneGroup* zg; + zg = new DaosZoneGroup(store, zonegroup.get_group()); + + group->reset(zg); + return 0; +} + +const rgw_zone_id& DaosZone::get_id() { return cur_zone_id; } + +const std::string& DaosZone::get_name() const { + return zone_params->get_name(); +} + +bool DaosZone::is_writeable() { return true; } + +bool DaosZone::get_redirect_endpoint(std::string* endpoint) { return false; } + +bool DaosZone::has_zonegroup_api(const std::string& api) const { return false; } + +const std::string& DaosZone::get_current_period_id() { + return current_period->get_id(); +} + +std::unique_ptr DaosStore::get_lua_manager() { + return std::make_unique(this); +} + +int DaosObject::get_obj_state(const DoutPrefixProvider* dpp, + RGWObjState** _state, optional_yield y, + bool follow_olh) { + // Get object's metadata (those stored in rgw_bucket_dir_entry) + ldpp_dout(dpp, 20) << "DEBUG: get_obj_state" << dendl; + rgw_bucket_dir_entry ent; + *_state = &state; // state is required even if a failure occurs + + int ret = get_dir_entry_attrs(dpp, &ent); + if (ret != 0) { + return ret; + } + + // Set object state. + state.exists = true; + state.size = ent.meta.size; + state.accounted_size = ent.meta.size; + state.mtime = ent.meta.mtime; + + state.has_attrs = true; + bufferlist etag_bl; + string& etag = ent.meta.etag; + ldpp_dout(dpp, 20) << __func__ << ": object's etag: " << ent.meta.etag + << dendl; + etag_bl.append(etag); + state.attrset[RGW_ATTR_ETAG] = etag_bl; + return 0; +} + +DaosObject::~DaosObject() { close(nullptr); } + +int DaosObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, + Attrs* delattrs, optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: DaosObject::set_obj_attrs()" << dendl; + // TODO handle target_obj + // Get object's metadata (those stored in rgw_bucket_dir_entry) + rgw_bucket_dir_entry ent; + int ret = get_dir_entry_attrs(dpp, &ent); + if (ret != 0) { + return ret; + } + + // Update object metadata + Attrs updateattrs = setattrs == nullptr ? attrs : *setattrs; + if (delattrs) { + for (auto const& [attr, attrval] : *delattrs) { + updateattrs.erase(attr); + } + } + + ret = set_dir_entry_attrs(dpp, &ent, &updateattrs); + return ret; +} + +int DaosObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, + rgw_obj* target_obj) { + ldpp_dout(dpp, 20) << "DEBUG: DaosObject::get_obj_attrs()" << dendl; + // TODO handle target_obj + // Get object's metadata (those stored in rgw_bucket_dir_entry) + rgw_bucket_dir_entry ent; + int ret = get_dir_entry_attrs(dpp, &ent, &attrs); + return ret; +} + +int DaosObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, + optional_yield y, + const DoutPrefixProvider* dpp) { + // Get object's metadata (those stored in rgw_bucket_dir_entry) + ldpp_dout(dpp, 20) << "DEBUG: modify_obj_attrs" << dendl; + rgw_bucket_dir_entry ent; + int ret = get_dir_entry_attrs(dpp, &ent, &attrs); + if (ret != 0) { + return ret; + } + + // Update object attrs + set_atomic(); + attrs[attr_name] = attr_val; + + ret = set_dir_entry_attrs(dpp, &ent, &attrs); + return ret; +} + +int DaosObject::delete_obj_attrs(const DoutPrefixProvider* dpp, + const char* attr_name, optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: delete_obj_attrs" << dendl; + rgw_obj target = get_obj(); + Attrs rmattr; + bufferlist bl; + + rmattr[attr_name] = bl; + return set_obj_attrs(dpp, nullptr, &rmattr, y); +} + +bool DaosObject::is_expired() { + auto iter = attrs.find(RGW_ATTR_DELETE_AT); + if (iter != attrs.end()) { + utime_t delete_at; + try { + auto bufit = iter->second.cbegin(); + decode(delete_at, bufit); + } catch (buffer::error& err) { + ldout(store->ctx(), 0) + << "ERROR: " << __func__ + << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl; + return false; + } + + if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) { + return true; + } + } + + return false; +} + +// Taken from rgw_rados.cc +void DaosObject::gen_rand_obj_instance_name() { + enum { OBJ_INSTANCE_LEN = 32 }; + char buf[OBJ_INSTANCE_LEN + 1]; + + gen_rand_alphanumeric_no_underscore(store->ctx(), buf, OBJ_INSTANCE_LEN); + state.obj.key.set_instance(buf); +} + +int DaosObject::omap_get_vals(const DoutPrefixProvider* dpp, + const std::string& marker, uint64_t count, + std::map* m, bool* pmore, + optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::omap_get_all(const DoutPrefixProvider* dpp, + std::map* m, + optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::omap_get_vals_by_keys(const DoutPrefixProvider* dpp, + const std::string& oid, + const std::set& keys, + Attrs* vals) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::omap_set_val_by_key(const DoutPrefixProvider* dpp, + const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) { + return 0; +} + +std::unique_ptr DaosObject::get_serializer( + const DoutPrefixProvider* dpp, const std::string& lock_name) { + return std::make_unique(dpp, store, this, lock_name); +} + +int DaosObject::transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, uint64_t olh_epoch, + const DoutPrefixProvider* dpp, optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::transition_to_cloud( + Bucket* bucket, rgw::sal::PlacementTier* tier, rgw_bucket_dir_entry& o, + std::set& cloud_targets, CephContext* cct, bool update_object, + const DoutPrefixProvider* dpp, optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +bool DaosObject::placement_rules_match(rgw_placement_rule& r1, + rgw_placement_rule& r2) { + /* XXX: support single default zone and zonegroup for now */ + return true; +} + +int DaosObject::dump_obj_layout(const DoutPrefixProvider* dpp, optional_yield y, + Formatter* f) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +std::unique_ptr DaosObject::get_read_op() { + return std::make_unique(this); +} + +DaosObject::DaosReadOp::DaosReadOp(DaosObject* _source) : source(_source) {} + +int DaosObject::DaosReadOp::prepare(optional_yield y, + const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << __func__ + << ": bucket=" << source->get_bucket()->get_name() + << dendl; + + if (source->get_bucket()->versioned() && !source->have_instance()) { + // If the bucket is versioned and no version is specified, get the latest + // version + source->set_instance(DS3_LATEST_INSTANCE); + } + + rgw_bucket_dir_entry ent; + int ret = source->get_dir_entry_attrs(dpp, &ent); + + // Set source object's attrs. The attrs is key/value map and is used + // in send_response_data() to set attributes, including etag. + bufferlist etag_bl; + string& etag = ent.meta.etag; + ldpp_dout(dpp, 20) << __func__ << ": object's etag: " << ent.meta.etag + << dendl; + etag_bl.append(etag.c_str(), etag.size()); + source->get_attrs().emplace(std::move(RGW_ATTR_ETAG), std::move(etag_bl)); + + source->set_key(ent.key); + source->set_obj_size(ent.meta.size); + ldpp_dout(dpp, 20) << __func__ << ": object's size: " << ent.meta.size + << dendl; + + return ret; +} + +int DaosObject::DaosReadOp::read(int64_t off, int64_t end, bufferlist& bl, + optional_yield y, + const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << __func__ << ": off=" << off << " end=" << end << dendl; + int ret = source->lookup(dpp); + if (ret != 0) { + return ret; + } + + // Calculate size, end is inclusive + uint64_t size = end - off + 1; + + // Read + ret = source->read(dpp, bl, off, size); + if (ret != 0) { + return ret; + } + + return ret; +} + +// RGWGetObj::execute() calls ReadOp::iterate() to read object from 'off' to +// 'end'. The returned data is processed in 'cb' which is a chain of +// post-processing filters such as decompression, de-encryption and sending back +// data to client (RGWGetObj_CB::handle_dta which in turn calls +// RGWGetObj::get_data_cb() to send data back.). +// +// POC implements a simple sync version of iterate() function in which it reads +// a block of data each time and call 'cb' for post-processing. +int DaosObject::DaosReadOp::iterate(const DoutPrefixProvider* dpp, int64_t off, + int64_t end, RGWGetDataCB* cb, + optional_yield y) { + ldpp_dout(dpp, 20) << __func__ << ": off=" << off << " end=" << end << dendl; + int ret = source->lookup(dpp); + if (ret != 0) { + return ret; + } + + // Calculate size, end is inclusive + uint64_t size = end - off + 1; + + // Reserve buffers and read + bufferlist bl; + ret = source->read(dpp, bl, off, size); + if (ret != 0) { + return ret; + } + + // Call cb to process returned data. + ldpp_dout(dpp, 20) << __func__ << ": call cb to process data, actual=" << size + << dendl; + cb->handle_data(bl, off, size); + return ret; +} + +int DaosObject::DaosReadOp::get_attr(const DoutPrefixProvider* dpp, + const char* name, bufferlist& dest, + optional_yield y) { + Attrs attrs; + int ret = source->get_dir_entry_attrs(dpp, nullptr, &attrs); + if (!ret) { + return -ENODATA; + } + + auto search = attrs.find(name); + if (search == attrs.end()) { + return -ENODATA; + } + + dest = search->second; + return 0; +} + +std::unique_ptr DaosObject::get_delete_op() { + return std::make_unique(this); +} + +DaosObject::DaosDeleteOp::DaosDeleteOp(DaosObject* _source) : source(_source) {} + +// Implementation of DELETE OBJ also requires DaosObject::get_obj_state() +// to retrieve and set object's state from object's metadata. +// +// TODO: +// 1. The POC only deletes the Daos objects. It doesn't handle the +// DeleteOp::params. Delete::delete_obj() in rgw_rados.cc shows how rados +// backend process the params. +// 2. Delete an object when its versioning is turned on. +// 3. Handle empty directories +// 4. Fail when file doesn't exist +int DaosObject::DaosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, + optional_yield y) { + ldpp_dout(dpp, 20) << "DaosDeleteOp::delete_obj " + << source->get_key().get_oid() << " from " + << source->get_bucket()->get_name() << dendl; + if (source->get_instance() == "null") { + source->clear_instance(); + } + + // Open bucket + int ret = 0; + std::string key = source->get_key().get_oid(); + DaosBucket* daos_bucket = source->get_daos_bucket(); + ret = daos_bucket->open(dpp); + if (ret != 0) { + return ret; + } + + // Remove the daos object + ret = ds3_obj_destroy(key.c_str(), daos_bucket->ds3b); + ldpp_dout(dpp, 20) << "DEBUG: ds3_obj_destroy key=" << key << " ret=" << ret + << dendl; + + // result.delete_marker = parent_op.result.delete_marker; + // result.version_id = parent_op.result.version_id; + + return ret; +} + +int DaosObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, + bool prevent_versioning) { + ldpp_dout(dpp, 20) << "DEBUG: delete_object" << dendl; + DaosObject::DaosDeleteOp del_op(this); + del_op.params.bucket_owner = bucket->get_info().owner; + del_op.params.versioning_status = bucket->get_info().versioning_status(); + + return del_op.delete_obj(dpp, y); +} + +int DaosObject::delete_obj_aio(const DoutPrefixProvider* dpp, + RGWObjState* astate, Completions* aio, + bool keep_index_consistent, optional_yield y) { + /* XXX: Make it async */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::copy_object( + User* user, req_info* info, const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, ceph::real_time* mtime, + const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr, + bool high_precision_time, const char* if_match, const char* if_nomatch, + AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs, + RGWObjCategory category, uint64_t olh_epoch, + boost::optional delete_at, std::string* version_id, + std::string* tag, std::string* etag, void (*progress_cb)(off_t, void*), + void* progress_data, const DoutPrefixProvider* dpp, optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosObject::lookup(const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << "DEBUG: lookup" << dendl; + if (is_open()) { + return 0; + } + + if (get_instance() == "null") { + clear_instance(); + } + + int ret = 0; + DaosBucket* daos_bucket = get_daos_bucket(); + ret = daos_bucket->open(dpp); + if (ret != 0) { + return ret; + } + + ret = ds3_obj_open(get_key().get_oid().c_str(), &ds3o, daos_bucket->ds3b); + + if (ret == -ENOENT) { + ldpp_dout(dpp, 20) << "DEBUG: daos object (" << get_bucket()->get_name() + << ", " << get_key().get_oid() + << ") does not exist: ret=" << ret << dendl; + } else if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to open daos object (" + << get_bucket()->get_name() << ", " << get_key().get_oid() + << "): ret=" << ret << dendl; + } + return ret; +} + +int DaosObject::create(const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << "DEBUG: create" << dendl; + if (is_open()) { + return 0; + } + + if (get_instance() == "null") { + clear_instance(); + } + + int ret = 0; + DaosBucket* daos_bucket = get_daos_bucket(); + ret = daos_bucket->open(dpp); + if (ret != 0) { + return ret; + } + + ret = ds3_obj_create(get_key().get_oid().c_str(), &ds3o, daos_bucket->ds3b); + + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to create daos object (" + << get_bucket()->get_name() << ", " << get_key().get_oid() + << "): ret=" << ret << dendl; + } + return ret; +} + +int DaosObject::close(const DoutPrefixProvider* dpp) { + ldpp_dout(dpp, 20) << "DEBUG: close" << dendl; + if (!is_open()) { + return 0; + } + + int ret = ds3_obj_close(ds3o); + ds3o = nullptr; + ldpp_dout(dpp, 20) << "DEBUG: ds3_obj_close ret=" << ret << dendl; + return ret; +} + +int DaosObject::write(const DoutPrefixProvider* dpp, bufferlist&& data, + uint64_t offset) { + ldpp_dout(dpp, 20) << "DEBUG: write" << dendl; + uint64_t size = data.length(); + int ret = ds3_obj_write(data.c_str(), offset, &size, get_daos_bucket()->ds3b, + ds3o, nullptr); + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to write into daos object (" + << get_bucket()->get_name() << ", " << get_key().get_oid() + << "): ret=" << ret << dendl; + } + return ret; +} + +int DaosObject::read(const DoutPrefixProvider* dpp, bufferlist& data, + uint64_t offset, uint64_t& size) { + ldpp_dout(dpp, 20) << "DEBUG: read" << dendl; + int ret = ds3_obj_read(data.append_hole(size).c_str(), offset, &size, + get_daos_bucket()->ds3b, ds3o, nullptr); + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read from daos object (" + << get_bucket()->get_name() << ", " << get_key().get_oid() + << "): ret=" << ret << dendl; + } + return ret; +} + +// Get the object's dirent and attrs +int DaosObject::get_dir_entry_attrs(const DoutPrefixProvider* dpp, + rgw_bucket_dir_entry* ent, + Attrs* getattrs) { + ldpp_dout(dpp, 20) << "DEBUG: get_dir_entry_attrs" << dendl; + int ret = 0; + vector value(DS3_MAX_ENCODED_LEN); + uint32_t size = value.size(); + + if (get_key().ns == RGW_OBJ_NS_MULTIPART) { + struct ds3_multipart_upload_info ui = {.encoded = value.data(), + .encoded_length = size}; + ret = ds3_upload_get_info(&ui, bucket->get_name().c_str(), + get_key().get_oid().c_str(), store->ds3); + } else { + ret = lookup(dpp); + if (ret != 0) { + return ret; + } + + auto object_info = std::make_unique(); + object_info->encoded = value.data(); + object_info->encoded_length = size; + ret = ds3_obj_get_info(object_info.get(), get_daos_bucket()->ds3b, ds3o); + size = object_info->encoded_length; + } + + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to get info of daos object (" + << get_bucket()->get_name() << ", " << get_key().get_oid() + << "): ret=" << ret << dendl; + return ret; + } + + rgw_bucket_dir_entry dummy_ent; + if (!ent) { + // if ent is not passed, use a dummy ent + ent = &dummy_ent; + } + + bufferlist bl; + bl.append(reinterpret_cast(value.data()), size); + auto iter = bl.cbegin(); + ent->decode(iter); + if (getattrs) { + decode(*getattrs, iter); + } + + return ret; +} +// Set the object's dirent and attrs +int DaosObject::set_dir_entry_attrs(const DoutPrefixProvider* dpp, + rgw_bucket_dir_entry* ent, + Attrs* setattrs) { + ldpp_dout(dpp, 20) << "DEBUG: set_dir_entry_attrs" << dendl; + int ret = lookup(dpp); + if (ret != 0) { + return ret; + } + + // Set defaults + if (!ent) { + // if ent is not passed, return an error + return -EINVAL; + } + + if (!setattrs) { + // if setattrs is not passed, use object attrs + setattrs = &attrs; + } + + bufferlist wbl; + ent->encode(wbl); + encode(*setattrs, wbl); + + // Write rgw_bucket_dir_entry into object xattr + auto object_info = std::make_unique(); + object_info->encoded = wbl.c_str(); + object_info->encoded_length = wbl.length(); + ret = ds3_obj_set_info(object_info.get(), get_daos_bucket()->ds3b, ds3o); + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to set info of daos object (" + << get_bucket()->get_name() << ", " << get_key().get_oid() + << "): ret=" << ret << dendl; + } + return ret; +} + +int DaosObject::mark_as_latest(const DoutPrefixProvider* dpp, + ceph::real_time set_mtime) { + // TODO handle deletion + // TODO understand race conditions + ldpp_dout(dpp, 20) << "DEBUG: mark_as_latest" << dendl; + + // Get latest version so far + std::unique_ptr latest_object = std::make_unique( + store, rgw_obj_key(get_name(), DS3_LATEST_INSTANCE), get_bucket()); + + ldpp_dout(dpp, 20) << __func__ << ": key=" << get_key().get_oid() + << " latest_object_key= " + << latest_object->get_key().get_oid() << dendl; + + int ret = latest_object->lookup(dpp); + if (ret == 0) { + // Get metadata only if file exists + rgw_bucket_dir_entry latest_ent; + Attrs latest_attrs; + ret = latest_object->get_dir_entry_attrs(dpp, &latest_ent, &latest_attrs); + if (ret != 0) { + return ret; + } + + // Update flags + latest_ent.flags = rgw_bucket_dir_entry::FLAG_VER; + latest_ent.meta.mtime = set_mtime; + ret = latest_object->set_dir_entry_attrs(dpp, &latest_ent, &latest_attrs); + if (ret != 0) { + return ret; + } + } + + // Get or create the link [latest], make it link to the current latest + // version. + ret = + ds3_obj_mark_latest(get_key().get_oid().c_str(), get_daos_bucket()->ds3b); + ldpp_dout(dpp, 20) << "DEBUG: ds3_obj_mark_latest ret=" << ret << dendl; + return ret; +} + +DaosAtomicWriter::DaosAtomicWriter( + const DoutPrefixProvider* dpp, optional_yield y, + rgw::sal::Object* obj, DaosStore* _store, + const rgw_user& _owner, const rgw_placement_rule* _ptail_placement_rule, + uint64_t _olh_epoch, const std::string& _unique_tag) + : StoreWriter(dpp, y), + store(_store), + owner(_owner), + ptail_placement_rule(_ptail_placement_rule), + olh_epoch(_olh_epoch), + unique_tag(_unique_tag), + obj(_store, obj->get_key(), obj->get_bucket()) {} + +int DaosAtomicWriter::prepare(optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: prepare" << dendl; + int ret = obj.create(dpp); + return ret; +} + +// TODO: Handle concurrent writes, a unique object id is a possible solution, or +// use DAOS transactions +// XXX: Do we need to accumulate writes as motr does? +int DaosAtomicWriter::process(bufferlist&& data, uint64_t offset) { + ldpp_dout(dpp, 20) << "DEBUG: process" << dendl; + if (data.length() == 0) { + return 0; + } + + int ret = 0; + if (!obj.is_open()) { + ret = obj.lookup(dpp); + if (ret != 0) { + return ret; + } + } + + // XXX: Combine multiple streams into one as motr does + uint64_t data_size = data.length(); + ret = obj.write(dpp, std::move(data), offset); + if (ret == 0) { + total_data_size += data_size; + } + return ret; +} + +int DaosAtomicWriter::complete( + size_t accounted_size, const std::string& etag, ceph::real_time* mtime, + ceph::real_time set_mtime, std::map& attrs, + ceph::real_time delete_at, const char* if_match, const char* if_nomatch, + const std::string* user_data, rgw_zone_set* zones_trace, bool* canceled, + optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: complete" << dendl; + bufferlist bl; + rgw_bucket_dir_entry ent; + int ret; + + // Set rgw_bucet_dir_entry. Some of the members of this structure may not + // apply to daos. + // + // Checkout AtomicObjectProcessor::complete() in rgw_putobj_processor.cc + // and RGWRados::Object::Write::write_meta() in rgw_rados.cc for what and + // how to set the dir entry. Only set the basic ones for POC, no ACLs and + // other attrs. + obj.get_key().get_index_key(&ent.key); + ent.meta.size = total_data_size; + ent.meta.accounted_size = accounted_size; + ent.meta.mtime = + real_clock::is_zero(set_mtime) ? ceph::real_clock::now() : set_mtime; + ent.meta.etag = etag; + ent.meta.owner = owner.to_str(); + ent.meta.owner_display_name = + obj.get_bucket()->get_owner()->get_display_name(); + bool is_versioned = obj.get_bucket()->versioned(); + if (is_versioned) + ent.flags = + rgw_bucket_dir_entry::FLAG_VER | rgw_bucket_dir_entry::FLAG_CURRENT; + ldpp_dout(dpp, 20) << __func__ << ": key=" << obj.get_key().get_oid() + << " etag: " << etag << dendl; + if (user_data) ent.meta.user_data = *user_data; + + RGWBucketInfo& info = obj.get_bucket()->get_info(); + if (info.obj_lock_enabled() && info.obj_lock.has_rule()) { + auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (iter == attrs.end()) { + real_time lock_until_date = + info.obj_lock.get_lock_until_date(ent.meta.mtime); + string mode = info.obj_lock.get_mode(); + RGWObjectRetention obj_retention(mode, lock_until_date); + bufferlist retention_bl; + obj_retention.encode(retention_bl); + attrs[RGW_ATTR_OBJECT_RETENTION] = retention_bl; + } + } + + ret = obj.set_dir_entry_attrs(dpp, &ent, &attrs); + + if (is_versioned) { + ret = obj.mark_as_latest(dpp, set_mtime); + if (ret != 0) { + return ret; + } + } + + return ret; +} + +int DaosMultipartUpload::abort(const DoutPrefixProvider* dpp, + CephContext* cct) { + // Remove upload from bucket multipart index + ldpp_dout(dpp, 20) << "DEBUG: abort" << dendl; + return ds3_upload_remove(bucket->get_name().c_str(), get_upload_id().c_str(), + store->ds3); +} + +std::unique_ptr DaosMultipartUpload::get_meta_obj() { + return bucket->get_object( + rgw_obj_key(get_upload_id(), string(), RGW_OBJ_NS_MULTIPART)); +} + +int DaosMultipartUpload::init(const DoutPrefixProvider* dpp, optional_yield y, + ACLOwner& _owner, + rgw_placement_rule& dest_placement, + rgw::sal::Attrs& attrs) { + ldpp_dout(dpp, 20) << "DEBUG: init" << dendl; + int ret; + std::string oid = mp_obj.get_key(); + + // Create an initial entry in the bucket. The entry will be + // updated when multipart upload is completed, for example, + // size, etag etc. + bufferlist bl; + rgw_bucket_dir_entry ent; + ent.key.name = oid; + ent.meta.owner = owner.get_id().to_str(); + ent.meta.category = RGWObjCategory::MultiMeta; + ent.meta.mtime = ceph::real_clock::now(); + + multipart_upload_info upload_info; + upload_info.dest_placement = dest_placement; + + ent.encode(bl); + encode(attrs, bl); + encode(upload_info, bl); + + struct ds3_multipart_upload_info ui; + std::strcpy(ui.upload_id, MULTIPART_UPLOAD_ID_PREFIX); + std::strncpy(ui.key, oid.c_str(), sizeof(ui.key)); + ui.encoded = bl.c_str(); + ui.encoded_length = bl.length(); + int prefix_length = strlen(ui.upload_id); + + do { + gen_rand_alphanumeric(store->ctx(), ui.upload_id + prefix_length, + sizeof(ui.upload_id) - 1 - prefix_length); + mp_obj.init(oid, ui.upload_id); + ret = ds3_upload_init(&ui, bucket->get_name().c_str(), store->ds3); + } while (ret == -EEXIST); + + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to create multipart upload dir (" + << bucket->get_name() << "/" << get_upload_id() + << "): ret=" << ret << dendl; + } + return ret; +} + +int DaosMultipartUpload::list_parts(const DoutPrefixProvider* dpp, + CephContext* cct, int num_parts, int marker, + int* next_marker, bool* truncated, + bool assume_unsorted) { + ldpp_dout(dpp, 20) << "DEBUG: list_parts" << dendl; + // Init needed structures + vector multipart_part_infos(num_parts); + uint32_t npart = multipart_part_infos.size(); + vector> values(npart, vector(DS3_MAX_ENCODED_LEN)); + for (uint32_t i = 0; i < npart; i++) { + multipart_part_infos[i].encoded = values[i].data(); + multipart_part_infos[i].encoded_length = values[i].size(); + } + + uint32_t daos_marker = marker; + int ret = ds3_upload_list_parts( + bucket->get_name().c_str(), get_upload_id().c_str(), &npart, + multipart_part_infos.data(), &daos_marker, truncated, store->ds3); + + if (ret != 0) { + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + return ret; + } + + multipart_part_infos.resize(npart); + values.resize(npart); + parts.clear(); + + for (auto const& pi : multipart_part_infos) { + bufferlist bl; + bl.append(reinterpret_cast(pi.encoded), pi.encoded_length); + + std::unique_ptr part = + std::make_unique(); + auto iter = bl.cbegin(); + decode(part->info, iter); + parts[pi.part_num] = std::move(part); + } + + if (next_marker) { + *next_marker = daos_marker; + } + return ret; +} + +// Heavily copied from rgw_sal_rados.cc +int DaosMultipartUpload::complete( + const DoutPrefixProvider* dpp, optional_yield y, CephContext* cct, + map& part_etags, list& remove_objs, + uint64_t& accounted_size, bool& compressed, RGWCompressionInfo& cs_info, + off_t& off, std::string& tag, ACLOwner& owner, uint64_t olh_epoch, + rgw::sal::Object* target_obj) { + ldpp_dout(dpp, 20) << "DEBUG: complete" << dendl; + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + std::string etag; + bufferlist etag_bl; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + bool truncated; + int ret; + + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): enter" << dendl; + int total_parts = 0; + int handled_parts = 0; + int max_parts = 1000; + int marker = 0; + uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size; + auto etags_iter = part_etags.begin(); + rgw::sal::Attrs attrs = target_obj->get_attrs(); + + do { + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): list_parts()" + << dendl; + ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + if (ret != 0) return ret; + + total_parts += parts.size(); + if (!truncated && total_parts != (int)part_etags.size()) { + ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts + << " expected: " << part_etags.size() << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): parts.size()=" + << parts.size() << dendl; + + for (auto obj_iter = parts.begin(); + etags_iter != part_etags.end() && obj_iter != parts.end(); + ++etags_iter, ++obj_iter, ++handled_parts) { + DaosMultipartPart* part = + dynamic_cast(obj_iter->second.get()); + uint64_t part_size = part->get_size(); + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): part_size=" + << part_size << dendl; + if (handled_parts < (int)part_etags.size() - 1 && + part_size < min_part_size) { + ret = -ERR_TOO_SMALL; + return ret; + } + + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + if (etags_iter->first != (int)obj_iter->first) { + ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: " + << etags_iter->first + << " next uploaded: " << obj_iter->first << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + string part_etag = rgw_string_unquote(etags_iter->second); + if (part_etag.compare(part->get_etag()) != 0) { + ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " + << etags_iter->first + << " etag: " << etags_iter->second << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + + hex_to_buf(part->get_etag().c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char*)petag, sizeof(petag)); + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): calc etag " + << dendl; + + RGWUploadPartInfo& obj_part = part->info; + string oid = mp_obj.get_part(obj_part.num); + rgw_obj src_obj; + src_obj.init_ns(bucket->get_key(), oid, RGW_OBJ_NS_MULTIPART); + + bool part_compressed = (obj_part.cs_info.compression_type != "none"); + if ((handled_parts > 0) && + ((part_compressed != compressed) || + (cs_info.compression_type != obj_part.cs_info.compression_type))) { + ldpp_dout(dpp, 0) + << "ERROR: compression type was changed during multipart upload (" + << cs_info.compression_type << ">>" + << obj_part.cs_info.compression_type << ")" << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): part compression" + << dendl; + if (part_compressed) { + int64_t new_ofs; // offset in compression data for new part + if (cs_info.blocks.size() > 0) + new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len; + else + new_ofs = 0; + for (const auto& block : obj_part.cs_info.blocks) { + compression_block cb; + cb.old_ofs = block.old_ofs + cs_info.orig_size; + cb.new_ofs = new_ofs; + cb.len = block.len; + cs_info.blocks.push_back(cb); + new_ofs = cb.new_ofs + cb.len; + } + if (!compressed) + cs_info.compression_type = obj_part.cs_info.compression_type; + cs_info.orig_size += obj_part.cs_info.orig_size; + compressed = true; + } + + // We may not need to do the following as remove_objs are those + // don't show when listing a bucket. As we store in-progress uploaded + // object's metadata in a separate index, they are not shown when + // listing a bucket. + rgw_obj_index_key remove_key; + src_obj.key.get_index_key(&remove_key); + + remove_objs.push_back(remove_key); + + off += obj_part.size; + accounted_size += obj_part.accounted_size; + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): off=" << off + << ", accounted_size = " << accounted_size << dendl; + } + } while (truncated); + hash.Final((unsigned char*)final_etag); + + buf_to_hex((unsigned char*)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], + sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, "-%lld", + (long long)part_etags.size()); + etag = final_etag_str; + ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl; + + etag_bl.append(etag); + + attrs[RGW_ATTR_ETAG] = etag_bl; + + if (compressed) { + // write compression attribute to full object + bufferlist tmp; + encode(cs_info, tmp); + attrs[RGW_ATTR_COMPRESSION] = tmp; + } + + // Different from rgw_sal_rados.cc starts here + // Read the object's multipart info + bufferlist bl; + uint64_t size = DS3_MAX_ENCODED_LEN; + struct ds3_multipart_upload_info ui = { + .encoded = bl.append_hole(size).c_str(), .encoded_length = size}; + ret = ds3_upload_get_info(&ui, bucket->get_name().c_str(), + get_upload_id().c_str(), store->ds3); + ldpp_dout(dpp, 20) << "DEBUG: ds3_upload_get_info entry=" + << bucket->get_name() << "/" << get_upload_id() << dendl; + if (ret != 0) { + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + return ret; + } + + rgw_bucket_dir_entry ent; + auto iter = bl.cbegin(); + ent.decode(iter); + + // Update entry data and name + target_obj->get_key().get_index_key(&ent.key); + ent.meta.size = off; + ent.meta.accounted_size = accounted_size; + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): obj size=" + << ent.meta.size + << " obj accounted size=" << ent.meta.accounted_size + << dendl; + ent.meta.category = RGWObjCategory::Main; + ent.meta.mtime = ceph::real_clock::now(); + bool is_versioned = target_obj->get_bucket()->versioned(); + if (is_versioned) + ent.flags = + rgw_bucket_dir_entry::FLAG_VER | rgw_bucket_dir_entry::FLAG_CURRENT; + ent.meta.etag = etag; + + // Open object + DaosObject* obj = static_cast(target_obj); + ret = obj->create(dpp); + if (ret != 0) { + return ret; + } + + // Copy data from parts to object + uint64_t write_off = 0; + for (auto const& [part_num, part] : get_parts()) { + ds3_part_t* ds3p; + ret = ds3_part_open(get_bucket_name().c_str(), get_upload_id().c_str(), + part_num, false, &ds3p, store->ds3); + if (ret != 0) { + return ret; + } + + // Reserve buffers and read + uint64_t size = part->get_size(); + bufferlist bl; + ret = ds3_part_read(bl.append_hole(size).c_str(), 0, &size, ds3p, + store->ds3, nullptr); + if (ret != 0) { + ds3_part_close(ds3p); + return ret; + } + + ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): part " << part_num + << " size is " << size << dendl; + + // write to obj + obj->write(dpp, std::move(bl), write_off); + ds3_part_close(ds3p); + write_off += part->get_size(); + } + + // Set attributes + ret = obj->set_dir_entry_attrs(dpp, &ent, &attrs); + + if (is_versioned) { + ret = obj->mark_as_latest(dpp, ent.meta.mtime); + if (ret != 0) { + return ret; + } + } + + // Remove upload from bucket multipart index + ret = ds3_upload_remove(get_bucket_name().c_str(), get_upload_id().c_str(), + store->ds3); + return ret; +} + +int DaosMultipartUpload::get_info(const DoutPrefixProvider* dpp, + optional_yield y, rgw_placement_rule** rule, + rgw::sal::Attrs* attrs) { + ldpp_dout(dpp, 20) << "DaosMultipartUpload::get_info(): enter" << dendl; + if (!rule && !attrs) { + return 0; + } + + if (rule) { + if (!placement.empty()) { + *rule = &placement; + if (!attrs) { + // Don't need attrs, done + return 0; + } + } else { + *rule = nullptr; + } + } + + // Read the multipart upload dirent from index + bufferlist bl; + uint64_t size = DS3_MAX_ENCODED_LEN; + struct ds3_multipart_upload_info ui = { + .encoded = bl.append_hole(size).c_str(), .encoded_length = size}; + int ret = ds3_upload_get_info(&ui, bucket->get_name().c_str(), + get_upload_id().c_str(), store->ds3); + + if (ret != 0) { + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + return ret; + } + + multipart_upload_info upload_info; + rgw_bucket_dir_entry ent; + Attrs decoded_attrs; + auto iter = bl.cbegin(); + ent.decode(iter); + decode(decoded_attrs, iter); + ldpp_dout(dpp, 20) << "DEBUG: decoded_attrs=" << attrs << dendl; + + if (attrs) { + *attrs = decoded_attrs; + if (!rule || *rule != nullptr) { + // placement was cached; don't actually read + return 0; + } + } + + // Now decode the placement rule + decode(upload_info, iter); + placement = upload_info.dest_placement; + *rule = &placement; + + return 0; +} + +std::unique_ptr DaosMultipartUpload::get_writer( + const DoutPrefixProvider* dpp, optional_yield y, + rgw::sal::Object* obj, const rgw_user& owner, + const rgw_placement_rule* ptail_placement_rule, uint64_t part_num, + const std::string& part_num_str) { + ldpp_dout(dpp, 20) << "DaosMultipartUpload::get_writer(): enter part=" + << part_num << " head_obj=" << _head_obj << dendl; + return std::make_unique( + dpp, y, this, obj, store, owner, ptail_placement_rule, + part_num, part_num_str); +} + +DaosMultipartWriter::~DaosMultipartWriter() { + if (is_open()) ds3_part_close(ds3p); +} + +int DaosMultipartWriter::prepare(optional_yield y) { + ldpp_dout(dpp, 20) << "DaosMultipartWriter::prepare(): enter part=" + << part_num_str << dendl; + int ret = ds3_part_open(get_bucket_name().c_str(), upload_id.c_str(), + part_num, true, &ds3p, store->ds3); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + return ret; +} + +const std::string& DaosMultipartWriter::get_bucket_name() { + return static_cast(upload)->get_bucket_name(); +} + +int DaosMultipartWriter::process(bufferlist&& data, uint64_t offset) { + ldpp_dout(dpp, 20) << "DaosMultipartWriter::process(): enter part=" + << part_num_str << " offset=" << offset << dendl; + if (data.length() == 0) { + return 0; + } + + uint64_t size = data.length(); + int ret = + ds3_part_write(data.c_str(), offset, &size, ds3p, store->ds3, nullptr); + if (ret == 0) { + // XXX: Combine multiple streams into one as motr does + actual_part_size += size; + } else { + ldpp_dout(dpp, 0) << "ERROR: failed to write into part (" + << get_bucket_name() << ", " << upload_id << ", " + << part_num << "): ret=" << ret << dendl; + } + return ret; +} + +int DaosMultipartWriter::complete( + size_t accounted_size, const std::string& etag, ceph::real_time* mtime, + ceph::real_time set_mtime, std::map& attrs, + ceph::real_time delete_at, const char* if_match, const char* if_nomatch, + const std::string* user_data, rgw_zone_set* zones_trace, bool* canceled, + optional_yield y) { + ldpp_dout(dpp, 20) << "DaosMultipartWriter::complete(): enter part=" + << part_num_str << dendl; + + // Add an entry into part index + bufferlist bl; + RGWUploadPartInfo info; + info.num = part_num; + info.etag = etag; + info.size = actual_part_size; + info.accounted_size = accounted_size; + info.modified = real_clock::now(); + + bool compressed; + int ret = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info); + ldpp_dout(dpp, 20) << "DaosMultipartWriter::complete(): compression ret=" + << ret << dendl; + if (ret != 0) { + ldpp_dout(dpp, 1) << "cannot get compression info" << dendl; + return ret; + } + encode(info, bl); + encode(attrs, bl); + ldpp_dout(dpp, 20) << "DaosMultipartWriter::complete(): entry size" + << bl.length() << dendl; + + struct ds3_multipart_part_info part_info = {.part_num = part_num, + .encoded = bl.c_str(), + .encoded_length = bl.length()}; + + ret = ds3_part_set_info(&part_info, ds3p, store->ds3, nullptr); + + if (ret != 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to set part info (" << get_bucket_name() + << ", " << upload_id << ", " << part_num + << "): ret=" << ret << dendl; + if (ret == ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + } + + return ret; +} + +std::unique_ptr DaosStore::get_role( + std::string name, std::string tenant, std::string path, + std::string trust_policy, std::string max_session_duration_str, + std::multimap tags) { + RGWRole* p = nullptr; + return std::unique_ptr(p); +} + +std::unique_ptr DaosStore::get_role(const RGWRoleInfo& info) { + RGWRole* p = nullptr; + return std::unique_ptr(p); +} + +std::unique_ptr DaosStore::get_role(std::string id) { + RGWRole* p = nullptr; + return std::unique_ptr(p); +} + +int DaosStore::get_roles(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + vector>& roles) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +std::unique_ptr DaosStore::get_oidc_provider() { + RGWOIDCProvider* p = nullptr; + return std::unique_ptr(p); +} + +int DaosStore::get_oidc_providers( + const DoutPrefixProvider* dpp, const std::string& tenant, + vector>& providers) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +std::unique_ptr DaosBucket::get_multipart_upload( + const std::string& oid, std::optional upload_id, + ACLOwner owner, ceph::real_time mtime) { + return std::make_unique(store, this, oid, upload_id, + owner, mtime); +} + +std::unique_ptr DaosStore::get_append_writer( + const DoutPrefixProvider* dpp, optional_yield y, + rgw::sal::Object* obj, const rgw_user& owner, + const rgw_placement_rule* ptail_placement_rule, + const std::string& unique_tag, uint64_t position, + uint64_t* cur_accounted_size) { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return nullptr; +} + +std::unique_ptr DaosStore::get_atomic_writer( + const DoutPrefixProvider* dpp, optional_yield y, + rgw::sal::Object* obj, const rgw_user& owner, + const rgw_placement_rule* ptail_placement_rule, uint64_t olh_epoch, + const std::string& unique_tag) { + ldpp_dout(dpp, 20) << "get_atomic_writer" << dendl; + return std::make_unique(dpp, y, obj, this, + owner, ptail_placement_rule, + olh_epoch, unique_tag); +} + +const std::string& DaosStore::get_compression_type( + const rgw_placement_rule& rule) { + return zone.zone_params->get_compression_type(rule); +} + +bool DaosStore::valid_placement(const rgw_placement_rule& rule) { + return zone.zone_params->valid_placement(rule); +} + +std::unique_ptr DaosStore::get_user(const rgw_user& u) { + ldout(cctx, 20) << "DEBUG: bucket's user: " << u.to_str() << dendl; + return std::make_unique(this, u); +} + +int DaosStore::get_user_by_access_key(const DoutPrefixProvider* dpp, + const std::string& key, optional_yield y, + std::unique_ptr* user) { + // Initialize ds3_user_info + bufferlist bl; + uint64_t size = DS3_MAX_ENCODED_LEN; + struct ds3_user_info user_info = {.encoded = bl.append_hole(size).c_str(), + .encoded_length = size}; + + int ret = ds3_user_get_by_key(key.c_str(), &user_info, ds3, nullptr); + + if (ret != 0) { + ldpp_dout(dpp, 0) << "Error: ds3_user_get_by_key failed, key=" << key + << " ret=" << ret << dendl; + return ret; + } + + // Decode + DaosUserInfo duinfo; + bufferlist& blr = bl; + auto iter = blr.cbegin(); + duinfo.decode(iter); + + User* u = new DaosUser(this, duinfo.info); + if (!u) { + return -ENOMEM; + } + + user->reset(u); + return 0; +} + +int DaosStore::get_user_by_email(const DoutPrefixProvider* dpp, + const std::string& email, optional_yield y, + std::unique_ptr* user) { + // Initialize ds3_user_info + bufferlist bl; + uint64_t size = DS3_MAX_ENCODED_LEN; + struct ds3_user_info user_info = {.encoded = bl.append_hole(size).c_str(), + .encoded_length = size}; + + int ret = ds3_user_get_by_email(email.c_str(), &user_info, ds3, nullptr); + + if (ret != 0) { + ldpp_dout(dpp, 0) << "Error: ds3_user_get_by_email failed, email=" << email + << " ret=" << ret << dendl; + return ret; + } + + // Decode + DaosUserInfo duinfo; + bufferlist& blr = bl; + auto iter = blr.cbegin(); + duinfo.decode(iter); + + User* u = new DaosUser(this, duinfo.info); + if (!u) { + return -ENOMEM; + } + + user->reset(u); + return 0; +} + +int DaosStore::get_user_by_swift(const DoutPrefixProvider* dpp, + const std::string& user_str, optional_yield y, + std::unique_ptr* user) { + /* Swift keys and subusers are not supported for now */ + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +std::unique_ptr DaosStore::get_object(const rgw_obj_key& k) { + return std::make_unique(this, k); +} + +inline std::ostream& operator<<(std::ostream& out, const rgw_user* u) { + std::string s; + if (u != nullptr) + u->to_str(s); + else + s = "(nullptr)"; + return out << s; +} + +int DaosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, + const rgw_bucket& b, std::unique_ptr* bucket, + optional_yield y) { + ldpp_dout(dpp, 20) << "DEBUG: get_bucket1: User: " << u << dendl; + int ret; + Bucket* bp; + + bp = new DaosBucket(this, b, u); + ret = bp->load_bucket(dpp, y); + if (ret != 0) { + delete bp; + return ret; + } + + bucket->reset(bp); + return 0; +} + +int DaosStore::get_bucket(User* u, const RGWBucketInfo& i, + std::unique_ptr* bucket) { + DaosBucket* bp; + + bp = new DaosBucket(this, i, u); + /* Don't need to fetch the bucket info, use the provided one */ + + bucket->reset(bp); + return 0; +} + +int DaosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, + const std::string& tenant, const std::string& name, + std::unique_ptr* bucket, optional_yield y) { + ldpp_dout(dpp, 20) << "get_bucket" << dendl; + rgw_bucket b; + + b.tenant = tenant; + b.name = name; + + return get_bucket(dpp, u, b, bucket, y); +} + +bool DaosStore::is_meta_master() { return true; } + +int DaosStore::forward_request_to_master(const DoutPrefixProvider* dpp, + User* user, obj_version* objv, + bufferlist& in_data, JSONParser* jp, + req_info& info, optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosStore::forward_iam_request_to_master(const DoutPrefixProvider* dpp, + const RGWAccessKey& key, + obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, + req_info& info, optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +std::string DaosStore::zone_unique_id(uint64_t unique_num) { return ""; } + +std::string DaosStore::zone_unique_trans_id(const uint64_t unique_num) { + return ""; +} + +int DaosStore::cluster_stat(RGWClusterStat& stats) { + return DAOS_NOT_IMPLEMENTED_LOG(nullptr); +} + +std::unique_ptr DaosStore::get_lifecycle(void) { + DAOS_NOT_IMPLEMENTED_LOG(nullptr); + return 0; +} + +std::unique_ptr DaosStore::get_completions(void) { + DAOS_NOT_IMPLEMENTED_LOG(nullptr); + return 0; +} + +std::unique_ptr DaosStore::get_notification( + rgw::sal::Object* obj, rgw::sal::Object* src_obj, struct req_state* s, + rgw::notify::EventType event_type, const std::string* object_name) { + return std::make_unique(obj, src_obj, event_type); +} + +std::unique_ptr DaosStore::get_notification( + const DoutPrefixProvider* dpp, Object* obj, Object* src_obj, + rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, + std::string& _user_id, std::string& _user_tenant, std::string& _req_id, + optional_yield y) { + ldpp_dout(dpp, 20) << "get_notification" << dendl; + return std::make_unique(obj, src_obj, event_type); +} + +int DaosStore::log_usage(const DoutPrefixProvider* dpp, + map& usage_info) { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return 0; +} + +int DaosStore::log_op(const DoutPrefixProvider* dpp, string& oid, + bufferlist& bl) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosStore::register_to_service_map(const DoutPrefixProvider* dpp, + const string& daemon_type, + const map& meta) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +void DaosStore::get_quota(RGWQuota& quota) { + // XXX: Not handled for the first pass + return; +} + +void DaosStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, + RGWRateLimitInfo& user_ratelimit, + RGWRateLimitInfo& anon_ratelimit) { + return; +} + +int DaosStore::set_buckets_enabled(const DoutPrefixProvider* dpp, + std::vector& buckets, + bool enabled) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosStore::get_sync_policy_handler(const DoutPrefixProvider* dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef* phandler, + optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +RGWDataSyncStatusManager* DaosStore::get_data_sync_manager( + const rgw_zone_id& source_zone) { + DAOS_NOT_IMPLEMENTED_LOG(nullptr); + return 0; +} + +int DaosStore::read_all_usage( + const DoutPrefixProvider* dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, RGWUsageIter& usage_iter, + map& usage) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosStore::trim_all_usage(const DoutPrefixProvider* dpp, + uint64_t start_epoch, uint64_t end_epoch) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosStore::get_config_key_val(string name, bufferlist* bl) { + return DAOS_NOT_IMPLEMENTED_LOG(nullptr); +} + +int DaosStore::meta_list_keys_init(const DoutPrefixProvider* dpp, + const string& section, const string& marker, + void** phandle) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +int DaosStore::meta_list_keys_next(const DoutPrefixProvider* dpp, void* handle, + int max, list& keys, + bool* truncated) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +void DaosStore::meta_list_keys_complete(void* handle) { return; } + +std::string DaosStore::meta_get_marker(void* handle) { return ""; } + +int DaosStore::meta_remove(const DoutPrefixProvider* dpp, string& metadata_key, + optional_yield y) { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); +} + +std::string DaosStore::get_cluster_id(const DoutPrefixProvider* dpp, + optional_yield y) { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return ""; +} + +} // namespace rgw::sal + +extern "C" { + +void* newDaosStore(CephContext* cct) { + return new rgw::sal::DaosStore(cct); +} +} diff --git a/src/rgw/rgw_sal_daos.h b/src/rgw/rgw_sal_daos.h new file mode 100644 index 000000000..64bf49c7c --- /dev/null +++ b/src/rgw/rgw_sal_daos.h @@ -0,0 +1,1054 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=2 sw=2 expandtab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * SAL implementation for the CORTX Daos backend + * + * Copyright (C) 2022 Seagate Technology LLC and/or its Affiliates + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +#include "rgw_multi.h" +#include "rgw_notify.h" +#include "rgw_oidc_provider.h" +#include "rgw_putobj_processor.h" +#include "rgw_rados.h" +#include "rgw_role.h" +#include "rgw_sal_store.h" + +inline bool IsDebuggerAttached() { +#ifdef DEBUG + char buf[4096]; + + const int status_fd = ::open("/proc/self/status", O_RDONLY); + if (status_fd == -1) return false; + + const ssize_t num_read = ::read(status_fd, buf, sizeof(buf) - 1); + ::close(status_fd); + + if (num_read <= 0) return false; + + buf[num_read] = '\0'; + constexpr char tracerPidString[] = "TracerPid:"; + const auto tracer_pid_ptr = ::strstr(buf, tracerPidString); + if (!tracer_pid_ptr) return false; + + for (const char* characterPtr = tracer_pid_ptr + sizeof(tracerPidString) - 1; + characterPtr <= buf + num_read; ++characterPtr) { + if (::isspace(*characterPtr)) + continue; + else + return ::isdigit(*characterPtr) != 0 && *characterPtr != '0'; + } +#endif // DEBUG + return false; +} + +inline void DebugBreak() { +#ifdef DEBUG + // only break into the debugger if the debugger is attached + if (IsDebuggerAttached()) + raise(SIGINT); // breaks into GDB and stops, can be continued +#endif // DEBUG +} + +inline int NotImplementedLog(const DoutPrefixProvider* ldpp, + const char* filename, int linenumber, + const char* functionname) { + if (ldpp) + ldpp_dout(ldpp, 20) << filename << "(" << linenumber << ") " << functionname + << ": Not implemented" << dendl; + return 0; +} + +inline int NotImplementedGdbBreak(const DoutPrefixProvider* ldpp, + const char* filename, int linenumber, + const char* functionname) { + NotImplementedLog(ldpp, filename, linenumber, functionname); + DebugBreak(); + return 0; +} + +#define DAOS_NOT_IMPLEMENTED_GDB_BREAK(ldpp) \ + NotImplementedGdbBreak(ldpp, __FILE__, __LINE__, __FUNCTION__) +#define DAOS_NOT_IMPLEMENTED_LOG(ldpp) \ + NotImplementedLog(ldpp, __FILE__, __LINE__, __FUNCTION__) + +namespace rgw::sal { + +class DaosStore; +class DaosObject; + +#ifdef DEBUG +// Prepends each log entry with the "filename(source_line) function_name". Makes +// it simple to +// associate log entries with the source that generated the log entry +#undef ldpp_dout +#define ldpp_dout(dpp, v) \ + if (decltype(auto) pdpp = (dpp); \ + pdpp) /* workaround -Wnonnull-compare for 'this' */ \ + dout_impl(pdpp->get_cct(), ceph::dout::need_dynamic(pdpp->get_subsys()), v) \ + pdpp->gen_prefix(*_dout) \ + << __FILE__ << "(" << __LINE__ << ") " << __FUNCTION__ << " - " +#endif // DEBUG + +struct DaosUserInfo { + RGWUserInfo info; + obj_version user_version; + rgw::sal::Attrs attrs; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 3, bl); + encode(info, bl); + encode(user_version, bl); + encode(attrs, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(info, bl); + decode(user_version, bl); + decode(attrs, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(DaosUserInfo); + +class DaosNotification : public StoreNotification { + public: + DaosNotification(Object* _obj, Object* _src_obj, rgw::notify::EventType _type) + : StoreNotification(_obj, _src_obj, _type) {} + ~DaosNotification() = default; + + virtual int publish_reserve(const DoutPrefixProvider* dpp, + RGWObjTags* obj_tags = nullptr) override { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); + } + virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size, + const ceph::real_time& mtime, + const std::string& etag, + const std::string& version) override { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); + } +}; + +class DaosUser : public StoreUser { + private: + DaosStore* store; + std::vector access_ids; + + public: + DaosUser(DaosStore* _st, const rgw_user& _u) : StoreUser(_u), store(_st) {} + DaosUser(DaosStore* _st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) {} + DaosUser(DaosStore* _st) : store(_st) {} + DaosUser(DaosUser& _o) = default; + DaosUser() {} + + virtual std::unique_ptr clone() override { + return std::make_unique(*this); + } + int list_buckets(const DoutPrefixProvider* dpp, const std::string& marker, + const std::string& end_marker, uint64_t max, bool need_stats, + BucketList& buckets, optional_yield y) override; + virtual int create_bucket( + const DoutPrefixProvider* dpp, const rgw_bucket& b, + const std::string& zonegroup_id, rgw_placement_rule& placement_rule, + std::string& swift_ver_location, const RGWQuotaInfo* pquota_info, + const RGWAccessControlPolicy& policy, Attrs& attrs, RGWBucketInfo& info, + obj_version& ep_objv, bool exclusive, bool obj_lock_enabled, + bool* existed, req_info& req_info, std::unique_ptr* bucket, + optional_yield y) override; + virtual int read_attrs(const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, + Attrs& new_attrs, + optional_yield y) override; + virtual int read_stats(const DoutPrefixProvider* dpp, optional_yield y, + RGWStorageStats* stats, + ceph::real_time* last_stats_sync = nullptr, + ceph::real_time* last_stats_update = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider* dpp, + RGWGetUserStats_CB* cb) override; + virtual int complete_flush_stats(const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int read_usage( + const DoutPrefixProvider* dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch, + uint64_t end_epoch) override; + + virtual int load_user(const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, + bool exclusive, + RGWUserInfo* old_info = nullptr) override; + virtual int remove_user(const DoutPrefixProvider* dpp, + optional_yield y) override; + + /** Read user info without loading it */ + int read_user(const DoutPrefixProvider* dpp, std::string name, + DaosUserInfo* duinfo); + + std::unique_ptr get_encoded_info(bufferlist& bl, + obj_version& obj_ver); + + friend class DaosBucket; +}; + +// RGWBucketInfo and other information that are shown when listing a bucket is +// represented in struct DaosBucketInfo. The structure is encoded and stored +// as the value of the global bucket instance index. +// TODO: compare pros and cons of separating the bucket_attrs (ACLs, tag etc.) +// into a different index. +struct DaosBucketInfo { + RGWBucketInfo info; + + obj_version bucket_version; + ceph::real_time mtime; + + rgw::sal::Attrs bucket_attrs; + + void encode(bufferlist& bl) const { + ENCODE_START(4, 4, bl); + encode(info, bl); + encode(bucket_version, bl); + encode(mtime, bl); + encode(bucket_attrs, bl); // rgw_cache.h example for a map + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(4, bl); + decode(info, bl); + decode(bucket_version, bl); + decode(mtime, bl); + decode(bucket_attrs, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(DaosBucketInfo); + +class DaosBucket : public StoreBucket { + private: + DaosStore* store; + RGWAccessControlPolicy acls; + + public: + /** Container ds3b handle */ + ds3_bucket_t* ds3b = nullptr; + + DaosBucket(DaosStore* _st) : store(_st), acls() {} + + DaosBucket(const DaosBucket& _daos_bucket) + : store(_daos_bucket.store), acls(), ds3b(nullptr) { + // TODO: deep copy all objects + } + + DaosBucket(DaosStore* _st, User* _u) : StoreBucket(_u), store(_st), acls() {} + + DaosBucket(DaosStore* _st, const rgw_bucket& _b) + : StoreBucket(_b), store(_st), acls() {} + + DaosBucket(DaosStore* _st, const RGWBucketEnt& _e) + : StoreBucket(_e), store(_st), acls() {} + + DaosBucket(DaosStore* _st, const RGWBucketInfo& _i) + : StoreBucket(_i), store(_st), acls() {} + + DaosBucket(DaosStore* _st, const rgw_bucket& _b, User* _u) + : StoreBucket(_b, _u), store(_st), acls() {} + + DaosBucket(DaosStore* _st, const RGWBucketEnt& _e, User* _u) + : StoreBucket(_e, _u), store(_st), acls() {} + + DaosBucket(DaosStore* _st, const RGWBucketInfo& _i, User* _u) + : StoreBucket(_i, _u), store(_st), acls() {} + + ~DaosBucket(); + + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, + ListResults&, optional_yield y) override; + virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, + bool forward_to_master, req_info* req_info, + optional_yield y) override; + virtual int remove_bucket_bypass_gc(int concurrent_max, + bool keep_index_consistent, + optional_yield y, + const DoutPrefixProvider* dpp) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } + virtual int set_acl(const DoutPrefixProvider* dpp, + RGWAccessControlPolicy& acl, optional_yield y) override; + virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, + bool get_stats = false) override; + virtual int read_stats(const DoutPrefixProvider* dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, std::string* bucket_ver, + std::string* master_ver, + std::map& stats, + std::string* max_marker = nullptr, + bool* syncstopped = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider* dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, + RGWGetBucketStats_CB* ctx) override; + virtual int sync_user_stats(const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int update_container_stats(const DoutPrefixProvider* dpp) override; + virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override; + virtual int chown(const DoutPrefixProvider* dpp, User& new_user, + optional_yield y) override; + virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, + ceph::real_time mtime) override; + virtual bool is_owner(User* user) override; + virtual int check_empty(const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int check_quota(const DoutPrefixProvider* dpp, RGWQuota& quota, + uint64_t obj_size, optional_yield y, + bool check_size_only = false) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& attrs, + optional_yield y) override; + virtual int try_refresh_info(const DoutPrefixProvider* dpp, + ceph::real_time* pmtime) override; + virtual int read_usage( + const DoutPrefixProvider* dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch, + uint64_t end_epoch) override; + virtual int remove_objs_from_index( + const DoutPrefixProvider* dpp, + std::list& objs_to_unlink) override; + virtual int check_index( + const DoutPrefixProvider* dpp, + std::map& existing_stats, + std::map& calculated_stats) override; + virtual int rebuild_index(const DoutPrefixProvider* dpp) override; + virtual int set_tag_timeout(const DoutPrefixProvider* dpp, + uint64_t timeout) override; + virtual int purge_instance(const DoutPrefixProvider* dpp) override; + virtual std::unique_ptr clone() override { + return std::make_unique(*this); + } + virtual std::unique_ptr get_multipart_upload( + const std::string& oid, + std::optional upload_id = std::nullopt, ACLOwner owner = {}, + ceph::real_time mtime = real_clock::now()) override; + virtual int list_multiparts( + const DoutPrefixProvider* dpp, const std::string& prefix, + std::string& marker, const std::string& delim, const int& max_uploads, + std::vector>& uploads, + std::map* common_prefixes, + bool* is_truncated) override; + virtual int abort_multiparts(const DoutPrefixProvider* dpp, + CephContext* cct) override; + + int open(const DoutPrefixProvider* dpp); + int close(const DoutPrefixProvider* dpp); + bool is_open() { return ds3b != nullptr; } + std::unique_ptr get_encoded_info( + bufferlist& bl, ceph::real_time mtime); + + friend class DaosStore; +}; + +class DaosPlacementTier : public StorePlacementTier { + DaosStore* store; + RGWZoneGroupPlacementTier tier; + + public: + DaosPlacementTier(DaosStore* _store, const RGWZoneGroupPlacementTier& _tier) + : store(_store), tier(_tier) {} + virtual ~DaosPlacementTier() = default; + + virtual const std::string& get_tier_type() { return tier.tier_type; } + virtual const std::string& get_storage_class() { return tier.storage_class; } + virtual bool retain_head_object() { return tier.retain_head_object; } + RGWZoneGroupPlacementTier& get_rt() { return tier; } +}; + +class DaosZoneGroup : public StoreZoneGroup { + DaosStore* store; + const RGWZoneGroup group; + std::string empty; + + public: + DaosZoneGroup(DaosStore* _store) : store(_store), group() {} + DaosZoneGroup(DaosStore* _store, const RGWZoneGroup& _group) + : store(_store), group(_group) {} + virtual ~DaosZoneGroup() = default; + + virtual const std::string& get_id() const override { return group.get_id(); }; + virtual const std::string& get_name() const override { + return group.get_name(); + }; + virtual int equals(const std::string& other_zonegroup) const override { + return group.equals(other_zonegroup); + }; + /** Get the endpoint from zonegroup, or from master zone if not set */ + virtual const std::string& get_endpoint() const override; + virtual bool placement_target_exists(std::string& target) const override; + virtual bool is_master_zonegroup() const override { + return group.is_master_zonegroup(); + }; + virtual const std::string& get_api_name() const override { + return group.api_name; + }; + virtual void get_placement_target_names( + std::set& names) const override; + virtual const std::string& get_default_placement_name() const override { + return group.default_placement.name; + }; + virtual int get_hostnames(std::list& names) const override { + names = group.hostnames; + return 0; + }; + virtual int get_s3website_hostnames( + std::list& names) const override { + names = group.hostnames_s3website; + return 0; + }; + virtual int get_zone_count() const override { return group.zones.size(); } + virtual int get_placement_tier(const rgw_placement_rule& rule, + std::unique_ptr* tier); + bool supports(std::string_view feature) const override { + return group.supports(feature); + } + virtual std::unique_ptr clone() override { + return std::make_unique(store, group); + } + const RGWZoneGroup& get_group() { return group; } +}; + +class DaosZone : public StoreZone { + protected: + DaosStore* store; + RGWRealm* realm{nullptr}; + DaosZoneGroup zonegroup; + RGWZone* zone_public_config{ + nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */ + RGWZoneParams* zone_params{ + nullptr}; /* internal zone params, e.g., rados pools */ + RGWPeriod* current_period{nullptr}; + rgw_zone_id cur_zone_id; + + public: + DaosZone(DaosStore* _store) : store(_store), zonegroup(_store) { + realm = new RGWRealm(); + zone_public_config = new RGWZone(); + zone_params = new RGWZoneParams(); + current_period = new RGWPeriod(); + cur_zone_id = rgw_zone_id(zone_params->get_id()); + + // XXX: only default and STANDARD supported for now + RGWZonePlacementInfo info; + RGWZoneStorageClasses sc; + sc.set_storage_class("STANDARD", nullptr, nullptr); + info.storage_classes = sc; + zone_params->placement_pools["default"] = info; + } + DaosZone(DaosStore* _store, DaosZoneGroup _zg) + : store(_store), zonegroup(_zg) { + realm = new RGWRealm(); + zone_public_config = new RGWZone(); + zone_params = new RGWZoneParams(); + current_period = new RGWPeriod(); + cur_zone_id = rgw_zone_id(zone_params->get_id()); + + // XXX: only default and STANDARD supported for now + RGWZonePlacementInfo info; + RGWZoneStorageClasses sc; + sc.set_storage_class("STANDARD", nullptr, nullptr); + info.storage_classes = sc; + zone_params->placement_pools["default"] = info; + } + ~DaosZone() = default; + + virtual std::unique_ptr clone() override { + return std::make_unique(store); + } + virtual ZoneGroup& get_zonegroup() override; + virtual int get_zonegroup(const std::string& id, + std::unique_ptr* zonegroup) override; + virtual const rgw_zone_id& get_id() override; + virtual const std::string& get_name() const override; + virtual bool is_writeable() override; + virtual bool get_redirect_endpoint(std::string* endpoint) override; + virtual bool has_zonegroup_api(const std::string& api) const override; + virtual const std::string& get_current_period_id() override; + virtual const RGWAccessKey& get_system_key() { + return zone_params->system_key; + } + virtual const std::string& get_realm_name() { return realm->get_name(); } + virtual const std::string& get_realm_id() { return realm->get_id(); } + virtual const std::string_view get_tier_type() { return "rgw"; } + + friend class DaosStore; +}; + +class DaosLuaManager : public StoreLuaManager { + DaosStore* store; + + public: + DaosLuaManager(DaosStore* _s) : store(_s) {} + virtual ~DaosLuaManager() = default; + + virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& key, std::string& script) override { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return -ENOENT; + }; + + virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& key, + const std::string& script) override { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return -ENOENT; + }; + + virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& key) override { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return -ENOENT; + }; + + virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& package_name) override { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return -ENOENT; + }; + + virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& package_name) override { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return -ENOENT; + }; + + virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, + rgw::lua::packages_t& packages) override { + DAOS_NOT_IMPLEMENTED_LOG(dpp); + return -ENOENT; + }; +}; + +class DaosObject : public StoreObject { + private: + DaosStore* store; + RGWAccessControlPolicy acls; + + public: + struct DaosReadOp : public StoreReadOp { + private: + DaosObject* source; + + public: + DaosReadOp(DaosObject* _source); + + virtual int prepare(optional_yield y, + const DoutPrefixProvider* dpp) override; + + /* + * Both `read` and `iterate` read up through index `end` + * *inclusive*. The number of bytes that could be returned is + * `end - ofs + 1`. + */ + virtual int read(int64_t off, int64_t end, bufferlist& bl, optional_yield y, + const DoutPrefixProvider* dpp) override; + virtual int iterate(const DoutPrefixProvider* dpp, int64_t off, int64_t end, + RGWGetDataCB* cb, optional_yield y) override; + + virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, + bufferlist& dest, optional_yield y) override; + }; + + struct DaosDeleteOp : public StoreDeleteOp { + private: + DaosObject* source; + + public: + DaosDeleteOp(DaosObject* _source); + + virtual int delete_obj(const DoutPrefixProvider* dpp, + optional_yield y) override; + }; + + ds3_obj_t* ds3o = nullptr; + + DaosObject() = default; + + DaosObject(DaosStore* _st, const rgw_obj_key& _k) + : StoreObject(_k), store(_st), acls() {} + DaosObject(DaosStore* _st, const rgw_obj_key& _k, Bucket* _b) + : StoreObject(_k, _b), store(_st), acls() {} + + DaosObject(DaosObject& _o) = default; + + virtual ~DaosObject(); + + virtual int delete_object(const DoutPrefixProvider* dpp, optional_yield y, + bool prevent_versioning = false) override; + virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, + Completions* aio, bool keep_index_consistent, + optional_yield y) override; + virtual int copy_object( + User* user, req_info* info, const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, ceph::real_time* mtime, + const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr, + bool high_precision_time, const char* if_match, const char* if_nomatch, + AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs, + RGWObjCategory category, uint64_t olh_epoch, + boost::optional delete_at, std::string* version_id, + std::string* tag, std::string* etag, void (*progress_cb)(off_t, void*), + void* progress_data, const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } + virtual int set_acl(const RGWAccessControlPolicy& acl) override { + acls = acl; + return 0; + } + + virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState** state, + optional_yield y, bool follow_olh = true) override; + virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, + Attrs* delattrs, optional_yield y) override; + virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, + rgw_obj* target_obj = NULL) override; + virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, + optional_yield y, + const DoutPrefixProvider* dpp) override; + virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, + const char* attr_name, + optional_yield y) override; + virtual bool is_expired() override; + virtual void gen_rand_obj_instance_name() override; + virtual std::unique_ptr clone() override { + return std::make_unique(*this); + } + virtual std::unique_ptr get_serializer( + const DoutPrefixProvider* dpp, const std::string& lock_name) override; + virtual int transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int transition_to_cloud(Bucket* bucket, rgw::sal::PlacementTier* tier, + rgw_bucket_dir_entry& o, + std::set& cloud_targets, + CephContext* cct, bool update_object, + const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual bool placement_rules_match(rgw_placement_rule& r1, + rgw_placement_rule& r2) override; + virtual int dump_obj_layout(const DoutPrefixProvider* dpp, optional_yield y, + Formatter* f) override; + + /* Swift versioning */ + virtual int swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) override; + virtual int swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) override; + + /* OPs */ + virtual std::unique_ptr get_read_op() override; + virtual std::unique_ptr get_delete_op() override; + + /* OMAP */ + virtual int omap_get_vals(const DoutPrefixProvider* dpp, + const std::string& marker, uint64_t count, + std::map* m, bool* pmore, + optional_yield y) override; + virtual int omap_get_all(const DoutPrefixProvider* dpp, + std::map* m, + optional_yield y) override; + virtual int omap_get_vals_by_keys(const DoutPrefixProvider* dpp, + const std::string& oid, + const std::set& keys, + Attrs* vals) override; + virtual int omap_set_val_by_key(const DoutPrefixProvider* dpp, + const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) override; + virtual int chown(User& new_user, const DoutPrefixProvider* dpp, + optional_yield y) override; + + bool is_open() { return ds3o != nullptr; }; + // Only lookup the object, do not create + int lookup(const DoutPrefixProvider* dpp); + // Create the object, truncate if exists + int create(const DoutPrefixProvider* dpp); + // Release the daos resources + int close(const DoutPrefixProvider* dpp); + // Write to object starting from offset + int write(const DoutPrefixProvider* dpp, bufferlist&& data, uint64_t offset); + // Read size bytes from object starting from offset + int read(const DoutPrefixProvider* dpp, bufferlist& data, uint64_t offset, + uint64_t& size); + // Get the object's dirent and attrs + int get_dir_entry_attrs(const DoutPrefixProvider* dpp, + rgw_bucket_dir_entry* ent, Attrs* getattrs = nullptr); + // Set the object's dirent and attrs + int set_dir_entry_attrs(const DoutPrefixProvider* dpp, + rgw_bucket_dir_entry* ent, Attrs* setattrs = nullptr); + // Marks this DAOS object as being the latest version and unmarks all other + // versions as latest + int mark_as_latest(const DoutPrefixProvider* dpp, ceph::real_time set_mtime); + // get_bucket casted as DaosBucket* + DaosBucket* get_daos_bucket() { + return static_cast(get_bucket()); + } +}; + +// A placeholder locking class for multipart upload. +class MPDaosSerializer : public StoreMPSerializer { + public: + MPDaosSerializer(const DoutPrefixProvider* dpp, DaosStore* store, + DaosObject* obj, const std::string& lock_name) {} + + virtual int try_lock(const DoutPrefixProvider* dpp, utime_t dur, + optional_yield y) override { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); + } + virtual int unlock() override { return DAOS_NOT_IMPLEMENTED_LOG(nullptr); } +}; + +class DaosAtomicWriter : public StoreWriter { + protected: + rgw::sal::DaosStore* store; + const rgw_user& owner; + const rgw_placement_rule* ptail_placement_rule; + uint64_t olh_epoch; + const std::string& unique_tag; + DaosObject obj; + uint64_t total_data_size = 0; // for total data being uploaded + + public: + DaosAtomicWriter(const DoutPrefixProvider* dpp, optional_yield y, + rgw::sal::Object* obj, + DaosStore* _store, const rgw_user& _owner, + const rgw_placement_rule* _ptail_placement_rule, + uint64_t _olh_epoch, const std::string& _unique_tag); + ~DaosAtomicWriter() = default; + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time* mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, const char* if_match, + const char* if_nomatch, const std::string* user_data, + rgw_zone_set* zones_trace, bool* canceled, + optional_yield y) override; +}; + +class DaosMultipartWriter : public StoreWriter { + protected: + rgw::sal::DaosStore* store; + MultipartUpload* upload; + std::string upload_id; + + // Part parameters. + const uint64_t part_num; + const std::string part_num_str; + uint64_t actual_part_size = 0; + + ds3_part_t* ds3p = nullptr; + bool is_open() { return ds3p != nullptr; }; + + public: + DaosMultipartWriter(const DoutPrefixProvider* dpp, optional_yield y, + MultipartUpload* _upload, + rgw::sal::Object* obj, + DaosStore* _store, const rgw_user& owner, + const rgw_placement_rule* ptail_placement_rule, + uint64_t _part_num, const std::string& part_num_str) + : StoreWriter(dpp, y), + store(_store), + upload(_upload), + upload_id(_upload->get_upload_id()), + part_num(_part_num), + part_num_str(part_num_str) {} + virtual ~DaosMultipartWriter(); + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time* mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, const char* if_match, + const char* if_nomatch, const std::string* user_data, + rgw_zone_set* zones_trace, bool* canceled, + optional_yield y) override; + + const std::string& get_bucket_name(); +}; + +class DaosMultipartPart : public StoreMultipartPart { + protected: + RGWUploadPartInfo info; + + public: + DaosMultipartPart() = default; + virtual ~DaosMultipartPart() = default; + + virtual uint32_t get_num() { return info.num; } + virtual uint64_t get_size() { return info.accounted_size; } + virtual const std::string& get_etag() { return info.etag; } + virtual ceph::real_time& get_mtime() { return info.modified; } + + friend class DaosMultipartUpload; +}; + +class DaosMultipartUpload : public StoreMultipartUpload { + DaosStore* store; + RGWMPObj mp_obj; + ACLOwner owner; + ceph::real_time mtime; + rgw_placement_rule placement; + RGWObjManifest manifest; + + public: + DaosMultipartUpload(DaosStore* _store, Bucket* _bucket, + const std::string& oid, + std::optional upload_id, ACLOwner _owner, + ceph::real_time _mtime) + : StoreMultipartUpload(_bucket), + store(_store), + mp_obj(oid, upload_id), + owner(_owner), + mtime(_mtime) {} + virtual ~DaosMultipartUpload() = default; + + virtual const std::string& get_meta() const { return mp_obj.get_meta(); } + virtual const std::string& get_key() const { return mp_obj.get_key(); } + virtual const std::string& get_upload_id() const { + return mp_obj.get_upload_id(); + } + virtual const ACLOwner& get_owner() const override { return owner; } + virtual ceph::real_time& get_mtime() { return mtime; } + virtual std::unique_ptr get_meta_obj() override; + virtual int init(const DoutPrefixProvider* dpp, optional_yield y, + ACLOwner& owner, rgw_placement_rule& dest_placement, + rgw::sal::Attrs& attrs) override; + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int num_parts, int marker, int* next_marker, + bool* truncated, + bool assume_unsorted = false) override; + virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override; + virtual int complete(const DoutPrefixProvider* dpp, optional_yield y, + CephContext* cct, std::map& part_etags, + std::list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& off, + std::string& tag, ACLOwner& owner, uint64_t olh_epoch, + rgw::sal::Object* target_obj) override; + virtual int get_info(const DoutPrefixProvider* dpp, optional_yield y, + rgw_placement_rule** rule, + rgw::sal::Attrs* attrs = nullptr) override; + virtual std::unique_ptr get_writer( + const DoutPrefixProvider* dpp, optional_yield y, + rgw::sal::Object* obj, const rgw_user& owner, + const rgw_placement_rule* ptail_placement_rule, uint64_t part_num, + const std::string& part_num_str) override; + const std::string& get_bucket_name() { return bucket->get_name(); } +}; + +class DaosStore : public StoreDriver { + private: + DaosZone zone; + RGWSyncModuleInstanceRef sync_module; + + public: + ds3_t* ds3 = nullptr; + + CephContext* cctx; + + DaosStore(CephContext* c) : zone(this), cctx(c) {} + ~DaosStore() = default; + + virtual const std::string get_name() const override { return "daos"; } + + virtual std::unique_ptr get_user(const rgw_user& u) override; + virtual std::string get_cluster_id(const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, + const std::string& key, optional_yield y, + std::unique_ptr* user) override; + virtual int get_user_by_email(const DoutPrefixProvider* dpp, + const std::string& email, optional_yield y, + std::unique_ptr* user) override; + virtual int get_user_by_swift(const DoutPrefixProvider* dpp, + const std::string& user_str, optional_yield y, + std::unique_ptr* user) override; + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, + const rgw_bucket& b, std::unique_ptr* bucket, + optional_yield y) override; + virtual int get_bucket(User* u, const RGWBucketInfo& i, + std::unique_ptr* bucket) override; + virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, + const std::string& tenant, const std::string& name, + std::unique_ptr* bucket, + optional_yield y) override; + virtual bool is_meta_master() override; + virtual int forward_request_to_master(const DoutPrefixProvider* dpp, + User* user, obj_version* objv, + bufferlist& in_data, JSONParser* jp, + req_info& info, + optional_yield y) override; + virtual int forward_iam_request_to_master( + const DoutPrefixProvider* dpp, const RGWAccessKey& key, obj_version* objv, + bufferlist& in_data, RGWXMLDecoder::XMLParser* parser, req_info& info, + optional_yield y) override; + virtual Zone* get_zone() { return &zone; } + virtual std::string zone_unique_id(uint64_t unique_num) override; + virtual std::string zone_unique_trans_id(const uint64_t unique_num) override; + virtual int cluster_stat(RGWClusterStat& stats) override; + virtual std::unique_ptr get_lifecycle(void) override; + virtual std::unique_ptr get_completions(void) override; + virtual std::unique_ptr get_notification( + rgw::sal::Object* obj, rgw::sal::Object* src_obj, struct req_state* s, + rgw::notify::EventType event_type, optional_yield y, + const std::string* object_name = nullptr) override; + virtual std::unique_ptr get_notification( + const DoutPrefixProvider* dpp, rgw::sal::Object* obj, + rgw::sal::Object* src_obj, rgw::notify::EventType event_type, + rgw::sal::Bucket* _bucket, std::string& _user_id, + std::string& _user_tenant, std::string& _req_id, + optional_yield y) override; + virtual RGWLC* get_rgwlc(void) override { return NULL; } + virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { + return NULL; + } + + virtual int log_usage( + const DoutPrefixProvider* dpp, + std::map& usage_info) override; + virtual int log_op(const DoutPrefixProvider* dpp, std::string& oid, + bufferlist& bl) override; + virtual int register_to_service_map( + const DoutPrefixProvider* dpp, const std::string& daemon_type, + const std::map& meta) override; + virtual void get_quota(RGWQuota& quota) override; + virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, + RGWRateLimitInfo& user_ratelimit, + RGWRateLimitInfo& anon_ratelimit) override; + virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, + std::vector& buckets, + bool enabled) override; + virtual uint64_t get_new_req_id() override { + return DAOS_NOT_IMPLEMENTED_LOG(nullptr); + } + virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef* phandler, + optional_yield y) override; + virtual RGWDataSyncStatusManager* get_data_sync_manager( + const rgw_zone_id& source_zone) override; + virtual void wakeup_meta_sync_shards(std::set& shard_ids) override { + return; + } + virtual void wakeup_data_sync_shards( + const DoutPrefixProvider* dpp, const rgw_zone_id& source_zone, + boost::container::flat_map< + int, boost::container::flat_set>& shard_ids) + override { + return; + } + virtual int clear_usage(const DoutPrefixProvider* dpp) override { + return DAOS_NOT_IMPLEMENTED_LOG(dpp); + } + virtual int read_all_usage( + const DoutPrefixProvider* dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_all_usage(const DoutPrefixProvider* dpp, + uint64_t start_epoch, uint64_t end_epoch) override; + virtual int get_config_key_val(std::string name, bufferlist* bl) override; + virtual int meta_list_keys_init(const DoutPrefixProvider* dpp, + const std::string& section, + const std::string& marker, + void** phandle) override; + virtual int meta_list_keys_next(const DoutPrefixProvider* dpp, void* handle, + int max, std::list& keys, + bool* truncated) override; + virtual void meta_list_keys_complete(void* handle) override; + virtual std::string meta_get_marker(void* handle) override; + virtual int meta_remove(const DoutPrefixProvider* dpp, + std::string& metadata_key, optional_yield y) override; + + virtual const RGWSyncModuleInstanceRef& get_sync_module() { + return sync_module; + } + virtual std::string get_host_id() { return ""; } + + virtual std::unique_ptr get_lua_manager() override; + virtual std::unique_ptr get_role( + std::string name, std::string tenant, std::string path = "", + std::string trust_policy = "", std::string max_session_duration_str = "", + std::multimap tags = {}) override; + virtual std::unique_ptr get_role(const RGWRoleInfo& info) override; + virtual std::unique_ptr get_role(std::string id) override; + virtual int get_roles(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + std::vector>& roles) override; + virtual std::unique_ptr get_oidc_provider() override; + virtual int get_oidc_providers( + const DoutPrefixProvider* dpp, const std::string& tenant, + std::vector>& providers) override; + virtual std::unique_ptr get_append_writer( + const DoutPrefixProvider* dpp, optional_yield y, + rgw::sal::Object* obj, const rgw_user& owner, + const rgw_placement_rule* ptail_placement_rule, + const std::string& unique_tag, uint64_t position, + uint64_t* cur_accounted_size) override; + virtual std::unique_ptr get_atomic_writer( + const DoutPrefixProvider* dpp, optional_yield y, + rgw::sal::Object* obj, const rgw_user& owner, + const rgw_placement_rule* ptail_placement_rule, uint64_t olh_epoch, + const std::string& unique_tag) override; + virtual const std::string& get_compression_type( + const rgw_placement_rule& rule) override; + virtual bool valid_placement(const rgw_placement_rule& rule) override; + + virtual void finalize(void) override; + + virtual CephContext* ctx(void) override { return cctx; } + + virtual int initialize(CephContext* cct, + const DoutPrefixProvider* dpp) override; +}; + +} // namespace rgw::sal diff --git a/src/rgw/rgw_sal_dbstore.cc b/src/rgw/rgw_sal_dbstore.cc new file mode 100644 index 000000000..5100dc41e --- /dev/null +++ b/src/rgw/rgw_sal_dbstore.cc @@ -0,0 +1,2045 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2021 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include + +#include "common/Clock.h" +#include "common/errno.h" + +#include "rgw_sal.h" +#include "rgw_sal_dbstore.h" +#include "rgw_bucket.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw::sal { + + int DBUser::list_buckets(const DoutPrefixProvider *dpp, const string& marker, + const string& end_marker, uint64_t max, bool need_stats, + BucketList &buckets, optional_yield y) + { + RGWUserBuckets ulist; + bool is_truncated = false; + int ret; + + buckets.clear(); + ret = store->getDB()->list_buckets(dpp, "", info.user_id, marker, end_marker, max, + need_stats, &ulist, &is_truncated); + if (ret < 0) + return ret; + + buckets.set_truncated(is_truncated); + for (const auto& ent : ulist.get_buckets()) { + buckets.add(std::make_unique(this->store, ent.second, this)); + } + + return 0; + } + + int DBUser::create_bucket(const DoutPrefixProvider *dpp, + const rgw_bucket& b, + const string& zonegroup_id, + rgw_placement_rule& placement_rule, + string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool *existed, + req_info& req_info, + std::unique_ptr* bucket_out, + optional_yield y) + { + int ret; + bufferlist in_data; + RGWBucketInfo master_info; + rgw_bucket *pmaster_bucket = nullptr; + uint32_t *pmaster_num_shards = nullptr; + real_time creation_time; + std::unique_ptr bucket; + obj_version objv, *pobjv = NULL; + + /* If it exists, look it up; otherwise create it */ + ret = store->get_bucket(dpp, this, b, &bucket, y); + if (ret < 0 && ret != -ENOENT) + return ret; + + if (ret != -ENOENT) { + RGWAccessControlPolicy old_policy(store->ctx()); + *existed = true; + if (swift_ver_location.empty()) { + swift_ver_location = bucket->get_info().swift_ver_location; + } + placement_rule.inherit_from(bucket->get_info().placement_rule); + + // don't allow changes to the acl policy + /* int r = rgw_op_get_bucket_policy_from_attr(dpp, this, this, bucket->get_attrs(), + &old_policy, y); + if (r >= 0 && old_policy != policy) { + bucket_out->swap(bucket); + return -EEXIST; + }*/ + } else { + bucket = std::make_unique(store, b, this); + *existed = false; + bucket->set_attrs(attrs); + // XXX: For now single default zone and STANDARD storage class + // supported. + placement_rule.name = "default"; + placement_rule.storage_class = "STANDARD"; + } + + /* + * XXX: If not master zone, fwd the request to master zone. + * For now DBStore has single zone. + */ + std::string zid = zonegroup_id; + /* if (zid.empty()) { + zid = svc()->zone->get_zonegroup().get_id(); + } */ + + if (*existed) { + rgw_placement_rule selected_placement_rule; + /* XXX: Handle this when zone is implemented + ret = svc()->zone->select_bucket_placement(this.get_info(), + zid, placement_rule, + &selected_placement_rule, nullptr, y); + if (selected_placement_rule != info.placement_rule) { + ret = -EEXIST; + bucket_out->swap(bucket); + return ret; + } */ + } else { + + /* XXX: We may not need to send all these params. Cleanup the unused ones */ + ret = store->getDB()->create_bucket(dpp, this->get_info(), bucket->get_key(), + zid, placement_rule, swift_ver_location, pquota_info, + attrs, info, pobjv, &ep_objv, creation_time, + pmaster_bucket, pmaster_num_shards, y, exclusive); + if (ret == -EEXIST) { + *existed = true; + ret = 0; + } else if (ret != 0) { + return ret; + } + } + + bucket->set_version(ep_objv); + bucket->get_info() = info; + + bucket_out->swap(bucket); + + return ret; + } + + int DBUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y) + { + int ret; + ret = store->getDB()->get_user(dpp, string("user_id"), get_id().id, info, &attrs, + &objv_tracker); + return ret; + } + + int DBUser::read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time *last_stats_sync, + ceph::real_time *last_stats_update) + { + return 0; + } + + /* stats - Not for first pass */ + int DBUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB *cb) + { + return 0; + } + + int DBUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) + { + return 0; + } + + int DBUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool *is_truncated, RGWUsageIter& usage_iter, + map& usage) + { + return 0; + } + + int DBUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) + { + return 0; + } + + int DBUser::load_user(const DoutPrefixProvider *dpp, optional_yield y) + { + int ret = 0; + + ret = store->getDB()->get_user(dpp, string("user_id"), get_id().id, info, &attrs, + &objv_tracker); + + return ret; + } + int DBUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) + { + for(auto& it : new_attrs) { + attrs[it.first] = it.second; + } + return store_user(dpp, y, false); + } + int DBUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info) + { + int ret = 0; + + ret = store->getDB()->store_user(dpp, info, exclusive, &attrs, &objv_tracker, old_info); + + return ret; + } + + int DBUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y) + { + int ret = 0; + + ret = store->getDB()->remove_user(dpp, info, &objv_tracker); + + return ret; + } + + int DBUser::verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider *dpp, optional_yield y) + { + *verified = false; + return 0; + } + + int DBBucket::remove_bucket(const DoutPrefixProvider *dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) + { + int ret; + + ret = load_bucket(dpp, y); + if (ret < 0) + return ret; + + /* XXX: handle delete_children */ + + if (!delete_children) { + /* Check if there are any objects */ + rgw::sal::Bucket::ListParams params; + params.list_versions = true; + params.allow_unordered = true; + + rgw::sal::Bucket::ListResults results; + + results.objs.clear(); + + ret = list(dpp, params, 2, results, null_yield); + + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << ": Bucket list objects returned " << + ret << dendl; + return ret; + } + + if (!results.objs.empty()) { + ret = -ENOTEMPTY; + ldpp_dout(dpp, -1) << __func__ << ": Bucket Not Empty.. returning " << + ret << dendl; + return ret; + } + } + + ret = store->getDB()->remove_bucket(dpp, info); + + return ret; + } + + int DBBucket::remove_bucket_bypass_gc(int concurrent_max, bool + keep_index_consistent, + optional_yield y, const + DoutPrefixProvider *dpp) { + return 0; + } + + int DBBucket::load_bucket(const DoutPrefixProvider *dpp, optional_yield y, bool get_stats) + { + int ret = 0; + + ret = store->getDB()->get_bucket_info(dpp, string("name"), "", info, &attrs, + &mtime, &bucket_version); + + return ret; + } + + /* stats - Not for first pass */ + int DBBucket::read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, + std::string *bucket_ver, std::string *master_ver, + std::map& stats, + std::string *max_marker, bool *syncstopped) + { + return 0; + } + + int DBBucket::read_stats_async(const DoutPrefixProvider *dpp, const bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx) + { + return 0; + } + + int DBBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) + { + return 0; + } + + int DBBucket::update_container_stats(const DoutPrefixProvider *dpp) + { + return 0; + } + + int DBBucket::check_bucket_shards(const DoutPrefixProvider *dpp) + { + return 0; + } + + int DBBucket::chown(const DoutPrefixProvider *dpp, User& new_user, optional_yield y) + { + int ret; + + ret = store->getDB()->update_bucket(dpp, "owner", info, false, &(new_user.get_id()), nullptr, nullptr, nullptr); + return ret; + } + + int DBBucket::put_info(const DoutPrefixProvider *dpp, bool exclusive, ceph::real_time _mtime) + { + int ret; + + ret = store->getDB()->update_bucket(dpp, "info", info, exclusive, nullptr, nullptr, &_mtime, &info.objv_tracker); + + return ret; + + } + + /* Make sure to call get_bucket_info() if you need it first */ + bool DBBucket::is_owner(User* user) + { + return (info.owner.compare(user->get_id()) == 0); + } + + int DBBucket::check_empty(const DoutPrefixProvider *dpp, optional_yield y) + { + /* XXX: Check if bucket contains any objects */ + return 0; + } + + int DBBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, + optional_yield y, bool check_size_only) + { + /* Not Handled in the first pass as stats are also needed */ + return 0; + } + + int DBBucket::merge_and_store_attrs(const DoutPrefixProvider *dpp, Attrs& new_attrs, optional_yield y) + { + int ret = 0; + + for(auto& it : new_attrs) { + attrs[it.first] = it.second; + } + + /* XXX: handle has_instance_obj like in set_bucket_instance_attrs() */ + + ret = store->getDB()->update_bucket(dpp, "attrs", info, false, nullptr, &new_attrs, nullptr, &get_info().objv_tracker); + + return ret; + } + + int DBBucket::try_refresh_info(const DoutPrefixProvider *dpp, ceph::real_time *pmtime) + { + int ret = 0; + + ret = store->getDB()->get_bucket_info(dpp, string("name"), "", info, &attrs, + pmtime, &bucket_version); + + return ret; + } + + /* XXX: usage and stats not supported in the first pass */ + int DBBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, + RGWUsageIter& usage_iter, + map& usage) + { + return 0; + } + + int DBBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) + { + return 0; + } + + int DBBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list& objs_to_unlink) + { + /* XXX: CHECK: Unlike RadosStore, there is no seperate bucket index table. + * Delete all the object in the list from the object table of this + * bucket + */ + return 0; + } + + int DBBucket::check_index(const DoutPrefixProvider *dpp, std::map& existing_stats, std::map& calculated_stats) + { + /* XXX: stats not supported yet */ + return 0; + } + + int DBBucket::rebuild_index(const DoutPrefixProvider *dpp) + { + /* there is no index table in dbstore. Not applicable */ + return 0; + } + + int DBBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) + { + /* XXX: CHECK: set tag timeout for all the bucket objects? */ + return 0; + } + + int DBBucket::purge_instance(const DoutPrefixProvider *dpp) + { + /* XXX: CHECK: for dbstore only single instance supported. + * Remove all the objects for that instance? Anything extra needed? + */ + return 0; + } + + int DBBucket::set_acl(const DoutPrefixProvider *dpp, RGWAccessControlPolicy &acl, optional_yield y) + { + int ret = 0; + bufferlist aclbl; + + acls = acl; + acl.encode(aclbl); + + Attrs attrs = get_attrs(); + attrs[RGW_ATTR_ACL] = aclbl; + + ret = store->getDB()->update_bucket(dpp, "attrs", info, false, &(acl.get_owner().get_id()), &attrs, nullptr, nullptr); + + return ret; + } + + std::unique_ptr DBBucket::get_object(const rgw_obj_key& k) + { + return std::make_unique(this->store, k, this); + } + + int DBBucket::list(const DoutPrefixProvider *dpp, ListParams& params, int max, ListResults& results, optional_yield y) + { + int ret = 0; + + results.objs.clear(); + + DB::Bucket target(store->getDB(), get_info()); + DB::Bucket::List list_op(&target); + + list_op.params.prefix = params.prefix; + list_op.params.delim = params.delim; + list_op.params.marker = params.marker; + list_op.params.ns = params.ns; + list_op.params.end_marker = params.end_marker; + list_op.params.ns = params.ns; + list_op.params.enforce_ns = params.enforce_ns; + list_op.params.access_list_filter = params.access_list_filter; + list_op.params.force_check_filter = params.force_check_filter; + list_op.params.list_versions = params.list_versions; + list_op.params.allow_unordered = params.allow_unordered; + + results.objs.clear(); + ret = list_op.list_objects(dpp, max, &results.objs, &results.common_prefixes, &results.is_truncated); + if (ret >= 0) { + results.next_marker = list_op.get_next_marker(); + params.marker = results.next_marker; + } + + return ret; + } + + std::unique_ptr DBBucket::get_multipart_upload( + const std::string& oid, + std::optional upload_id, + ACLOwner owner, ceph::real_time mtime) { + return std::make_unique(this->store, this, oid, upload_id, + std::move(owner), mtime); + } + + int DBBucket::list_multiparts(const DoutPrefixProvider *dpp, + const string& prefix, + string& marker, + const string& delim, + const int& max_uploads, + vector>& uploads, + map *common_prefixes, + bool *is_truncated) { + return 0; + } + + int DBBucket::abort_multiparts(const DoutPrefixProvider* dpp, + CephContext* cct) { + return 0; + } + + void DBStore::finalize(void) + { + if (dbsm) + dbsm->destroyAllHandles(); + } + + const std::string& DBZoneGroup::get_endpoint() const { + if (!group->endpoints.empty()) { + return group->endpoints.front(); + } else { + // use zonegroup's master zone endpoints + auto z = group->zones.find(group->master_zone); + if (z != group->zones.end() && !z->second.endpoints.empty()) { + return z->second.endpoints.front(); + } + } + return empty; + } + + bool DBZoneGroup::placement_target_exists(std::string& target) const { + return !!group->placement_targets.count(target); + } + + void DBZoneGroup::get_placement_target_names(std::set& names) const { + for (const auto& target : group->placement_targets) { + names.emplace(target.second.name); + } + } + + ZoneGroup& DBZone::get_zonegroup() + { + return *zonegroup; + } + + const RGWZoneParams& DBZone::get_rgw_params() + { + return *zone_params; + } + + const std::string& DBZone::get_id() + { + return zone_params->get_id(); + } + + + const std::string& DBZone::get_name() const + { + return zone_params->get_name(); + } + + bool DBZone::is_writeable() + { + return true; + } + + bool DBZone::get_redirect_endpoint(std::string* endpoint) + { + return false; + } + + bool DBZone::has_zonegroup_api(const std::string& api) const + { + return false; + } + + const std::string& DBZone::get_current_period_id() + { + return current_period->get_id(); + } + + const RGWAccessKey& DBZone::get_system_key() + { + return zone_params->system_key; + } + + const std::string& DBZone::get_realm_name() + { + return realm->get_name(); + } + + const std::string& DBZone::get_realm_id() + { + return realm->get_id(); + } + + RGWBucketSyncPolicyHandlerRef DBZone::get_sync_policy_handler() + { + return nullptr; + } + + std::unique_ptr DBStore::get_lua_manager() + { + return std::make_unique(this); + } + + int DBObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate, optional_yield y, bool follow_olh) + { + RGWObjState* astate; + DB::Object op_target(store->getDB(), get_bucket()->get_info(), get_obj()); + int ret = op_target.get_obj_state(dpp, get_bucket()->get_info(), get_obj(), follow_olh, &astate); + if (ret < 0) { + return ret; + } + + /* Don't overwrite obj, atomic, or prefetch */ + rgw_obj obj = get_obj(); + bool is_atomic = state.is_atomic; + bool prefetch_data = state.prefetch_data; + + state = *astate; + *pstate = &state; + + state.obj = obj; + state.is_atomic = is_atomic; + state.prefetch_data = prefetch_data; + return ret; + } + + int DBObject::read_attrs(const DoutPrefixProvider* dpp, DB::Object::Read &read_op, optional_yield y, rgw_obj* target_obj) + { + read_op.params.attrs = &state.attrset; + read_op.params.target_obj = target_obj; + read_op.params.obj_size = &state.size; + read_op.params.lastmod = &state.mtime; + + return read_op.prepare(dpp); + } + + int DBObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) + { + Attrs empty; + DB::Object op_target(store->getDB(), + get_bucket()->get_info(), get_obj()); + return op_target.set_attrs(dpp, setattrs ? *setattrs : empty, delattrs); + } + + int DBObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj) + { + DB::Object op_target(store->getDB(), get_bucket()->get_info(), get_obj()); + DB::Object::Read read_op(&op_target); + + return read_attrs(dpp, read_op, y, target_obj); + } + + int DBObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) + { + rgw_obj target = get_obj(); + int r = get_obj_attrs(y, dpp, &target); + if (r < 0) { + return r; + } + set_atomic(); + state.attrset[attr_name] = attr_val; + return set_obj_attrs(dpp, &state.attrset, nullptr, y); + } + + int DBObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) + { + Attrs rmattr; + bufferlist bl; + + set_atomic(); + rmattr[attr_name] = bl; + return set_obj_attrs(dpp, nullptr, &rmattr, y); + } + + bool DBObject::is_expired() { + return false; + } + + void DBObject::gen_rand_obj_instance_name() + { + store->getDB()->gen_rand_obj_instance_name(&state.obj.key); + } + + + int DBObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map *m, + bool* pmore, optional_yield y) + { + DB::Object op_target(store->getDB(), + get_bucket()->get_info(), get_obj()); + return op_target.obj_omap_get_vals(dpp, marker, count, m, pmore); + } + + int DBObject::omap_get_all(const DoutPrefixProvider *dpp, std::map *m, + optional_yield y) + { + DB::Object op_target(store->getDB(), + get_bucket()->get_info(), get_obj()); + return op_target.obj_omap_get_all(dpp, m); + } + + int DBObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid, + const std::set& keys, + Attrs* vals) + { + DB::Object op_target(store->getDB(), + get_bucket()->get_info(), get_obj()); + return op_target.obj_omap_get_vals_by_keys(dpp, oid, keys, vals); + } + + int DBObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) + { + DB::Object op_target(store->getDB(), + get_bucket()->get_info(), get_obj()); + return op_target.obj_omap_set_val_by_key(dpp, key, val, must_exist); + } + + int DBObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) + { + return 0; + } + + std::unique_ptr DBObject::get_serializer(const DoutPrefixProvider *dpp, + const std::string& lock_name) + { + return std::make_unique(dpp, store, this, lock_name); + } + + int DBObject::transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) + { + DB::Object op_target(store->getDB(), + get_bucket()->get_info(), get_obj()); + return op_target.transition(dpp, placement_rule, mtime, olh_epoch); + } + + bool DBObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) + { + /* XXX: support single default zone and zonegroup for now */ + return true; + } + + int DBObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) + { + return 0; + } + + std::unique_ptr DBObject::get_read_op() + { + return std::make_unique(this, nullptr); + } + + DBObject::DBReadOp::DBReadOp(DBObject *_source, RGWObjectCtx *_rctx) : + source(_source), + rctx(_rctx), + op_target(_source->store->getDB(), + _source->get_bucket()->get_info(), + _source->get_obj()), + parent_op(&op_target) + { } + + int DBObject::DBReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp) + { + uint64_t obj_size; + + parent_op.conds.mod_ptr = params.mod_ptr; + parent_op.conds.unmod_ptr = params.unmod_ptr; + parent_op.conds.high_precision_time = params.high_precision_time; + parent_op.conds.mod_zone_id = params.mod_zone_id; + parent_op.conds.mod_pg_ver = params.mod_pg_ver; + parent_op.conds.if_match = params.if_match; + parent_op.conds.if_nomatch = params.if_nomatch; + parent_op.params.lastmod = params.lastmod; + parent_op.params.target_obj = params.target_obj; + parent_op.params.obj_size = &obj_size; + parent_op.params.attrs = &source->get_attrs(); + + int ret = parent_op.prepare(dpp); + if (ret < 0) + return ret; + + source->set_key(parent_op.state.obj.key); + source->set_obj_size(obj_size); + + return ret; + } + + int DBObject::DBReadOp::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp) + { + return parent_op.read(ofs, end, bl, dpp); + } + + int DBObject::DBReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) + { + return parent_op.get_attr(dpp, name, dest); + } + + std::unique_ptr DBObject::get_delete_op() + { + return std::make_unique(this); + } + + DBObject::DBDeleteOp::DBDeleteOp(DBObject *_source) : + source(_source), + op_target(_source->store->getDB(), + _source->get_bucket()->get_info(), + _source->get_obj()), + parent_op(&op_target) + { } + + int DBObject::DBDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y) + { + parent_op.params.bucket_owner = params.bucket_owner.get_id(); + parent_op.params.versioning_status = params.versioning_status; + parent_op.params.obj_owner = params.obj_owner; + parent_op.params.olh_epoch = params.olh_epoch; + parent_op.params.marker_version_id = params.marker_version_id; + parent_op.params.bilog_flags = params.bilog_flags; + parent_op.params.remove_objs = params.remove_objs; + parent_op.params.expiration_time = params.expiration_time; + parent_op.params.unmod_since = params.unmod_since; + parent_op.params.mtime = params.mtime; + parent_op.params.high_precision_time = params.high_precision_time; + parent_op.params.zones_trace = params.zones_trace; + parent_op.params.abortmp = params.abortmp; + parent_op.params.parts_accounted_size = params.parts_accounted_size; + + int ret = parent_op.delete_obj(dpp); + if (ret < 0) + return ret; + + result.delete_marker = parent_op.result.delete_marker; + result.version_id = parent_op.result.version_id; + + return ret; + } + + int DBObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, bool prevent_versioning) + { + DB::Object del_target(store->getDB(), bucket->get_info(), get_obj()); + DB::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = bucket->get_info().owner; + del_op.params.versioning_status = bucket->get_info().versioning_status(); + + return del_op.delete_obj(dpp); + } + + int DBObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, + Completions* aio, bool keep_index_consistent, + optional_yield y) + { + /* XXX: Make it async */ + return 0; + } + + int DBObject::copy_object(User* user, + req_info* info, + const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, + rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, + ceph::real_time* mtime, + const ceph::real_time* mod_ptr, + const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, + const char* if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + Attrs& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, + std::string* tag, + std::string* etag, + void (*progress_cb)(off_t, void *), + void* progress_data, + const DoutPrefixProvider* dpp, + optional_yield y) + { + return 0; + } + + int DBObject::DBReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end, RGWGetDataCB* cb, optional_yield y) + { + return parent_op.iterate(dpp, ofs, end, cb); + } + + int DBObject::swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) + { + return 0; + } + + int DBObject::swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) + { + return 0; + } + + int DBMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct) + { + std::unique_ptr meta_obj = get_meta_obj(); + meta_obj->set_in_extra_data(true); + meta_obj->set_hash_source(mp_obj.get_key()); + int ret; + + std::unique_ptr del_op = meta_obj->get_delete_op(); + del_op->params.bucket_owner = bucket->get_acl_owner(); + del_op->params.versioning_status = 0; + + // Since the data objects are associated with meta obj till + // MultipartUpload::Complete() is done, removing the metadata obj + // should remove all the uploads so far. + ret = del_op->delete_obj(dpp, null_yield); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " << + ret << dendl; + } + return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret; + } + + static string mp_ns = RGW_OBJ_NS_MULTIPART; + + std::unique_ptr DBMultipartUpload::get_meta_obj() + { + return bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns)); + } + + int DBMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) + { + int ret; + std::string oid = mp_obj.get_key(); + + char buf[33]; + std::unique_ptr obj; // create meta obj + gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); + std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */ + upload_id.append(buf); + + mp_obj.init(oid, upload_id); + obj = get_meta_obj(); + + DB::Object op_target(store->getDB(), obj->get_bucket()->get_info(), + obj->get_obj()); + DB::Object::Write obj_op(&op_target); + + /* Create meta object */ + obj_op.meta.owner = owner.get_id(); + obj_op.meta.category = RGWObjCategory::MultiMeta; + obj_op.meta.flags = PUT_OBJ_CREATE_EXCL; + obj_op.meta.mtime = &mtime; + + multipart_upload_info upload_info; + upload_info.dest_placement = dest_placement; + + bufferlist bl; + encode(upload_info, bl); + obj_op.meta.data = &bl; + ret = obj_op.prepare(dpp); + if (ret < 0) + return ret; + ret = obj_op.write_meta(dpp, bl.length(), bl.length(), attrs); + + return ret; + } + + int DBMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct, + int num_parts, int marker, + int *next_marker, bool *truncated, + bool assume_unsorted) + { + std::list parts_map; + + std::unique_ptr obj = get_meta_obj(); + + parts.clear(); + int ret; + + DB::Object op_target(store->getDB(), + obj->get_bucket()->get_info(), obj->get_obj()); + ret = op_target.get_mp_parts_list(dpp, parts_map); + if (ret < 0) { + return ret; + } + + int last_num = 0; + + while (!parts_map.empty()) { + std::unique_ptr part = std::make_unique(); + RGWUploadPartInfo &pinfo = parts_map.front(); + part->set_info(pinfo); + if ((int)pinfo.num > marker) { + last_num = pinfo.num; + parts[pinfo.num] = std::move(part); + } + parts_map.pop_front(); + } + + /* rebuild a map with only num_parts entries */ + std::map> new_parts; + std::map>::iterator piter; + int i; + for (i = 0, piter = parts.begin(); + i < num_parts && piter != parts.end(); + ++i, ++piter) { + last_num = piter->first; + new_parts[piter->first] = std::move(piter->second); + } + + if (truncated) { + *truncated = (piter != parts.end()); + } + + parts.swap(new_parts); + + if (next_marker) { + *next_marker = last_num; + } + + return 0; + } + + int DBMultipartUpload::complete(const DoutPrefixProvider *dpp, + optional_yield y, CephContext* cct, + map& part_etags, + list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& ofs, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) + { + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + std::string etag; + bufferlist etag_bl; + MD5 hash; + bool truncated; + int ret; + + int total_parts = 0; + int handled_parts = 0; + int max_parts = 1000; + int marker = 0; + uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size; + auto etags_iter = part_etags.begin(); + rgw::sal::Attrs attrs = target_obj->get_attrs(); + + ofs = 0; + accounted_size = 0; + do { + ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_UPLOAD; + } + if (ret < 0) + return ret; + + total_parts += parts.size(); + if (!truncated && total_parts != (int)part_etags.size()) { + ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts + << " expected: " << part_etags.size() << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + + for (auto obj_iter = parts.begin(); etags_iter != part_etags.end() && obj_iter != parts.end(); ++etags_iter, ++obj_iter, ++handled_parts) { + DBMultipartPart* part = dynamic_cast(obj_iter->second.get()); + uint64_t part_size = part->get_size(); + if (handled_parts < (int)part_etags.size() - 1 && + part_size < min_part_size) { + ret = -ERR_TOO_SMALL; + return ret; + } + + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + if (etags_iter->first != (int)obj_iter->first) { + ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: " + << etags_iter->first << " next uploaded: " + << obj_iter->first << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + string part_etag = rgw_string_unquote(etags_iter->second); + if (part_etag.compare(part->get_etag()) != 0) { + ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first + << " etag: " << etags_iter->second << dendl; + ret = -ERR_INVALID_PART; + return ret; + } + + hex_to_buf(part->get_etag().c_str(), petag, + CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + + RGWUploadPartInfo& obj_part = part->get_info(); + + ofs += obj_part.size; + accounted_size += obj_part.accounted_size; + } + } while (truncated); + hash.Final((unsigned char *)final_etag); + + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], + sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)part_etags.size()); + etag = final_etag_str; + ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl; + + etag_bl.append(etag); + + attrs[RGW_ATTR_ETAG] = etag_bl; + + /* XXX: handle compression ? */ + + /* Rename all the object data entries with original object name (i.e + * from 'head_obj.name + "." + upload_id' to head_obj.name) */ + + /* Original head object */ + DB::Object op_target(store->getDB(), + target_obj->get_bucket()->get_info(), + target_obj->get_obj(), get_upload_id()); + DB::Object::Write obj_op(&op_target); + ret = obj_op.prepare(dpp); + + obj_op.meta.owner = owner.get_id(); + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.category = RGWObjCategory::Main; + obj_op.meta.modify_tail = true; + obj_op.meta.completeMultipart = true; + + ret = obj_op.write_meta(dpp, ofs, accounted_size, attrs); + if (ret < 0) + return ret; + + /* No need to delete Meta obj here. It is deleted from sal */ + return ret; + } + + int DBMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs) + { + if (!rule && !attrs) { + return 0; + } + + if (rule) { + if (!placement.empty()) { + *rule = &placement; + if (!attrs) { + /* Don't need attrs, done */ + return 0; + } + } else { + *rule = nullptr; + } + } + + /* We need either attributes or placement, so we need a read */ + std::unique_ptr meta_obj; + meta_obj = get_meta_obj(); + meta_obj->set_in_extra_data(true); + + multipart_upload_info upload_info; + bufferlist headbl; + + /* Read the obj head which contains the multipart_upload_info */ + std::unique_ptr read_op = meta_obj->get_read_op(); + int ret = read_op->prepare(y, dpp); + if (ret < 0) { + if (ret == -ENOENT) { + return -ERR_NO_SUCH_UPLOAD; + } + return ret; + } + + if (attrs) { + /* Attrs are filled in by prepare */ + *attrs = meta_obj->get_attrs(); + if (!rule || *rule != nullptr) { + /* placement was cached; don't actually read */ + return 0; + } + } + + /* Now read the placement from the head */ + ret = read_op->read(0, store->getDB()->get_max_head_size(), headbl, y, dpp); + if (ret < 0) { + if (ret == -ENOENT) { + return -ERR_NO_SUCH_UPLOAD; + } + return ret; + } + + if (headbl.length() <= 0) { + return -ERR_NO_SUCH_UPLOAD; + } + + /* Decode multipart_upload_info */ + auto hiter = headbl.cbegin(); + try { + decode(upload_info, hiter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode multipart upload info" << dendl; + return -EIO; + } + placement = upload_info.dest_placement; + *rule = &placement; + + return 0; + } + + std::unique_ptr DBMultipartUpload::get_writer( + const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) + { + return std::make_unique(dpp, y, this, obj, store, owner, + ptail_placement_rule, part_num, part_num_str); + } + + DBMultipartWriter::DBMultipartWriter(const DoutPrefixProvider *dpp, + optional_yield y, + MultipartUpload* upload, + rgw::sal::Object* obj, + DBStore* _driver, + const rgw_user& _owner, + const rgw_placement_rule *_ptail_placement_rule, + uint64_t _part_num, const std::string& _part_num_str): + StoreWriter(dpp, y), + store(_driver), + owner(_owner), + ptail_placement_rule(_ptail_placement_rule), + head_obj(obj), + upload_id(upload->get_upload_id()), + part_num(_part_num), + oid(head_obj->get_name() + "." + upload_id + + "." + std::to_string(part_num)), + meta_obj(((DBMultipartUpload*)upload)->get_meta_obj()), + op_target(_driver->getDB(), head_obj->get_bucket()->get_info(), head_obj->get_obj(), upload_id), + parent_op(&op_target), + part_num_str(_part_num_str) {} + + int DBMultipartWriter::prepare(optional_yield y) + { + parent_op.prepare(NULL); + parent_op.set_mp_part_str(upload_id + "." + std::to_string(part_num)); + // XXX: do we need to handle part_num_str?? + return 0; + } + + int DBMultipartWriter::process(bufferlist&& data, uint64_t offset) + { + /* XXX: same as AtomicWriter..consolidate code */ + total_data_size += data.length(); + + /* XXX: Optimize all bufferlist copies in this function */ + + /* copy head_data into meta. But for multipart we do not + * need to write head_data */ + uint64_t max_chunk_size = store->getDB()->get_max_chunk_size(); + int excess_size = 0; + + /* Accumulate tail_data till max_chunk_size or flush op */ + bufferlist tail_data; + + if (data.length() != 0) { + parent_op.meta.data = &head_data; /* Null data ?? */ + + /* handle tail )parts. + * First accumulate and write data into dbstore in its chunk_size + * parts + */ + if (!tail_part_size) { /* new tail part */ + tail_part_offset = offset; + } + data.begin(0).copy(data.length(), tail_data); + tail_part_size += tail_data.length(); + tail_part_data.append(tail_data); + + if (tail_part_size < max_chunk_size) { + return 0; + } else { + int write_ofs = 0; + while (tail_part_size >= max_chunk_size) { + excess_size = tail_part_size - max_chunk_size; + bufferlist tmp; + tail_part_data.begin(write_ofs).copy(max_chunk_size, tmp); + /* write tail objects data */ + int ret = parent_op.write_data(dpp, tmp, tail_part_offset); + + if (ret < 0) { + return ret; + } + + tail_part_size -= max_chunk_size; + write_ofs += max_chunk_size; + tail_part_offset += max_chunk_size; + } + /* reset tail parts or update if excess data */ + if (excess_size > 0) { /* wrote max_chunk_size data */ + tail_part_size = excess_size; + bufferlist tmp; + tail_part_data.begin(write_ofs).copy(excess_size, tmp); + tail_part_data = tmp; + } else { + tail_part_size = 0; + tail_part_data.clear(); + tail_part_offset = 0; + } + } + } else { + if (tail_part_size == 0) { + return 0; /* nothing more to write */ + } + + /* flush watever tail data is present */ + int ret = parent_op.write_data(dpp, tail_part_data, tail_part_offset); + if (ret < 0) { + return ret; + } + tail_part_size = 0; + tail_part_data.clear(); + tail_part_offset = 0; + } + + return 0; + } + + int DBMultipartWriter::complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) + { + int ret = 0; + /* XXX: same as AtomicWriter..consolidate code */ + parent_op.meta.mtime = mtime; + parent_op.meta.delete_at = delete_at; + parent_op.meta.if_match = if_match; + parent_op.meta.if_nomatch = if_nomatch; + parent_op.meta.user_data = user_data; + parent_op.meta.zones_trace = zones_trace; + + /* XXX: handle accounted size */ + accounted_size = total_data_size; + + if (ret < 0) + return ret; + + RGWUploadPartInfo info; + info.num = part_num; + info.etag = etag; + info.size = total_data_size; + info.accounted_size = accounted_size; + info.modified = real_clock::now(); + //info.manifest = manifest; + + DB::Object op_target(store->getDB(), + meta_obj->get_bucket()->get_info(), meta_obj->get_obj()); + ret = op_target.add_mp_part(dpp, info); + if (ret < 0) { + return ret == -ENOENT ? -ERR_NO_SUCH_UPLOAD : ret; + } + + return 0; + } + + DBAtomicWriter::DBAtomicWriter(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* _obj, + DBStore* _driver, + const rgw_user& _owner, + const rgw_placement_rule *_ptail_placement_rule, + uint64_t _olh_epoch, + const std::string& _unique_tag) : + StoreWriter(dpp, y), + store(_driver), + owner(_owner), + ptail_placement_rule(_ptail_placement_rule), + olh_epoch(_olh_epoch), + unique_tag(_unique_tag), + obj(_driver, _obj->get_key(), _obj->get_bucket()), + op_target(_driver->getDB(), obj.get_bucket()->get_info(), obj.get_obj()), + parent_op(&op_target) {} + + int DBAtomicWriter::prepare(optional_yield y) + { + return parent_op.prepare(NULL); /* send dpp */ + } + + int DBAtomicWriter::process(bufferlist&& data, uint64_t offset) + { + total_data_size += data.length(); + + /* XXX: Optimize all bufferlist copies in this function */ + + /* copy head_data into meta. */ + uint64_t head_size = store->getDB()->get_max_head_size(); + unsigned head_len = 0; + uint64_t max_chunk_size = store->getDB()->get_max_chunk_size(); + int excess_size = 0; + + /* Accumulate tail_data till max_chunk_size or flush op */ + bufferlist tail_data; + + if (data.length() != 0) { + if (offset < head_size) { + /* XXX: handle case (if exists) where offset > 0 & < head_size */ + head_len = std::min((uint64_t)data.length(), + head_size - offset); + bufferlist tmp; + data.begin(0).copy(head_len, tmp); + head_data.append(tmp); + + parent_op.meta.data = &head_data; + if (head_len == data.length()) { + return 0; + } + + /* Move offset by copy_len */ + offset = head_len; + } + + /* handle tail parts. + * First accumulate and write data into dbstore in its chunk_size + * parts + */ + if (!tail_part_size) { /* new tail part */ + tail_part_offset = offset; + } + data.begin(head_len).copy(data.length() - head_len, tail_data); + tail_part_size += tail_data.length(); + tail_part_data.append(tail_data); + + if (tail_part_size < max_chunk_size) { + return 0; + } else { + int write_ofs = 0; + while (tail_part_size >= max_chunk_size) { + excess_size = tail_part_size - max_chunk_size; + bufferlist tmp; + tail_part_data.begin(write_ofs).copy(max_chunk_size, tmp); + /* write tail objects data */ + int ret = parent_op.write_data(dpp, tmp, tail_part_offset); + + if (ret < 0) { + return ret; + } + + tail_part_size -= max_chunk_size; + write_ofs += max_chunk_size; + tail_part_offset += max_chunk_size; + } + /* reset tail parts or update if excess data */ + if (excess_size > 0) { /* wrote max_chunk_size data */ + tail_part_size = excess_size; + bufferlist tmp; + tail_part_data.begin(write_ofs).copy(excess_size, tmp); + tail_part_data = tmp; + } else { + tail_part_size = 0; + tail_part_data.clear(); + tail_part_offset = 0; + } + } + } else { + if (tail_part_size == 0) { + return 0; /* nothing more to write */ + } + + /* flush watever tail data is present */ + int ret = parent_op.write_data(dpp, tail_part_data, tail_part_offset); + if (ret < 0) { + return ret; + } + tail_part_size = 0; + tail_part_data.clear(); + tail_part_offset = 0; + } + + return 0; + } + + int DBAtomicWriter::complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) + { + parent_op.meta.mtime = mtime; + parent_op.meta.delete_at = delete_at; + parent_op.meta.if_match = if_match; + parent_op.meta.if_nomatch = if_nomatch; + parent_op.meta.user_data = user_data; + parent_op.meta.zones_trace = zones_trace; + parent_op.meta.category = RGWObjCategory::Main; + + /* XXX: handle accounted size */ + accounted_size = total_data_size; + int ret = parent_op.write_meta(dpp, total_data_size, accounted_size, attrs); + if (canceled) { + *canceled = parent_op.meta.canceled; + } + + return ret; + + } + + std::unique_ptr DBStore::get_role(std::string name, + std::string tenant, + std::string path, + std::string trust_policy, + std::string max_session_duration_str, + std::multimap tags) + { + RGWRole* p = nullptr; + return std::unique_ptr(p); + } + + std::unique_ptr DBStore::get_role(std::string id) + { + RGWRole* p = nullptr; + return std::unique_ptr(p); + } + + std::unique_ptr DBStore::get_role(const RGWRoleInfo& info) + { + RGWRole* p = nullptr; + return std::unique_ptr(p); + } + + int DBStore::get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + vector>& roles) + { + return 0; + } + + std::unique_ptr DBStore::get_oidc_provider() + { + RGWOIDCProvider* p = nullptr; + return std::unique_ptr(p); + } + + int DBStore::get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + vector>& providers) + { + return 0; + } + + std::unique_ptr DBStore::get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) { + return nullptr; + } + + std::unique_ptr DBStore::get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) { + return std::make_unique(dpp, y, obj, this, owner, + ptail_placement_rule, olh_epoch, unique_tag); + } + + const std::string& DBStore::get_compression_type(const rgw_placement_rule& rule) { + return zone.get_rgw_params().get_compression_type(rule); + } + + bool DBStore::valid_placement(const rgw_placement_rule& rule) + { + // XXX: Till zonegroup, zone and storage-classes can be configured + // for dbstore return true + return true; //zone.get_rgw_params().valid_placement(rule); + } + + std::unique_ptr DBStore::get_user(const rgw_user &u) + { + return std::make_unique(this, u); + } + + int DBStore::get_user_by_access_key(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y, std::unique_ptr* user) + { + RGWUserInfo uinfo; + User *u; + int ret = 0; + RGWObjVersionTracker objv_tracker; + + ret = getDB()->get_user(dpp, string("access_key"), key, uinfo, nullptr, + &objv_tracker); + + if (ret < 0) + return ret; + + u = new DBUser(this, uinfo); + + if (!u) + return -ENOMEM; + + u->get_version_tracker() = objv_tracker; + user->reset(u); + + return 0; + } + + int DBStore::get_user_by_email(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y, std::unique_ptr* user) + { + RGWUserInfo uinfo; + User *u; + int ret = 0; + RGWObjVersionTracker objv_tracker; + + ret = getDB()->get_user(dpp, string("email"), email, uinfo, nullptr, + &objv_tracker); + + if (ret < 0) + return ret; + + u = new DBUser(this, uinfo); + + if (!u) + return -ENOMEM; + + u->get_version_tracker() = objv_tracker; + user->reset(u); + + return ret; + } + + int DBStore::get_user_by_swift(const DoutPrefixProvider *dpp, const std::string& user_str, optional_yield y, std::unique_ptr* user) + { + /* Swift keys and subusers are not supported for now */ + return -ENOTSUP; + } + + std::string DBStore::get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y) + { + return "PLACEHOLDER"; // for instance unique identifier + } + + std::unique_ptr DBStore::get_object(const rgw_obj_key& k) + { + return std::make_unique(this, k); + } + + + int DBStore::get_bucket(const DoutPrefixProvider *dpp, User* u, const rgw_bucket& b, std::unique_ptr* bucket, optional_yield y) + { + int ret; + Bucket* bp; + + bp = new DBBucket(this, b, u); + ret = bp->load_bucket(dpp, y); + if (ret < 0) { + delete bp; + return ret; + } + + bucket->reset(bp); + return 0; + } + + int DBStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr* bucket) + { + Bucket* bp; + + bp = new DBBucket(this, i, u); + /* Don't need to fetch the bucket info, use the provided one */ + + bucket->reset(bp); + return 0; + } + + int DBStore::get_bucket(const DoutPrefixProvider *dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr* bucket, optional_yield y) + { + rgw_bucket b; + + b.tenant = tenant; + b.name = name; + + return get_bucket(dpp, u, b, bucket, y); + } + + bool DBStore::is_meta_master() + { + return true; + } + + int DBStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version *objv, + bufferlist& in_data, + JSONParser *jp, req_info& info, + optional_yield y) + { + return 0; + } + + int DBStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, req_info& info, + optional_yield y) + { + return 0; + } + + std::string DBStore::zone_unique_id(uint64_t unique_num) + { + return ""; + } + + std::string DBStore::zone_unique_trans_id(const uint64_t unique_num) + { + return ""; + } + + int DBStore::get_zonegroup(const std::string& id, std::unique_ptr* zg) + { + /* XXX: for now only one zonegroup supported */ + ZoneGroup* group = new DBZoneGroup(this, std::make_unique()); + if (!group) + return -ENOMEM; + + zg->reset(group); + return 0; + } + + int DBStore::list_all_zones(const DoutPrefixProvider* dpp, + std::list& zone_ids) + { + zone_ids.push_back(zone.get_id()); + return 0; + } + + int DBStore::cluster_stat(RGWClusterStat& stats) + { + return 0; + } + + std::unique_ptr DBStore::get_lifecycle(void) + { + return std::make_unique(this); + } + + std::unique_ptr DBStore::get_completions(void) + { + return 0; + } + + int DBLifecycle::get_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) + { + return store->getDB()->get_entry(oid, marker, entry); + } + + int DBLifecycle::get_next_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) + { + return store->getDB()->get_next_entry(oid, marker, entry); + } + + int DBLifecycle::set_entry(const std::string& oid, LCEntry& entry) + { + return store->getDB()->set_entry(oid, entry); + } + + int DBLifecycle::list_entries(const std::string& oid, const std::string& marker, + uint32_t max_entries, vector>& entries) + { + return store->getDB()->list_entries(oid, marker, max_entries, entries); + } + + int DBLifecycle::rm_entry(const std::string& oid, LCEntry& entry) + { + return store->getDB()->rm_entry(oid, entry); + } + + int DBLifecycle::get_head(const std::string& oid, std::unique_ptr* head) + { + return store->getDB()->get_head(oid, head); + } + + int DBLifecycle::put_head(const std::string& oid, LCHead& head) + { + return store->getDB()->put_head(oid, head); + } + + std::unique_ptr DBLifecycle::get_serializer(const std::string& lock_name, + const std::string& oid, + const std::string& cookie) + { + return std::make_unique(store, oid, lock_name, cookie); + } + + std::unique_ptr DBStore::get_notification( + rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, + rgw::notify::EventType event_type, optional_yield y, + const std::string* object_name) + { + return std::make_unique(obj, src_obj, event_type); + } + + std::unique_ptr DBStore::get_notification( + const DoutPrefixProvider* dpp, rgw::sal::Object* obj, + rgw::sal::Object* src_obj, + rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, + std::string& _user_id, std::string& _user_tenant, std::string& _req_id, + optional_yield y) + { + return std::make_unique(obj, src_obj, event_type); + } + + RGWLC* DBStore::get_rgwlc(void) { + return lc; + } + + int DBStore::log_usage(const DoutPrefixProvider *dpp, map& usage_info) + { + return 0; + } + + int DBStore::log_op(const DoutPrefixProvider *dpp, string& oid, bufferlist& bl) + { + return 0; + } + + int DBStore::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, + const map& meta) + { + return 0; + } + + void DBStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) + { + return; + } + + void DBStore::get_quota(RGWQuota& quota) + { + // XXX: Not handled for the first pass + return; + } + + int DBStore::set_buckets_enabled(const DoutPrefixProvider *dpp, vector& buckets, bool enabled) + { + int ret = 0; + + vector::iterator iter; + + for (iter = buckets.begin(); iter != buckets.end(); ++iter) { + rgw_bucket& bucket = *iter; + if (enabled) { + ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl; + } else { + ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl; + } + + RGWBucketInfo info; + map attrs; + int r = getDB()->get_bucket_info(dpp, string("name"), "", info, &attrs, + nullptr, nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; + ret = r; + continue; + } + if (enabled) { + info.flags &= ~BUCKET_SUSPENDED; + } else { + info.flags |= BUCKET_SUSPENDED; + } + + r = getDB()->update_bucket(dpp, "info", info, false, nullptr, &attrs, nullptr, &info.objv_tracker); + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; + ret = r; + continue; + } + } + return ret; + } + + int DBStore::get_sync_policy_handler(const DoutPrefixProvider *dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef *phandler, + optional_yield y) + { + return 0; + } + + RGWDataSyncStatusManager* DBStore::get_data_sync_manager(const rgw_zone_id& source_zone) + { + return 0; + } + + int DBStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, + RGWUsageIter& usage_iter, + map& usage) + { + return 0; + } + + int DBStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) + { + return 0; + } + + int DBStore::get_config_key_val(string name, bufferlist *bl) + { + return -ENOTSUP; + } + + int DBStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const string& section, const string& marker, void** phandle) + { + return 0; + } + + int DBStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list& keys, bool* truncated) + { + return 0; + } + + void DBStore::meta_list_keys_complete(void* handle) + { + return; + } + + std::string DBStore::meta_get_marker(void* handle) + { + return ""; + } + + int DBStore::meta_remove(const DoutPrefixProvider *dpp, string& metadata_key, optional_yield y) + { + return 0; + } + + int DBStore::initialize(CephContext *_cct, const DoutPrefixProvider *_dpp) { + int ret = 0; + cct = _cct; + dpp = _dpp; + + lc = new RGWLC(); + lc->initialize(cct, this); + + if (use_lc_thread) { + ret = db->createLCTables(dpp); + lc->start_processor(); + } + + ret = db->createGC(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) <<"GC thread creation failed: ret = " << ret << dendl; + } + + return ret; + } + + int DBLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) + { + return -ENOENT; + } + + int DBLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) + { + return -ENOENT; + } + + int DBLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) + { + return -ENOENT; + } + + int DBLuaManager::add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) + { + return -ENOENT; + } + + int DBLuaManager::remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) + { + return -ENOENT; + } + + int DBLuaManager::list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) + { + return -ENOENT; + } +} // namespace rgw::sal + +extern "C" { + + void *newDBStore(CephContext *cct) + { + rgw::sal::DBStore *driver = new rgw::sal::DBStore(); + DBStoreManager *dbsm = new DBStoreManager(cct); + + DB *db = dbsm->getDB(); + if (!db) { + delete dbsm; + delete driver; + return nullptr; + } + + driver->setDBStoreManager(dbsm); + driver->setDB(db); + db->set_driver((rgw::sal::Driver*)driver); + db->set_context(cct); + + return driver; + } + +} diff --git a/src/rgw/rgw_sal_dbstore.h b/src/rgw/rgw_sal_dbstore.h new file mode 100644 index 000000000..3acdb4ba3 --- /dev/null +++ b/src/rgw/rgw_sal_dbstore.h @@ -0,0 +1,921 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2021 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_sal_store.h" +#include "rgw_oidc_provider.h" +#include "rgw_role.h" +#include "rgw_lc.h" +#include "rgw_multi.h" + +#include "driver/dbstore/common/dbstore.h" +#include "driver/dbstore/dbstore_mgr.h" + +namespace rgw { namespace sal { + + class DBStore; + +class LCDBSerializer : public StoreLCSerializer { + +public: + LCDBSerializer(DBStore* store, const std::string& oid, const std::string& lock_name, const std::string& cookie) {} + + virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override { return 0; } + virtual int unlock() override { + return 0; + } +}; + +class DBLifecycle : public StoreLifecycle { + DBStore* store; + +public: + DBLifecycle(DBStore* _st) : store(_st) {} + + using StoreLifecycle::get_entry; + virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr* entry) override; + virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr* entry) override; + virtual int set_entry(const std::string& oid, LCEntry& entry) override; + virtual int list_entries(const std::string& oid, const std::string& marker, + uint32_t max_entries, + std::vector>& entries) override; + virtual int rm_entry(const std::string& oid, LCEntry& entry) override; + virtual int get_head(const std::string& oid, std::unique_ptr* head) override; + virtual int put_head(const std::string& oid, LCHead& head) override; + virtual std::unique_ptr get_serializer(const std::string& lock_name, + const std::string& oid, + const std::string& cookie) override; +}; + +class DBNotification : public StoreNotification { +protected: + public: + DBNotification(Object* _obj, Object* _src_obj, rgw::notify::EventType _type) + : StoreNotification(_obj, _src_obj, _type) {} + ~DBNotification() = default; + + virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override { return 0;} + virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size, + const ceph::real_time& mtime, const std::string& etag, const std::string& version) override { return 0; } +}; + + class DBUser : public StoreUser { + private: + DBStore *store; + + public: + DBUser(DBStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { } + DBUser(DBStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { } + DBUser(DBStore *_st) : store(_st) { } + DBUser(DBUser& _o) = default; + DBUser() {} + + virtual std::unique_ptr clone() override { + return std::unique_ptr(new DBUser(*this)); + } + int list_buckets(const DoutPrefixProvider *dpp, const std::string& marker, const std::string& end_marker, + uint64_t max, bool need_stats, BucketList& buckets, optional_yield y) override; + virtual int create_bucket(const DoutPrefixProvider* dpp, + const rgw_bucket& b, + const std::string& zonegroup_id, + rgw_placement_rule& placement_rule, + std::string& swift_ver_location, + const RGWQuotaInfo* pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool* existed, + req_info& req_info, + std::unique_ptr* bucket, + optional_yield y) override; + virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time *last_stats_sync = nullptr, + ceph::real_time *last_stats_update = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override; + virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + + /* Placeholders */ + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override; + virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override; + virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override; + + friend class DBBucket; + }; + + class DBBucket : public StoreBucket { + private: + DBStore *store; + RGWAccessControlPolicy acls; + + public: + DBBucket(DBStore *_st) + : store(_st), + acls() { + } + + DBBucket(DBStore *_st, User* _u) + : StoreBucket(_u), + store(_st), + acls() { + } + + DBBucket(DBStore *_st, const rgw_bucket& _b) + : StoreBucket(_b), + store(_st), + acls() { + } + + DBBucket(DBStore *_st, const RGWBucketEnt& _e) + : StoreBucket(_e), + store(_st), + acls() { + } + + DBBucket(DBStore *_st, const RGWBucketInfo& _i) + : StoreBucket(_i), + store(_st), + acls() { + } + + DBBucket(DBStore *_st, const rgw_bucket& _b, User* _u) + : StoreBucket(_b, _u), + store(_st), + acls() { + } + + DBBucket(DBStore *_st, const RGWBucketEnt& _e, User* _u) + : StoreBucket(_e, _u), + store(_st), + acls() { + } + + DBBucket(DBStore *_st, const RGWBucketInfo& _i, User* _u) + : StoreBucket(_i, _u), + store(_st), + acls() { + } + + ~DBBucket() { } + + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual int list(const DoutPrefixProvider *dpp, ListParams&, int, ListResults&, optional_yield y) override; + virtual int remove_bucket(const DoutPrefixProvider *dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override; + virtual int remove_bucket_bypass_gc(int concurrent_max, bool + keep_index_consistent, + optional_yield y, const + DoutPrefixProvider *dpp) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } + virtual int set_acl(const DoutPrefixProvider *dpp, RGWAccessControlPolicy& acl, optional_yield y) override; + virtual int load_bucket(const DoutPrefixProvider *dpp, optional_yield y, bool get_stats = false) override; + virtual int read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, + std::string *bucket_ver, std::string *master_ver, + std::map& stats, + std::string *max_marker = nullptr, + bool *syncstopped = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider *dpp, const bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB* ctx) override; + virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int update_container_stats(const DoutPrefixProvider *dpp) override; + virtual int check_bucket_shards(const DoutPrefixProvider *dpp) override; + virtual int chown(const DoutPrefixProvider *dpp, User& new_user, optional_yield y) override; + virtual int put_info(const DoutPrefixProvider *dpp, bool exclusive, ceph::real_time mtime) override; + virtual bool is_owner(User* user) override; + virtual int check_empty(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider *dpp, Attrs& attrs, optional_yield y) override; + virtual int try_refresh_info(const DoutPrefixProvider *dpp, ceph::real_time *pmtime) override; + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool *is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list& objs_to_unlink) override; + virtual int check_index(const DoutPrefixProvider *dpp, std::map& existing_stats, std::map& calculated_stats) override; + virtual int rebuild_index(const DoutPrefixProvider *dpp) override; + virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override; + virtual int purge_instance(const DoutPrefixProvider *dpp) override; + virtual std::unique_ptr clone() override { + return std::make_unique(*this); + } + virtual std::unique_ptr get_multipart_upload( + const std::string& oid, std::optional upload_id, + ACLOwner owner={}, ceph::real_time mtime=ceph::real_clock::now()) override; + virtual int list_multiparts(const DoutPrefixProvider *dpp, + const std::string& prefix, + std::string& marker, + const std::string& delim, + const int& max_uploads, + std::vector>& uploads, + std::map *common_prefixes, + bool *is_truncated) override; + virtual int abort_multiparts(const DoutPrefixProvider* dpp, + CephContext* cct) override; + + friend class DBStore; + }; + + class DBPlacementTier: public StorePlacementTier { + DBStore* store; + RGWZoneGroupPlacementTier tier; + public: + DBPlacementTier(DBStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {} + virtual ~DBPlacementTier() = default; + + virtual const std::string& get_tier_type() { return tier.tier_type; } + virtual const std::string& get_storage_class() { return tier.storage_class; } + virtual bool retain_head_object() { return tier.retain_head_object; } + RGWZoneGroupPlacementTier& get_rt() { return tier; } + }; + + class DBZoneGroup : public StoreZoneGroup { + DBStore* store; + std::unique_ptr group; + std::string empty; + public: + DBZoneGroup(DBStore* _store, std::unique_ptr _group) : store(_store), group(std::move(_group)) {} + virtual ~DBZoneGroup() = default; + + virtual const std::string& get_id() const override { return group->get_id(); }; + virtual const std::string& get_name() const override { return group->get_name(); }; + virtual int equals(const std::string& other_zonegroup) const override { + return group->equals(other_zonegroup); + }; + /** Get the endpoint from zonegroup, or from master zone if not set */ + virtual const std::string& get_endpoint() const override; + virtual bool placement_target_exists(std::string& target) const override; + virtual bool is_master_zonegroup() const override { + return group->is_master_zonegroup(); + }; + virtual const std::string& get_api_name() const override { return group->api_name; }; + virtual void get_placement_target_names(std::set& names) const override; + virtual const std::string& get_default_placement_name() const override { + return group->default_placement.name; }; + virtual int get_hostnames(std::list& names) const override { + names = group->hostnames; + return 0; + }; + virtual int get_s3website_hostnames(std::list& names) const override { + names = group->hostnames_s3website; + return 0; + }; + virtual int get_zone_count() const override { + /* currently only 1 zone supported */ + return 1; + } + virtual int get_placement_tier(const rgw_placement_rule& rule, + std::unique_ptr* tier) { + return -1; + } + virtual int get_zone_by_id(const std::string& id, std::unique_ptr* zone) override { + return -1; + } + virtual int get_zone_by_name(const std::string& name, std::unique_ptr* zone) override { + return -1; + } + virtual int list_zones(std::list& zone_ids) override { + zone_ids.clear(); + return 0; + } + bool supports(std::string_view feature) const override { + return group->supports(feature); + } + virtual std::unique_ptr clone() override { + std::unique_ptrzg = std::make_unique(*group.get()); + return std::make_unique(store, std::move(zg)); + } + }; + + class DBZone : public StoreZone { + protected: + DBStore* store; + RGWRealm *realm{nullptr}; + DBZoneGroup *zonegroup{nullptr}; + RGWZone *zone_public_config{nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */ + RGWZoneParams *zone_params{nullptr}; /* internal zone params, e.g., rados pools */ + RGWPeriod *current_period{nullptr}; + + public: + DBZone(DBStore* _store) : store(_store) { + realm = new RGWRealm(); + zonegroup = new DBZoneGroup(store, std::make_unique()); + zone_public_config = new RGWZone(); + zone_params = new RGWZoneParams(); + current_period = new RGWPeriod(); + + // XXX: only default and STANDARD supported for now + RGWZonePlacementInfo info; + RGWZoneStorageClasses sc; + sc.set_storage_class("STANDARD", nullptr, nullptr); + info.storage_classes = sc; + zone_params->placement_pools["default"] = info; + } + ~DBZone() { + delete realm; + delete zonegroup; + delete zone_public_config; + delete zone_params; + delete current_period; + } + + virtual std::unique_ptr clone() override { + return std::make_unique(store); + } + virtual ZoneGroup& get_zonegroup() override; + const RGWZoneParams& get_rgw_params(); + virtual const std::string& get_id() override; + virtual const std::string& get_name() const override; + virtual bool is_writeable() override; + virtual bool get_redirect_endpoint(std::string* endpoint) override; + virtual bool has_zonegroup_api(const std::string& api) const override; + virtual const std::string& get_current_period_id() override; + virtual const RGWAccessKey& get_system_key() override; + virtual const std::string& get_realm_name() override; + virtual const std::string& get_realm_id() override; + virtual const std::string_view get_tier_type() override { return "rgw"; } + virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override; + }; + + class DBLuaManager : public StoreLuaManager { + DBStore* store; + + public: + DBLuaManager(DBStore* _s) : store(_s) + { + } + virtual ~DBLuaManager() = default; + + /** Get a script named with the given key from the backing store */ + virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override; + /** Put a script named with the given key to the backing store */ + virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override; + /** Delete a script named with the given key from the backing store */ + virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override; + /** Add a lua package */ + virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override; + /** Remove a lua package */ + virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override; + /** List lua packages */ + virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override; + }; + + class DBOIDCProvider : public RGWOIDCProvider { + DBStore* store; + public: + DBOIDCProvider(DBStore* _store) : store(_store) {} + ~DBOIDCProvider() = default; + + virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override { return 0; } + virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override { return 0; } + virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override { return 0;} + + void encode(bufferlist& bl) const { + RGWOIDCProvider::encode(bl); + } + void decode(bufferlist::const_iterator& bl) { + RGWOIDCProvider::decode(bl); + } + }; + + /* + * For multipart upload, below is the process flow - + * + * MultipartUpload::Init - create head object of meta obj (src_obj_name + "." + upload_id) + * [ Meta object stores all the parts upload info] + * MultipartWriter::process - create all data/tail objects with obj_name same as + * meta obj (so that they can all be identified & deleted + * during abort) + * MultipartUpload::Abort - Just delete meta obj .. that will indirectly delete all the + * uploads associated with that upload id / meta obj so far. + * MultipartUpload::Complete - create head object of the original object (if not exists) & + * rename all data/tail objects to orig object name and update + * metadata of the orig object. + */ + class DBMultipartPart : public StoreMultipartPart { + protected: + RGWUploadPartInfo info; /* XXX: info contains manifest also which is not needed */ + + public: + DBMultipartPart() = default; + virtual ~DBMultipartPart() = default; + + virtual RGWUploadPartInfo& get_info() { return info; } + virtual void set_info(const RGWUploadPartInfo& _info) { info = _info; } + virtual uint32_t get_num() { return info.num; } + virtual uint64_t get_size() { return info.accounted_size; } + virtual const std::string& get_etag() { return info.etag; } + virtual ceph::real_time& get_mtime() { return info.modified; } + + }; + + class DBMPObj { + std::string oid; // object name + std::string upload_id; + std::string meta; // multipart meta object = . + public: + DBMPObj() {} + DBMPObj(const std::string& _oid, const std::string& _upload_id) { + init(_oid, _upload_id, _upload_id); + } + DBMPObj(const std::string& _oid, std::optional _upload_id) { + if (_upload_id) { + init(_oid, *_upload_id, *_upload_id); + } else { + from_meta(_oid); + } + } + void init(const std::string& _oid, const std::string& _upload_id) { + init(_oid, _upload_id, _upload_id); + } + void init(const std::string& _oid, const std::string& _upload_id, const std::string& part_unique_str) { + if (_oid.empty()) { + clear(); + return; + } + oid = _oid; + upload_id = _upload_id; + meta = oid + "." + upload_id; + } + const std::string& get_upload_id() const { + return upload_id; + } + const std::string& get_key() const { + return oid; + } + const std::string& get_meta() const { return meta; } + bool from_meta(const std::string& meta) { + int end_pos = meta.length(); + int mid_pos = meta.rfind('.', end_pos - 1); // . + if (mid_pos < 0) + return false; + oid = meta.substr(0, mid_pos); + upload_id = meta.substr(mid_pos + 1, end_pos - mid_pos - 1); + init(oid, upload_id, upload_id); + return true; + } + void clear() { + oid = ""; + meta = ""; + upload_id = ""; + } + }; + + class DBMultipartUpload : public StoreMultipartUpload { + DBStore* store; + DBMPObj mp_obj; + ACLOwner owner; + ceph::real_time mtime; + rgw_placement_rule placement; + + public: + DBMultipartUpload(DBStore* _store, Bucket* _bucket, const std::string& oid, std::optional upload_id, ACLOwner _owner, ceph::real_time _mtime) : StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id), owner(_owner), mtime(_mtime) {} + virtual ~DBMultipartUpload() = default; + + virtual const std::string& get_meta() const { return mp_obj.get_meta(); } + virtual const std::string& get_key() const { return mp_obj.get_key(); } + virtual const std::string& get_upload_id() const { return mp_obj.get_upload_id(); } + virtual const ACLOwner& get_owner() const override { return owner; } + virtual ceph::real_time& get_mtime() { return mtime; } + virtual std::unique_ptr get_meta_obj() override; + virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override; + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int num_parts, int marker, + int* next_marker, bool* truncated, + bool assume_unsorted = false) override; + virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override; + virtual int complete(const DoutPrefixProvider* dpp, + optional_yield y, CephContext* cct, + std::map& part_etags, + std::list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& ofs, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) override; + virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override; + virtual std::unique_ptr get_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) override; + }; + + class DBObject : public StoreObject { + private: + DBStore* store; + RGWAccessControlPolicy acls; + + public: + struct DBReadOp : public ReadOp { + private: + DBObject* source; + RGWObjectCtx* rctx; + DB::Object op_target; + DB::Object::Read parent_op; + + public: + DBReadOp(DBObject *_source, RGWObjectCtx *_rctx); + + virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override; + + /* + * Both `read` and `iterate` read up through index `end` + * *inclusive*. The number of bytes that could be returned is + * `end - ofs + 1`. + */ + virtual int read(int64_t ofs, int64_t end, bufferlist& bl, + optional_yield y, + const DoutPrefixProvider* dpp) override; + virtual int iterate(const DoutPrefixProvider* dpp, int64_t ofs, + int64_t end, RGWGetDataCB* cb, + optional_yield y) override; + + virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override; + }; + + struct DBDeleteOp : public DeleteOp { + private: + DBObject* source; + DB::Object op_target; + DB::Object::Delete parent_op; + + public: + DBDeleteOp(DBObject* _source); + + virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override; + }; + + DBObject() = default; + + DBObject(DBStore *_st, const rgw_obj_key& _k) + : StoreObject(_k), + store(_st), + acls() {} + + DBObject(DBStore *_st, const rgw_obj_key& _k, Bucket* _b) + : StoreObject(_k, _b), + store(_st), + acls() {} + + DBObject(DBObject& _o) = default; + + virtual int delete_object(const DoutPrefixProvider* dpp, + optional_yield y, + bool prevent_versioning = false) override; + virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio, + bool keep_index_consistent, optional_yield y) override; + virtual int copy_object(User* user, + req_info* info, const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, ceph::real_time* mtime, + const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, const char* if_nomatch, + AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs, + RGWObjCategory category, uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, std::string* tag, std::string* etag, + void (*progress_cb)(off_t, void *), void* progress_data, + const DoutPrefixProvider* dpp, optional_yield y) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } + virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; } + + virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override; + virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override; + virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override; + virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override; + virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override; + virtual bool is_expired() override; + virtual void gen_rand_obj_instance_name() override; + virtual std::unique_ptr clone() override { + return std::unique_ptr(new DBObject(*this)); + } + virtual std::unique_ptr get_serializer(const DoutPrefixProvider *dpp, + const std::string& lock_name) override; + virtual int transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override; + virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override; + + /* Swift versioning */ + virtual int swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) override; + virtual int swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) override; + + /* OPs */ + virtual std::unique_ptr get_read_op() override; + virtual std::unique_ptr get_delete_op() override; + + /* OMAP */ + virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map *m, + bool* pmore, optional_yield y) override; + virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map *m, + optional_yield y) override; + virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid, + const std::set& keys, + Attrs* vals) override; + virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) override; + virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) override; + private: + int read_attrs(const DoutPrefixProvider* dpp, DB::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr); + }; + + class MPDBSerializer : public StoreMPSerializer { + + public: + MPDBSerializer(const DoutPrefixProvider *dpp, DBStore* store, DBObject* obj, const std::string& lock_name) {} + + virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override {return 0; } + virtual int unlock() override { return 0;} + }; + + class DBAtomicWriter : public StoreWriter { + protected: + rgw::sal::DBStore* store; + const rgw_user& owner; + const rgw_placement_rule *ptail_placement_rule; + uint64_t olh_epoch; + const std::string& unique_tag; + DBObject obj; + DB::Object op_target; + DB::Object::Write parent_op; + uint64_t total_data_size = 0; /* for total data being uploaded */ + bufferlist head_data; + bufferlist tail_part_data; + uint64_t tail_part_offset; + uint64_t tail_part_size = 0; /* corresponds to each tail part being + written to dbstore */ + + public: + DBAtomicWriter(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + DBStore* _store, + const rgw_user& _owner, + const rgw_placement_rule *_ptail_placement_rule, + uint64_t _olh_epoch, + const std::string& _unique_tag); + ~DBAtomicWriter() = default; + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + }; + + class DBMultipartWriter : public StoreWriter { + protected: + rgw::sal::DBStore* store; + const rgw_user& owner; + const rgw_placement_rule *ptail_placement_rule; + uint64_t olh_epoch; + rgw::sal::Object* head_obj; + std::string upload_id; + int part_num; + std::string oid; /* object->name() + "." + "upload_id" + "." + part_num */ + std::unique_ptr meta_obj; + DB::Object op_target; + DB::Object::Write parent_op; + std::string part_num_str; + uint64_t total_data_size = 0; /* for total data being uploaded */ + bufferlist head_data; + bufferlist tail_part_data; + uint64_t tail_part_offset; + uint64_t tail_part_size = 0; /* corresponds to each tail part being + written to dbstore */ + +public: + DBMultipartWriter(const DoutPrefixProvider *dpp, + optional_yield y, MultipartUpload* upload, + rgw::sal::Object* obj, + DBStore* _store, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, const std::string& part_num_str); + ~DBMultipartWriter() = default; + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + }; + + class DBStore : public StoreDriver { + private: + /* DBStoreManager is used in case multiple + * connections are needed one for each tenant. + */ + DBStoreManager *dbsm; + /* default db (single connection). If needed + * multiple db handles (for eg., one for each tenant), + * use dbsm->getDB(tenant) */ + DB *db; + DBZone zone; + RGWSyncModuleInstanceRef sync_module; + RGWLC* lc; + CephContext *cct; + const DoutPrefixProvider *dpp; + bool use_lc_thread; + + public: + DBStore(): dbsm(nullptr), zone(this), cct(nullptr), dpp(nullptr), + use_lc_thread(false) {} + ~DBStore() { delete dbsm; } + + DBStore& set_run_lc_thread(bool _use_lc_thread) { + use_lc_thread = _use_lc_thread; + return *this; + } + + virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override; + + virtual const std::string get_name() const override { + return "dbstore"; + } + + virtual std::unique_ptr get_user(const rgw_user& u) override; + virtual int get_user_by_access_key(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y, std::unique_ptr* user) override; + virtual int get_user_by_email(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y, std::unique_ptr* user) override; + virtual int get_user_by_swift(const DoutPrefixProvider *dpp, const std::string& user_str, optional_yield y, std::unique_ptr* user) override; + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual std::string get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y); + virtual int get_bucket(const DoutPrefixProvider *dpp, User* u, const rgw_bucket& b, std::unique_ptr* bucket, optional_yield y) override; + virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr* bucket) override; + virtual int get_bucket(const DoutPrefixProvider *dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr* bucket, optional_yield y) override; + virtual bool is_meta_master() override; + virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv, + bufferlist& in_data, JSONParser *jp, req_info& info, + optional_yield y) override; + virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, req_info& info, + optional_yield y) override; + virtual Zone* get_zone() { return &zone; } + virtual std::string zone_unique_id(uint64_t unique_num) override; + virtual std::string zone_unique_trans_id(const uint64_t unique_num) override; + virtual int get_zonegroup(const std::string& id, std::unique_ptr* zonegroup) override; + virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list& zone_ids) override; + virtual int cluster_stat(RGWClusterStat& stats) override; + virtual std::unique_ptr get_lifecycle(void) override; + virtual std::unique_ptr get_completions(void) override; + + virtual std::unique_ptr get_notification( + rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, + rgw::notify::EventType event_type, optional_yield y, const std::string* object_name) override; + + virtual std::unique_ptr get_notification( + const DoutPrefixProvider* dpp, rgw::sal::Object* obj, + rgw::sal::Object* src_obj, + rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, + std::string& _user_id, std::string& _user_tenant, std::string& _req_id, + optional_yield y) override; + + virtual RGWLC* get_rgwlc(void) override; + virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return NULL; } + virtual int log_usage(const DoutPrefixProvider *dpp, std::map& usage_info) override; + virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override; + virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, + const std::map& meta) override; + virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override; + virtual void get_quota(RGWQuota& quota) override; + virtual int set_buckets_enabled(const DoutPrefixProvider *dpp, std::vector& buckets, bool enabled) override; + virtual int get_sync_policy_handler(const DoutPrefixProvider *dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef *phandler, + optional_yield y) override; + virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override; + virtual void wakeup_meta_sync_shards(std::set& shard_ids) override { return; } + virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, + const rgw_zone_id& source_zone, + boost::container::flat_map< + int, + boost::container::flat_set>& shard_ids) override { return; } + virtual int clear_usage(const DoutPrefixProvider *dpp) override { return 0; } + virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, + RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + virtual int get_config_key_val(std::string name, bufferlist* bl) override; + virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override; + virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list& keys, bool* truncated) override; + virtual void meta_list_keys_complete(void* handle) override; + virtual std::string meta_get_marker(void *handle) override; + virtual int meta_remove(const DoutPrefixProvider *dpp, std::string& metadata_key, optional_yield y) override; + + virtual const RGWSyncModuleInstanceRef& get_sync_module() { return sync_module; } + virtual std::string get_host_id() { return ""; } + + virtual std::unique_ptr get_lua_manager() override; + virtual std::unique_ptr get_role(std::string name, + std::string tenant, + std::string path="", + std::string trust_policy="", + std::string max_session_duration_str="", + std::multimap tags={}) override; + virtual std::unique_ptr get_role(std::string id) override; + virtual std::unique_ptr get_role(const RGWRoleInfo& info) override; + virtual int get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + std::vector>& roles) override; + virtual std::unique_ptr get_oidc_provider() override; + virtual int get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + std::vector>& providers) override; + virtual std::unique_ptr get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) override; + virtual std::unique_ptr get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) override; + + virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override; + virtual bool valid_placement(const rgw_placement_rule& rule) override; + + virtual void finalize(void) override; + + virtual CephContext *ctx(void) override { + return db->ctx(); + } + + virtual void register_admin_apis(RGWRESTMgr* mgr) override { }; + + /* Unique to DBStore */ + void setDBStoreManager(DBStoreManager *stm) { dbsm = stm; } + DBStoreManager *getDBStoreManager(void) { return dbsm; } + + void setDB(DB * st) { db = st; } + DB *getDB(void) { return db; } + + DB *getDB(std::string tenant) { return dbsm->getDB(tenant, false); } + }; + +} } // namespace rgw::sal diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc new file mode 100644 index 000000000..2a48cec9c --- /dev/null +++ b/src/rgw/rgw_sal_filter.cc @@ -0,0 +1,1370 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_sal_filter.h" + +namespace rgw { namespace sal { + +/* These are helpers for getting 'next' out of an object, handling nullptr */ +static inline PlacementTier* nextPlacementTier(PlacementTier* t) +{ + if (!t) + return nullptr; + + return dynamic_cast(t)->get_next(); +} + +static inline User* nextUser(User* t) +{ + if (!t) + return nullptr; + + return dynamic_cast(t)->get_next(); +} + +static inline Bucket* nextBucket(Bucket* t) +{ + if (!t) + return nullptr; + + return dynamic_cast(t)->get_next(); +} + +static inline Object* nextObject(Object* t) +{ + if (!t) + return nullptr; + + return dynamic_cast(t)->get_next(); +} + +int FilterZoneGroup::get_placement_tier(const rgw_placement_rule& rule, + std::unique_ptr* tier) +{ + std::unique_ptr nt; + int ret; + + ret = next->get_placement_tier(rule, &nt); + if (ret != 0) + return ret; + + PlacementTier* t = new FilterPlacementTier(std::move(nt)); + tier->reset(t); + return 0; +} + +int FilterZoneGroup::get_zone_by_id(const std::string& id, std::unique_ptr* zone) +{ + std::unique_ptr nz; + int ret = next->get_zone_by_id(id, &nz); + if (ret < 0) + return ret; + Zone *z = new FilterZone(std::move(nz)); + + zone->reset(z); + return 0; +} + +int FilterZoneGroup::get_zone_by_name(const std::string& name, std::unique_ptr* zone) +{ + std::unique_ptr nz; + int ret = next->get_zone_by_name(name, &nz); + if (ret < 0) + return ret; + Zone *z = new FilterZone(std::move(nz)); + + zone->reset(z); + return 0; +} + +int FilterDriver::initialize(CephContext *cct, const DoutPrefixProvider *dpp) +{ + zone = std::make_unique(next->get_zone()->clone()); + + return 0; +} + +const std::string FilterDriver::get_name() const +{ + std::string name = "filter<" + next->get_name() + ">"; + return name; +} + +std::string FilterDriver::get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y) +{ + return next->get_cluster_id(dpp, y); +} + +std::unique_ptr FilterDriver::get_user(const rgw_user &u) +{ + std::unique_ptr user = next->get_user(u); + return std::make_unique(std::move(user)); +} + +int FilterDriver::get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr* user) +{ + std::unique_ptr nu; + int ret; + + ret = next->get_user_by_access_key(dpp, key, y, &nu); + if (ret != 0) + return ret; + + User* u = new FilterUser(std::move(nu)); + user->reset(u); + return 0; +} + +int FilterDriver::get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr* user) +{ + std::unique_ptr nu; + int ret; + + ret = next->get_user_by_email(dpp, email, y, &nu); + if (ret != 0) + return ret; + + User* u = new FilterUser(std::move(nu)); + user->reset(u); + return 0; +} + +int FilterDriver::get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr* user) +{ + std::unique_ptr nu; + int ret; + + ret = next->get_user_by_swift(dpp, user_str, y, &nu); + if (ret != 0) + return ret; + + User* u = new FilterUser(std::move(nu)); + user->reset(u); + return 0; +} + +std::unique_ptr FilterDriver::get_object(const rgw_obj_key& k) +{ + std::unique_ptr o = next->get_object(k); + return std::make_unique(std::move(o)); +} + +int FilterDriver::get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr* bucket, optional_yield y) +{ + std::unique_ptr nb; + int ret; + User* nu = nextUser(u); + + ret = next->get_bucket(dpp, nu, b, &nb, y); + if (ret != 0) + return ret; + + Bucket* fb = new FilterBucket(std::move(nb), u); + bucket->reset(fb); + return 0; +} + +int FilterDriver::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr* bucket) +{ + std::unique_ptr nb; + int ret; + User* nu = nextUser(u); + + ret = next->get_bucket(nu, i, &nb); + if (ret != 0) + return ret; + + Bucket* fb = new FilterBucket(std::move(nb), u); + bucket->reset(fb); + return 0; +} + +int FilterDriver::get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr* bucket, optional_yield y) +{ + std::unique_ptr nb; + int ret; + User* nu = nextUser(u); + + ret = next->get_bucket(dpp, nu, tenant, name, &nb, y); + if (ret != 0) + return ret; + + Bucket* fb = new FilterBucket(std::move(nb), u); + bucket->reset(fb); + return 0; +} + +bool FilterDriver::is_meta_master() +{ + return next->is_meta_master(); +} + +int FilterDriver::forward_request_to_master(const DoutPrefixProvider *dpp, + User* user, obj_version* objv, + bufferlist& in_data, + JSONParser* jp, req_info& info, + optional_yield y) +{ + return next->forward_request_to_master(dpp, user, objv, in_data, jp, info, y); +} + +int FilterDriver::forward_iam_request_to_master(const DoutPrefixProvider *dpp, + const RGWAccessKey& key, + obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, + req_info& info, + optional_yield y) +{ + return next->forward_iam_request_to_master(dpp, key, objv, in_data, parser, info, y); +} + +std::string FilterDriver::zone_unique_id(uint64_t unique_num) +{ + return next->zone_unique_id(unique_num); +} + +std::string FilterDriver::zone_unique_trans_id(uint64_t unique_num) +{ + return next->zone_unique_trans_id(unique_num); +} + +int FilterDriver::get_zonegroup(const std::string& id, + std::unique_ptr* zonegroup) +{ + std::unique_ptr ngz; + int ret; + + ret = next->get_zonegroup(id, &ngz); + if (ret != 0) + return ret; + + ZoneGroup* zg = new FilterZoneGroup(std::move(ngz)); + zonegroup->reset(zg); + return 0; +} + +int FilterDriver::cluster_stat(RGWClusterStat& stats) +{ + return next->cluster_stat(stats); +} + +std::unique_ptr FilterDriver::get_lifecycle(void) +{ + std::unique_ptr lc = next->get_lifecycle(); + return std::make_unique(std::move(lc)); +} + +std::unique_ptr FilterDriver::get_completions(void) +{ + std::unique_ptr c = next->get_completions(); + return std::make_unique(std::move(c)); +} + +std::unique_ptr FilterDriver::get_notification(rgw::sal::Object* obj, + rgw::sal::Object* src_obj, req_state* s, + rgw::notify::EventType event_type, optional_yield y, + const std::string* object_name) +{ + std::unique_ptr n = next->get_notification(nextObject(obj), + nextObject(src_obj), + s, event_type, y, + object_name); + return std::make_unique(std::move(n)); +} + +std::unique_ptr FilterDriver::get_notification(const DoutPrefixProvider* dpp, + rgw::sal::Object* obj, rgw::sal::Object* src_obj, + rgw::notify::EventType event_type, + rgw::sal::Bucket* _bucket, std::string& _user_id, + std::string& _user_tenant, std::string& _req_id, + optional_yield y) +{ + std::unique_ptr n = next->get_notification(dpp, nextObject(obj), + nextObject(src_obj), + event_type, + nextBucket(_bucket), + _user_id, + _user_tenant, + _req_id, y); + return std::make_unique(std::move(n)); +} + +RGWLC* FilterDriver::get_rgwlc() +{ + return next->get_rgwlc(); +} + +RGWCoroutinesManagerRegistry* FilterDriver::get_cr_registry() +{ + return next->get_cr_registry(); +} + +int FilterDriver::log_usage(const DoutPrefixProvider *dpp, std::map& usage_info) +{ + return next->log_usage(dpp, usage_info); +} + +int FilterDriver::log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) +{ + return next->log_op(dpp, oid, bl); +} + +int FilterDriver::register_to_service_map(const DoutPrefixProvider *dpp, + const std::string& daemon_type, + const std::map& meta) +{ + return next->register_to_service_map(dpp, daemon_type, meta); +} + +void FilterDriver::get_quota(RGWQuota& quota) +{ + return next->get_quota(quota); +} + +void FilterDriver::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, + RGWRateLimitInfo& user_ratelimit, + RGWRateLimitInfo& anon_ratelimit) +{ + return next->get_ratelimit(bucket_ratelimit, user_ratelimit, anon_ratelimit); +} + +int FilterDriver::set_buckets_enabled(const DoutPrefixProvider* dpp, + std::vector& buckets, bool enabled) +{ + return next->set_buckets_enabled(dpp, buckets, enabled); +} + +uint64_t FilterDriver::get_new_req_id() +{ + return next->get_new_req_id(); +} + +int FilterDriver::get_sync_policy_handler(const DoutPrefixProvider* dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef* phandler, + optional_yield y) +{ + return next->get_sync_policy_handler(dpp, zone, bucket, phandler, y); +} + +RGWDataSyncStatusManager* FilterDriver::get_data_sync_manager(const rgw_zone_id& source_zone) +{ + return next->get_data_sync_manager(source_zone); +} + +void FilterDriver::wakeup_meta_sync_shards(std::set& shard_ids) +{ + return next->wakeup_meta_sync_shards(shard_ids); +} + +void FilterDriver::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, + const rgw_zone_id& source_zone, + boost::container::flat_map>& shard_ids) +{ + return next->wakeup_data_sync_shards(dpp, source_zone, shard_ids); +} + +int FilterDriver::clear_usage(const DoutPrefixProvider *dpp) +{ + return next->clear_usage(dpp); +} + +int FilterDriver::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) +{ + return next->read_all_usage(dpp, start_epoch, end_epoch, max_entries, + is_truncated, usage_iter, usage); +} + +int FilterDriver::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch) +{ + return next->trim_all_usage(dpp, start_epoch, end_epoch); +} + +int FilterDriver::get_config_key_val(std::string name, bufferlist* bl) +{ + return next->get_config_key_val(name, bl); +} + +int FilterDriver::meta_list_keys_init(const DoutPrefixProvider *dpp, + const std::string& section, + const std::string& marker, void** phandle) +{ + return next->meta_list_keys_init(dpp, section, marker, phandle); +} + +int FilterDriver::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, + int max, std::list& keys, + bool* truncated) +{ + return next->meta_list_keys_next(dpp, handle, max, keys, truncated); +} + +void FilterDriver::meta_list_keys_complete(void* handle) +{ + next->meta_list_keys_complete(handle); +} + +std::string FilterDriver::meta_get_marker(void* handle) +{ + return next->meta_get_marker(handle); +} + +int FilterDriver::meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, + optional_yield y) +{ + return next->meta_remove(dpp, metadata_key, y); +} + +const RGWSyncModuleInstanceRef& FilterDriver::get_sync_module() +{ + return next->get_sync_module(); +} + +std::unique_ptr FilterDriver::get_lua_manager() +{ + std::unique_ptr nm = next->get_lua_manager(); + + return std::make_unique(std::move(nm)); +} + +std::unique_ptr FilterDriver::get_role(std::string name, + std::string tenant, + std::string path, + std::string trust_policy, + std::string max_session_duration_str, + std::multimap tags) +{ + return next->get_role(name, tenant, path, trust_policy, max_session_duration_str, tags); +} + +std::unique_ptr FilterDriver::get_role(std::string id) +{ + return next->get_role(id); +} + +std::unique_ptr FilterDriver::get_role(const RGWRoleInfo& info) +{ + return next->get_role(info); +} + +int FilterDriver::get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + std::vector>& roles) +{ + return next->get_roles(dpp, y, path_prefix, tenant, roles); +} + +std::unique_ptr FilterDriver::get_oidc_provider() +{ + return next->get_oidc_provider(); +} + +int FilterDriver::get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + std::vector>& providers) +{ + return next->get_oidc_providers(dpp, tenant, providers); +} + +std::unique_ptr FilterDriver::get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) +{ + std::unique_ptr writer = next->get_append_writer(dpp, y, nextObject(obj), + owner, ptail_placement_rule, + unique_tag, position, + cur_accounted_size); + + return std::make_unique(std::move(writer), obj); +} + +std::unique_ptr FilterDriver::get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) +{ + std::unique_ptr writer = next->get_atomic_writer(dpp, y, nextObject(obj), + owner, ptail_placement_rule, + olh_epoch, unique_tag); + + return std::make_unique(std::move(writer), obj); +} + +const std::string& FilterDriver::get_compression_type(const rgw_placement_rule& rule) +{ + return next->get_compression_type(rule); +} + +bool FilterDriver::valid_placement(const rgw_placement_rule& rule) +{ + return next->valid_placement(rule); +} + +void FilterDriver::finalize(void) +{ + next->finalize(); +} + +CephContext* FilterDriver::ctx(void) +{ + return next->ctx(); +} + +int FilterUser::list_buckets(const DoutPrefixProvider* dpp, const std::string& marker, + const std::string& end_marker, uint64_t max, + bool need_stats, BucketList &buckets, optional_yield y) +{ + BucketList bl; + int ret; + + buckets.clear(); + ret = next->list_buckets(dpp, marker, end_marker, max, need_stats, bl, y); + if (ret < 0) + return ret; + + buckets.set_truncated(bl.is_truncated()); + for (auto& ent : bl.get_buckets()) { + buckets.add(std::make_unique(std::move(ent.second), this)); + } + + return 0; +} + +int FilterUser::create_bucket(const DoutPrefixProvider* dpp, + const rgw_bucket& b, + const std::string& zonegroup_id, + rgw_placement_rule& placement_rule, + std::string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool* existed, + req_info& req_info, + std::unique_ptr* bucket_out, + optional_yield y) +{ + std::unique_ptr nb; + int ret; + + ret = next->create_bucket(dpp, b, zonegroup_id, placement_rule, swift_ver_location, pquota_info, policy, attrs, info, ep_objv, exclusive, obj_lock_enabled, existed, req_info, &nb, y); + if (ret < 0) + return ret; + + Bucket* fb = new FilterBucket(std::move(nb), this); + bucket_out->reset(fb); + return 0; +} + +int FilterUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y) +{ + return next->read_attrs(dpp, y); +} + +int FilterUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, + Attrs& new_attrs, optional_yield y) +{ + return next->merge_and_store_attrs(dpp, new_attrs, y); +} + +int FilterUser::read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time* last_stats_sync, + ceph::real_time* last_stats_update) +{ + return next->read_stats(dpp, y, stats, last_stats_sync, last_stats_update); +} + +int FilterUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) +{ + return next->read_stats_async(dpp, cb); +} + +int FilterUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) +{ + return next->complete_flush_stats(dpp, y); +} + +int FilterUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) +{ + return next->read_usage(dpp, start_epoch, end_epoch, max_entries, + is_truncated, usage_iter, usage); +} + +int FilterUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch) +{ + return next->trim_usage(dpp, start_epoch, end_epoch); +} + +int FilterUser::load_user(const DoutPrefixProvider* dpp, optional_yield y) +{ + return next->load_user(dpp, y); +} + +int FilterUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info) +{ + return next->store_user(dpp, y, exclusive, old_info); +} + +int FilterUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y) +{ + return next->remove_user(dpp, y); +} + +int FilterUser::verify_mfa(const std::string& mfa_str, bool* verified, + const DoutPrefixProvider* dpp, optional_yield y) +{ + return next->verify_mfa(mfa_str, verified, dpp, y); +} + +std::unique_ptr FilterBucket::get_object(const rgw_obj_key& k) +{ + std::unique_ptr o = next->get_object(k); + + return std::make_unique(std::move(o), this); +} + +int FilterBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max, + ListResults& results, optional_yield y) +{ + return next->list(dpp, params, max, results, y); +} + +int FilterBucket::remove_bucket(const DoutPrefixProvider* dpp, + bool delete_children, + bool forward_to_master, + req_info* req_info, + optional_yield y) +{ + return next->remove_bucket(dpp, delete_children, forward_to_master, req_info, y); +} + +int FilterBucket::remove_bucket_bypass_gc(int concurrent_max, + bool keep_index_consistent, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + return next->remove_bucket_bypass_gc(concurrent_max, keep_index_consistent, y, dpp); +} + +int FilterBucket::set_acl(const DoutPrefixProvider* dpp, + RGWAccessControlPolicy &acl, optional_yield y) +{ + return next->set_acl(dpp, acl, y); +} + +int FilterBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y, + bool get_stats) +{ + return next->load_bucket(dpp, y, get_stats); +} + +int FilterBucket::read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, std::string* bucket_ver, + std::string* master_ver, + std::map& stats, + std::string* max_marker, bool* syncstopped) +{ + return next->read_stats(dpp, idx_layout, shard_id, bucket_ver, master_ver, + stats, max_marker, syncstopped); +} + +int FilterBucket::read_stats_async(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetBucketStats_CB* ctx) +{ + return next->read_stats_async(dpp, idx_layout, shard_id, ctx); +} + +int FilterBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) +{ + return next->sync_user_stats(dpp, y); +} + +int FilterBucket::update_container_stats(const DoutPrefixProvider* dpp) +{ + return next->update_container_stats(dpp); +} + +int FilterBucket::check_bucket_shards(const DoutPrefixProvider* dpp) +{ + return next->check_bucket_shards(dpp); +} + +int FilterBucket::chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y) +{ + return next->chown(dpp, new_user, y); +} + +int FilterBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive, + ceph::real_time _mtime) +{ + return next->put_info(dpp, exclusive, _mtime); +} + +bool FilterBucket::is_owner(User* user) +{ + return next->is_owner(nextUser(user)); +} + +int FilterBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y) +{ + return next->check_empty(dpp, y); +} + +int FilterBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, + uint64_t obj_size, optional_yield y, + bool check_size_only) +{ + return next->check_quota(dpp, quota, obj_size, y, check_size_only); +} + +int FilterBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp, + Attrs& new_attrs, optional_yield y) +{ + return next->merge_and_store_attrs(dpp, new_attrs, y); +} + +int FilterBucket::try_refresh_info(const DoutPrefixProvider* dpp, + ceph::real_time* pmtime) +{ + return next->try_refresh_info(dpp, pmtime); +} + +int FilterBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) +{ + return next->read_usage(dpp, start_epoch, end_epoch, max_entries, + is_truncated, usage_iter, usage); +} + +int FilterBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch) +{ + return next->trim_usage(dpp, start_epoch, end_epoch); +} + +int FilterBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, + std::list& objs_to_unlink) +{ + return next->remove_objs_from_index(dpp, objs_to_unlink); +} + +int FilterBucket::check_index(const DoutPrefixProvider *dpp, + std::map& existing_stats, + std::map& calculated_stats) +{ + return next->check_index(dpp, existing_stats, calculated_stats); +} + +int FilterBucket::rebuild_index(const DoutPrefixProvider *dpp) +{ + return next->rebuild_index(dpp); +} + +int FilterBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) +{ + return next->set_tag_timeout(dpp, timeout); +} + +int FilterBucket::purge_instance(const DoutPrefixProvider* dpp) +{ + return next->purge_instance(dpp); +} + +std::unique_ptr FilterBucket::get_multipart_upload( + const std::string& oid, + std::optional upload_id, + ACLOwner owner, ceph::real_time mtime) +{ + std::unique_ptr nmu = + next->get_multipart_upload(oid, upload_id, owner, mtime); + + return std::make_unique(std::move(nmu), this); +} + +int FilterBucket::list_multiparts(const DoutPrefixProvider *dpp, + const std::string& prefix, + std::string& marker, + const std::string& delim, + const int& max_uploads, + std::vector>& uploads, + std::map *common_prefixes, + bool *is_truncated) +{ + std::vector> nup; + int ret; + + ret = next->list_multiparts(dpp, prefix, marker, delim, max_uploads, nup, + common_prefixes, is_truncated); + if (ret < 0) + return ret; + + for (auto& ent : nup) { + uploads.emplace_back(std::make_unique(std::move(ent), this)); + } + + return 0; +} + +int FilterBucket::abort_multiparts(const DoutPrefixProvider* dpp, CephContext* cct) +{ + return next->abort_multiparts(dpp, cct); +} + +int FilterObject::delete_object(const DoutPrefixProvider* dpp, + optional_yield y, + bool prevent_versioning) +{ + return next->delete_object(dpp, y, prevent_versioning); +} + +int FilterObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, + Completions* aio, bool keep_index_consistent, + optional_yield y) +{ + return next->delete_obj_aio(dpp, astate, aio, keep_index_consistent, y); +} + +int FilterObject::copy_object(User* user, + req_info* info, + const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, + rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, + ceph::real_time* mtime, + const ceph::real_time* mod_ptr, + const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, + const char* if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + Attrs& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, + std::string* tag, + std::string* etag, + void (*progress_cb)(off_t, void *), + void* progress_data, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + return next->copy_object(user, info, source_zone, + nextObject(dest_object), + nextBucket(dest_bucket), + nextBucket(src_bucket), + dest_placement, src_mtime, mtime, + mod_ptr, unmod_ptr, high_precision_time, if_match, + if_nomatch, attrs_mod, copy_if_newer, attrs, + category, olh_epoch, delete_at, version_id, tag, + etag, progress_cb, progress_data, dpp, y); +} + +RGWAccessControlPolicy& FilterObject::get_acl() +{ + return next->get_acl(); +} + +int FilterObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate, + optional_yield y, bool follow_olh) +{ + return next->get_obj_state(dpp, pstate, y, follow_olh); +} + +int FilterObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, + Attrs* delattrs, optional_yield y) +{ + return next->set_obj_attrs(dpp, setattrs, delattrs, y); +} + +int FilterObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, + rgw_obj* target_obj) +{ + return next->get_obj_attrs(y, dpp, target_obj); +} + +int FilterObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, + optional_yield y, const DoutPrefixProvider* dpp) +{ + return next->modify_obj_attrs(attr_name, attr_val, y, dpp); +} + +int FilterObject::delete_obj_attrs(const DoutPrefixProvider* dpp, + const char* attr_name, optional_yield y) +{ + return next->delete_obj_attrs(dpp, attr_name, y); +} + +bool FilterObject::is_expired() +{ + return next->is_expired(); +} + +void FilterObject::gen_rand_obj_instance_name() +{ + return next->gen_rand_obj_instance_name(); +} + +std::unique_ptr FilterObject::get_serializer(const DoutPrefixProvider *dpp, + const std::string& lock_name) +{ + std::unique_ptr s = next->get_serializer(dpp, lock_name); + return std::make_unique(std::move(s)); +} + +int FilterObject::transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + return next->transition(nextBucket(bucket), placement_rule, mtime, olh_epoch, + dpp, y); +} + +int FilterObject::transition_to_cloud(Bucket* bucket, + rgw::sal::PlacementTier* tier, + rgw_bucket_dir_entry& o, + std::set& cloud_targets, + CephContext* cct, + bool update_object, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + return next->transition_to_cloud(nextBucket(bucket), nextPlacementTier(tier), + o, cloud_targets, cct, update_object, dpp, y); +} + +bool FilterObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) +{ + return next->placement_rules_match(r1, r2); +} + +int FilterObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, + Formatter* f) +{ + return next->dump_obj_layout(dpp, y, f); +} + +void FilterObject::set_bucket(Bucket* b) +{ + bucket = b; + next->set_bucket(nextBucket(b)); +}; + +int FilterObject::swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) +{ + return next->swift_versioning_restore(restored, dpp); +} + +int FilterObject::swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) +{ + return next->swift_versioning_copy(dpp, y); +} + +std::unique_ptr FilterObject::get_read_op() +{ + std::unique_ptr r = next->get_read_op(); + return std::make_unique(std::move(r)); +} + +std::unique_ptr FilterObject::get_delete_op() +{ + std::unique_ptr d = next->get_delete_op(); + return std::make_unique(std::move(d)); +} + +int FilterObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, + uint64_t count, std::map *m, + bool* pmore, optional_yield y) +{ + return next->omap_get_vals(dpp, marker, count, m, pmore, y); +} + +int FilterObject::omap_get_all(const DoutPrefixProvider *dpp, + std::map *m, + optional_yield y) +{ + return next->omap_get_all(dpp, m, y); +} + +int FilterObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, + const std::string& oid, + const std::set& keys, + Attrs* vals) +{ + return next->omap_get_vals_by_keys(dpp, oid, keys, vals); +} + +int FilterObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, + const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) +{ + return next->omap_set_val_by_key(dpp, key, val, must_exist, y); +} + +int FilterObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) +{ + return next->chown(new_user, dpp, y); +} + +int FilterObject::FilterReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp) +{ + /* Copy params into next */ + next->params = params; + return next->prepare(y, dpp); +} + +int FilterObject::FilterReadOp::read(int64_t ofs, int64_t end, bufferlist& bl, + optional_yield y, const DoutPrefixProvider* dpp) +{ + int ret = next->read(ofs, end, bl, y, dpp); + if (ret < 0) + return ret; + + /* Copy params out of next */ + params = next->params; + return ret; +} + +int FilterObject::FilterReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) +{ + return next->get_attr(dpp, name, dest, y); +} + +int FilterObject::FilterReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs, + int64_t end, RGWGetDataCB* cb, optional_yield y) +{ + int ret = next->iterate(dpp, ofs, end, cb, y); + if (ret < 0) + return ret; + + /* Copy params out of next */ + params = next->params; + return ret; +} + +int FilterObject::FilterDeleteOp::delete_obj(const DoutPrefixProvider* dpp, + optional_yield y) +{ + /* Copy params into next */ + next->params = params; + int ret = next->delete_obj(dpp, y); + /* Copy result back */ + result = next->result; + return ret; +} + +std::unique_ptr FilterMultipartUpload::get_meta_obj() +{ + std::unique_ptr no = next->get_meta_obj(); + + return std::make_unique(std::move(no), bucket); +} + +int FilterMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, + ACLOwner& owner, rgw_placement_rule& dest_placement, + rgw::sal::Attrs& attrs) +{ + return next->init(dpp, y, owner, dest_placement, attrs); +} + +int FilterMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct, + int num_parts, int marker, + int *next_marker, bool *truncated, + bool assume_unsorted) +{ + int ret; + + ret = next->list_parts(dpp, cct, num_parts, marker, next_marker, truncated, + assume_unsorted); + if (ret < 0) + return ret; + + parts.clear(); + + for (auto& ent : next->get_parts()) { + parts.emplace(ent.first, std::make_unique(std::move(ent.second))); + } + + return 0; +} + +int FilterMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct) +{ + return next->abort(dpp, cct); +} + +int FilterMultipartUpload::complete(const DoutPrefixProvider *dpp, + optional_yield y, CephContext* cct, + std::map& part_etags, + std::list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& ofs, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) +{ + return next->complete(dpp, y, cct, part_etags, remove_objs, accounted_size, + compressed, cs_info, ofs, tag, owner, olh_epoch, + nextObject(target_obj)); +} + +int FilterMultipartUpload::get_info(const DoutPrefixProvider *dpp, + optional_yield y, rgw_placement_rule** rule, + rgw::sal::Attrs* attrs) +{ + return next->get_info(dpp, y, rule, attrs); +} + +std::unique_ptr FilterMultipartUpload::get_writer( + const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) +{ + std::unique_ptr writer; + writer = next->get_writer(dpp, y, nextObject(obj), owner, + ptail_placement_rule, part_num, part_num_str); + + return std::make_unique(std::move(writer), obj); +} + +int FilterMPSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, + optional_yield y) +{ + return next->try_lock(dpp, dur, y); +} + +int FilterLCSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, + optional_yield y) +{ + return next->try_lock(dpp, dur, y); +} + +std::unique_ptr FilterLifecycle::get_entry() +{ + std::unique_ptr e = next->get_entry(); + return std::make_unique(std::move(e)); +} + +int FilterLifecycle::get_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) +{ + std::unique_ptr ne; + int ret; + + ret = next->get_entry(oid, marker, &ne); + if (ret < 0) + return ret; + + LCEntry* e = new FilterLCEntry(std::move(ne)); + entry->reset(e); + + return 0; +} + +int FilterLifecycle::get_next_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) +{ + std::unique_ptr ne; + int ret; + + ret = next->get_next_entry(oid, marker, &ne); + if (ret < 0) + return ret; + + LCEntry* e = new FilterLCEntry(std::move(ne)); + entry->reset(e); + + return 0; +} + +int FilterLifecycle::set_entry(const std::string& oid, LCEntry& entry) +{ + return next->set_entry(oid, entry); +} + +int FilterLifecycle::list_entries(const std::string& oid, const std::string& marker, + uint32_t max_entries, + std::vector>& entries) +{ + std::vector> ne; + int ret; + + ret = next->list_entries(oid, marker, max_entries, ne); + if (ret < 0) + return ret; + + for (auto& ent : ne) { + entries.emplace_back(std::make_unique(std::move(ent))); + } + + return 0; +} + +int FilterLifecycle::rm_entry(const std::string& oid, LCEntry& entry) +{ + return next->rm_entry(oid, entry); +} + +int FilterLifecycle::get_head(const std::string& oid, std::unique_ptr* head) +{ + std::unique_ptr nh; + int ret; + + ret = next->get_head(oid, &nh); + if (ret < 0) + return ret; + + LCHead* h = new FilterLCHead(std::move(nh)); + head->reset(h); + + return 0; +} + +int FilterLifecycle::put_head(const std::string& oid, LCHead& head) +{ + return next->put_head(oid, *(dynamic_cast(head).next.get())); +} + +std::unique_ptr FilterLifecycle::get_serializer( + const std::string& lock_name, + const std::string& oid, + const std::string& cookie) +{ + std::unique_ptr ns; + ns = next->get_serializer(lock_name, oid, cookie); + + return std::make_unique(std::move(ns)); +} + +int FilterNotification::publish_reserve(const DoutPrefixProvider *dpp, + RGWObjTags* obj_tags) +{ + return next->publish_reserve(dpp, obj_tags); +} + +int FilterNotification::publish_commit(const DoutPrefixProvider* dpp, uint64_t size, + const ceph::real_time& mtime, const + std::string& etag, const std::string& version) +{ + return next->publish_commit(dpp, size, mtime, etag, version); +} + +int FilterWriter::process(bufferlist&& data, uint64_t offset) +{ + return next->process(std::move(data), offset); +} + +int FilterWriter::complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) +{ + return next->complete(accounted_size, etag, mtime, set_mtime, attrs, + delete_at, if_match, if_nomatch, user_data, zones_trace, + canceled, y); +} + +int FilterLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& key, std::string& script) +{ + return next->get_script(dpp, y, key, script); +} + +int FilterLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& key, const std::string& script) +{ + return next->put_script(dpp, y, key, script); +} + +int FilterLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& key) +{ + return next->del_script(dpp, y, key); +} + +int FilterLuaManager::add_package(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& package_name) +{ + return next->add_package(dpp, y, package_name); +} + +int FilterLuaManager::remove_package(const DoutPrefixProvider* dpp, optional_yield y, + const std::string& package_name) +{ + return next->remove_package(dpp, y, package_name); +} + +int FilterLuaManager::list_packages(const DoutPrefixProvider* dpp, optional_yield y, + rgw::lua::packages_t& packages) +{ + return next->list_packages(dpp, y, packages); +} + +} } // namespace rgw::sal + +extern "C" { + +rgw::sal::Driver* newBaseFilter(rgw::sal::Driver* next) +{ + rgw::sal::FilterDriver* driver = new rgw::sal::FilterDriver(next); + + return driver; +} + +} diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h new file mode 100644 index 000000000..951a1de5f --- /dev/null +++ b/src/rgw/rgw_sal_filter.h @@ -0,0 +1,921 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_sal.h" +#include "rgw_oidc_provider.h" +#include "rgw_role.h" + +namespace rgw { namespace sal { + +class FilterCompletions : public Completions { +protected: + std::unique_ptr next; + +public: + FilterCompletions(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterCompletions() = default; + virtual int drain() override { return next->drain(); } +}; + +class FilterPlacementTier : public PlacementTier { +protected: + std::unique_ptr next; + +public: + FilterPlacementTier(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterPlacementTier() = default; + + virtual const std::string& get_tier_type() override { return next->get_tier_type(); } + virtual const std::string& get_storage_class() override { return next->get_storage_class(); } + virtual bool retain_head_object() override { return next->retain_head_object(); } + + /* Internal to Filters */ + PlacementTier* get_next() { return next.get(); } +}; + +class FilterZoneGroup : public ZoneGroup { +protected: + std::unique_ptr next; + +public: + FilterZoneGroup(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterZoneGroup() = default; + virtual const std::string& get_id() const override + { return next->get_id(); } + virtual const std::string& get_name() const override + { return next->get_name(); } + virtual int equals(const std::string& other_zonegroup) const override + { return next->equals(other_zonegroup); } + virtual const std::string& get_endpoint() const override + { return next->get_endpoint(); } + virtual bool placement_target_exists(std::string& target) const override + { return next->placement_target_exists(target); } + virtual bool is_master_zonegroup() const override + { return next->is_master_zonegroup(); } + virtual const std::string& get_api_name() const override + { return next->get_api_name(); } + virtual void get_placement_target_names(std::set& names) const override + { next->get_placement_target_names(names); } + virtual const std::string& get_default_placement_name() const override + { return next->get_default_placement_name(); } + virtual int get_hostnames(std::list& names) const override + { return next->get_hostnames(names); } + virtual int get_s3website_hostnames(std::list& names) const override + { return next->get_s3website_hostnames(names); } + virtual int get_zone_count() const override + { return next->get_zone_count(); } + virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr* tier) override; + virtual int get_zone_by_id(const std::string& id, std::unique_ptr* zone) override; + virtual int get_zone_by_name(const std::string& name, std::unique_ptr* zone) override; + virtual int list_zones(std::list& zone_ids) override + { return next->list_zones(zone_ids); } + bool supports(std::string_view feature) const override { + return next->supports(feature); + } + virtual std::unique_ptr clone() override { + std::unique_ptr nzg = next->clone(); + return std::make_unique(std::move(nzg)); + } +}; + +class FilterZone : public Zone { +protected: + std::unique_ptr next; +private: + std::unique_ptr group; + +public: + FilterZone(std::unique_ptr _next) : next(std::move(_next)) + { + group = std::make_unique(next->get_zonegroup().clone()); + } + virtual ~FilterZone() = default; + + virtual std::unique_ptr clone() override { + std::unique_ptr nz = next->clone(); + return std::make_unique(std::move(nz)); + } + virtual ZoneGroup& get_zonegroup() override { + return *group.get(); + } + virtual const std::string& get_id() override { + return next->get_id(); + } + virtual const std::string& get_name() const override { + return next->get_name(); + } + virtual bool is_writeable() override { + return next->is_writeable(); + } + virtual bool get_redirect_endpoint(std::string* endpoint) override { + return next->get_redirect_endpoint(endpoint); + } + virtual bool has_zonegroup_api(const std::string& api) const override { + return next->has_zonegroup_api(api); + } + virtual const std::string& get_current_period_id() override { + return next->get_current_period_id(); + } + virtual const RGWAccessKey& get_system_key() override { + return next->get_system_key(); + } + virtual const std::string& get_realm_name() override { + return next->get_realm_name(); + } + virtual const std::string& get_realm_id() override { + return next->get_realm_id(); + } + virtual const std::string_view get_tier_type() override { + return next->get_tier_type(); + } + virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override { + return next->get_sync_policy_handler(); + } +}; + +class FilterDriver : public Driver { +protected: + Driver* next; +private: + std::unique_ptr zone; + +public: + FilterDriver(Driver* _next) : next(_next) {} + virtual ~FilterDriver() = default; + + virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override; + virtual const std::string get_name() const override; + virtual std::string get_cluster_id(const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual std::unique_ptr get_user(const rgw_user& u) override; + virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const + std::string& key, optional_yield y, + std::unique_ptr* user) override; + virtual int get_user_by_email(const DoutPrefixProvider* dpp, const + std::string& email, optional_yield y, + std::unique_ptr* user) override; + virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const + std::string& user_str, optional_yield y, + std::unique_ptr* user) override; + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual int get_bucket(User* u, const RGWBucketInfo& i, + std::unique_ptr* bucket) override; + virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const + rgw_bucket& b, std::unique_ptr* bucket, + optional_yield y) override; + virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const + std::string& tenant, const std::string& name, + std::unique_ptr* bucket, optional_yield y) override; + virtual bool is_meta_master() override; + virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, + obj_version* objv, bufferlist& in_data, + JSONParser* jp, req_info& info, + optional_yield y) override; + virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, + const RGWAccessKey& key, + obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, + req_info& info, + optional_yield y) override; + virtual Zone* get_zone() override { return zone.get(); } + virtual std::string zone_unique_id(uint64_t unique_num) override; + virtual std::string zone_unique_trans_id(const uint64_t unique_num) override; + virtual int get_zonegroup(const std::string& id, std::unique_ptr* zonegroup) override; + virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list& zone_ids) override { + return next->list_all_zones(dpp, zone_ids); + } + virtual int cluster_stat(RGWClusterStat& stats) override; + virtual std::unique_ptr get_lifecycle(void) override; + virtual std::unique_ptr get_completions(void) override; + + virtual std::unique_ptr get_notification(rgw::sal::Object* obj, + rgw::sal::Object* src_obj, struct req_state* s, + rgw::notify::EventType event_type, optional_yield y, + const std::string* object_name=nullptr) override; + virtual std::unique_ptr get_notification( + const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, + + rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, + std::string& _user_id, std::string& _user_tenant, + std::string& _req_id, optional_yield y) override; + + int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + return next->read_topics(tenant, topics, objv_tracker, y, dpp); + } + int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + return next->write_topics(tenant, topics, objv_tracker, y, dpp); + } + int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + return next->remove_topics(tenant, objv_tracker, y, dpp); + } + + virtual RGWLC* get_rgwlc(void) override; + virtual RGWCoroutinesManagerRegistry* get_cr_registry() override; + + virtual int log_usage(const DoutPrefixProvider *dpp, std::map& usage_info) override; + virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, + bufferlist& bl) override; + virtual int register_to_service_map(const DoutPrefixProvider *dpp, const + std::string& daemon_type, + const std::map& meta) override; + virtual void get_quota(RGWQuota& quota) override; + virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, + RGWRateLimitInfo& user_ratelimit, + RGWRateLimitInfo& anon_ratelimit) override; + virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, + std::vector& buckets, + bool enabled) override; + virtual uint64_t get_new_req_id() override; + virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef* phandler, + optional_yield y) override; + virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override; + virtual void wakeup_meta_sync_shards(std::set& shard_ids) override; + virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, + const rgw_zone_id& source_zone, + boost::container::flat_map>& shard_ids) override; + virtual int clear_usage(const DoutPrefixProvider *dpp) override; + virtual int read_all_usage(const DoutPrefixProvider *dpp, + uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool* is_truncated, + RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_all_usage(const DoutPrefixProvider *dpp, + uint64_t start_epoch, uint64_t end_epoch) override; + virtual int get_config_key_val(std::string name, bufferlist* bl) override; + virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, + const std::string& section, + const std::string& marker, + void** phandle) override; + virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, + int max, std::list& keys, + bool* truncated) override; + virtual void meta_list_keys_complete(void* handle) override; + virtual std::string meta_get_marker(void* handle) override; + virtual int meta_remove(const DoutPrefixProvider* dpp, + std::string& metadata_key, optional_yield y) override; + virtual const RGWSyncModuleInstanceRef& get_sync_module() override; + virtual std::string get_host_id() override { return next->get_host_id(); } + virtual std::unique_ptr get_lua_manager() override; + virtual std::unique_ptr get_role(std::string name, + std::string tenant, + std::string path="", + std::string trust_policy="", + std::string + max_session_duration_str="", + std::multimap tags={}) override; + virtual std::unique_ptr get_role(std::string id) override; + virtual std::unique_ptr get_role(const RGWRoleInfo& info) override; + virtual int get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + std::vector>& roles) override; + virtual std::unique_ptr get_oidc_provider() override; + virtual int get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + std::vector>& + providers) override; + virtual std::unique_ptr get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule + *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) override; + virtual std::unique_ptr get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) override; + + virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override; + virtual bool valid_placement(const rgw_placement_rule& rule) override; + + virtual void finalize(void) override; + + virtual CephContext* ctx(void) override; + + virtual void register_admin_apis(RGWRESTMgr* mgr) override { + return next->register_admin_apis(mgr); + } +}; + +class FilterUser : public User { +protected: + std::unique_ptr next; + +public: + FilterUser(std::unique_ptr _next) : next(std::move(_next)) {} + FilterUser(FilterUser& u) : next(u.next->clone()) {}; + virtual ~FilterUser() = default; + + virtual std::unique_ptr clone() override { + return std::make_unique(*this); + } + virtual int list_buckets(const DoutPrefixProvider* dpp, + const std::string& marker, const std::string& end_marker, + uint64_t max, bool need_stats, BucketList& buckets, + optional_yield y) override; + virtual int create_bucket(const DoutPrefixProvider* dpp, + const rgw_bucket& b, + const std::string& zonegroup_id, + rgw_placement_rule& placement_rule, + std::string& swift_ver_location, + const RGWQuotaInfo* pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool* existed, + req_info& req_info, + std::unique_ptr* bucket, + optional_yield y) override; + + virtual std::string& get_display_name() override { return next->get_display_name(); } + virtual const std::string& get_tenant() override { return next->get_tenant(); } + virtual void set_tenant(std::string& _t) override { next->set_tenant(_t); } + virtual const std::string& get_ns() override { return next->get_ns(); } + virtual void set_ns(std::string& _ns) override { next->set_ns(_ns); } + virtual void clear_ns() override { next->clear_ns(); } + virtual const rgw_user& get_id() const override { return next->get_id(); } + virtual uint32_t get_type() const override { return next->get_type(); } + virtual int32_t get_max_buckets() const override { return next->get_max_buckets(); } + virtual const RGWUserCaps& get_caps() const override { return next->get_caps(); } + virtual RGWObjVersionTracker& get_version_tracker() override { + return next->get_version_tracker(); + } + virtual Attrs& get_attrs() override { return next->get_attrs(); } + virtual void set_attrs(Attrs& _attrs) override { next->set_attrs(_attrs); } + virtual bool empty() const override { return next->empty(); } + virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& + new_attrs, optional_yield y) override; + virtual int read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time* last_stats_sync = nullptr, + ceph::real_time* last_stats_update = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider *dpp, + RGWGetUserStats_CB* cb) override; + virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch) override; + + virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool + exclusive, RGWUserInfo* old_info = nullptr) override; + virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int verify_mfa(const std::string& mfa_str, bool* verified, + const DoutPrefixProvider* dpp, optional_yield y) override; + + RGWUserInfo& get_info() override { return next->get_info(); } + virtual void print(std::ostream& out) const override { return next->print(out); } + + /* Internal to Filters */ + User* get_next() { return next.get(); } +}; + +class FilterBucket : public Bucket { +protected: + std::unique_ptr next; +private: + User* user; + +public: + + FilterBucket(std::unique_ptr _next, User* _user) : + next(std::move(_next)), user(_user) {} + virtual ~FilterBucket() = default; + + virtual std::unique_ptr get_object(const rgw_obj_key& key) override; + virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, + ListResults&, optional_yield y) override; + virtual Attrs& get_attrs(void) override { return next->get_attrs(); } + virtual int set_attrs(Attrs a) override { return next->set_attrs(a); } + virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, + bool forward_to_master, req_info* req_info, + optional_yield y) override; + virtual int remove_bucket_bypass_gc(int concurrent_max, bool + keep_index_consistent, + optional_yield y, const + DoutPrefixProvider *dpp) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return next->get_acl(); } + virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl, + optional_yield y) override; + + virtual void set_owner(rgw::sal::User* _owner) override { next->set_owner(_owner); } + virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, + bool get_stats = false) override; + virtual int read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, std::string* bucket_ver, std::string* master_ver, + std::map& stats, + std::string* max_marker = nullptr, + bool* syncstopped = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetBucketStats_CB* ctx) override; + virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int update_container_stats(const DoutPrefixProvider* dpp) override; + virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override; + virtual int chown(const DoutPrefixProvider* dpp, User& new_user, + optional_yield y) override; + virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, + ceph::real_time mtime) override; + virtual bool is_owner(User* user) override; + virtual User* get_owner(void) override { return user; } + virtual ACLOwner get_acl_owner(void) override { return next->get_acl_owner(); } + virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, + uint64_t obj_size, optional_yield y, + bool check_size_only = false) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, + Attrs& new_attrs, optional_yield y) override; + virtual int try_refresh_info(const DoutPrefixProvider* dpp, + ceph::real_time* pmtime) override; + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, + uint64_t end_epoch) override; + virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, + std::list& + objs_to_unlink) override; + virtual int check_index(const DoutPrefixProvider *dpp, + std::map& + existing_stats, + std::map& + calculated_stats) override; + virtual int rebuild_index(const DoutPrefixProvider *dpp) override; + virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override; + virtual int purge_instance(const DoutPrefixProvider* dpp) override; + virtual bool empty() const override { return next->empty(); } + virtual const std::string& get_name() const override { return next->get_name(); } + virtual const std::string& get_tenant() const override { return next->get_tenant(); } + virtual const std::string& get_marker() const override { return next->get_marker(); } + virtual const std::string& get_bucket_id() const override { return next->get_bucket_id(); } + virtual size_t get_size() const override { return next->get_size(); } + virtual size_t get_size_rounded() const override { return next->get_size_rounded(); } + virtual uint64_t get_count() const override { return next->get_count(); } + virtual rgw_placement_rule& get_placement_rule() override { return next->get_placement_rule(); } + virtual ceph::real_time& get_creation_time() override { return next->get_creation_time(); } + virtual ceph::real_time& get_modification_time() override { return next->get_modification_time(); } + virtual obj_version& get_version() override { return next->get_version(); } + virtual void set_version(obj_version &ver) override { next->set_version(ver); } + virtual bool versioned() override { return next->versioned(); } + virtual bool versioning_enabled() override { return next->versioning_enabled(); } + + virtual std::unique_ptr clone() override { + std::unique_ptr nb = next->clone(); + return std::make_unique(std::move(nb), user); + } + + virtual std::unique_ptr get_multipart_upload( + const std::string& oid, + std::optional upload_id=std::nullopt, + ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override; + virtual int list_multiparts(const DoutPrefixProvider *dpp, + const std::string& prefix, + std::string& marker, + const std::string& delim, + const int& max_uploads, + std::vector>& uploads, + std::map *common_prefixes, + bool *is_truncated) override; + virtual int abort_multiparts(const DoutPrefixProvider* dpp, + CephContext* cct) override; + + int read_topics(rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + return next->read_topics(notifications, objv_tracker, y, dpp); + } + int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* obj_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + return next->write_topics(notifications, obj_tracker, y, dpp); + } + int remove_topics(RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override { + return next->remove_topics(objv_tracker, y, dpp); + } + + virtual rgw_bucket& get_key() override { return next->get_key(); } + virtual RGWBucketInfo& get_info() override { return next->get_info(); } + + virtual void print(std::ostream& out) const override { return next->print(out); } + + virtual bool operator==(const Bucket& b) const override { return next->operator==(b); } + virtual bool operator!=(const Bucket& b) const override { return next->operator!=(b); } + + friend class BucketList; + + /* Internal to Filters */ + Bucket* get_next() { return next.get(); } +}; + +class FilterObject : public Object { +protected: + std::unique_ptr next; +private: + Bucket* bucket{nullptr}; + +public: + + struct FilterReadOp : ReadOp { + std::unique_ptr next; + + FilterReadOp(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterReadOp() = default; + + virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override; + virtual int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, + const DoutPrefixProvider* dpp) override; + virtual int iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end, + RGWGetDataCB* cb, optional_yield y) override; + virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, + bufferlist& dest, optional_yield y) override; + }; + + struct FilterDeleteOp : DeleteOp { + std::unique_ptr next; + + FilterDeleteOp(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterDeleteOp() = default; + + virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override; + }; + + FilterObject(std::unique_ptr _next) : next(std::move(_next)) {} + FilterObject(std::unique_ptr _next, Bucket* _bucket) : + next(std::move(_next)), bucket(_bucket) {} + FilterObject(FilterObject& _o) { + next = _o.next->clone(); + bucket = _o.bucket; + } + virtual ~FilterObject() = default; + + virtual int delete_object(const DoutPrefixProvider* dpp, + optional_yield y, + bool prevent_versioning = false) override; + virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, + Completions* aio, + bool keep_index_consistent, optional_yield y) override; + virtual int copy_object(User* user, + req_info* info, const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, ceph::real_time* mtime, + const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, const char* if_nomatch, + AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs, + RGWObjCategory category, uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, std::string* tag, std::string* etag, + void (*progress_cb)(off_t, void *), void* progress_data, + const DoutPrefixProvider* dpp, optional_yield y) override; + virtual RGWAccessControlPolicy& get_acl(void) override; + virtual int set_acl(const RGWAccessControlPolicy& acl) override { return next->set_acl(acl); } + virtual void set_atomic() override { return next->set_atomic(); } + virtual bool is_atomic() override { return next->is_atomic(); } + virtual void set_prefetch_data() override { return next->set_prefetch_data(); } + virtual bool is_prefetch_data() override { return next->is_prefetch_data(); } + virtual void set_compressed() override { return next->set_compressed(); } + virtual bool is_compressed() override { return next->is_compressed(); } + virtual void invalidate() override { return next->invalidate(); } + virtual bool empty() const override { return next->empty(); } + virtual const std::string &get_name() const override { return next->get_name(); } + + virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, + optional_yield y, bool follow_olh = true) override; + virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, + Attrs* delattrs, optional_yield y) override; + virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, + rgw_obj* target_obj = NULL) override; + virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, + optional_yield y, const DoutPrefixProvider* dpp) override; + virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, + optional_yield y) override; + virtual bool is_expired() override; + virtual void gen_rand_obj_instance_name() override; + virtual std::unique_ptr get_serializer(const DoutPrefixProvider *dpp, + const std::string& lock_name) override; + virtual int transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual int transition_to_cloud(Bucket* bucket, + rgw::sal::PlacementTier* tier, + rgw_bucket_dir_entry& o, + std::set& cloud_targets, + CephContext* cct, + bool update_object, + const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override; + virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, + Formatter* f) override; + + virtual Attrs& get_attrs(void) override { return next->get_attrs(); }; + virtual const Attrs& get_attrs(void) const override { return next->get_attrs(); }; + virtual int set_attrs(Attrs a) override { return next->set_attrs(a); }; + virtual bool has_attrs(void) override { return next->has_attrs(); }; + virtual ceph::real_time get_mtime(void) const override { return next->get_mtime(); }; + virtual uint64_t get_obj_size(void) const override { return next->get_obj_size(); }; + virtual Bucket* get_bucket(void) const override { return bucket; }; + virtual void set_bucket(Bucket* b) override; + virtual std::string get_hash_source(void) override { return next->get_hash_source(); }; + virtual void set_hash_source(std::string s) override { return next->set_hash_source(s); }; + virtual std::string get_oid(void) const override { return next->get_oid(); }; + virtual bool get_delete_marker(void) override { return next->get_delete_marker(); }; + virtual bool get_in_extra_data(void) override { return next->get_in_extra_data(); }; + virtual void set_in_extra_data(bool i) override { return next->set_in_extra_data(i); }; + int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end) { + return next->range_to_ofs(obj_size, ofs, end); + }; + virtual void set_obj_size(uint64_t s) override { return next->set_obj_size(s); }; + virtual void set_name(const std::string& n) override { return next->set_name(n); }; + virtual void set_key(const rgw_obj_key& k) override { return next->set_key(k); }; + virtual rgw_obj get_obj(void) const override { return next->get_obj(); }; + virtual rgw_obj_key& get_key() override { return next->get_key(); } + virtual void set_instance(const std::string &i) override { return next->set_instance(i); } + virtual const std::string &get_instance() const override { return next->get_instance(); } + virtual bool have_instance(void) override { return next->have_instance(); } + virtual void clear_instance() override { return next->clear_instance(); } + + virtual int swift_versioning_restore(bool& restored, /* out */ + const DoutPrefixProvider* dpp) override; + virtual int swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) override; + + virtual std::unique_ptr get_read_op() override; + virtual std::unique_ptr get_delete_op() override; + + virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, + uint64_t count, std::map* m, + bool* pmore, optional_yield y) override; + virtual int omap_get_all(const DoutPrefixProvider *dpp, + std::map* m, + optional_yield y) override; + virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, + const std::string& oid, + const std::set& keys, + Attrs* vals) override; + virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, + const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) override; + virtual int chown(User& new_user, const DoutPrefixProvider* dpp, + optional_yield y) override; + + virtual std::unique_ptr clone() override { + return std::make_unique(*this); + } + + virtual void print(std::ostream& out) const override { return next->print(out); } + + /* Internal to Filters */ + Object* get_next() { return next.get(); } +}; + +class FilterMultipartPart : public MultipartPart { +protected: + std::unique_ptr next; + +public: + FilterMultipartPart(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterMultipartPart() = default; + + virtual uint32_t get_num() override { return next->get_num(); } + virtual uint64_t get_size() override { return next->get_size(); } + virtual const std::string& get_etag() override { return next->get_etag(); } + virtual ceph::real_time& get_mtime() override { return next->get_mtime(); } +}; + +class FilterMultipartUpload : public MultipartUpload { +protected: + std::unique_ptr next; + Bucket* bucket; + std::map> parts; + +public: + FilterMultipartUpload(std::unique_ptr _next, Bucket* _b) : + next(std::move(_next)), bucket(_b) {} + virtual ~FilterMultipartUpload() = default; + + virtual const std::string& get_meta() const override { return next->get_meta(); } + virtual const std::string& get_key() const override { return next->get_key(); } + virtual const std::string& get_upload_id() const override { return next->get_upload_id(); } + virtual const ACLOwner& get_owner() const override { return next->get_owner(); } + virtual ceph::real_time& get_mtime() override { return next->get_mtime(); } + + virtual std::map>& get_parts() override { return parts; } + + virtual const jspan_context& get_trace() override { return next->get_trace(); } + + virtual std::unique_ptr get_meta_obj() override; + + virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override; + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int num_parts, int marker, + int* next_marker, bool* truncated, + bool assume_unsorted = false) override; + virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override; + virtual int complete(const DoutPrefixProvider* dpp, + optional_yield y, CephContext* cct, + std::map& part_etags, + std::list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& ofs, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) override; + + virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, + rgw_placement_rule** rule, + rgw::sal::Attrs* attrs = nullptr) override; + + virtual std::unique_ptr get_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) override; + virtual void print(std::ostream& out) const override { return next->print(out); } +}; + +class FilterMPSerializer : public MPSerializer { +protected: + std::unique_ptr next; + +public: + FilterMPSerializer(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterMPSerializer() = default; + + virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override; + virtual int unlock() override { return next->unlock(); } + virtual void clear_locked() override { next->clear_locked(); } + virtual bool is_locked() override { return next->is_locked(); } + virtual void print(std::ostream& out) const override { return next->print(out); } +}; + +class FilterLCSerializer : public LCSerializer { +protected: + std::unique_ptr next; + +public: + FilterLCSerializer(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterLCSerializer() = default; + + virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override; + virtual int unlock() override { return next->unlock(); } + virtual void print(std::ostream& out) const override { return next->print(out); } +}; + +class FilterLifecycle : public Lifecycle { +protected: + std::unique_ptr next; + +public: + struct FilterLCHead : LCHead { + std::unique_ptr next; + + FilterLCHead(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterLCHead() = default; + + virtual time_t& get_start_date() override { return next->get_start_date(); } + virtual void set_start_date(time_t t) override { next->set_start_date(t); } + virtual std::string& get_marker() override { return next->get_marker(); } + virtual void set_marker(const std::string& m) override { next->set_marker(m); } + virtual time_t& get_shard_rollover_date() override { return next->get_shard_rollover_date(); } + virtual void set_shard_rollover_date(time_t t) override { next->set_shard_rollover_date(t); } + }; + + struct FilterLCEntry : LCEntry { + std::unique_ptr next; + + FilterLCEntry(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterLCEntry() = default; + + virtual std::string& get_bucket() override { return next->get_bucket(); } + virtual void set_bucket(const std::string& b) override { next->set_bucket(b); } + virtual std::string& get_oid() override { return next->get_oid(); } + virtual void set_oid(const std::string& o) override { next->set_oid(o); } + virtual uint64_t get_start_time() override { return next->get_start_time(); } + virtual void set_start_time(uint64_t t) override { next->set_start_time(t); } + virtual uint32_t get_status() override { return next->get_status(); } + virtual void set_status(uint32_t s) override { next->set_status(s); } + virtual void print(std::ostream& out) const override { return next->print(out); } + }; + + FilterLifecycle(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterLifecycle() = default; + + virtual std::unique_ptr get_entry() override; + virtual int get_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) override; + virtual int get_next_entry(const std::string& oid, const std::string& marker, + std::unique_ptr* entry) override; + virtual int set_entry(const std::string& oid, LCEntry& entry) override; + virtual int list_entries(const std::string& oid, const std::string& marker, + uint32_t max_entries, + std::vector>& entries) override; + virtual int rm_entry(const std::string& oid, LCEntry& entry) override; + virtual int get_head(const std::string& oid, std::unique_ptr* head) override; + virtual int put_head(const std::string& oid, LCHead& head) override; + virtual std::unique_ptr get_serializer(const std::string& lock_name, + const std::string& oid, + const std::string& cookie) override; +}; + +class FilterNotification : public Notification { +protected: + std::unique_ptr next; + +public: + FilterNotification(std::unique_ptr _next) : next(std::move(_next)) {} + + virtual ~FilterNotification() = default; + + virtual int publish_reserve(const DoutPrefixProvider *dpp, + RGWObjTags* obj_tags = nullptr) override; + virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size, + const ceph::real_time& mtime, + const std::string& etag, + const std::string& version) override; +}; + +class FilterWriter : public Writer { +protected: + std::unique_ptr next; + Object* obj; +public: + FilterWriter(std::unique_ptr _next, Object* _obj) : + next(std::move(_next)), obj(_obj) {} + virtual ~FilterWriter() = default; + + virtual int prepare(optional_yield y) { return next->prepare(y); } + virtual int process(bufferlist&& data, uint64_t offset) override; + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; +}; + +class FilterLuaManager : public LuaManager { +protected: + std::unique_ptr next; + +public: + FilterLuaManager(std::unique_ptr _next) : next(std::move(_next)) {} + virtual ~FilterLuaManager() = default; + + virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override; + virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override; + virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override; + virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override; + virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override; + virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override; +}; + +} } // namespace rgw::sal diff --git a/src/rgw/rgw_sal_fwd.h b/src/rgw/rgw_sal_fwd.h new file mode 100644 index 000000000..08866c2be --- /dev/null +++ b/src/rgw/rgw_sal_fwd.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + + +namespace rgw { namespace sal { + + class Driver; + class User; + class Bucket; + class BucketList; + class Object; + class MultipartUpload; + class Lifecycle; + class Notification; + class Writer; + class PlacementTier; + class ZoneGroup; + class Zone; + class LuaManager; + struct RGWRoleInfo; + + class ConfigStore; + class RealmWriter; + class ZoneGroupWriter; + class ZoneWriter; + +} } // namespace rgw::sal diff --git a/src/rgw/rgw_sal_motr.cc b/src/rgw/rgw_sal_motr.cc new file mode 100644 index 000000000..de18ba944 --- /dev/null +++ b/src/rgw/rgw_sal_motr.cc @@ -0,0 +1,4024 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=2 sw=2 expandtab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * SAL implementation for the CORTX Motr backend + * + * Copyright (C) 2021 Seagate Technology LLC and/or its Affiliates + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include + +extern "C" { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wextern-c-compat" +#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion" +#include "motr/config.h" +#include "lib/types.h" +#include "lib/trace.h" // m0_trace_set_mmapped_buffer +#include "motr/layout.h" // M0_OBJ_LAYOUT_ID +#include "helpers/helpers.h" // m0_ufid_next +#pragma clang diagnostic pop +} + +#include "common/Clock.h" +#include "common/errno.h" + +#include "rgw_compression.h" +#include "rgw_sal.h" +#include "rgw_sal_motr.h" +#include "rgw_bucket.h" + +#define dout_subsys ceph_subsys_rgw + +using std::string; +using std::map; +using std::vector; +using std::set; +using std::list; + +static string mp_ns = RGW_OBJ_NS_MULTIPART; +static struct m0_ufid_generator ufid_gr; + +namespace rgw::sal { + +using ::ceph::encode; +using ::ceph::decode; + +static std::string motr_global_indices[] = { + RGW_MOTR_USERS_IDX_NAME, + RGW_MOTR_BUCKET_INST_IDX_NAME, + RGW_MOTR_BUCKET_HD_IDX_NAME, + RGW_IAM_MOTR_ACCESS_KEY, + RGW_IAM_MOTR_EMAIL_KEY +}; + +void MotrMetaCache::invalid(const DoutPrefixProvider *dpp, + const string& name) +{ + cache.invalidate_remove(dpp, name); +} + +int MotrMetaCache::put(const DoutPrefixProvider *dpp, + const string& name, + const bufferlist& data) +{ + ldpp_dout(dpp, 0) << "Put into cache: name = " << name << dendl; + + ObjectCacheInfo info; + info.status = 0; + info.data = data; + info.flags = CACHE_FLAG_DATA; + info.meta.mtime = ceph::real_clock::now(); + info.meta.size = data.length(); + cache.put(dpp, name, info, NULL); + + // Inform other rgw instances. Do nothing if it gets some error? + int rc = distribute_cache(dpp, name, info, UPDATE_OBJ); + if (rc < 0) + ldpp_dout(dpp, 0) << "ERROR: failed to distribute cache for " << name << dendl; + + return 0; +} + +int MotrMetaCache::get(const DoutPrefixProvider *dpp, + const string& name, + bufferlist& data) +{ + ObjectCacheInfo info; + uint32_t flags = CACHE_FLAG_DATA; + int rc = cache.get(dpp, name, info, flags, NULL); + if (rc == 0) { + if (info.status < 0) + return info.status; + + bufferlist& bl = info.data; + bufferlist::iterator it = bl.begin(); + data.clear(); + + it.copy_all(data); + ldpp_dout(dpp, 0) << "Cache hit: name = " << name << dendl; + return 0; + } + ldpp_dout(dpp, 0) << "Cache miss: name = " << name << ", rc = "<< rc << dendl; + if(rc == -ENODATA) + return -ENOENT; + + return rc; +} + +int MotrMetaCache::remove(const DoutPrefixProvider *dpp, + const string& name) + +{ + cache.invalidate_remove(dpp, name); + + ObjectCacheInfo info; + int rc = distribute_cache(dpp, name, info, INVALIDATE_OBJ); + if (rc < 0) { + ldpp_dout(dpp, 0) << "ERROR: " <<__func__<< "(): failed to distribute cache: rc =" << rc << dendl; + } + + ldpp_dout(dpp, 0) << "Remove from cache: name = " << name << dendl; + return 0; +} + +int MotrMetaCache::distribute_cache(const DoutPrefixProvider *dpp, + const string& normal_name, + ObjectCacheInfo& obj_info, int op) +{ + return 0; +} + +int MotrMetaCache::watch_cb(const DoutPrefixProvider *dpp, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) +{ + return 0; +} + +void MotrMetaCache::set_enabled(bool status) +{ + cache.set_enabled(status); +} + +// TODO: properly handle the number of key/value pairs to get in +// one query. Now the POC simply tries to retrieve all `max` number of pairs +// with starting key `marker`. +int MotrUser::list_buckets(const DoutPrefixProvider *dpp, const string& marker, + const string& end_marker, uint64_t max, bool need_stats, + BucketList &buckets, optional_yield y) +{ + int rc; + vector keys(max); + vector vals(max); + bool is_truncated = false; + + ldpp_dout(dpp, 20) <<__func__<< ": list_user_buckets: marker=" << marker + << " end_marker=" << end_marker + << " max=" << max << dendl; + + // Retrieve all `max` number of pairs. + buckets.clear(); + string user_info_iname = "motr.rgw.user.info." + info.user_id.to_str(); + keys[0] = marker; + rc = store->next_query_by_name(user_info_iname, keys, vals); + if (rc < 0) { + ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl; + return rc; + } + + // Process the returned pairs to add into BucketList. + uint64_t bcount = 0; + for (const auto& bl: vals) { + if (bl.length() == 0) + break; + + RGWBucketEnt ent; + auto iter = bl.cbegin(); + ent.decode(iter); + + std::time_t ctime = ceph::real_clock::to_time_t(ent.creation_time); + ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl; + + if (!end_marker.empty() && + end_marker.compare(ent.bucket.marker) <= 0) + break; + + buckets.add(std::make_unique(this->store, ent, this)); + bcount++; + } + if (bcount == max) + is_truncated = true; + buckets.set_truncated(is_truncated); + + return 0; +} + +int MotrUser::create_bucket(const DoutPrefixProvider* dpp, + const rgw_bucket& b, + const std::string& zonegroup_id, + rgw_placement_rule& placement_rule, + std::string& swift_ver_location, + const RGWQuotaInfo* pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool* existed, + req_info& req_info, + std::unique_ptr* bucket_out, + optional_yield y) +{ + int ret; + std::unique_ptr bucket; + + // Look up the bucket. Create it if it doesn't exist. + ret = this->store->get_bucket(dpp, this, b, &bucket, y); + if (ret < 0 && ret != -ENOENT) + return ret; + + if (ret != -ENOENT) { + *existed = true; + // if (swift_ver_location.empty()) { + // swift_ver_location = bucket->get_info().swift_ver_location; + // } + // placement_rule.inherit_from(bucket->get_info().placement_rule); + + // TODO: ACL policy + // // don't allow changes to the acl policy + //RGWAccessControlPolicy old_policy(ctx()); + //int rc = rgw_op_get_bucket_policy_from_attr( + // dpp, this, u, bucket->get_attrs(), &old_policy, y); + //if (rc >= 0 && old_policy != policy) { + // bucket_out->swap(bucket); + // return -EEXIST; + //} + } else { + + placement_rule.name = "default"; + placement_rule.storage_class = "STANDARD"; + bucket = std::make_unique(store, b, this); + bucket->set_attrs(attrs); + *existed = false; + } + + if (!*existed){ + // TODO: how to handle zone and multi-site. + info.placement_rule = placement_rule; + info.bucket = b; + info.owner = this->get_info().user_id; + info.zonegroup = zonegroup_id; + if (obj_lock_enabled) + info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED; + bucket->set_version(ep_objv); + bucket->get_info() = info; + + // Create a new bucket: (1) Add a key/value pair in the + // bucket instance index. (2) Create a new bucket index. + MotrBucket* mbucket = static_cast(bucket.get()); + ret = mbucket->put_info(dpp, y, ceph::real_time())? : + mbucket->create_bucket_index() ? : + mbucket->create_multipart_indices(); + if (ret < 0) + ldpp_dout(dpp, 0) << "ERROR: failed to create bucket indices! " << ret << dendl; + + // Insert the bucket entry into the user info index. + ret = mbucket->link_user(dpp, this, y); + if (ret < 0) + ldpp_dout(dpp, 0) << "ERROR: failed to add bucket entry! " << ret << dendl; + } else { + return -EEXIST; + // bucket->set_version(ep_objv); + // bucket->get_info() = info; + } + + bucket_out->swap(bucket); + + return ret; +} + +int MotrUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y) +{ + return 0; +} + +int MotrUser::read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time *last_stats_sync, + ceph::real_time *last_stats_update) +{ + return 0; +} + +/* stats - Not for first pass */ +int MotrUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB *cb) +{ + return 0; +} + +int MotrUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) +{ + return 0; +} + +int MotrUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool *is_truncated, RGWUsageIter& usage_iter, + map& usage) +{ + return 0; +} + +int MotrUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) +{ + return 0; +} + +int MotrUser::load_user_from_idx(const DoutPrefixProvider *dpp, + MotrStore *store, + RGWUserInfo& info, map *attrs, + RGWObjVersionTracker *objv_tr) +{ + struct MotrUserInfo muinfo; + bufferlist bl; + ldpp_dout(dpp, 20) << "info.user_id.id = " << info.user_id.id << dendl; + if (store->get_user_cache()->get(dpp, info.user_id.id, bl)) { + // Cache misses + int rc = store->do_idx_op_by_name(RGW_MOTR_USERS_IDX_NAME, + M0_IC_GET, info.user_id.to_str(), bl); + ldpp_dout(dpp, 20) << "do_idx_op_by_name() = " << rc << dendl; + if (rc < 0) + return rc; + + // Put into cache. + store->get_user_cache()->put(dpp, info.user_id.id, bl); + } + + bufferlist& blr = bl; + auto iter = blr.cbegin(); + muinfo.decode(iter); + info = muinfo.info; + if (attrs) + *attrs = muinfo.attrs; + if (objv_tr) + { + objv_tr->read_version = muinfo.user_version; + objv_tracker.read_version = objv_tr->read_version; + } + + if (!info.access_keys.empty()) { + for(auto key : info.access_keys) { + access_key_tracker.insert(key.first); + } + } + + return 0; +} + +int MotrUser::load_user(const DoutPrefixProvider *dpp, + optional_yield y) +{ + ldpp_dout(dpp, 20) << "load user: user id = " << info.user_id.to_str() << dendl; + return load_user_from_idx(dpp, store, info, &attrs, &objv_tracker); +} + +int MotrUser::create_user_info_idx() +{ + string user_info_iname = "motr.rgw.user.info." + info.user_id.to_str(); + return store->create_motr_idx_by_name(user_info_iname); +} + +int MotrUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) +{ + for (auto& it : new_attrs) + attrs[it.first] = it.second; + + return store_user(dpp, y, false); +} + +int MotrUser::store_user(const DoutPrefixProvider* dpp, + optional_yield y, bool exclusive, RGWUserInfo* old_info) +{ + bufferlist bl; + struct MotrUserInfo muinfo; + RGWUserInfo orig_info; + RGWObjVersionTracker objv_tr = {}; + obj_version& obj_ver = objv_tr.read_version; + + ldpp_dout(dpp, 20) << "Store_user(): User = " << info.user_id.id << dendl; + orig_info.user_id = info.user_id; + // XXX: we open and close motr idx 2 times in this method: + // 1) on load_user_from_idx() here and 2) on do_idx_op_by_name(PUT) below. + // Maybe this can be optimised later somewhow. + int rc = load_user_from_idx(dpp, store, orig_info, nullptr, &objv_tr); + ldpp_dout(dpp, 10) << "Get user: rc = " << rc << dendl; + + // Check if the user already exists + if (rc == 0 && obj_ver.ver > 0) { + if (old_info) + *old_info = orig_info; + + if (obj_ver.ver != objv_tracker.read_version.ver) { + rc = -ECANCELED; + ldpp_dout(dpp, 0) << "ERROR: User Read version mismatch" << dendl; + goto out; + } + + if (exclusive) + return rc; + + obj_ver.ver++; + } else { + obj_ver.ver = 1; + obj_ver.tag = "UserTAG"; + } + + // Insert the user to user info index. + muinfo.info = info; + muinfo.attrs = attrs; + muinfo.user_version = obj_ver; + muinfo.encode(bl); + rc = store->do_idx_op_by_name(RGW_MOTR_USERS_IDX_NAME, + M0_IC_PUT, info.user_id.to_str(), bl); + ldpp_dout(dpp, 10) << "Store user to motr index: rc = " << rc << dendl; + if (rc == 0) { + objv_tracker.read_version = obj_ver; + objv_tracker.write_version = obj_ver; + } + + // Store access key in access key index + if (!info.access_keys.empty()) { + std::string access_key; + std::string secret_key; + std::map::const_iterator iter = info.access_keys.begin(); + const RGWAccessKey& k = iter->second; + access_key = k.id; + secret_key = k.key; + MotrAccessKey MGWUserKeys(access_key, secret_key, info.user_id.to_str()); + store->store_access_key(dpp, y, MGWUserKeys); + access_key_tracker.insert(access_key); + } + + // Check if any key need to be deleted + if (access_key_tracker.size() != info.access_keys.size()) { + std::string key_for_deletion; + for (auto key : access_key_tracker) { + if (!info.get_key(key)) { + key_for_deletion = key; + ldpp_dout(dpp, 0) << "Deleting access key: " << key_for_deletion << dendl; + store->delete_access_key(dpp, y, key_for_deletion); + if (rc < 0) { + ldpp_dout(dpp, 0) << "Unable to delete access key" << rc << dendl; + } + } + } + if(rc >= 0){ + access_key_tracker.erase(key_for_deletion); + } + } + + if (!info.user_email.empty()) { + MotrEmailInfo MGWEmailInfo(info.user_id.to_str(), info.user_email); + store->store_email_info(dpp, y, MGWEmailInfo); + } + + // Create user info index to store all buckets that are belong + // to this bucket. + rc = create_user_info_idx(); + if (rc < 0 && rc != -EEXIST) { + ldpp_dout(dpp, 0) << "Failed to create user info index: rc = " << rc << dendl; + goto out; + } + + // Put the user info into cache. + rc = store->get_user_cache()->put(dpp, info.user_id.id, bl); + +out: + return rc; +} + +int MotrUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y) +{ + // Remove user info from cache + // Delete access keys for user + // Delete user info + // Delete user from user index + // Delete email for user - TODO + bufferlist bl; + int rc; + // Remove the user info from cache. + store->get_user_cache()->remove(dpp, info.user_id.id); + + // Delete all access key of user + if (!info.access_keys.empty()) { + for(auto acc_key = info.access_keys.begin(); acc_key != info.access_keys.end(); acc_key++) { + auto access_key = acc_key->first; + rc = store->delete_access_key(dpp, y, access_key); + // TODO + // Check error code for access_key does not exist + // Continue to next step only if delete failed because key doesn't exists + if (rc < 0){ + ldpp_dout(dpp, 0) << "Unable to delete access key" << rc << dendl; + } + } + } + + //Delete email id + if (!info.user_email.empty()) { + rc = store->do_idx_op_by_name(RGW_IAM_MOTR_EMAIL_KEY, + M0_IC_DEL, info.user_email, bl); + if (rc < 0 && rc != -ENOENT) { + ldpp_dout(dpp, 0) << "Unable to delete email id " << rc << dendl; + } + } + + // Delete user info index + string user_info_iname = "motr.rgw.user.info." + info.user_id.to_str(); + store->delete_motr_idx_by_name(user_info_iname); + ldpp_dout(dpp, 10) << "Deleted user info index - " << user_info_iname << dendl; + + // Delete user from user index + rc = store->do_idx_op_by_name(RGW_MOTR_USERS_IDX_NAME, + M0_IC_DEL, info.user_id.to_str(), bl); + if (rc < 0){ + ldpp_dout(dpp, 0) << "Unable to delete user from user index " << rc << dendl; + return rc; + } + + // TODO + // Delete email for user + // rc = store->do_idx_op_by_name(RGW_IAM_MOTR_EMAIL_KEY, + // M0_IC_DEL, info.user_email, bl); + // if (rc < 0){ + // ldpp_dout(dpp, 0) << "Unable to delete email for user" << rc << dendl; + // return rc; + // } + return 0; +} + +int MotrUser::verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider *dpp, optional_yield y) +{ + *verified = false; + return 0; +} + +int MotrBucket::remove_bucket(const DoutPrefixProvider *dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) +{ + int ret; + + ldpp_dout(dpp, 20) << "remove_bucket Entry=" << info.bucket.name << dendl; + + // Refresh info + ret = load_bucket(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: remove_bucket load_bucket failed rc=" << ret << dendl; + return ret; + } + + ListParams params; + params.list_versions = true; + params.allow_unordered = true; + + ListResults results; + + // 1. Check if Bucket has objects. + // If bucket contains objects and delete_children is true, delete all objects. + // Else throw error that bucket is not empty. + do { + results.objs.clear(); + + // Check if bucket has objects. + ret = list(dpp, params, 1000, results, y); + if (ret < 0) { + return ret; + } + + // If result contains entries, bucket is not empty. + if (!results.objs.empty() && !delete_children) { + ldpp_dout(dpp, 0) << "ERROR: could not remove non-empty bucket " << info.bucket.name << dendl; + return -ENOTEMPTY; + } + + for (const auto& obj : results.objs) { + rgw_obj_key key(obj.key); + /* xxx dang */ + ret = rgw_remove_object(dpp, store, this, key); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: remove_bucket rgw_remove_object failed rc=" << ret << dendl; + return ret; + } + } + } while(results.is_truncated); + + // 2. Abort Mp uploads on the bucket. + ret = abort_multiparts(dpp, store->ctx()); + if (ret < 0) { + return ret; + } + + // 3. Remove mp index?? + string bucket_multipart_iname = "motr.rgw.bucket." + info.bucket.name + ".multiparts"; + ret = store->delete_motr_idx_by_name(bucket_multipart_iname); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: remove_bucket failed to remove multipart index rc=" << ret << dendl; + return ret; + } + + // 4. Sync user stats. + ret = this->sync_user_stats(dpp, y); + if (ret < 0) { + ldout(store->ctx(), 1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl; + } + + // 5. Remove the bucket from user info index. (unlink user) + ret = this->unlink_user(dpp, owner, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: remove_bucket unlink_user failed rc=" << ret << dendl; + return ret; + } + + // 6. Remove bucket index. + string bucket_index_iname = "motr.rgw.bucket.index." + info.bucket.name; + ret = store->delete_motr_idx_by_name(bucket_index_iname); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: remove_bucket unlink_user failed rc=" << ret << dendl; + return ret; + } + + // 7. Remove bucket instance info. + bufferlist bl; + ret = store->get_bucket_inst_cache()->remove(dpp, info.bucket.name); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: remove_bucket failed to remove bucket instance from cache rc=" + << ret << dendl; + return ret; + } + + ret = store->do_idx_op_by_name(RGW_MOTR_BUCKET_INST_IDX_NAME, + M0_IC_DEL, info.bucket.name, bl); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: remove_bucket failed to remove bucket instance rc=" + << ret << dendl; + return ret; + } + + // TODO : + // 8. Remove Notifications + // if bucket has notification definitions associated with it + // they should be removed (note that any pending notifications on the bucket are still going to be sent) + + // 9. Forward request to master. + if (forward_to_master) { + bufferlist in_data; + ret = store->forward_request_to_master(dpp, owner, &bucket_version, in_data, nullptr, *req_info, y); + if (ret < 0) { + if (ret == -ENOENT) { + /* adjust error, we want to return with NoSuchBucket and not + * NoSuchKey */ + ret = -ERR_NO_SUCH_BUCKET; + } + ldpp_dout(dpp, 0) << "ERROR: Forward to master failed. ret=" << ret << dendl; + return ret; + } + } + + ldpp_dout(dpp, 20) << "remove_bucket Exit=" << info.bucket.name << dendl; + + return ret; +} + +int MotrBucket::remove_bucket_bypass_gc(int concurrent_max, bool + keep_index_consistent, + optional_yield y, const + DoutPrefixProvider *dpp) { + return 0; +} + +int MotrBucket::put_info(const DoutPrefixProvider *dpp, bool exclusive, ceph::real_time _mtime) +{ + bufferlist bl; + struct MotrBucketInfo mbinfo; + + ldpp_dout(dpp, 20) << "put_info(): bucket_id=" << info.bucket.bucket_id << dendl; + mbinfo.info = info; + mbinfo.bucket_attrs = attrs; + mbinfo.mtime = _mtime; + mbinfo.bucket_version = bucket_version; + mbinfo.encode(bl); + + // Insert bucket instance using bucket's marker (string). + int rc = store->do_idx_op_by_name(RGW_MOTR_BUCKET_INST_IDX_NAME, + M0_IC_PUT, info.bucket.name, bl, !exclusive); + if (rc == 0) + store->get_bucket_inst_cache()->put(dpp, info.bucket.name, bl); + + return rc; +} + +int MotrBucket::load_bucket(const DoutPrefixProvider *dpp, optional_yield y, bool get_stats) +{ + // Get bucket instance using bucket's name (string). or bucket id? + bufferlist bl; + if (store->get_bucket_inst_cache()->get(dpp, info.bucket.name, bl)) { + // Cache misses. + ldpp_dout(dpp, 20) << "load_bucket(): name=" << info.bucket.name << dendl; + int rc = store->do_idx_op_by_name(RGW_MOTR_BUCKET_INST_IDX_NAME, + M0_IC_GET, info.bucket.name, bl); + ldpp_dout(dpp, 20) << "load_bucket(): rc=" << rc << dendl; + if (rc < 0) + return rc; + store->get_bucket_inst_cache()->put(dpp, info.bucket.name, bl); + } + + struct MotrBucketInfo mbinfo; + bufferlist& blr = bl; + auto iter =blr.cbegin(); + mbinfo.decode(iter); //Decode into MotrBucketInfo. + + info = mbinfo.info; + ldpp_dout(dpp, 20) << "load_bucket(): bucket_id=" << info.bucket.bucket_id << dendl; + rgw_placement_rule placement_rule; + placement_rule.name = "default"; + placement_rule.storage_class = "STANDARD"; + info.placement_rule = placement_rule; + + attrs = mbinfo.bucket_attrs; + mtime = mbinfo.mtime; + bucket_version = mbinfo.bucket_version; + + return 0; +} + +int MotrBucket::link_user(const DoutPrefixProvider* dpp, User* new_user, optional_yield y) +{ + bufferlist bl; + RGWBucketEnt new_bucket; + ceph::real_time creation_time = get_creation_time(); + + // RGWBucketEnt or cls_user_bucket_entry is the structure that is stored. + new_bucket.bucket = info.bucket; + new_bucket.size = 0; + if (real_clock::is_zero(creation_time)) + creation_time = ceph::real_clock::now(); + new_bucket.creation_time = creation_time; + new_bucket.encode(bl); + std::time_t ctime = ceph::real_clock::to_time_t(new_bucket.creation_time); + ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl; + + // Insert the user into the user info index. + string user_info_idx_name = "motr.rgw.user.info." + new_user->get_info().user_id.to_str(); + return store->do_idx_op_by_name(user_info_idx_name, + M0_IC_PUT, info.bucket.name, bl); + +} + +int MotrBucket::unlink_user(const DoutPrefixProvider* dpp, User* new_user, optional_yield y) +{ + // Remove the user into the user info index. + bufferlist bl; + string user_info_idx_name = "motr.rgw.user.info." + new_user->get_info().user_id.to_str(); + return store->do_idx_op_by_name(user_info_idx_name, + M0_IC_DEL, info.bucket.name, bl); +} + +/* stats - Not for first pass */ +int MotrBucket::read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, int shard_id, + std::string *bucket_ver, std::string *master_ver, + std::map& stats, + std::string *max_marker, bool *syncstopped) +{ + return 0; +} + +int MotrBucket::create_bucket_index() +{ + string bucket_index_iname = "motr.rgw.bucket.index." + info.bucket.name; + return store->create_motr_idx_by_name(bucket_index_iname); +} + +int MotrBucket::create_multipart_indices() +{ + int rc; + + // Bucket multipart index stores in-progress multipart uploads. + // Key is the object name + upload_id, value is a rgw_bucket_dir_entry. + // An entry is inserted when a multipart upload is initialised ( + // MotrMultipartUpload::init()) and will be removed when the upload + // is completed (MotrMultipartUpload::complete()). + // MotrBucket::list_multiparts() will scan this index to return all + // in-progress multipart uploads in the bucket. + string bucket_multipart_iname = "motr.rgw.bucket." + info.bucket.name + ".multiparts"; + rc = store->create_motr_idx_by_name(bucket_multipart_iname); + if (rc < 0) { + ldout(store->cctx, 0) << "Failed to create bucket multipart index " << bucket_multipart_iname << dendl; + return rc; + } + + return 0; +} + + +int MotrBucket::read_stats_async(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetBucketStats_CB *ctx) +{ + return 0; +} + +int MotrBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) +{ + return 0; +} + +int MotrBucket::update_container_stats(const DoutPrefixProvider *dpp) +{ + return 0; +} + +int MotrBucket::check_bucket_shards(const DoutPrefixProvider *dpp) +{ + return 0; +} + +int MotrBucket::chown(const DoutPrefixProvider *dpp, User& new_user, optional_yield y) +{ + // TODO: update bucket with new owner + return 0; +} + +/* Make sure to call load_bucket() if you need it first */ +bool MotrBucket::is_owner(User* user) +{ + return (info.owner.compare(user->get_id()) == 0); +} + +int MotrBucket::check_empty(const DoutPrefixProvider *dpp, optional_yield y) +{ + /* XXX: Check if bucket contains any objects */ + return 0; +} + +int MotrBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, + optional_yield y, bool check_size_only) +{ + /* Not Handled in the first pass as stats are also needed */ + return 0; +} + +int MotrBucket::merge_and_store_attrs(const DoutPrefixProvider *dpp, Attrs& new_attrs, optional_yield y) +{ + for (auto& it : new_attrs) + attrs[it.first] = it.second; + + return put_info(dpp, y, ceph::real_time()); +} + +int MotrBucket::try_refresh_info(const DoutPrefixProvider *dpp, ceph::real_time *pmtime) +{ + return 0; +} + +/* XXX: usage and stats not supported in the first pass */ +int MotrBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, + RGWUsageIter& usage_iter, + map& usage) +{ + return 0; +} + +int MotrBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) +{ + return 0; +} + +int MotrBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list& objs_to_unlink) +{ + /* XXX: CHECK: Unlike RadosStore, there is no seperate bucket index table. + * Delete all the object in the list from the object table of this + * bucket + */ + return 0; +} + +int MotrBucket::check_index(const DoutPrefixProvider *dpp, std::map& existing_stats, std::map& calculated_stats) +{ + /* XXX: stats not supported yet */ + return 0; +} + +int MotrBucket::rebuild_index(const DoutPrefixProvider *dpp) +{ + /* there is no index table in dbstore. Not applicable */ + return 0; +} + +int MotrBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) +{ + /* XXX: CHECK: set tag timeout for all the bucket objects? */ + return 0; +} + +int MotrBucket::purge_instance(const DoutPrefixProvider *dpp) +{ + /* XXX: CHECK: for dbstore only single instance supported. + * Remove all the objects for that instance? Anything extra needed? + */ + return 0; +} + +int MotrBucket::set_acl(const DoutPrefixProvider *dpp, RGWAccessControlPolicy &acl, optional_yield y) +{ + int ret = 0; + bufferlist aclbl; + + acls = acl; + acl.encode(aclbl); + + Attrs attrs = get_attrs(); + attrs[RGW_ATTR_ACL] = aclbl; + + // TODO: update bucket entry with the new attrs + + return ret; +} + +std::unique_ptr MotrBucket::get_object(const rgw_obj_key& k) +{ + return std::make_unique(this->store, k, this); +} + +int MotrBucket::list(const DoutPrefixProvider *dpp, ListParams& params, int max, ListResults& results, optional_yield y) +{ + int rc; + vector keys(max); + vector vals(max); + + ldpp_dout(dpp, 20) << "bucket=" << info.bucket.name + << " prefix=" << params.prefix + << " marker=" << params.marker + << " max=" << max << dendl; + + // Retrieve all `max` number of pairs. + string bucket_index_iname = "motr.rgw.bucket.index." + info.bucket.name; + keys[0] = params.marker.empty() ? params.prefix : + params.marker.get_oid(); + rc = store->next_query_by_name(bucket_index_iname, keys, vals, params.prefix, + params.delim); + if (rc < 0) { + ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl; + return rc; + } + + // Process the returned pairs to add into ListResults. + int i = 0; + for (; i < rc; ++i) { + if (vals[i].length() == 0) { + results.common_prefixes[keys[i]] = true; + } else { + rgw_bucket_dir_entry ent; + auto iter = vals[i].cbegin(); + ent.decode(iter); + if (params.list_versions || ent.is_visible()) + results.objs.emplace_back(std::move(ent)); + } + } + + if (i == max) { + results.is_truncated = true; + results.next_marker = keys[max - 1] + " "; + } else { + results.is_truncated = false; + } + + return 0; +} + +int MotrBucket::list_multiparts(const DoutPrefixProvider *dpp, + const string& prefix, + string& marker, + const string& delim, + const int& max_uploads, + vector>& uploads, + map *common_prefixes, + bool *is_truncated) +{ + int rc; + vector key_vec(max_uploads); + vector val_vec(max_uploads); + + string bucket_multipart_iname = + "motr.rgw.bucket." + this->get_name() + ".multiparts"; + key_vec[0].clear(); + key_vec[0].assign(marker.begin(), marker.end()); + rc = store->next_query_by_name(bucket_multipart_iname, key_vec, val_vec); + if (rc < 0) { + ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl; + return rc; + } + + // Process the returned pairs to add into ListResults. + // The POC can only support listing all objects or selecting + // with prefix. + int ocount = 0; + rgw_obj_key last_obj_key; + *is_truncated = false; + for (const auto& bl: val_vec) { + if (bl.length() == 0) + break; + + rgw_bucket_dir_entry ent; + auto iter = bl.cbegin(); + ent.decode(iter); + + if (prefix.size() && + (0 != ent.key.name.compare(0, prefix.size(), prefix))) { + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << + ": skippping \"" << ent.key << + "\" because doesn't match prefix" << dendl; + continue; + } + + rgw_obj_key key(ent.key); + uploads.push_back(this->get_multipart_upload(key.name)); + last_obj_key = key; + ocount++; + if (ocount == max_uploads) { + *is_truncated = true; + break; + } + } + marker = last_obj_key.name; + + // What is common prefix? We don't handle it for now. + + return 0; + +} + +int MotrBucket::abort_multiparts(const DoutPrefixProvider *dpp, CephContext *cct) +{ + return 0; +} + +void MotrStore::finalize(void) +{ + // close connection with motr + m0_client_fini(this->instance, true); +} + +const std::string& MotrZoneGroup::get_endpoint() const +{ + if (!group.endpoints.empty()) { + return group.endpoints.front(); + } else { + // use zonegroup's master zone endpoints + auto z = group.zones.find(group.master_zone); + if (z != group.zones.end() && !z->second.endpoints.empty()) { + return z->second.endpoints.front(); + } + } + return empty; +} + +bool MotrZoneGroup::placement_target_exists(std::string& target) const +{ + return !!group.placement_targets.count(target); +} + +void MotrZoneGroup::get_placement_target_names(std::set& names) const +{ + for (const auto& target : group.placement_targets) { + names.emplace(target.second.name); + } +} + +int MotrZoneGroup::get_placement_tier(const rgw_placement_rule& rule, + std::unique_ptr* tier) +{ + std::map::const_iterator titer; + titer = group.placement_targets.find(rule.name); + if (titer == group.placement_targets.end()) { + return -ENOENT; + } + + const auto& target_rule = titer->second; + std::map::const_iterator ttier; + ttier = target_rule.tier_targets.find(rule.storage_class); + if (ttier == target_rule.tier_targets.end()) { + // not found + return -ENOENT; + } + + PlacementTier* t; + t = new MotrPlacementTier(store, ttier->second); + if (!t) + return -ENOMEM; + + tier->reset(t); + return 0; +} + +ZoneGroup& MotrZone::get_zonegroup() +{ + return zonegroup; +} + +const std::string& MotrZone::get_id() +{ + return zone_params->get_id(); +} + +const std::string& MotrZone::get_name() const +{ + return zone_params->get_name(); +} + +bool MotrZone::is_writeable() +{ + return true; +} + +bool MotrZone::get_redirect_endpoint(std::string* endpoint) +{ + return false; +} + +bool MotrZone::has_zonegroup_api(const std::string& api) const +{ + return (zonegroup->api_name == api); +} + +const std::string& MotrZone::get_current_period_id() +{ + return current_period->get_id(); +} + +std::unique_ptr MotrStore::get_lua_manager() +{ + return std::make_unique(this); +} + +int MotrObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **_state, optional_yield y, bool follow_olh) +{ + // Get object's metadata (those stored in rgw_bucket_dir_entry). + bufferlist bl; + if (this->store->get_obj_meta_cache()->get(dpp, this->get_key().get_oid(), bl)) { + // Cache misses. + string bucket_index_iname = "motr.rgw.bucket.index." + this->get_bucket()->get_name(); + int rc = this->store->do_idx_op_by_name(bucket_index_iname, + M0_IC_GET, this->get_key().get_oid(), bl); + if (rc < 0) { + ldpp_dout(dpp, 0) << "Failed to get object's entry from bucket index. " << dendl; + return rc; + } + + // Put into cache. + this->store->get_obj_meta_cache()->put(dpp, this->get_key().get_oid(), bl); + } + + rgw_bucket_dir_entry ent; + bufferlist& blr = bl; + auto iter = blr.cbegin(); + ent.decode(iter); + + // Set object's type. + this->category = ent.meta.category; + + // Set object state. + state.exists = true; + state.size = ent.meta.size; + state.accounted_size = ent.meta.size; + state.mtime = ent.meta.mtime; + + state.has_attrs = true; + bufferlist etag_bl; + string& etag = ent.meta.etag; + ldpp_dout(dpp, 20) <<__func__<< ": object's etag: " << ent.meta.etag << dendl; + etag_bl.append(etag); + state.attrset[RGW_ATTR_ETAG] = etag_bl; + + return 0; +} + +MotrObject::~MotrObject() { + this->close_mobj(); +} + +// int MotrObject::read_attrs(const DoutPrefixProvider* dpp, Motr::Object::Read &read_op, optional_yield y, rgw_obj* target_obj) +// { +// read_op.params.attrs = &attrs; +// read_op.params.target_obj = target_obj; +// read_op.params.obj_size = &obj_size; +// read_op.params.lastmod = &mtime; +// +// return read_op.prepare(dpp); +// } + +int MotrObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) +{ + // TODO: implement + ldpp_dout(dpp, 20) <<__func__<< ": MotrObject::set_obj_attrs()" << dendl; + return 0; +} + +int MotrObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj) +{ + if (this->category == RGWObjCategory::MultiMeta) + return 0; + + string bname, key; + if (target_obj) { + bname = target_obj->bucket.name; + key = target_obj->key.get_oid(); + } else { + bname = this->get_bucket()->get_name(); + key = this->get_key().get_oid(); + } + ldpp_dout(dpp, 20) << "MotrObject::get_obj_attrs(): " + << bname << "/" << key << dendl; + + // Get object's metadata (those stored in rgw_bucket_dir_entry). + bufferlist bl; + if (this->store->get_obj_meta_cache()->get(dpp, key, bl)) { + // Cache misses. + string bucket_index_iname = "motr.rgw.bucket.index." + bname; + int rc = this->store->do_idx_op_by_name(bucket_index_iname, M0_IC_GET, key, bl); + if (rc < 0) { + ldpp_dout(dpp, 0) << "Failed to get object's entry from bucket index. " << dendl; + return rc; + } + + // Put into cache. + this->store->get_obj_meta_cache()->put(dpp, key, bl); + } + + rgw_bucket_dir_entry ent; + bufferlist& blr = bl; + auto iter = blr.cbegin(); + ent.decode(iter); + decode(attrs, iter); + + return 0; +} + +int MotrObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) +{ + rgw_obj target = get_obj(); + int r = get_obj_attrs(y, dpp, &target); + if (r < 0) { + return r; + } + set_atomic(); + attrs[attr_name] = attr_val; + return set_obj_attrs(dpp, &attrs, nullptr, y); +} + +int MotrObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) +{ + rgw_obj target = get_obj(); + Attrs rmattr; + bufferlist bl; + + set_atomic(); + rmattr[attr_name] = bl; + return set_obj_attrs(dpp, nullptr, &rmattr, y); +} + +bool MotrObject::is_expired() { + return false; +} + +// Taken from rgw_rados.cc +void MotrObject::gen_rand_obj_instance_name() +{ + enum {OBJ_INSTANCE_LEN = 32}; + char buf[OBJ_INSTANCE_LEN + 1]; + + gen_rand_alphanumeric_no_underscore(store->ctx(), buf, OBJ_INSTANCE_LEN); + state.obj.key.set_instance(buf); +} + +int MotrObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map *m, + bool* pmore, optional_yield y) +{ + return 0; +} + +int MotrObject::omap_get_all(const DoutPrefixProvider *dpp, std::map *m, + optional_yield y) +{ + return 0; +} + +int MotrObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid, + const std::set& keys, + Attrs* vals) +{ + return 0; +} + +int MotrObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) +{ + return 0; +} + +int MotrObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) +{ + return 0; +} + +std::unique_ptr MotrObject::get_serializer(const DoutPrefixProvider *dpp, + const std::string& lock_name) +{ + return std::make_unique(dpp, store, this, lock_name); +} + +int MotrObject::transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + return 0; +} + +bool MotrObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) +{ + /* XXX: support single default zone and zonegroup for now */ + return true; +} + +int MotrObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) +{ + return 0; +} + +std::unique_ptr MotrObject::get_read_op() +{ + return std::make_unique(this); +} + +MotrObject::MotrReadOp::MotrReadOp(MotrObject *_source) : + source(_source) +{ } + +int MotrObject::MotrReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp) +{ + int rc; + ldpp_dout(dpp, 20) <<__func__<< ": bucket=" << source->get_bucket()->get_name() << dendl; + + rgw_bucket_dir_entry ent; + rc = source->get_bucket_dir_ent(dpp, ent); + if (rc < 0) + return rc; + + // Set source object's attrs. The attrs is key/value map and is used + // in send_response_data() to set attributes, including etag. + bufferlist etag_bl; + string& etag = ent.meta.etag; + ldpp_dout(dpp, 20) <<__func__<< ": object's etag: " << ent.meta.etag << dendl; + etag_bl.append(etag.c_str(), etag.size()); + source->get_attrs().emplace(std::move(RGW_ATTR_ETAG), std::move(etag_bl)); + + source->set_key(ent.key); + source->set_obj_size(ent.meta.size); + source->category = ent.meta.category; + *params.lastmod = ent.meta.mtime; + + if (params.mod_ptr || params.unmod_ptr) { + // Convert all times go GMT to make them compatible + obj_time_weight src_weight; + src_weight.init(*params.lastmod, params.mod_zone_id, params.mod_pg_ver); + src_weight.high_precision = params.high_precision_time; + + obj_time_weight dest_weight; + dest_weight.high_precision = params.high_precision_time; + + // Check if-modified-since condition + if (params.mod_ptr && !params.if_nomatch) { + dest_weight.init(*params.mod_ptr, params.mod_zone_id, params.mod_pg_ver); + ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " & " + << "Last-Modified: " << src_weight << dendl; + if (!(dest_weight < src_weight)) { + return -ERR_NOT_MODIFIED; + } + } + + // Check if-unmodified-since condition + if (params.unmod_ptr && !params.if_match) { + dest_weight.init(*params.unmod_ptr, params.mod_zone_id, params.mod_pg_ver); + ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " & " + << "Last-Modified: " << src_weight << dendl; + if (dest_weight < src_weight) { + return -ERR_PRECONDITION_FAILED; + } + } + } + // Check if-match condition + if (params.if_match) { + string if_match_str = rgw_string_unquote(params.if_match); + ldpp_dout(dpp, 10) << "ETag: " << etag << " & " + << "If-Match: " << if_match_str << dendl; + if (if_match_str.compare(etag) != 0) { + return -ERR_PRECONDITION_FAILED; + } + } + // Check if-none-match condition + if (params.if_nomatch) { + string if_nomatch_str = rgw_string_unquote(params.if_nomatch); + ldpp_dout(dpp, 10) << "ETag: " << etag << " & " + << "If-NoMatch: " << if_nomatch_str << dendl; + if (if_nomatch_str.compare(etag) == 0) { + return -ERR_NOT_MODIFIED; + } + } + + // Skip opening an empty object. + if(source->get_obj_size() == 0) + return 0; + + // Open the object here. + if (source->category == RGWObjCategory::MultiMeta) { + ldpp_dout(dpp, 20) <<__func__<< ": open obj parts..." << dendl; + rc = source->get_part_objs(dpp, this->part_objs)? : + source->open_part_objs(dpp, this->part_objs); + return rc; + } else { + ldpp_dout(dpp, 20) <<__func__<< ": open object..." << dendl; + return source->open_mobj(dpp); + } +} + +int MotrObject::MotrReadOp::read(int64_t off, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp) +{ + ldpp_dout(dpp, 20) << "MotrReadOp::read(): sync read." << dendl; + return 0; +} + +// RGWGetObj::execute() calls ReadOp::iterate() to read object from 'off' to 'end'. +// The returned data is processed in 'cb' which is a chain of post-processing +// filters such as decompression, de-encryption and sending back data to client +// (RGWGetObj_CB::handle_dta which in turn calls RGWGetObj::get_data_cb() to +// send data back.). +// +// POC implements a simple sync version of iterate() function in which it reads +// a block of data each time and call 'cb' for post-processing. +int MotrObject::MotrReadOp::iterate(const DoutPrefixProvider* dpp, int64_t off, int64_t end, RGWGetDataCB* cb, optional_yield y) +{ + int rc; + + if (source->category == RGWObjCategory::MultiMeta) + rc = source->read_multipart_obj(dpp, off, end, cb, part_objs); + else + rc = source->read_mobj(dpp, off, end, cb); + + return rc; +} + +int MotrObject::MotrReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) +{ + //return 0; + return -ENODATA; +} + +std::unique_ptr MotrObject::get_delete_op() +{ + return std::make_unique(this); +} + +MotrObject::MotrDeleteOp::MotrDeleteOp(MotrObject *_source) : + source(_source) +{ } + +// Implementation of DELETE OBJ also requires MotrObject::get_obj_state() +// to retrieve and set object's state from object's metadata. +// +// TODO: +// 1. The POC only remove the object's entry from bucket index and delete +// corresponding Motr objects. It doesn't handle the DeleteOp::params. +// Delete::delete_obj() in rgw_rados.cc shows how rados backend process the +// params. +// 2. Delete an object when its versioning is turned on. +int MotrObject::MotrDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y) +{ + ldpp_dout(dpp, 20) << "delete " << source->get_key().get_oid() << " from " << source->get_bucket()->get_name() << dendl; + + rgw_bucket_dir_entry ent; + int rc = source->get_bucket_dir_ent(dpp, ent); + if (rc < 0) { + return rc; + } + + //TODO: When integrating with background GC for object deletion, + // we should consider adding object entry to GC before deleting the metadata. + // Delete from the cache first. + source->store->get_obj_meta_cache()->remove(dpp, source->get_key().get_oid()); + + // Delete the object's entry from the bucket index. + bufferlist bl; + string bucket_index_iname = "motr.rgw.bucket.index." + source->get_bucket()->get_name(); + rc = source->store->do_idx_op_by_name(bucket_index_iname, + M0_IC_DEL, source->get_key().get_oid(), bl); + if (rc < 0) { + ldpp_dout(dpp, 0) << "Failed to del object's entry from bucket index. " << dendl; + return rc; + } + + if (ent.meta.size == 0) { + ldpp_dout(dpp, 0) << __func__ << ": Object size is 0, not deleting motr object." << dendl; + return 0; + } + // Remove the motr objects. + if (source->category == RGWObjCategory::MultiMeta) + rc = source->delete_part_objs(dpp); + else + rc = source->delete_mobj(dpp); + if (rc < 0) { + ldpp_dout(dpp, 0) << "Failed to delete the object from Motr. " << dendl; + return rc; + } + + //result.delete_marker = parent_op.result.delete_marker; + //result.version_id = parent_op.result.version_id; + return 0; +} + +int MotrObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, bool prevent_versioning) +{ + MotrObject::MotrDeleteOp del_op(this); + del_op.params.bucket_owner = bucket->get_info().owner; + del_op.params.versioning_status = bucket->get_info().versioning_status(); + + return del_op.delete_obj(dpp, y); +} + +int MotrObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, + Completions* aio, bool keep_index_consistent, + optional_yield y) +{ + /* XXX: Make it async */ + return 0; +} + +int MotrObject::copy_object(User* user, + req_info* info, + const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, + rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, + ceph::real_time* mtime, + const ceph::real_time* mod_ptr, + const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, + const char* if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + Attrs& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, + std::string* tag, + std::string* etag, + void (*progress_cb)(off_t, void *), + void* progress_data, + const DoutPrefixProvider* dpp, + optional_yield y) +{ + return 0; +} + +int MotrObject::swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) +{ + return 0; +} + +int MotrObject::swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) +{ + return 0; +} + +MotrAtomicWriter::MotrAtomicWriter(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + MotrStore* _store, + const rgw_user& _owner, + const rgw_placement_rule *_ptail_placement_rule, + uint64_t _olh_epoch, + const std::string& _unique_tag) : + StoreWriter(dpp, y), + store(_store), + owner(_owner), + ptail_placement_rule(_ptail_placement_rule), + olh_epoch(_olh_epoch), + unique_tag(_unique_tag), + obj(_store, obj->get_key(), obj->get_bucket()), + old_obj(_store, obj->get_key(), obj->get_bucket()) {} + +static const unsigned MAX_BUFVEC_NR = 256; + +int MotrAtomicWriter::prepare(optional_yield y) +{ + total_data_size = 0; + + if (obj.is_opened()) + return 0; + + rgw_bucket_dir_entry ent; + int rc = old_obj.get_bucket_dir_ent(dpp, ent); + if (rc == 0) { + ldpp_dout(dpp, 20) << __func__ << ": object exists." << dendl; + } + + rc = m0_bufvec_empty_alloc(&buf, MAX_BUFVEC_NR) ?: + m0_bufvec_alloc(&attr, MAX_BUFVEC_NR, 1) ?: + m0_indexvec_alloc(&ext, MAX_BUFVEC_NR); + if (rc != 0) + this->cleanup(); + + return rc; +} + +int MotrObject::create_mobj(const DoutPrefixProvider *dpp, uint64_t sz) +{ + if (mobj != nullptr) { + ldpp_dout(dpp, 0) <<__func__<< "ERROR: object is already opened" << dendl; + return -EINVAL; + } + + int rc = m0_ufid_next(&ufid_gr, 1, &meta.oid); + if (rc != 0) { + ldpp_dout(dpp, 0) <<__func__<< "ERROR: m0_ufid_next() failed: " << rc << dendl; + return rc; + } + + char fid_str[M0_FID_STR_LEN]; + snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&meta.oid)); + ldpp_dout(dpp, 20) <<__func__<< ": sz=" << sz << " oid=" << fid_str << dendl; + + int64_t lid = m0_layout_find_by_objsz(store->instance, nullptr, sz); + M0_ASSERT(lid > 0); + + M0_ASSERT(mobj == nullptr); + mobj = new m0_obj(); + m0_obj_init(mobj, &store->container.co_realm, &meta.oid, lid); + + struct m0_op *op = nullptr; + mobj->ob_entity.en_flags |= M0_ENF_META; + rc = m0_entity_create(nullptr, &mobj->ob_entity, &op); + if (rc != 0) { + this->close_mobj(); + ldpp_dout(dpp, 0) << "ERROR: m0_entity_create() failed: " << rc << dendl; + return rc; + } + ldpp_dout(dpp, 20) <<__func__<< ": call m0_op_launch()..." << dendl; + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + + if (rc != 0) { + this->close_mobj(); + ldpp_dout(dpp, 0) << "ERROR: failed to create motr object: " << rc << dendl; + return rc; + } + + meta.layout_id = mobj->ob_attr.oa_layout_id; + meta.pver = mobj->ob_attr.oa_pver; + ldpp_dout(dpp, 20) <<__func__<< ": lid=0x" << std::hex << meta.layout_id + << std::dec << " rc=" << rc << dendl; + + // TODO: add key:user+bucket+key+obj.meta.oid value:timestamp to + // gc.queue.index. See more at github.com/Seagate/cortx-rgw/issues/7. + + return rc; +} + +int MotrObject::open_mobj(const DoutPrefixProvider *dpp) +{ + char fid_str[M0_FID_STR_LEN]; + snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&meta.oid)); + ldpp_dout(dpp, 20) <<__func__<< ": oid=" << fid_str << dendl; + + int rc; + if (meta.layout_id == 0) { + rgw_bucket_dir_entry ent; + rc = this->get_bucket_dir_ent(dpp, ent); + if (rc < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_mobj() failed: rc=" << rc << dendl; + return rc; + } + } + + if (meta.layout_id == 0) + return -ENOENT; + + M0_ASSERT(mobj == nullptr); + mobj = new m0_obj(); + memset(mobj, 0, sizeof *mobj); + m0_obj_init(mobj, &store->container.co_realm, &meta.oid, store->conf.mc_layout_id); + + struct m0_op *op = nullptr; + mobj->ob_attr.oa_layout_id = meta.layout_id; + mobj->ob_attr.oa_pver = meta.pver; + mobj->ob_entity.en_flags |= M0_ENF_META; + rc = m0_entity_open(&mobj->ob_entity, &op); + if (rc != 0) { + ldpp_dout(dpp, 0) << "ERROR: m0_entity_open() failed: rc=" << rc << dendl; + this->close_mobj(); + return rc; + } + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + + if (rc < 0) { + ldpp_dout(dpp, 10) << "ERROR: failed to open motr object: rc=" << rc << dendl; + this->close_mobj(); + return rc; + } + + ldpp_dout(dpp, 20) <<__func__<< ": rc=" << rc << dendl; + + return 0; +} + +int MotrObject::delete_mobj(const DoutPrefixProvider *dpp) +{ + int rc; + char fid_str[M0_FID_STR_LEN]; + snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&meta.oid)); + if (!meta.oid.u_hi || !meta.oid.u_lo) { + ldpp_dout(dpp, 20) << __func__ << ": invalid motr object oid=" << fid_str << dendl; + return -EINVAL; + } + ldpp_dout(dpp, 20) << __func__ << ": deleting motr object oid=" << fid_str << dendl; + + // Open the object. + if (mobj == nullptr) { + rc = this->open_mobj(dpp); + if (rc < 0) + return rc; + } + + // Create an DELETE op and execute it (sync version). + struct m0_op *op = nullptr; + mobj->ob_entity.en_flags |= M0_ENF_META; + rc = m0_entity_delete(&mobj->ob_entity, &op); + if (rc != 0) { + ldpp_dout(dpp, 0) << "ERROR: m0_entity_delete() failed: " << rc << dendl; + return rc; + } + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + + if (rc < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to open motr object: " << rc << dendl; + return rc; + } + + this->close_mobj(); + + return 0; +} + +void MotrObject::close_mobj() +{ + if (mobj == nullptr) + return; + m0_obj_fini(mobj); + delete mobj; mobj = nullptr; +} + +int MotrObject::write_mobj(const DoutPrefixProvider *dpp, bufferlist&& data, uint64_t offset) +{ + int rc; + unsigned bs, left; + struct m0_op *op; + char *start, *p; + struct m0_bufvec buf; + struct m0_bufvec attr; + struct m0_indexvec ext; + + left = data.length(); + if (left == 0) + return 0; + + rc = m0_bufvec_empty_alloc(&buf, 1) ?: + m0_bufvec_alloc(&attr, 1, 1) ?: + m0_indexvec_alloc(&ext, 1); + if (rc != 0) + goto out; + + bs = this->get_optimal_bs(left); + ldpp_dout(dpp, 20) <<__func__<< ": left=" << left << " bs=" << bs << dendl; + + start = data.c_str(); + + for (p = start; left > 0; left -= bs, p += bs, offset += bs) { + if (left < bs) + bs = this->get_optimal_bs(left); + if (left < bs) { + data.append_zero(bs - left); + left = bs; + p = data.c_str(); + } + buf.ov_buf[0] = p; + buf.ov_vec.v_count[0] = bs; + ext.iv_index[0] = offset; + ext.iv_vec.v_count[0] = bs; + attr.ov_vec.v_count[0] = 0; + + op = nullptr; + rc = m0_obj_op(this->mobj, M0_OC_WRITE, &ext, &buf, &attr, 0, 0, &op); + if (rc != 0) + goto out; + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + if (rc != 0) + goto out; + } + +out: + m0_indexvec_free(&ext); + m0_bufvec_free(&attr); + m0_bufvec_free2(&buf); + return rc; +} + +int MotrObject::read_mobj(const DoutPrefixProvider* dpp, int64_t off, int64_t end, RGWGetDataCB* cb) +{ + int rc; + unsigned bs, actual, left; + struct m0_op *op; + struct m0_bufvec buf; + struct m0_bufvec attr; + struct m0_indexvec ext; + + // make end pointer exclusive: + // it's easier to work with it this way + end++; + ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): off=" << off << + " end=" << end << dendl; + // As `off` may not be parity group size aligned, even using optimal + // buffer block size, simply reading data from offset `off` could come + // across parity group boundary. And Motr only allows page-size aligned + // offset. + // + // The optimal size of each IO should also take into account the data + // transfer size to s3 client. For example, 16MB may be nice to read + // data from motr, but it could be too big for network transfer. + // + // TODO: We leave proper handling of offset in the future. + bs = this->get_optimal_bs(end - off); + ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): bs=" << bs << dendl; + + rc = m0_bufvec_empty_alloc(&buf, 1) ? : + m0_bufvec_alloc(&attr, 1, 1) ? : + m0_indexvec_alloc(&ext, 1); + if (rc < 0) + goto out; + + left = end - off; + for (; left > 0; off += actual) { + if (left < bs) + bs = this->get_optimal_bs(left); + actual = bs; + if (left < bs) + actual = left; + ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): off=" << off << + " actual=" << actual << dendl; + bufferlist bl; + buf.ov_buf[0] = bl.append_hole(bs).c_str(); + buf.ov_vec.v_count[0] = bs; + ext.iv_index[0] = off; + ext.iv_vec.v_count[0] = bs; + attr.ov_vec.v_count[0] = 0; + + left -= actual; + // Read from Motr. + op = nullptr; + rc = m0_obj_op(this->mobj, M0_OC_READ, &ext, &buf, &attr, 0, 0, &op); + ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): init read op rc=" << rc << dendl; + if (rc != 0) { + ldpp_dout(dpp, 0) << __func__ << ": read failed during m0_obj_op, rc=" << rc << dendl; + goto out; + } + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + if (rc != 0) { + ldpp_dout(dpp, 0) << __func__ << ": read failed, m0_op_wait rc=" << rc << dendl; + goto out; + } + // Call `cb` to process returned data. + ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): call cb to process data" << dendl; + cb->handle_data(bl, 0, actual); + } + +out: + m0_indexvec_free(&ext); + m0_bufvec_free(&attr); + m0_bufvec_free2(&buf); + this->close_mobj(); + + return rc; +} + +int MotrObject::get_bucket_dir_ent(const DoutPrefixProvider *dpp, rgw_bucket_dir_entry& ent) +{ + int rc = 0; + string bucket_index_iname = "motr.rgw.bucket.index." + this->get_bucket()->get_name(); + int max = 1000; + vector keys(max); + vector vals(max); + bufferlist bl; + bufferlist::const_iterator iter; + + if (this->get_bucket()->get_info().versioning_status() == BUCKET_VERSIONED || + this->get_bucket()->get_info().versioning_status() == BUCKET_SUSPENDED) { + + rgw_bucket_dir_entry ent_to_check; + + if (this->store->get_obj_meta_cache()->get(dpp, this->get_name(), bl) == 0) { + iter = bl.cbegin(); + ent_to_check.decode(iter); + if (ent_to_check.is_current()) { + ent = ent_to_check; + rc = 0; + goto out; + } + } + + ldpp_dout(dpp, 20) <<__func__<< ": versioned bucket!" << dendl; + keys[0] = this->get_name(); + rc = store->next_query_by_name(bucket_index_iname, keys, vals); + if (rc < 0) { + ldpp_dout(dpp, 0) << __func__ << "ERROR: NEXT query failed. " << rc << dendl; + return rc; + } + + rc = -ENOENT; + for (const auto& bl: vals) { + if (bl.length() == 0) + break; + + iter = bl.cbegin(); + ent_to_check.decode(iter); + if (ent_to_check.is_current()) { + ldpp_dout(dpp, 20) <<__func__<< ": found current version!" << dendl; + ent = ent_to_check; + rc = 0; + + this->store->get_obj_meta_cache()->put(dpp, this->get_name(), bl); + + break; + } + } + } else { + if (this->store->get_obj_meta_cache()->get(dpp, this->get_key().get_oid(), bl)) { + ldpp_dout(dpp, 20) <<__func__<< ": non-versioned bucket!" << dendl; + rc = this->store->do_idx_op_by_name(bucket_index_iname, + M0_IC_GET, this->get_key().get_oid(), bl); + if (rc < 0) { + ldpp_dout(dpp, 0) << __func__ << "ERROR: failed to get object's entry from bucket index: rc=" + << rc << dendl; + return rc; + } + this->store->get_obj_meta_cache()->put(dpp, this->get_key().get_oid(), bl); + } + + bufferlist& blr = bl; + iter = blr.cbegin(); + ent.decode(iter); + } + +out: + if (rc == 0) { + sal::Attrs dummy; + decode(dummy, iter); + meta.decode(iter); + ldpp_dout(dpp, 20) <<__func__<< ": lid=0x" << std::hex << meta.layout_id << dendl; + char fid_str[M0_FID_STR_LEN]; + snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&meta.oid)); + ldpp_dout(dpp, 70) << __func__ << ": oid=" << fid_str << dendl; + } else + ldpp_dout(dpp, 0) <<__func__<< ": rc=" << rc << dendl; + + return rc; +} + +int MotrObject::update_version_entries(const DoutPrefixProvider *dpp) +{ + int rc; + int max = 10; + vector keys(max); + vector vals(max); + + string bucket_index_iname = "motr.rgw.bucket.index." + this->get_bucket()->get_name(); + keys[0] = this->get_name(); + rc = store->next_query_by_name(bucket_index_iname, keys, vals); + ldpp_dout(dpp, 20) << "get all versions, name = " << this->get_name() << "rc = " << rc << dendl; + if (rc < 0) { + ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl; + return rc; + } + + // no entries returned. + if (rc == 0) + return 0; + + for (const auto& bl: vals) { + if (bl.length() == 0) + break; + + rgw_bucket_dir_entry ent; + auto iter = bl.cbegin(); + ent.decode(iter); + + if (0 != ent.key.name.compare(0, this->get_name().size(), this->get_name())) + continue; + + if (!ent.is_current()) + continue; + + // Remove from the cache. + store->get_obj_meta_cache()->remove(dpp, this->get_name()); + + rgw::sal::Attrs attrs; + decode(attrs, iter); + MotrObject::Meta meta; + meta.decode(iter); + + ent.flags = rgw_bucket_dir_entry::FLAG_VER; + string key; + if (ent.key.instance.empty()) + key = ent.key.name; + else { + char buf[ent.key.name.size() + ent.key.instance.size() + 16]; + snprintf(buf, sizeof(buf), "%s[%s]", ent.key.name.c_str(), ent.key.instance.c_str()); + key = buf; + } + ldpp_dout(dpp, 20) << "update one version, key = " << key << dendl; + bufferlist ent_bl; + ent.encode(ent_bl); + encode(attrs, ent_bl); + meta.encode(ent_bl); + + rc = store->do_idx_op_by_name(bucket_index_iname, + M0_IC_PUT, key, ent_bl); + if (rc < 0) + break; + } + return rc; +} + +// Scan object_nnn_part_index to get all parts then open their motr objects. +// TODO: all parts are opened in the POC. But for a large object, for example +// a 5GB object will have about 300 parts (for default 15MB part). A better +// way of managing opened object may be needed. +int MotrObject::get_part_objs(const DoutPrefixProvider* dpp, + std::map>& part_objs) +{ + int rc; + int max_parts = 1000; + int marker = 0; + uint64_t off = 0; + bool truncated = false; + std::unique_ptr upload; + + upload = this->get_bucket()->get_multipart_upload(this->get_name(), string()); + + do { + rc = upload->list_parts(dpp, store->ctx(), max_parts, marker, &marker, &truncated); + if (rc == -ENOENT) { + rc = -ERR_NO_SUCH_UPLOAD; + } + if (rc < 0) + return rc; + + std::map>& parts = upload->get_parts(); + for (auto part_iter = parts.begin(); part_iter != parts.end(); ++part_iter) { + + MultipartPart *mpart = part_iter->second.get(); + MotrMultipartPart *mmpart = static_cast(mpart); + uint32_t part_num = mmpart->get_num(); + uint64_t part_size = mmpart->get_size(); + + string part_obj_name = this->get_bucket()->get_name() + "." + + this->get_key().get_oid() + + ".part." + std::to_string(part_num); + std::unique_ptr obj; + obj = this->bucket->get_object(rgw_obj_key(part_obj_name)); + std::unique_ptr mobj(static_cast(obj.release())); + + ldpp_dout(dpp, 20) << "get_part_objs: off = " << off << ", size = " << part_size << dendl; + mobj->part_off = off; + mobj->part_size = part_size; + mobj->part_num = part_num; + mobj->meta = mmpart->meta; + + part_objs.emplace(part_num, std::move(mobj)); + + off += part_size; + } + } while (truncated); + + return 0; +} + +int MotrObject::open_part_objs(const DoutPrefixProvider* dpp, + std::map>& part_objs) +{ + //for (auto& iter: part_objs) { + for (auto iter = part_objs.begin(); iter != part_objs.end(); ++iter) { + MotrObject* obj = static_cast(iter->second.get()); + ldpp_dout(dpp, 20) << "open_part_objs: name = " << obj->get_name() << dendl; + int rc = obj->open_mobj(dpp); + if (rc < 0) + return rc; + } + + return 0; +} + +int MotrObject::delete_part_objs(const DoutPrefixProvider* dpp) +{ + std::unique_ptr upload; + upload = this->get_bucket()->get_multipart_upload(this->get_name(), string()); + std::unique_ptr mupload(static_cast(upload.release())); + return mupload->delete_parts(dpp); +} + +int MotrObject::read_multipart_obj(const DoutPrefixProvider* dpp, + int64_t off, int64_t end, RGWGetDataCB* cb, + std::map>& part_objs) +{ + int64_t cursor = off; + + ldpp_dout(dpp, 20) << "read_multipart_obj: off=" << off << " end=" << end << dendl; + + // Find the parts which are in the (off, end) range and + // read data from it. Note: `end` argument is inclusive. + for (auto iter = part_objs.begin(); iter != part_objs.end(); ++iter) { + MotrObject* obj = static_cast(iter->second.get()); + int64_t part_off = obj->part_off; + int64_t part_size = obj->part_size; + int64_t part_end = obj->part_off + obj->part_size - 1; + ldpp_dout(dpp, 20) << "read_multipart_obj: part_off=" << part_off + << " part_end=" << part_end << dendl; + if (part_end < off) + continue; + + int64_t local_off = cursor - obj->part_off; + int64_t local_end = part_end < end? part_size - 1 : end - part_off; + ldpp_dout(dpp, 20) << "real_multipart_obj: name=" << obj->get_name() + << " local_off=" << local_off + << " local_end=" << local_end << dendl; + int rc = obj->read_mobj(dpp, local_off, local_end, cb); + if (rc < 0) + return rc; + + cursor = part_end + 1; + if (cursor > end) + break; + } + + return 0; +} + +static unsigned roundup(unsigned x, unsigned by) +{ + return ((x - 1) / by + 1) * by; +} + +unsigned MotrObject::get_optimal_bs(unsigned len) +{ + struct m0_pool_version *pver; + + pver = m0_pool_version_find(&store->instance->m0c_pools_common, + &mobj->ob_attr.oa_pver); + M0_ASSERT(pver != nullptr); + struct m0_pdclust_attr *pa = &pver->pv_attr; + uint64_t lid = M0_OBJ_LAYOUT_ID(meta.layout_id); + unsigned unit_sz = m0_obj_layout_id_to_unit_size(lid); + unsigned grp_sz = unit_sz * pa->pa_N; + + // bs should be max 4-times pool-width deep counting by 1MB units, or + // 8-times deep counting by 512K units, 16-times deep by 256K units, + // and so on. Several units to one target will be aggregated to make + // fewer network RPCs, disk i/o operations and BE transactions. + // For unit sizes of 32K or less, the depth is 128, which + // makes it 32K * 128 == 4MB - the maximum amount per target when + // the performance is still good on LNet (which has max 1MB frames). + // TODO: it may be different on libfabric, should be re-measured. + unsigned depth = 128 / ((unit_sz + 0x7fff) / 0x8000); + if (depth == 0) + depth = 1; + // P * N / (N + K + S) - number of data units to span the pool-width + unsigned max_bs = depth * unit_sz * pa->pa_P * pa->pa_N / + (pa->pa_N + pa->pa_K + pa->pa_S); + max_bs = roundup(max_bs, grp_sz); // multiple of group size + if (len >= max_bs) + return max_bs; + else if (len <= grp_sz) + return grp_sz; + else + return roundup(len, grp_sz); +} + +void MotrAtomicWriter::cleanup() +{ + m0_indexvec_free(&ext); + m0_bufvec_free(&attr); + m0_bufvec_free2(&buf); + acc_data.clear(); + obj.close_mobj(); + old_obj.close_mobj(); +} + +unsigned MotrAtomicWriter::populate_bvec(unsigned len, bufferlist::iterator &bi) +{ + unsigned i, l, done = 0; + const char *data; + + for (i = 0; i < MAX_BUFVEC_NR && len > 0; ++i) { + l = bi.get_ptr_and_advance(len, &data); + buf.ov_buf[i] = (char*)data; + buf.ov_vec.v_count[i] = l; + ext.iv_index[i] = acc_off; + ext.iv_vec.v_count[i] = l; + attr.ov_vec.v_count[i] = 0; + acc_off += l; + len -= l; + done += l; + } + buf.ov_vec.v_nr = i; + ext.iv_vec.v_nr = i; + + return done; +} + +int MotrAtomicWriter::write() +{ + int rc; + unsigned bs, left; + struct m0_op *op; + bufferlist::iterator bi; + + left = acc_data.length(); + + if (!obj.is_opened()) { + rc = obj.create_mobj(dpp, left); + if (rc == -EEXIST) + rc = obj.open_mobj(dpp); + if (rc != 0) { + char fid_str[M0_FID_STR_LEN]; + snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&obj.meta.oid)); + ldpp_dout(dpp, 0) << "ERROR: failed to create/open motr object " + << fid_str << " (" << obj.get_bucket()->get_name() + << "/" << obj.get_key().get_oid() << "): rc=" << rc + << dendl; + goto err; + } + } + + total_data_size += left; + + bs = obj.get_optimal_bs(left); + ldpp_dout(dpp, 20) <<__func__<< ": left=" << left << " bs=" << bs << dendl; + + bi = acc_data.begin(); + while (left > 0) { + if (left < bs) + bs = obj.get_optimal_bs(left); + if (left < bs) { + acc_data.append_zero(bs - left); + auto off = bi.get_off(); + bufferlist tmp; + acc_data.splice(off, bs, &tmp); + acc_data.clear(); + acc_data.append(tmp.c_str(), bs); // make it a single buf + bi = acc_data.begin(); + left = bs; + } + + left -= this->populate_bvec(bs, bi); + + op = nullptr; + rc = m0_obj_op(obj.mobj, M0_OC_WRITE, &ext, &buf, &attr, 0, 0, &op); + if (rc != 0) + goto err; + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + if (rc != 0) + goto err; + } + acc_data.clear(); + + return 0; + +err: + this->cleanup(); + return rc; +} + +static const unsigned MAX_ACC_SIZE = 32 * 1024 * 1024; + +// Accumulate enough data first to make a reasonable decision about the +// optimal unit size for a new object, or bs for existing object (32M seems +// enough for 4M units in 8+2 parity groups, a common config on wide pools), +// and then launch the write operations. +int MotrAtomicWriter::process(bufferlist&& data, uint64_t offset) +{ + if (data.length() == 0) { // last call, flush data + int rc = 0; + if (acc_data.length() != 0) + rc = this->write(); + this->cleanup(); + return rc; + } + + if (acc_data.length() == 0) + acc_off = offset; + + acc_data.append(std::move(data)); + if (acc_data.length() < MAX_ACC_SIZE) + return 0; + + return this->write(); +} + +int MotrAtomicWriter::complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) +{ + int rc = 0; + + if (acc_data.length() != 0) { // check again, just in case + rc = this->write(); + this->cleanup(); + if (rc != 0) + return rc; + } + + bufferlist bl; + rgw_bucket_dir_entry ent; + + // Set rgw_bucet_dir_entry. Some of the member of this structure may not + // apply to motr. For example the storage_class. + // + // Checkout AtomicObjectProcessor::complete() in rgw_putobj_processor.cc + // and RGWRados::Object::Write::write_meta() in rgw_rados.cc for what and + // how to set the dir entry. Only set the basic ones for POC, no ACLs and + // other attrs. + obj.get_key().get_index_key(&ent.key); + ent.meta.size = total_data_size; + ent.meta.accounted_size = total_data_size; + ent.meta.mtime = real_clock::is_zero(set_mtime)? ceph::real_clock::now() : set_mtime; + ent.meta.etag = etag; + ent.meta.owner = owner.to_str(); + ent.meta.owner_display_name = obj.get_bucket()->get_owner()->get_display_name(); + bool is_versioned = obj.get_key().have_instance(); + if (is_versioned) + ent.flags = rgw_bucket_dir_entry::FLAG_VER | rgw_bucket_dir_entry::FLAG_CURRENT; + ldpp_dout(dpp, 20) <<__func__<< ": key=" << obj.get_key().get_oid() + << " etag: " << etag << " user_data=" << user_data << dendl; + if (user_data) + ent.meta.user_data = *user_data; + ent.encode(bl); + + RGWBucketInfo &info = obj.get_bucket()->get_info(); + if (info.obj_lock_enabled() && info.obj_lock.has_rule()) { + auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (iter == attrs.end()) { + real_time lock_until_date = info.obj_lock.get_lock_until_date(ent.meta.mtime); + string mode = info.obj_lock.get_mode(); + RGWObjectRetention obj_retention(mode, lock_until_date); + bufferlist retention_bl; + obj_retention.encode(retention_bl); + attrs[RGW_ATTR_OBJECT_RETENTION] = retention_bl; + } + } + encode(attrs, bl); + obj.meta.encode(bl); + ldpp_dout(dpp, 20) <<__func__<< ": lid=0x" << std::hex << obj.meta.layout_id + << dendl; + if (is_versioned) { + // get the list of all versioned objects with the same key and + // unset their FLAG_CURRENT later, if do_idx_op_by_name() is successful. + // Note: without distributed lock on the index - it is possible that 2 + // CURRENT entries would appear in the bucket. For example, consider the + // following scenario when two clients are trying to add the new object + // version concurrently: + // client 1: reads all the CURRENT entries + // client 2: updates the index and sets the new CURRENT + // client 1: updates the index and sets the new CURRENT + // At the step (1) client 1 would not see the new current record from step (2), + // so it won't update it. As a result, two CURRENT version entries will appear + // in the bucket. + // TODO: update the current version (unset the flag) and insert the new current + // version can be launched in one motr op. This requires change at do_idx_op() + // and do_idx_op_by_name(). + rc = obj.update_version_entries(dpp); + if (rc < 0) + return rc; + } + // Insert an entry into bucket index. + string bucket_index_iname = "motr.rgw.bucket.index." + obj.get_bucket()->get_name(); + rc = store->do_idx_op_by_name(bucket_index_iname, + M0_IC_PUT, obj.get_key().get_oid(), bl); + if (rc == 0) + store->get_obj_meta_cache()->put(dpp, obj.get_key().get_oid(), bl); + + if (old_obj.get_bucket()->get_info().versioning_status() != BUCKET_VERSIONED) { + // Delete old object data if exists. + old_obj.delete_mobj(dpp); + } + + // TODO: We need to handle the object leak caused by parallel object upload by + // making use of background gc, which is currently not enabled for motr. + return rc; +} + +int MotrMultipartUpload::delete_parts(const DoutPrefixProvider *dpp) +{ + int rc; + int max_parts = 1000; + int marker = 0; + bool truncated = false; + + // Scan all parts and delete the corresponding motr objects. + do { + rc = this->list_parts(dpp, store->ctx(), max_parts, marker, &marker, &truncated); + if (rc == -ENOENT) { + truncated = false; + rc = 0; + } + if (rc < 0) + return rc; + + std::map>& parts = this->get_parts(); + for (auto part_iter = parts.begin(); part_iter != parts.end(); ++part_iter) { + + MultipartPart *mpart = part_iter->second.get(); + MotrMultipartPart *mmpart = static_cast(mpart); + uint32_t part_num = mmpart->get_num(); + + // Delete the part object. Note that the part object is not + // inserted into bucket index, only the corresponding motr object + // needs to be delete. That is why we don't call + // MotrObject::delete_object(). + string part_obj_name = bucket->get_name() + "." + + mp_obj.get_key() + + ".part." + std::to_string(part_num); + std::unique_ptr obj; + obj = this->bucket->get_object(rgw_obj_key(part_obj_name)); + std::unique_ptr mobj(static_cast(obj.release())); + mobj->meta = mmpart->meta; + rc = mobj->delete_mobj(dpp); + if (rc < 0) { + ldpp_dout(dpp, 0) << __func__ << ": Failed to delete object from Motr. rc=" << rc << dendl; + return rc; + } + } + } while (truncated); + + // Delete object part index. + std::string oid = mp_obj.get_key(); + string obj_part_iname = "motr.rgw.object." + bucket->get_name() + "." + oid + ".parts"; + return store->delete_motr_idx_by_name(obj_part_iname); +} + +int MotrMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct) +{ + int rc; + // Check if multipart upload exists + bufferlist bl; + std::unique_ptr meta_obj; + meta_obj = get_meta_obj(); + string bucket_multipart_iname = + "motr.rgw.bucket." + meta_obj->get_bucket()->get_name() + ".multiparts"; + rc = store->do_idx_op_by_name(bucket_multipart_iname, + M0_IC_GET, meta_obj->get_key().to_str(), bl); + if (rc < 0) { + ldpp_dout(dpp, 0) << __func__ << ": Failed to get multipart upload. rc=" << rc << dendl; + return rc == -ENOENT ? -ERR_NO_SUCH_UPLOAD : rc; + } + + // Scan all parts and delete the corresponding motr objects. + rc = this->delete_parts(dpp); + if (rc < 0) + return rc; + + bl.clear(); + // Remove the upload from bucket multipart index. + rc = store->do_idx_op_by_name(bucket_multipart_iname, + M0_IC_DEL, meta_obj->get_key().get_oid(), bl); + return rc; +} + +std::unique_ptr MotrMultipartUpload::get_meta_obj() +{ + std::unique_ptr obj = bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns)); + std::unique_ptr mobj(static_cast(obj.release())); + mobj->set_category(RGWObjCategory::MultiMeta); + return mobj; +} + +struct motr_multipart_upload_info +{ + rgw_placement_rule dest_placement; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(dest_placement, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(dest_placement, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(motr_multipart_upload_info) + +int MotrMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, + ACLOwner& _owner, + rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) +{ + int rc; + std::string oid = mp_obj.get_key(); + + owner = _owner; + + do { + char buf[33]; + string tmp_obj_name; + gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); + std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */ + upload_id.append(buf); + + mp_obj.init(oid, upload_id); + tmp_obj_name = mp_obj.get_meta(); + + std::unique_ptr obj; + obj = bucket->get_object(rgw_obj_key(tmp_obj_name, string(), mp_ns)); + // the meta object will be indexed with 0 size, we c + obj->set_in_extra_data(true); + obj->set_hash_source(oid); + + motr_multipart_upload_info upload_info; + upload_info.dest_placement = dest_placement; + bufferlist mpbl; + encode(upload_info, mpbl); + + // Create an initial entry in the bucket. The entry will be + // updated when multipart upload is completed, for example, + // size, etag etc. + bufferlist bl; + rgw_bucket_dir_entry ent; + obj->get_key().get_index_key(&ent.key); + ent.meta.owner = owner.get_id().to_str(); + ent.meta.category = RGWObjCategory::MultiMeta; + ent.meta.mtime = ceph::real_clock::now(); + ent.meta.user_data.assign(mpbl.c_str(), mpbl.c_str() + mpbl.length()); + ent.encode(bl); + + // Insert an entry into bucket multipart index so it is not shown + // when listing a bucket. + string bucket_multipart_iname = + "motr.rgw.bucket." + obj->get_bucket()->get_name() + ".multiparts"; + rc = store->do_idx_op_by_name(bucket_multipart_iname, + M0_IC_PUT, obj->get_key().get_oid(), bl); + + } while (rc == -EEXIST); + + if (rc < 0) + return rc; + + // Create object part index. + // TODO: add bucket as part of the name. + string obj_part_iname = "motr.rgw.object." + bucket->get_name() + "." + oid + ".parts"; + ldpp_dout(dpp, 20) << "MotrMultipartUpload::init(): object part index=" << obj_part_iname << dendl; + rc = store->create_motr_idx_by_name(obj_part_iname); + if (rc == -EEXIST) + rc = 0; + if (rc < 0) + // TODO: clean the bucket index entry + ldpp_dout(dpp, 0) << "Failed to create object multipart index " << obj_part_iname << dendl; + + return rc; +} + +int MotrMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct, + int num_parts, int marker, + int *next_marker, bool *truncated, + bool assume_unsorted) +{ + int rc; + vector key_vec(num_parts); + vector val_vec(num_parts); + + std::string oid = mp_obj.get_key(); + string obj_part_iname = "motr.rgw.object." + bucket->get_name() + "." + oid + ".parts"; + ldpp_dout(dpp, 20) << __func__ << ": object part index = " << obj_part_iname << dendl; + key_vec[0].clear(); + key_vec[0] = "part."; + char buf[32]; + snprintf(buf, sizeof(buf), "%08d", marker + 1); + key_vec[0].append(buf); + rc = store->next_query_by_name(obj_part_iname, key_vec, val_vec); + if (rc < 0) { + ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl; + return rc; + } + + int last_num = 0; + int part_cnt = 0; + uint32_t expected_next = 0; + ldpp_dout(dpp, 20) << __func__ << ": marker = " << marker << dendl; + for (const auto& bl: val_vec) { + if (bl.length() == 0) + break; + + RGWUploadPartInfo info; + auto iter = bl.cbegin(); + info.decode(iter); + rgw::sal::Attrs attrs_dummy; + decode(attrs_dummy, iter); + MotrObject::Meta meta; + meta.decode(iter); + + ldpp_dout(dpp, 20) << __func__ << ": part_num=" << info.num + << " part_size=" << info.size << dendl; + ldpp_dout(dpp, 20) << __func__ << ": meta:oid=[" << meta.oid.u_hi << "," << meta.oid.u_lo + << "], meta:pvid=[" << meta.pver.f_container << "," << meta.pver.f_key + << "], meta:layout id=" << meta.layout_id << dendl; + + if (!expected_next) + expected_next = info.num + 1; + else if (expected_next && info.num != expected_next) + return -EINVAL; + else expected_next = info.num + 1; + + if ((int)info.num > marker) { + last_num = info.num; + parts.emplace(info.num, std::make_unique(info, meta)); + } + + part_cnt++; + } + + // Does it have more parts? + if (truncated) + *truncated = part_cnt < num_parts? false : true; + ldpp_dout(dpp, 20) << __func__ << ": truncated=" << *truncated << dendl; + + if (next_marker) + *next_marker = last_num; + + return 0; +} + +// Heavily copy from rgw_sal_rados.cc +int MotrMultipartUpload::complete(const DoutPrefixProvider *dpp, + optional_yield y, CephContext* cct, + map& part_etags, + list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& off, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) +{ + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + std::string etag; + bufferlist etag_bl; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + bool truncated; + int rc; + + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): enter" << dendl; + int total_parts = 0; + int handled_parts = 0; + int max_parts = 1000; + int marker = 0; + uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size; + auto etags_iter = part_etags.begin(); + rgw::sal::Attrs attrs = target_obj->get_attrs(); + + do { + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): list_parts()" << dendl; + rc = list_parts(dpp, cct, max_parts, marker, &marker, &truncated); + if (rc == -ENOENT) { + rc = -ERR_NO_SUCH_UPLOAD; + } + if (rc < 0) + return rc; + + total_parts += parts.size(); + if (!truncated && total_parts != (int)part_etags.size()) { + ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts + << " expected: " << part_etags.size() << dendl; + rc = -ERR_INVALID_PART; + return rc; + } + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): parts.size()=" << parts.size() << dendl; + + for (auto obj_iter = parts.begin(); + etags_iter != part_etags.end() && obj_iter != parts.end(); + ++etags_iter, ++obj_iter, ++handled_parts) { + MultipartPart *mpart = obj_iter->second.get(); + MotrMultipartPart *mmpart = static_cast(mpart); + RGWUploadPartInfo *part = &mmpart->info; + + uint64_t part_size = part->accounted_size; + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): part_size=" << part_size << dendl; + if (handled_parts < (int)part_etags.size() - 1 && + part_size < min_part_size) { + rc = -ERR_TOO_SMALL; + return rc; + } + + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + if (etags_iter->first != (int)obj_iter->first) { + ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: " + << etags_iter->first << " next uploaded: " + << obj_iter->first << dendl; + rc = -ERR_INVALID_PART; + return rc; + } + string part_etag = rgw_string_unquote(etags_iter->second); + if (part_etag.compare(part->etag) != 0) { + ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first + << " etag: " << etags_iter->second << dendl; + rc = -ERR_INVALID_PART; + return rc; + } + + hex_to_buf(part->etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): calc etag " << dendl; + + string oid = mp_obj.get_part(part->num); + rgw_obj src_obj; + src_obj.init_ns(bucket->get_key(), oid, mp_ns); + +#if 0 // does Motr backend need it? + /* update manifest for part */ + if (part->manifest.empty()) { + ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj=" + << src_obj << dendl; + rc = -ERR_INVALID_PART; + return rc; + } else { + manifest.append(dpp, part->manifest, store->get_zone()); + } + ldpp_dout(dpp, 0) << "MotrMultipartUpload::complete(): manifest " << dendl; +#endif + + bool part_compressed = (part->cs_info.compression_type != "none"); + if ((handled_parts > 0) && + ((part_compressed != compressed) || + (cs_info.compression_type != part->cs_info.compression_type))) { + ldpp_dout(dpp, 0) << "ERROR: compression type was changed during multipart upload (" + << cs_info.compression_type << ">>" << part->cs_info.compression_type << ")" << dendl; + rc = -ERR_INVALID_PART; + return rc; + } + + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): part compression" << dendl; + if (part_compressed) { + int64_t new_ofs; // offset in compression data for new part + if (cs_info.blocks.size() > 0) + new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len; + else + new_ofs = 0; + for (const auto& block : part->cs_info.blocks) { + compression_block cb; + cb.old_ofs = block.old_ofs + cs_info.orig_size; + cb.new_ofs = new_ofs; + cb.len = block.len; + cs_info.blocks.push_back(cb); + new_ofs = cb.new_ofs + cb.len; + } + if (!compressed) + cs_info.compression_type = part->cs_info.compression_type; + cs_info.orig_size += part->cs_info.orig_size; + compressed = true; + } + + // We may not need to do the following as remove_objs are those + // don't show when listing a bucket. As we store in-progress uploaded + // object's metadata in a separate index, they are not shown when + // listing a bucket. + rgw_obj_index_key remove_key; + src_obj.key.get_index_key(&remove_key); + remove_objs.push_back(remove_key); + + off += part_size; + accounted_size += part->accounted_size; + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): off=" << off << ", accounted_size = " << accounted_size << dendl; + } + } while (truncated); + hash.Final((unsigned char *)final_etag); + + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], + sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)part_etags.size()); + etag = final_etag_str; + ldpp_dout(dpp, 20) << "calculated etag: " << etag << dendl; + etag_bl.append(etag); + attrs[RGW_ATTR_ETAG] = etag_bl; + + if (compressed) { + // write compression attribute to full object + bufferlist tmp; + encode(cs_info, tmp); + attrs[RGW_ATTR_COMPRESSION] = tmp; + } + + // Read the object's the multipart_upload_info. + // TODO: all those index name and key constructions should be implemented as + // member functions. + bufferlist bl; + std::unique_ptr meta_obj; + meta_obj = get_meta_obj(); + string bucket_multipart_iname = + "motr.rgw.bucket." + meta_obj->get_bucket()->get_name() + ".multiparts"; + rc = this->store->do_idx_op_by_name(bucket_multipart_iname, + M0_IC_GET, meta_obj->get_key().get_oid(), bl); + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): read entry from bucket multipart index rc=" << rc << dendl; + if (rc < 0) + return rc; + rgw_bucket_dir_entry ent; + bufferlist& blr = bl; + auto ent_iter = blr.cbegin(); + ent.decode(ent_iter); + + // Update the dir entry and insert it to the bucket index so + // the object will be seen when listing the bucket. + bufferlist update_bl; + target_obj->get_key().get_index_key(&ent.key); // Change to offical name :) + ent.meta.size = off; + ent.meta.accounted_size = accounted_size; + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): obj size=" << ent.meta.size + << " obj accounted size=" << ent.meta.accounted_size << dendl; + ent.meta.mtime = ceph::real_clock::now(); + ent.meta.etag = etag; + ent.encode(update_bl); + encode(attrs, update_bl); + MotrObject::Meta meta_dummy; + meta_dummy.encode(update_bl); + + string bucket_index_iname = "motr.rgw.bucket.index." + meta_obj->get_bucket()->get_name(); + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): target_obj name=" << target_obj->get_name() + << " target_obj oid=" << target_obj->get_oid() << dendl; + rc = store->do_idx_op_by_name(bucket_index_iname, M0_IC_PUT, + target_obj->get_name(), update_bl); + if (rc < 0) + return rc; + + // Put into metadata cache. + store->get_obj_meta_cache()->put(dpp, target_obj->get_name(), update_bl); + + // Now we can remove it from bucket multipart index. + ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): remove from bucket multipartindex " << dendl; + return store->do_idx_op_by_name(bucket_multipart_iname, + M0_IC_DEL, meta_obj->get_key().get_oid(), bl); +} + +int MotrMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs) +{ + if (!rule && !attrs) { + return 0; + } + + if (rule) { + if (!placement.empty()) { + *rule = &placement; + if (!attrs) { + /* Don't need attrs, done */ + return 0; + } + } else { + *rule = nullptr; + } + } + + std::unique_ptr meta_obj; + meta_obj = get_meta_obj(); + meta_obj->set_in_extra_data(true); + + // Read the object's the multipart_upload_info. + bufferlist bl; + string bucket_multipart_iname = + "motr.rgw.bucket." + meta_obj->get_bucket()->get_name() + ".multiparts"; + int rc = this->store->do_idx_op_by_name(bucket_multipart_iname, + M0_IC_GET, meta_obj->get_key().get_oid(), bl); + if (rc < 0) { + ldpp_dout(dpp, 0) << __func__ << ": Failed to get multipart info. rc=" << rc << dendl; + return rc == -ENOENT ? -ERR_NO_SUCH_UPLOAD : rc; + } + + rgw_bucket_dir_entry ent; + bufferlist& blr = bl; + auto ent_iter = blr.cbegin(); + ent.decode(ent_iter); + + if (attrs) { + bufferlist etag_bl; + string& etag = ent.meta.etag; + ldpp_dout(dpp, 20) << "object's etag: " << ent.meta.etag << dendl; + etag_bl.append(etag.c_str(), etag.size()); + attrs->emplace(std::move(RGW_ATTR_ETAG), std::move(etag_bl)); + if (!rule || *rule != nullptr) { + /* placement was cached; don't actually read */ + return 0; + } + } + + /* Decode multipart_upload_info */ + motr_multipart_upload_info upload_info; + bufferlist mpbl; + mpbl.append(ent.meta.user_data.c_str(), ent.meta.user_data.size()); + auto mpbl_iter = mpbl.cbegin(); + upload_info.decode(mpbl_iter); + placement = upload_info.dest_placement; + *rule = &placement; + + return 0; +} + +std::unique_ptr MotrMultipartUpload::get_writer( + const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) +{ + return std::make_unique(dpp, y, this, + obj, store, owner, + ptail_placement_rule, part_num, part_num_str); +} + +int MotrMultipartWriter::prepare(optional_yield y) +{ + string part_obj_name = head_obj->get_bucket()->get_name() + "." + + head_obj->get_key().get_oid() + + ".part." + std::to_string(part_num); + ldpp_dout(dpp, 20) << "bucket=" << head_obj->get_bucket()->get_name() << "part_obj_name=" << part_obj_name << dendl; + part_obj = std::make_unique(this->store, rgw_obj_key(part_obj_name), head_obj->get_bucket()); + if (part_obj == nullptr) + return -ENOMEM; + + // s3 client may retry uploading part, so the part may have already + // been created. + int rc = part_obj->create_mobj(dpp, store->cctx->_conf->rgw_max_chunk_size); + if (rc == -EEXIST) { + rc = part_obj->open_mobj(dpp); + if (rc < 0) + return rc; + } + return rc; +} + +int MotrMultipartWriter::process(bufferlist&& data, uint64_t offset) +{ + int rc = part_obj->write_mobj(dpp, std::move(data), offset); + if (rc == 0) { + actual_part_size += data.length(); + ldpp_dout(dpp, 20) << " write_mobj(): actual_part_size=" << actual_part_size << dendl; + } + return rc; +} + +int MotrMultipartWriter::complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) +{ + // Should the dir entry(object metadata) be updated? For example + // mtime. + + ldpp_dout(dpp, 20) << "MotrMultipartWriter::complete(): enter" << dendl; + // Add an entry into object_nnn_part_index. + bufferlist bl; + RGWUploadPartInfo info; + info.num = part_num; + info.etag = etag; + info.size = actual_part_size; + info.accounted_size = accounted_size; + info.modified = real_clock::now(); + + bool compressed; + int rc = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info); + ldpp_dout(dpp, 20) << "MotrMultipartWriter::complete(): compression rc=" << rc << dendl; + if (rc < 0) { + ldpp_dout(dpp, 1) << "cannot get compression info" << dendl; + return rc; + } + encode(info, bl); + encode(attrs, bl); + part_obj->meta.encode(bl); + + string p = "part."; + char buf[32]; + snprintf(buf, sizeof(buf), "%08d", (int)part_num); + p.append(buf); + string obj_part_iname = "motr.rgw.object." + head_obj->get_bucket()->get_name() + "." + + head_obj->get_key().get_oid() + ".parts"; + ldpp_dout(dpp, 20) << "MotrMultipartWriter::complete(): object part index = " << obj_part_iname << dendl; + rc = store->do_idx_op_by_name(obj_part_iname, M0_IC_PUT, p, bl); + if (rc < 0) { + return rc == -ENOENT ? -ERR_NO_SUCH_UPLOAD : rc; + } + + return 0; +} + +std::unique_ptr MotrStore::get_role(std::string name, + std::string tenant, + std::string path, + std::string trust_policy, + std::string max_session_duration_str, + std::multimap tags) +{ + RGWRole* p = nullptr; + return std::unique_ptr(p); +} + +std::unique_ptr MotrStore::get_role(const RGWRoleInfo& info) +{ + RGWRole* p = nullptr; + return std::unique_ptr(p); +} + +std::unique_ptr MotrStore::get_role(std::string id) +{ + RGWRole* p = nullptr; + return std::unique_ptr(p); +} + +int MotrStore::get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + vector>& roles) +{ + return 0; +} + +std::unique_ptr MotrStore::get_oidc_provider() +{ + RGWOIDCProvider* p = nullptr; + return std::unique_ptr(p); +} + +int MotrStore::get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + vector>& providers) +{ + return 0; +} + +std::unique_ptr MotrBucket::get_multipart_upload(const std::string& oid, + std::optional upload_id, + ACLOwner owner, ceph::real_time mtime) +{ + return std::make_unique(store, this, oid, upload_id, owner, mtime); +} + +std::unique_ptr MotrStore::get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) { + return nullptr; +} + +std::unique_ptr MotrStore::get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) { + return std::make_unique(dpp, y, + obj, this, owner, + ptail_placement_rule, olh_epoch, unique_tag); +} + +const std::string& MotrStore::get_compression_type(const rgw_placement_rule& rule) +{ + return zone.zone_params->get_compression_type(rule); +} + +bool MotrStore::valid_placement(const rgw_placement_rule& rule) +{ + return zone.zone_params->valid_placement(rule); +} + +std::unique_ptr MotrStore::get_user(const rgw_user &u) +{ + ldout(cctx, 20) << "bucket's user: " << u.to_str() << dendl; + return std::make_unique(this, u); +} + +int MotrStore::get_user_by_access_key(const DoutPrefixProvider *dpp, const std::string &key, optional_yield y, std::unique_ptr *user) +{ + int rc; + User *u; + bufferlist bl; + RGWUserInfo uinfo; + MotrAccessKey access_key; + + rc = do_idx_op_by_name(RGW_IAM_MOTR_ACCESS_KEY, + M0_IC_GET, key, bl); + if (rc < 0){ + ldout(cctx, 0) << "Access key not found: rc = " << rc << dendl; + return rc; + } + + bufferlist& blr = bl; + auto iter = blr.cbegin(); + access_key.decode(iter); + + uinfo.user_id.from_str(access_key.user_id); + ldout(cctx, 0) << "Loading user: " << uinfo.user_id.id << dendl; + rc = MotrUser().load_user_from_idx(dpp, this, uinfo, nullptr, nullptr); + if (rc < 0){ + ldout(cctx, 0) << "Failed to load user: rc = " << rc << dendl; + return rc; + } + u = new MotrUser(this, uinfo); + if (!u) + return -ENOMEM; + + user->reset(u); + return 0; +} + +int MotrStore::get_user_by_email(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y, std::unique_ptr* user) +{ + int rc; + User *u; + bufferlist bl; + RGWUserInfo uinfo; + MotrEmailInfo email_info; + rc = do_idx_op_by_name(RGW_IAM_MOTR_EMAIL_KEY, + M0_IC_GET, email, bl); + if (rc < 0){ + ldout(cctx, 0) << "Email Id not found: rc = " << rc << dendl; + return rc; + } + auto iter = bl.cbegin(); + email_info.decode(iter); + ldout(cctx, 0) << "Loading user: " << email_info.user_id << dendl; + uinfo.user_id.from_str(email_info.user_id); + rc = MotrUser().load_user_from_idx(dpp, this, uinfo, nullptr, nullptr); + if (rc < 0){ + ldout(cctx, 0) << "Failed to load user: rc = " << rc << dendl; + return rc; + } + u = new MotrUser(this, uinfo); + if (!u) + return -ENOMEM; + + user->reset(u); + return 0; +} + +int MotrStore::get_user_by_swift(const DoutPrefixProvider *dpp, const std::string& user_str, optional_yield y, std::unique_ptr* user) +{ + /* Swift keys and subusers are not supported for now */ + return 0; +} + +int MotrStore::store_access_key(const DoutPrefixProvider *dpp, optional_yield y, MotrAccessKey access_key) +{ + int rc; + bufferlist bl; + access_key.encode(bl); + rc = do_idx_op_by_name(RGW_IAM_MOTR_ACCESS_KEY, + M0_IC_PUT, access_key.id, bl); + if (rc < 0){ + ldout(cctx, 0) << "Failed to store key: rc = " << rc << dendl; + return rc; + } + return rc; +} + +int MotrStore::delete_access_key(const DoutPrefixProvider *dpp, optional_yield y, std::string access_key) +{ + int rc; + bufferlist bl; + rc = do_idx_op_by_name(RGW_IAM_MOTR_ACCESS_KEY, + M0_IC_DEL, access_key, bl); + if (rc < 0){ + ldout(cctx, 0) << "Failed to delete key: rc = " << rc << dendl; + } + return rc; +} + +int MotrStore::store_email_info(const DoutPrefixProvider *dpp, optional_yield y, MotrEmailInfo& email_info ) +{ + int rc; + bufferlist bl; + email_info.encode(bl); + rc = do_idx_op_by_name(RGW_IAM_MOTR_EMAIL_KEY, + M0_IC_PUT, email_info.email_id, bl); + if (rc < 0) { + ldout(cctx, 0) << "Failed to store the user by email as key: rc = " << rc << dendl; + } + return rc; +} + +std::unique_ptr MotrStore::get_object(const rgw_obj_key& k) +{ + return std::make_unique(this, k); +} + + +int MotrStore::get_bucket(const DoutPrefixProvider *dpp, User* u, const rgw_bucket& b, std::unique_ptr* bucket, optional_yield y) +{ + int ret; + Bucket* bp; + + bp = new MotrBucket(this, b, u); + ret = bp->load_bucket(dpp, y); + if (ret < 0) { + delete bp; + return ret; + } + + bucket->reset(bp); + return 0; +} + +int MotrStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr* bucket) +{ + Bucket* bp; + + bp = new MotrBucket(this, i, u); + /* Don't need to fetch the bucket info, use the provided one */ + + bucket->reset(bp); + return 0; +} + +int MotrStore::get_bucket(const DoutPrefixProvider *dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr* bucket, optional_yield y) +{ + rgw_bucket b; + + b.tenant = tenant; + b.name = name; + + return get_bucket(dpp, u, b, bucket, y); +} + +bool MotrStore::is_meta_master() +{ + return true; +} + +int MotrStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version *objv, + bufferlist& in_data, + JSONParser *jp, req_info& info, + optional_yield y) +{ + return 0; +} + +int MotrStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, req_info& info, + optional_yield y) +{ + return 0; +} + +std::string MotrStore::zone_unique_id(uint64_t unique_num) +{ + return ""; +} + +std::string MotrStore::zone_unique_trans_id(const uint64_t unique_num) +{ + return ""; +} + +int MotrStore::get_zonegroup(const std::string& id, std::unique_ptr* group) +{ + /* XXX: for now only one zonegroup supported */ + ZoneGroup* zg; + zg = new MotrZoneGroup(this, zone.zonegroup.get_group()); + + group->reset(zg); + return 0; +} + +int MotrStore::list_all_zones(const DoutPrefixProvider* dpp, + std::list& zone_ids) +{ + zone_ids.push_back(zone.get_id()); + return 0; +} + +int MotrStore::cluster_stat(RGWClusterStat& stats) +{ + return 0; +} + +std::unique_ptr MotrStore::get_lifecycle(void) +{ + return 0; +} + +std::unique_ptr MotrStore::get_completions(void) +{ + return 0; +} + +std::unique_ptr MotrStore::get_notification(Object* obj, Object* src_obj, req_state* s, + rgw::notify::EventType event_type, optional_yield y, const string* object_name) +{ + return std::make_unique(obj, src_obj, event_type); +} + +std::unique_ptr MotrStore::get_notification(const DoutPrefixProvider* dpp, Object* obj, + Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, + std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) +{ + return std::make_unique(obj, src_obj, event_type); +} + +int MotrStore::log_usage(const DoutPrefixProvider *dpp, map& usage_info) +{ + return 0; +} + +int MotrStore::log_op(const DoutPrefixProvider *dpp, string& oid, bufferlist& bl) +{ + return 0; +} + +int MotrStore::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, + const map& meta) +{ + return 0; +} + +void MotrStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, + RGWRateLimitInfo& user_ratelimit, + RGWRateLimitInfo& anon_ratelimit) +{ + return; +} + +void MotrStore::get_quota(RGWQuota& quota) +{ + // XXX: Not handled for the first pass + return; +} + +int MotrStore::set_buckets_enabled(const DoutPrefixProvider *dpp, vector& buckets, bool enabled) +{ + return 0; +} + +int MotrStore::get_sync_policy_handler(const DoutPrefixProvider *dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef *phandler, + optional_yield y) +{ + return 0; +} + +RGWDataSyncStatusManager* MotrStore::get_data_sync_manager(const rgw_zone_id& source_zone) +{ + return 0; +} + +int MotrStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, + RGWUsageIter& usage_iter, + map& usage) +{ + return 0; +} + +int MotrStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) +{ + return 0; +} + +int MotrStore::get_config_key_val(string name, bufferlist *bl) +{ + return 0; +} + +int MotrStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const string& section, const string& marker, void** phandle) +{ + return 0; +} + +int MotrStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list& keys, bool* truncated) +{ + return 0; +} + +void MotrStore::meta_list_keys_complete(void* handle) +{ + return; +} + +std::string MotrStore::meta_get_marker(void* handle) +{ + return ""; +} + +int MotrStore::meta_remove(const DoutPrefixProvider *dpp, string& metadata_key, optional_yield y) +{ + return 0; +} + +int MotrStore::open_idx(struct m0_uint128 *id, bool create, struct m0_idx *idx) +{ + m0_idx_init(idx, &container.co_realm, id); + + if (!create) + return 0; // nothing to do more + + // create index or make sure it's created + struct m0_op *op = nullptr; + int rc = m0_entity_create(nullptr, &idx->in_entity, &op); + if (rc != 0) { + ldout(cctx, 0) << "ERROR: m0_entity_create() failed: " << rc << dendl; + goto out; + } + + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + + if (rc != 0 && rc != -EEXIST) + ldout(cctx, 0) << "ERROR: index create failed: " << rc << dendl; +out: + return rc; +} + +static void set_m0bufvec(struct m0_bufvec *bv, vector& vec) +{ + *bv->ov_buf = reinterpret_cast(vec.data()); + *bv->ov_vec.v_count = vec.size(); +} + +// idx must be opened with open_idx() beforehand +int MotrStore::do_idx_op(struct m0_idx *idx, enum m0_idx_opcode opcode, + vector& key, vector& val, bool update) +{ + int rc, rc_i; + struct m0_bufvec k, v, *vp = &v; + uint32_t flags = 0; + struct m0_op *op = nullptr; + + if (m0_bufvec_empty_alloc(&k, 1) != 0) { + ldout(cctx, 0) << "ERROR: failed to allocate key bufvec" << dendl; + return -ENOMEM; + } + + if (opcode == M0_IC_PUT || opcode == M0_IC_GET) { + rc = -ENOMEM; + if (m0_bufvec_empty_alloc(&v, 1) != 0) { + ldout(cctx, 0) << "ERROR: failed to allocate value bufvec" << dendl; + goto out; + } + } + + set_m0bufvec(&k, key); + if (opcode == M0_IC_PUT) + set_m0bufvec(&v, val); + + if (opcode == M0_IC_DEL) + vp = nullptr; + + if (opcode == M0_IC_PUT && update) + flags |= M0_OIF_OVERWRITE; + + rc = m0_idx_op(idx, opcode, &k, vp, &rc_i, flags, &op); + if (rc != 0) { + ldout(cctx, 0) << "ERROR: failed to init index op: " << rc << dendl; + goto out; + } + + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + + if (rc != 0) { + ldout(cctx, 0) << "ERROR: op failed: " << rc << dendl; + goto out; + } + + if (rc_i != 0) { + ldout(cctx, 0) << "ERROR: idx op failed: " << rc_i << dendl; + rc = rc_i; + goto out; + } + + if (opcode == M0_IC_GET) { + val.resize(*v.ov_vec.v_count); + memcpy(reinterpret_cast(val.data()), *v.ov_buf, *v.ov_vec.v_count); + } + +out: + m0_bufvec_free2(&k); + if (opcode == M0_IC_GET) + m0_bufvec_free(&v); // cleanup buffer after GET + else if (opcode == M0_IC_PUT) + m0_bufvec_free2(&v); + + return rc; +} + +// Retrieve a range of key/value pairs starting from keys[0]. +int MotrStore::do_idx_next_op(struct m0_idx *idx, + vector>& keys, + vector>& vals) +{ + int rc; + uint32_t i = 0; + int nr_kvp = vals.size(); + int *rcs = new int[nr_kvp]; + struct m0_bufvec k, v; + struct m0_op *op = nullptr; + + rc = m0_bufvec_empty_alloc(&k, nr_kvp)?: + m0_bufvec_empty_alloc(&v, nr_kvp); + if (rc != 0) { + ldout(cctx, 0) << "ERROR: failed to allocate kv bufvecs" << dendl; + return rc; + } + + set_m0bufvec(&k, keys[0]); + + rc = m0_idx_op(idx, M0_IC_NEXT, &k, &v, rcs, 0, &op); + if (rc != 0) { + ldout(cctx, 0) << "ERROR: failed to init index op: " << rc << dendl; + goto out; + } + + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + + if (rc != 0) { + ldout(cctx, 0) << "ERROR: op failed: " << rc << dendl; + goto out; + } + + for (i = 0; i < v.ov_vec.v_nr; ++i) { + if (rcs[i] < 0) + break; + + vector& key = keys[i]; + vector& val = vals[i]; + key.resize(k.ov_vec.v_count[i]); + val.resize(v.ov_vec.v_count[i]); + memcpy(reinterpret_cast(key.data()), k.ov_buf[i], k.ov_vec.v_count[i]); + memcpy(reinterpret_cast(val.data()), v.ov_buf[i], v.ov_vec.v_count[i]); + } + +out: + k.ov_vec.v_nr = i; + v.ov_vec.v_nr = i; + m0_bufvec_free(&k); + m0_bufvec_free(&v); // cleanup buffer after GET + + delete []rcs; + return rc ?: i; +} + +// Retrieve a number of key/value pairs under the prefix starting +// from the marker at key_out[0]. +int MotrStore::next_query_by_name(string idx_name, + vector& key_out, + vector& val_out, + string prefix, string delim) +{ + unsigned nr_kvp = std::min(val_out.size(), 100UL); + struct m0_idx idx = {}; + vector> keys(nr_kvp); + vector> vals(nr_kvp); + struct m0_uint128 idx_id; + int i = 0, j, k = 0; + + index_name_to_motr_fid(idx_name, &idx_id); + int rc = open_motr_idx(&idx_id, &idx); + if (rc != 0) { + ldout(cctx, 0) << "ERROR: next_query_by_name(): failed to open index: rc=" + << rc << dendl; + goto out; + } + + // Only the first element for keys needs to be set for NEXT query. + // The keys will be set will the returned keys from motr index. + ldout(cctx, 20) <<__func__<< ": next_query_by_name(): index=" << idx_name + << " prefix=" << prefix << " delim=" << delim << dendl; + keys[0].assign(key_out[0].begin(), key_out[0].end()); + for (i = 0; i < (int)val_out.size(); i += k, k = 0) { + rc = do_idx_next_op(&idx, keys, vals); + ldout(cctx, 20) << "do_idx_next_op() = " << rc << dendl; + if (rc < 0) { + ldout(cctx, 0) << "ERROR: NEXT query failed. " << rc << dendl; + goto out; + } + + string dir; + for (j = 0, k = 0; j < rc; ++j) { + string key(keys[j].begin(), keys[j].end()); + size_t pos = std::string::npos; + if (!delim.empty()) + pos = key.find(delim, prefix.length()); + if (pos != std::string::npos) { // DIR entry + dir.assign(key, 0, pos + 1); + if (dir.compare(0, prefix.length(), prefix) != 0) + goto out; + if (i + k == 0 || dir != key_out[i + k - 1]) // a new one + key_out[i + k++] = dir; + continue; + } + dir = ""; + if (key.compare(0, prefix.length(), prefix) != 0) + goto out; + key_out[i + k] = key; + bufferlist& vbl = val_out[i + k]; + vbl.append(reinterpret_cast(vals[j].data()), vals[j].size()); + ++k; + } + + if (rc < (int)nr_kvp) // there are no more keys to fetch + break; + + string next_key; + if (dir != "") + next_key = dir + "\xff"; // skip all dir content in 1 step + else + next_key = key_out[i + k - 1] + " "; + ldout(cctx, 0) << "do_idx_next_op(): next_key=" << next_key << dendl; + keys[0].assign(next_key.begin(), next_key.end()); + } + +out: + m0_idx_fini(&idx); + return rc < 0 ? rc : i + k; +} + +int MotrStore::delete_motr_idx_by_name(string iname) +{ + struct m0_idx idx; + struct m0_uint128 idx_id; + struct m0_op *op = nullptr; + + ldout(cctx, 20) << "delete_motr_idx_by_name=" << iname << dendl; + + index_name_to_motr_fid(iname, &idx_id); + m0_idx_init(&idx, &container.co_realm, &idx_id); + m0_entity_open(&idx.in_entity, &op); + int rc = m0_entity_delete(&idx.in_entity, &op); + if (rc < 0) + goto out; + + m0_op_launch(&op, 1); + + ldout(cctx, 70) << "waiting for op completion" << dendl; + + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + + if (rc == -ENOENT) // race deletion?? + rc = 0; + else if (rc < 0) + ldout(cctx, 0) << "ERROR: index create failed: " << rc << dendl; + + ldout(cctx, 20) << "delete_motr_idx_by_name rc=" << rc << dendl; + +out: + m0_idx_fini(&idx); + return rc; +} + +int MotrStore::open_motr_idx(struct m0_uint128 *id, struct m0_idx *idx) +{ + m0_idx_init(idx, &container.co_realm, id); + return 0; +} + +// The following marcos are from dix/fid_convert.h which are not exposed. +enum { + M0_DIX_FID_DEVICE_ID_OFFSET = 32, + M0_DIX_FID_DIX_CONTAINER_MASK = (1ULL << M0_DIX_FID_DEVICE_ID_OFFSET) + - 1, +}; + +// md5 is used here, a more robust way to convert index name to fid is +// needed to avoid collision. +void MotrStore::index_name_to_motr_fid(string iname, struct m0_uint128 *id) +{ + unsigned char md5[16]; // 128/8 = 16 + MD5 hash; + + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + hash.Update((const unsigned char *)iname.c_str(), iname.length()); + hash.Final(md5); + + memcpy(&id->u_hi, md5, 8); + memcpy(&id->u_lo, md5 + 8, 8); + ldout(cctx, 20) << "id = 0x" << std::hex << id->u_hi << ":0x" << std::hex << id->u_lo << dendl; + + struct m0_fid *fid = (struct m0_fid*)id; + m0_fid_tset(fid, m0_dix_fid_type.ft_id, + fid->f_container & M0_DIX_FID_DIX_CONTAINER_MASK, fid->f_key); + ldout(cctx, 20) << "converted id = 0x" << std::hex << id->u_hi << ":0x" << std::hex << id->u_lo << dendl; +} + +int MotrStore::do_idx_op_by_name(string idx_name, enum m0_idx_opcode opcode, + string key_str, bufferlist &bl, bool update) +{ + struct m0_idx idx; + vector key(key_str.begin(), key_str.end()); + vector val; + struct m0_uint128 idx_id; + + index_name_to_motr_fid(idx_name, &idx_id); + int rc = open_motr_idx(&idx_id, &idx); + if (rc != 0) { + ldout(cctx, 0) << "ERROR: failed to open index: " << rc << dendl; + goto out; + } + + if (opcode == M0_IC_PUT) + val.assign(bl.c_str(), bl.c_str() + bl.length()); + + ldout(cctx, 20) <<__func__<< ": do_idx_op_by_name(): op=" + << (opcode == M0_IC_PUT ? "PUT" : "GET") + << " idx=" << idx_name << " key=" << key_str << dendl; + rc = do_idx_op(&idx, opcode, key, val, update); + if (rc == 0 && opcode == M0_IC_GET) + // Append the returned value (blob) to the bufferlist. + bl.append(reinterpret_cast(val.data()), val.size()); + +out: + m0_idx_fini(&idx); + return rc; +} + +int MotrStore::create_motr_idx_by_name(string iname) +{ + struct m0_idx idx = {}; + struct m0_uint128 id; + + index_name_to_motr_fid(iname, &id); + m0_idx_init(&idx, &container.co_realm, &id); + + // create index or make sure it's created + struct m0_op *op = nullptr; + int rc = m0_entity_create(nullptr, &idx.in_entity, &op); + if (rc != 0) { + ldout(cctx, 0) << "ERROR: m0_entity_create() failed: " << rc << dendl; + goto out; + } + + m0_op_launch(&op, 1); + rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?: + m0_rc(op); + m0_op_fini(op); + m0_op_free(op); + + if (rc != 0 && rc != -EEXIST) + ldout(cctx, 0) << "ERROR: index create failed: " << rc << dendl; +out: + m0_idx_fini(&idx); + return rc; +} + +// If a global index is checked (if it has been create) every time +// before they're queried (put/get), which takes 2 Motr operations to +// complete the query. As the global indices' name and FID are known +// already when MotrStore is created, we move the check and creation +// in newMotrStore(). +// Similar method is used for per bucket/user index. For example, +// bucket instance index is created when creating the bucket. +int MotrStore::check_n_create_global_indices() +{ + int rc = 0; + + for (const auto& iname : motr_global_indices) { + rc = create_motr_idx_by_name(iname); + if (rc < 0 && rc != -EEXIST) + break; + rc = 0; + } + + return rc; +} + +std::string MotrStore::get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y) +{ + char id[M0_FID_STR_LEN]; + struct m0_confc *confc = m0_reqh2confc(&instance->m0c_reqh); + + m0_fid_print(id, ARRAY_SIZE(id), &confc->cc_root->co_id); + return std::string(id); +} + +int MotrStore::init_metadata_cache(const DoutPrefixProvider *dpp, + CephContext *cct) +{ + this->obj_meta_cache = new MotrMetaCache(dpp, cct); + this->get_obj_meta_cache()->set_enabled(true); + + this->user_cache = new MotrMetaCache(dpp, cct); + this->get_user_cache()->set_enabled(true); + + this->bucket_inst_cache = new MotrMetaCache(dpp, cct); + this->get_bucket_inst_cache()->set_enabled(true); + + return 0; +} + + int MotrLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) + { + return -ENOENT; + } + + int MotrLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) + { + return -ENOENT; + } + + int MotrLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) + { + return -ENOENT; + } + + int MotrLuaManager::add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) + { + return -ENOENT; + } + + int MotrLuaManager::remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) + { + return -ENOENT; + } + + int MotrLuaManager::list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) + { + return -ENOENT; + } +} // namespace rgw::sal + +extern "C" { + +void *newMotrStore(CephContext *cct) +{ + int rc = -1; + rgw::sal::MotrStore *store = new rgw::sal::MotrStore(cct); + + if (store) { + store->conf.mc_is_oostore = true; + // XXX: these params should be taken from config settings and + // cct somehow? + store->instance = nullptr; + const auto& proc_ep = g_conf().get_val("motr_my_endpoint"); + const auto& ha_ep = g_conf().get_val("motr_ha_endpoint"); + const auto& proc_fid = g_conf().get_val("motr_my_fid"); + const auto& profile = g_conf().get_val("motr_profile_fid"); + const auto& admin_proc_ep = g_conf().get_val("motr_admin_endpoint"); + const auto& admin_proc_fid = g_conf().get_val("motr_admin_fid"); + const int init_flags = cct->get_init_flags(); + ldout(cct, 0) << "INFO: motr my endpoint: " << proc_ep << dendl; + ldout(cct, 0) << "INFO: motr ha endpoint: " << ha_ep << dendl; + ldout(cct, 0) << "INFO: motr my fid: " << proc_fid << dendl; + ldout(cct, 0) << "INFO: motr profile fid: " << profile << dendl; + store->conf.mc_local_addr = proc_ep.c_str(); + store->conf.mc_process_fid = proc_fid.c_str(); + + ldout(cct, 0) << "INFO: init flags: " << init_flags << dendl; + ldout(cct, 0) << "INFO: motr admin endpoint: " << admin_proc_ep << dendl; + ldout(cct, 0) << "INFO: motr admin fid: " << admin_proc_fid << dendl; + + // HACK this is so that radosge-admin uses a different client + if (init_flags == 0) { + store->conf.mc_process_fid = admin_proc_fid.c_str(); + store->conf.mc_local_addr = admin_proc_ep.c_str(); + } else { + store->conf.mc_process_fid = proc_fid.c_str(); + store->conf.mc_local_addr = proc_ep.c_str(); + } + store->conf.mc_ha_addr = ha_ep.c_str(); + store->conf.mc_profile = profile.c_str(); + + ldout(cct, 50) << "INFO: motr profile fid: " << store->conf.mc_profile << dendl; + ldout(cct, 50) << "INFO: ha addr: " << store->conf.mc_ha_addr << dendl; + ldout(cct, 50) << "INFO: process fid: " << store->conf.mc_process_fid << dendl; + ldout(cct, 50) << "INFO: motr endpoint: " << store->conf.mc_local_addr << dendl; + + store->conf.mc_tm_recv_queue_min_len = 64; + store->conf.mc_max_rpc_msg_size = 524288; + store->conf.mc_idx_service_id = M0_IDX_DIX; + store->dix_conf.kc_create_meta = false; + store->conf.mc_idx_service_conf = &store->dix_conf; + + if (!g_conf().get_val("motr_tracing_enabled")) { + m0_trace_level_allow(M0_WARN); // allow errors and warnings in syslog anyway + m0_trace_set_mmapped_buffer(false); + } + + store->instance = nullptr; + rc = m0_client_init(&store->instance, &store->conf, true); + if (rc != 0) { + ldout(cct, 0) << "ERROR: m0_client_init() failed: " << rc << dendl; + goto out; + } + + m0_container_init(&store->container, nullptr, &M0_UBER_REALM, store->instance); + rc = store->container.co_realm.re_entity.en_sm.sm_rc; + if (rc != 0) { + ldout(cct, 0) << "ERROR: m0_container_init() failed: " << rc << dendl; + goto out; + } + + rc = m0_ufid_init(store->instance, &ufid_gr); + if (rc != 0) { + ldout(cct, 0) << "ERROR: m0_ufid_init() failed: " << rc << dendl; + goto out; + } + + // Create global indices if not yet. + rc = store->check_n_create_global_indices(); + if (rc != 0) { + ldout(cct, 0) << "ERROR: check_n_create_global_indices() failed: " << rc << dendl; + goto out; + } + + } + +out: + if (rc != 0) { + delete store; + return nullptr; + } + return store; +} + +} diff --git a/src/rgw/rgw_sal_motr.h b/src/rgw/rgw_sal_motr.h new file mode 100644 index 000000000..b7230f7e1 --- /dev/null +++ b/src/rgw/rgw_sal_motr.h @@ -0,0 +1,1204 @@ + +// vim: ts=2 sw=2 expandtab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * SAL implementation for the CORTX Motr backend + * + * Copyright (C) 2021 Seagate Technology LLC and/or its Affiliates + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +extern "C" { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wextern-c-compat" +#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion" +#include "motr/config.h" +#include "motr/client.h" +#pragma clang diagnostic pop +} + +#include "rgw_sal_store.h" +#include "rgw_rados.h" +#include "rgw_notify.h" +#include "rgw_oidc_provider.h" +#include "rgw_role.h" +#include "rgw_multi.h" +#include "rgw_putobj_processor.h" + +namespace rgw::sal { + +class MotrStore; + +// Global Motr indices +#define RGW_MOTR_USERS_IDX_NAME "motr.rgw.users" +#define RGW_MOTR_BUCKET_INST_IDX_NAME "motr.rgw.bucket.instances" +#define RGW_MOTR_BUCKET_HD_IDX_NAME "motr.rgw.bucket.headers" +#define RGW_IAM_MOTR_ACCESS_KEY "motr.rgw.accesskeys" +#define RGW_IAM_MOTR_EMAIL_KEY "motr.rgw.emails" + +//#define RGW_MOTR_BUCKET_ACL_IDX_NAME "motr.rgw.bucket.acls" + +// A simplified metadata cache implementation. +// Note: MotrObjMetaCache doesn't handle the IO operations to Motr. A proxy +// class can be added to handle cache and 'real' ops. +class MotrMetaCache +{ +protected: + // MGW re-uses ObjectCache to cache object's metadata as it has already + // implemented a lru cache: (1) ObjectCache internally uses a map and lru + // list to manage cache entry. POC uses object name, user name or bucket + // name as the key to lookup and insert an entry. (2) ObjectCache::data is + // a bufferlist and can be used to store any metadata structure, such as + // object's bucket dir entry, user info or bucket instance. + // + // Note from RGW: + // The Rados Gateway stores metadata and objects in an internal cache. This + // should be kept consistent by the OSD's relaying notify events between + // multiple watching RGW processes. In the event that this notification + // protocol fails, bounding the length of time that any data in the cache will + // be assumed valid will ensure that any RGW instance that falls out of sync + // will eventually recover. This seems to be an issue mostly for large numbers + // of RGW instances under heavy use. If you would like to turn off cache expiry, + // set this value to zero. + // + // Currently POC hasn't implemented the watch-notify menchanism yet. So the + // current implementation is similar to cortx-s3server which is based on expiry + // time. TODO: see comments on distribute_cache). + // + // Beaware: Motr object data is not cached in current POC as RGW! + // RGW caches the first chunk (4MB by default). + ObjectCache cache; + +public: + // Lookup a cache entry. + int get(const DoutPrefixProvider *dpp, const std::string& name, bufferlist& data); + + // Insert a cache entry. + int put(const DoutPrefixProvider *dpp, const std::string& name, const bufferlist& data); + + // Called when an object is deleted. Notification should be sent to other + // RGW instances. + int remove(const DoutPrefixProvider *dpp, const std::string& name); + + // Make the local cache entry invalid. + void invalid(const DoutPrefixProvider *dpp, const std::string& name); + + // TODO: Distribute_cache() and watch_cb() now are only place holder functions. + // Checkout services/svc_sys_obj_cache.h/cc for reference. + // These 2 functions are designed to notify or to act on cache notification. + // It is feasible to implement the functionality using Motr's FDMI after discussing + // with Hua. + int distribute_cache(const DoutPrefixProvider *dpp, + const std::string& normal_name, + ObjectCacheInfo& obj_info, int op); + int watch_cb(const DoutPrefixProvider *dpp, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl); + + void set_enabled(bool status); + + MotrMetaCache(const DoutPrefixProvider *dpp, CephContext *cct) { + cache.set_ctx(cct); + } +}; + +struct MotrUserInfo { + RGWUserInfo info; + obj_version user_version; + rgw::sal::Attrs attrs; + + void encode(bufferlist& bl) const + { + ENCODE_START(3, 3, bl); + encode(info, bl); + encode(user_version, bl); + encode(attrs, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) + { + DECODE_START(3, bl); + decode(info, bl); + decode(user_version, bl); + decode(attrs, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(MotrUserInfo); + +struct MotrEmailInfo { + std::string user_id; + std::string email_id; + + MotrEmailInfo() {} + MotrEmailInfo(std::string _user_id, std::string _email_id ) + : user_id(std::move(_user_id)), email_id(std::move(_email_id)) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(user_id, bl); + encode(email_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl); + decode(user_id, bl); + decode(email_id, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(MotrEmailInfo); + +struct MotrAccessKey { + std::string id; // AccessKey + std::string key; // SecretKey + std::string user_id; // UserID + + MotrAccessKey() {} + MotrAccessKey(std::string _id, std::string _key, std::string _user_id) + : id(std::move(_id)), key(std::move(_key)), user_id(std::move(_user_id)) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(id, bl); + encode(key, bl); + encode(user_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl); + decode(id, bl); + decode(key, bl); + decode(user_id, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(MotrAccessKey); + +class MotrNotification : public StoreNotification { + public: + MotrNotification(Object* _obj, Object* _src_obj, rgw::notify::EventType _type) : + StoreNotification(_obj, _src_obj, _type) {} + ~MotrNotification() = default; + + virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override { return 0;} + virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size, + const ceph::real_time& mtime, const std::string& etag, const std::string& version) override { return 0; } +}; + +class MotrUser : public StoreUser { + private: + MotrStore *store; + struct m0_uint128 idxID = {0xe5ecb53640d4ecce, 0x6a156cd5a74aa3b8}; // MD5 of “motr.rgw.users“ + struct m0_idx idx; + + public: + std::set access_key_tracker; + MotrUser(MotrStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { } + MotrUser(MotrStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { } + MotrUser(MotrStore *_st) : store(_st) { } + MotrUser(MotrUser& _o) = default; + MotrUser() {} + + virtual std::unique_ptr clone() override { + return std::unique_ptr(new MotrUser(*this)); + } + int list_buckets(const DoutPrefixProvider *dpp, const std::string& marker, const std::string& end_marker, + uint64_t max, bool need_stats, BucketList& buckets, optional_yield y) override; + virtual int create_bucket(const DoutPrefixProvider* dpp, + const rgw_bucket& b, + const std::string& zonegroup_id, + rgw_placement_rule& placement_rule, + std::string& swift_ver_location, + const RGWQuotaInfo* pquota_info, + const RGWAccessControlPolicy& policy, + Attrs& attrs, + RGWBucketInfo& info, + obj_version& ep_objv, + bool exclusive, + bool obj_lock_enabled, + bool* existed, + req_info& req_info, + std::unique_ptr* bucket, + optional_yield y) override; + virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override; + virtual int read_stats(const DoutPrefixProvider *dpp, + optional_yield y, RGWStorageStats* stats, + ceph::real_time *last_stats_sync = nullptr, + ceph::real_time *last_stats_update = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override; + virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool* is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + + virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override; + virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override; + + int create_user_info_idx(); + int load_user_from_idx(const DoutPrefixProvider *dpp, MotrStore *store, RGWUserInfo& info, std::map *attrs, RGWObjVersionTracker *objv_tr); + + friend class MotrBucket; +}; + +class MotrBucket : public StoreBucket { + private: + MotrStore *store; + RGWAccessControlPolicy acls; + + // RGWBucketInfo and other information that are shown when listing a bucket is + // represented in struct MotrBucketInfo. The structure is encoded and stored + // as the value of the global bucket instance index. + // TODO: compare pros and cons of separating the bucket_attrs (ACLs, tag etc.) + // into a different index. + struct MotrBucketInfo { + RGWBucketInfo info; + + obj_version bucket_version; + ceph::real_time mtime; + + rgw::sal::Attrs bucket_attrs; + + void encode(bufferlist& bl) const + { + ENCODE_START(4, 4, bl); + encode(info, bl); + encode(bucket_version, bl); + encode(mtime, bl); + encode(bucket_attrs, bl); //rgw_cache.h example for a map + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) + { + DECODE_START(4, bl); + decode(info, bl); + decode(bucket_version, bl); + decode(mtime, bl); + decode(bucket_attrs, bl); + DECODE_FINISH(bl); + } + }; + WRITE_CLASS_ENCODER(MotrBucketInfo); + + public: + MotrBucket(MotrStore *_st) + : store(_st), + acls() { + } + + MotrBucket(MotrStore *_st, User* _u) + : StoreBucket(_u), + store(_st), + acls() { + } + + MotrBucket(MotrStore *_st, const rgw_bucket& _b) + : StoreBucket(_b), + store(_st), + acls() { + } + + MotrBucket(MotrStore *_st, const RGWBucketEnt& _e) + : StoreBucket(_e), + store(_st), + acls() { + } + + MotrBucket(MotrStore *_st, const RGWBucketInfo& _i) + : StoreBucket(_i), + store(_st), + acls() { + } + + MotrBucket(MotrStore *_st, const rgw_bucket& _b, User* _u) + : StoreBucket(_b, _u), + store(_st), + acls() { + } + + MotrBucket(MotrStore *_st, const RGWBucketEnt& _e, User* _u) + : StoreBucket(_e, _u), + store(_st), + acls() { + } + + MotrBucket(MotrStore *_st, const RGWBucketInfo& _i, User* _u) + : StoreBucket(_i, _u), + store(_st), + acls() { + } + + ~MotrBucket() { } + + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual int list(const DoutPrefixProvider *dpp, ListParams&, int, ListResults&, optional_yield y) override; + virtual int remove_bucket(const DoutPrefixProvider *dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override; + virtual int remove_bucket_bypass_gc(int concurrent_max, bool + keep_index_consistent, + optional_yield y, const + DoutPrefixProvider *dpp) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } + virtual int set_acl(const DoutPrefixProvider *dpp, RGWAccessControlPolicy& acl, optional_yield y) override; + virtual int load_bucket(const DoutPrefixProvider *dpp, optional_yield y, bool get_stats = false) override; + int link_user(const DoutPrefixProvider* dpp, User* new_user, optional_yield y); + int unlink_user(const DoutPrefixProvider* dpp, User* new_user, optional_yield y); + int create_bucket_index(); + int create_multipart_indices(); + virtual int read_stats(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, int shard_id, + std::string *bucket_ver, std::string *master_ver, + std::map& stats, + std::string *max_marker = nullptr, + bool *syncstopped = nullptr) override; + virtual int read_stats_async(const DoutPrefixProvider *dpp, + const bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetBucketStats_CB* ctx) override; + virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int update_container_stats(const DoutPrefixProvider *dpp) override; + virtual int check_bucket_shards(const DoutPrefixProvider *dpp) override; + virtual int chown(const DoutPrefixProvider *dpp, User& new_user, optional_yield y) override; + virtual int put_info(const DoutPrefixProvider *dpp, bool exclusive, ceph::real_time mtime) override; + virtual bool is_owner(User* user) override; + virtual int check_empty(const DoutPrefixProvider *dpp, optional_yield y) override; + virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override; + virtual int merge_and_store_attrs(const DoutPrefixProvider *dpp, Attrs& attrs, optional_yield y) override; + virtual int try_refresh_info(const DoutPrefixProvider *dpp, ceph::real_time *pmtime) override; + virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + bool *is_truncated, RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list& objs_to_unlink) override; + virtual int check_index(const DoutPrefixProvider *dpp, std::map& existing_stats, std::map& calculated_stats) override; + virtual int rebuild_index(const DoutPrefixProvider *dpp) override; + virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override; + virtual int purge_instance(const DoutPrefixProvider *dpp) override; + virtual std::unique_ptr clone() override { + return std::make_unique(*this); + } + virtual std::unique_ptr get_multipart_upload(const std::string& oid, + std::optional upload_id=std::nullopt, + ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override; + virtual int list_multiparts(const DoutPrefixProvider *dpp, + const std::string& prefix, + std::string& marker, + const std::string& delim, + const int& max_uploads, + std::vector>& uploads, + std::map *common_prefixes, + bool *is_truncated) override; + virtual int abort_multiparts(const DoutPrefixProvider *dpp, CephContext *cct) override; + + friend class MotrStore; +}; + +class MotrPlacementTier: public StorePlacementTier { + MotrStore* store; + RGWZoneGroupPlacementTier tier; +public: + MotrPlacementTier(MotrStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {} + virtual ~MotrPlacementTier() = default; + + virtual const std::string& get_tier_type() { return tier.tier_type; } + virtual const std::string& get_storage_class() { return tier.storage_class; } + virtual bool retain_head_object() { return tier.retain_head_object; } + RGWZoneGroupPlacementTier& get_rt() { return tier; } +}; + +class MotrZoneGroup : public StoreZoneGroup { + MotrStore* store; + const RGWZoneGroup group; + std::string empty; +public: + MotrZoneGroup(MotrStore* _store) : store(_store), group() {} + MotrZoneGroup(MotrStore* _store, const RGWZoneGroup& _group) : store(_store), group(_group) {} + virtual ~MotrZoneGroup() = default; + + virtual const std::string& get_id() const override { return group.get_id(); }; + virtual const std::string& get_name() const override { return group.get_name(); }; + virtual int equals(const std::string& other_zonegroup) const override { + return group.equals(other_zonegroup); + }; + /** Get the endpoint from zonegroup, or from master zone if not set */ + virtual const std::string& get_endpoint() const override; + virtual bool placement_target_exists(std::string& target) const override; + virtual bool is_master_zonegroup() const override { + return group.is_master_zonegroup(); + }; + virtual const std::string& get_api_name() const override { return group.api_name; }; + virtual void get_placement_target_names(std::set& names) const override; + virtual const std::string& get_default_placement_name() const override { + return group.default_placement.name; }; + virtual int get_hostnames(std::list& names) const override { + names = group.hostnames; + return 0; + }; + virtual int get_s3website_hostnames(std::list& names) const override { + names = group.hostnames_s3website; + return 0; + }; + virtual int get_zone_count() const override { + return group.zones.size(); + } + virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr* tier); + virtual int get_zone_by_id(const std::string& id, std::unique_ptr* zone) override { + return -1; + } + virtual int get_zone_by_name(const std::string& name, std::unique_ptr* zone) override { + return -1; + } + virtual int list_zones(std::list& zone_ids) override { + zone_ids.clear(); + return 0; + } + const RGWZoneGroup& get_group() { return group; } + bool supports(std::string_view feature) const override { + return group.supports(features); + } + virtual std::unique_ptr clone() override { + return std::make_unique(store, group); + } +}; + +class MotrZone : public StoreZone { + protected: + MotrStore* store; + RGWRealm *realm{nullptr}; + MotrZoneGroup zonegroup; + RGWZone *zone_public_config{nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */ + RGWZoneParams *zone_params{nullptr}; /* internal zone params, e.g., rados pools */ + RGWPeriod *current_period{nullptr}; + + public: + MotrZone(MotrStore* _store) : store(_store), zonegroup(_store) { + realm = new RGWRealm(); + zone_public_config = new RGWZone(); + zone_params = new RGWZoneParams(); + current_period = new RGWPeriod(); + + // XXX: only default and STANDARD supported for now + RGWZonePlacementInfo info; + RGWZoneStorageClasses sc; + sc.set_storage_class("STANDARD", nullptr, nullptr); + info.storage_classes = sc; + zone_params->placement_pools["default"] = info; + } + MotrZone(MotrStore* _store, MotrZoneGroup _zg) : store(_store), zonegroup(_zg) { + realm = new RGWRealm(); + // TODO: fetch zonegroup params (eg. id) from provisioner config. + zonegroup.set_id("0956b174-fe14-4f97-8b50-bb7ec5e1cf62"); + zonegroup.api_name = "default"; + zone_public_config = new RGWZone(); + zone_params = new RGWZoneParams(); + current_period = new RGWPeriod(); + + // XXX: only default and STANDARD supported for now + RGWZonePlacementInfo info; + RGWZoneStorageClasses sc; + sc.set_storage_class("STANDARD", nullptr, nullptr); + info.storage_classes = sc; + zone_params->placement_pools["default"] = info; + } + ~MotrZone() = default; + + virtual std::unique_ptr clone() override { + return std::make_unique(store); + } + virtual ZoneGroup& get_zonegroup() override; + virtual const std::string& get_id() override; + virtual const std::string& get_name() const override; + virtual bool is_writeable() override; + virtual bool get_redirect_endpoint(std::string* endpoint) override; + virtual bool has_zonegroup_api(const std::string& api) const override; + virtual const std::string& get_current_period_id() override; + virtual const RGWAccessKey& get_system_key() { return zone_params->system_key; } + virtual const std::string& get_realm_name() { return realm->get_name(); } + virtual const std::string& get_realm_id() { return realm->get_id(); } + virtual const std::string_view get_tier_type() { return "rgw"; } + virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() { return nullptr; } + friend class MotrStore; +}; + +class MotrLuaManager : public StoreLuaManager { + MotrStore* store; + + public: + MotrLuaManager(MotrStore* _s) : store(_s) + { + } + virtual ~MotrLuaManager() = default; + + /** Get a script named with the given key from the backing store */ + virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override; + /** Put a script named with the given key to the backing store */ + virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override; + /** Delete a script named with the given key from the backing store */ + virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override; + /** Add a lua package */ + virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override; + /** Remove a lua package */ + virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override; + /** List lua packages */ + virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override; +}; + +class MotrOIDCProvider : public RGWOIDCProvider { + MotrStore* store; + public: + MotrOIDCProvider(MotrStore* _store) : store(_store) {} + ~MotrOIDCProvider() = default; + + virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override { return 0; } + virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override { return 0; } + virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override { return 0;} + + void encode(bufferlist& bl) const { + RGWOIDCProvider::encode(bl); + } + void decode(bufferlist::const_iterator& bl) { + RGWOIDCProvider::decode(bl); + } +}; + +class MotrObject : public StoreObject { + private: + MotrStore *store; + RGWAccessControlPolicy acls; + RGWObjCategory category; + + // If this object is pat of a multipart uploaded one. + // TODO: do it in another class? MotrPartObject : public MotrObject + uint64_t part_off; + uint64_t part_size; + uint64_t part_num; + + public: + + // motr object metadata stored in index + struct Meta { + struct m0_uint128 oid = {}; + struct m0_fid pver = {}; + uint64_t layout_id = 0; + + void encode(bufferlist& bl) const + { + ENCODE_START(5, 5, bl); + encode(oid.u_hi, bl); + encode(oid.u_lo, bl); + encode(pver.f_container, bl); + encode(pver.f_key, bl); + encode(layout_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) + { + DECODE_START(5, bl); + decode(oid.u_hi, bl); + decode(oid.u_lo, bl); + decode(pver.f_container, bl); + decode(pver.f_key, bl); + decode(layout_id, bl); + DECODE_FINISH(bl); + } + }; + + struct m0_obj *mobj = NULL; + Meta meta; + + struct MotrReadOp : public ReadOp { + private: + MotrObject* source; + + // The set of part objects if the source is + // a multipart uploaded object. + std::map> part_objs; + + public: + MotrReadOp(MotrObject *_source); + + virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override; + + /* + * Both `read` and `iterate` read up through index `end` + * *inclusive*. The number of bytes that could be returned is + * `end - ofs + 1`. + */ + virtual int read(int64_t off, int64_t end, bufferlist& bl, + optional_yield y, + const DoutPrefixProvider* dpp) override; + virtual int iterate(const DoutPrefixProvider* dpp, int64_t off, + int64_t end, RGWGetDataCB* cb, + optional_yield y) override; + + virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override; + }; + + struct MotrDeleteOp : public DeleteOp { + private: + MotrObject* source; + + public: + MotrDeleteOp(MotrObject* _source); + + virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override; + }; + + MotrObject() = default; + + MotrObject(MotrStore *_st, const rgw_obj_key& _k) + : StoreObject(_k), store(_st), acls() {} + MotrObject(MotrStore *_st, const rgw_obj_key& _k, Bucket* _b) + : StoreObject(_k, _b), store(_st), acls() {} + + MotrObject(MotrObject& _o) = default; + + virtual ~MotrObject(); + + virtual int delete_object(const DoutPrefixProvider* dpp, + optional_yield y, + bool prevent_versioning = false) override; + virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio, + bool keep_index_consistent, optional_yield y) override; + virtual int copy_object(User* user, + req_info* info, const rgw_zone_id& source_zone, + rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time* src_mtime, ceph::real_time* mtime, + const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr, + bool high_precision_time, + const char* if_match, const char* if_nomatch, + AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs, + RGWObjCategory category, uint64_t olh_epoch, + boost::optional delete_at, + std::string* version_id, std::string* tag, std::string* etag, + void (*progress_cb)(off_t, void *), void* progress_data, + const DoutPrefixProvider* dpp, optional_yield y) override; + virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } + virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; } + virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override; + virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override; + virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override; + virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override; + virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override; + virtual bool is_expired() override; + virtual void gen_rand_obj_instance_name() override; + virtual std::unique_ptr clone() override { + return std::unique_ptr(new MotrObject(*this)); + } + virtual std::unique_ptr get_serializer(const DoutPrefixProvider *dpp, const std::string& lock_name) override; + virtual int transition(Bucket* bucket, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider* dpp, + optional_yield y) override; + virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override; + virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override; + + /* Swift versioning */ + virtual int swift_versioning_restore(bool& restored, + const DoutPrefixProvider* dpp) override; + virtual int swift_versioning_copy(const DoutPrefixProvider* dpp, + optional_yield y) override; + + /* OPs */ + virtual std::unique_ptr get_read_op() override; + virtual std::unique_ptr get_delete_op() override; + + /* OMAP */ + virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map *m, + bool* pmore, optional_yield y) override; + virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map *m, + optional_yield y) override; + virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid, + const std::set& keys, + Attrs* vals) override; + virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, + bool must_exist, optional_yield y) override; + virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) override; + private: + //int read_attrs(const DoutPrefixProvider* dpp, Motr::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr); + + public: + bool is_opened() { return mobj != NULL; } + int create_mobj(const DoutPrefixProvider *dpp, uint64_t sz); + int open_mobj(const DoutPrefixProvider *dpp); + int delete_mobj(const DoutPrefixProvider *dpp); + void close_mobj(); + int write_mobj(const DoutPrefixProvider *dpp, bufferlist&& data, uint64_t offset); + int read_mobj(const DoutPrefixProvider* dpp, int64_t off, int64_t end, RGWGetDataCB* cb); + unsigned get_optimal_bs(unsigned len); + + int get_part_objs(const DoutPrefixProvider *dpp, + std::map>& part_objs); + int open_part_objs(const DoutPrefixProvider* dpp, + std::map>& part_objs); + int read_multipart_obj(const DoutPrefixProvider* dpp, + int64_t off, int64_t end, RGWGetDataCB* cb, + std::map>& part_objs); + int delete_part_objs(const DoutPrefixProvider* dpp); + void set_category(RGWObjCategory _category) {category = _category;} + int get_bucket_dir_ent(const DoutPrefixProvider *dpp, rgw_bucket_dir_entry& ent); + int update_version_entries(const DoutPrefixProvider *dpp); +}; + +// A placeholder locking class for multipart upload. +// TODO: implement it using Motr object locks. +class MPMotrSerializer : public StoreMPSerializer { + + public: + MPMotrSerializer(const DoutPrefixProvider *dpp, MotrStore* store, MotrObject* obj, const std::string& lock_name) {} + + virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override {return 0; } + virtual int unlock() override { return 0;} +}; + +class MotrAtomicWriter : public StoreWriter { + protected: + rgw::sal::MotrStore* store; + const rgw_user& owner; + const rgw_placement_rule *ptail_placement_rule; + uint64_t olh_epoch; + const std::string& unique_tag; + MotrObject obj; + MotrObject old_obj; + uint64_t total_data_size; // for total data being uploaded + bufferlist acc_data; // accumulated data + uint64_t acc_off; // accumulated data offset + + struct m0_bufvec buf; + struct m0_bufvec attr; + struct m0_indexvec ext; + + public: + MotrAtomicWriter(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + MotrStore* _store, + const rgw_user& _owner, + const rgw_placement_rule *_ptail_placement_rule, + uint64_t _olh_epoch, + const std::string& _unique_tag); + ~MotrAtomicWriter() = default; + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + int write(); + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + + unsigned populate_bvec(unsigned len, bufferlist::iterator &bi); + void cleanup(); +}; + +class MotrMultipartWriter : public StoreWriter { +protected: + rgw::sal::MotrStore* store; + + // Head object. + rgw::sal::Object* head_obj; + + // Part parameters. + const uint64_t part_num; + const std::string part_num_str; + std::unique_ptr part_obj; + uint64_t actual_part_size = 0; + +public: + MotrMultipartWriter(const DoutPrefixProvider *dpp, + optional_yield y, MultipartUpload* upload, + rgw::sal::Object* obj, + MotrStore* _store, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t _part_num, const std::string& part_num_str) : + StoreWriter(dpp, y), store(_store), head_obj(obj), + part_num(_part_num), part_num_str(part_num_str) + { + } + ~MotrMultipartWriter() = default; + + // prepare to start processing object data + virtual int prepare(optional_yield y) override; + + // Process a bufferlist + virtual int process(bufferlist&& data, uint64_t offset) override; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; +}; + +// The implementation of multipart upload in POC roughly follows the +// cortx-s3server's design. Parts are stored in separate Motr objects. +// s3server uses a few auxiliary Motr indices to manage multipart +// related metadata: (1) Bucket multipart index (bucket_nnn_multipart_index) +// which contains metadata that answers questions such as which objects have +// started multipart upload and its upload id. This index is created during +// bucket creation. (2) Object part index (object_nnn_part_index) which stores +// metadata of a part's details (size, pvid, oid...). This index is created in +// MotrMultipartUpload::init(). (3) Extended metadata index +// (bucket_nnn_extended_metadata): once parts has been uploaded and their +// metadata saved in the part index, the user may issue multipart completion +// request. When processing the completion request, the parts are read from +// object part index and for each part an entry is created in extended index. +// The entry for the object is created in bucket (object list) index. The part +// index is deleted and an entry removed from bucket_nnn_multipart_index. Like +// bucket multipart index, bucket part extened metadata index is created during +// bucket creation. +// +// The extended metadata index is used mainly due to fault tolerant +// considerations (how to handle Motr service crash when uploading an object) +// and to avoid to create too many Motr indices (I am not sure I understand +// why many Motr indices is bad.). In our POC, to keep it simple, only 2 +// indices are maintained: bucket multipart index and object_nnn_part_index. +// +// + +class MotrMultipartPart : public StoreMultipartPart { +protected: + RGWUploadPartInfo info; + +public: + MotrObject::Meta meta; + + MotrMultipartPart(RGWUploadPartInfo _info, MotrObject::Meta _meta) : + info(_info), meta(_meta) {} + virtual ~MotrMultipartPart() = default; + + virtual uint32_t get_num() { return info.num; } + virtual uint64_t get_size() { return info.accounted_size; } + virtual const std::string& get_etag() { return info.etag; } + virtual ceph::real_time& get_mtime() { return info.modified; } + + RGWObjManifest& get_manifest() { return info.manifest; } + + friend class MotrMultipartUpload; +}; + +class MotrMultipartUpload : public StoreMultipartUpload { + MotrStore* store; + RGWMPObj mp_obj; + ACLOwner owner; + ceph::real_time mtime; + rgw_placement_rule placement; + RGWObjManifest manifest; + +public: + MotrMultipartUpload(MotrStore* _store, Bucket* _bucket, const std::string& oid, + std::optional upload_id, ACLOwner _owner, ceph::real_time _mtime) : + StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id), owner(_owner), mtime(_mtime) {} + virtual ~MotrMultipartUpload() = default; + + virtual const std::string& get_meta() const { return mp_obj.get_meta(); } + virtual const std::string& get_key() const { return mp_obj.get_key(); } + virtual const std::string& get_upload_id() const { return mp_obj.get_upload_id(); } + virtual const ACLOwner& get_owner() const override { return owner; } + virtual ceph::real_time& get_mtime() { return mtime; } + virtual std::unique_ptr get_meta_obj() override; + virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override; + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int num_parts, int marker, + int* next_marker, bool* truncated, + bool assume_unsorted = false) override; + virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override; + virtual int complete(const DoutPrefixProvider* dpp, + optional_yield y, CephContext* cct, + std::map& part_etags, + std::list& remove_objs, + uint64_t& accounted_size, bool& compressed, + RGWCompressionInfo& cs_info, off_t& off, + std::string& tag, ACLOwner& owner, + uint64_t olh_epoch, + rgw::sal::Object* target_obj) override; + virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override; + virtual std::unique_ptr get_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t part_num, + const std::string& part_num_str) override; + int delete_parts(const DoutPrefixProvider *dpp); +}; + +class MotrStore : public StoreDriver { + private: + MotrZone zone; + RGWSyncModuleInstanceRef sync_module; + + MotrMetaCache* obj_meta_cache; + MotrMetaCache* user_cache; + MotrMetaCache* bucket_inst_cache; + + public: + CephContext *cctx; + struct m0_client *instance; + struct m0_container container; + struct m0_realm uber_realm; + struct m0_config conf = {}; + struct m0_idx_dix_config dix_conf = {}; + + MotrStore(CephContext *c): zone(this), cctx(c) {} + ~MotrStore() { + delete obj_meta_cache; + delete user_cache; + delete bucket_inst_cache; + } + + virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) { return 0; } + virtual const std::string get_name() const override { + return "motr"; + } + + virtual std::unique_ptr get_user(const rgw_user& u) override; + virtual std::string get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y) override; + virtual int get_user_by_access_key(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y, std::unique_ptr* user) override; + virtual int get_user_by_email(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y, std::unique_ptr* user) override; + virtual int get_user_by_swift(const DoutPrefixProvider *dpp, const std::string& user_str, optional_yield y, std::unique_ptr* user) override; + virtual std::unique_ptr get_object(const rgw_obj_key& k) override; + virtual int get_bucket(const DoutPrefixProvider *dpp, User* u, const rgw_bucket& b, std::unique_ptr* bucket, optional_yield y) override; + virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr* bucket) override; + virtual int get_bucket(const DoutPrefixProvider *dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr* bucket, optional_yield y) override; + virtual bool is_meta_master() override; + virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv, + bufferlist& in_data, JSONParser *jp, req_info& info, + optional_yield y) override; + virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv, + bufferlist& in_data, + RGWXMLDecoder::XMLParser* parser, req_info& info, + optional_yield y) override; + virtual Zone* get_zone() { return &zone; } + virtual std::string zone_unique_id(uint64_t unique_num) override; + virtual std::string zone_unique_trans_id(const uint64_t unique_num) override; + virtual int get_zonegroup(const std::string& id, std::unique_ptr* zonegroup) override; + virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list& zone_ids) override; + virtual int cluster_stat(RGWClusterStat& stats) override; + virtual std::unique_ptr get_lifecycle(void) override; + virtual std::unique_ptr get_completions(void) override; + virtual std::unique_ptr get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj, + req_state* s, rgw::notify::EventType event_type, optional_yield y, const std::string* object_name=nullptr) override; + virtual std::unique_ptr get_notification(const DoutPrefixProvider* dpp, rgw::sal::Object* obj, + rgw::sal::Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, + std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) override; + virtual RGWLC* get_rgwlc(void) override { return NULL; } + virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return NULL; } + + virtual int log_usage(const DoutPrefixProvider *dpp, std::map& usage_info) override; + virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override; + virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, + const std::map& meta) override; + virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override; + virtual void get_quota(RGWQuota& quota) override; + virtual int set_buckets_enabled(const DoutPrefixProvider *dpp, std::vector& buckets, bool enabled) override; + virtual int get_sync_policy_handler(const DoutPrefixProvider *dpp, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef *phandler, + optional_yield y) override; + virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override; + virtual void wakeup_meta_sync_shards(std::set& shard_ids) override { return; } + virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map>& shard_ids) override {} + virtual int clear_usage(const DoutPrefixProvider *dpp) override { return 0; } + virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, + RGWUsageIter& usage_iter, + std::map& usage) override; + virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override; + virtual int get_config_key_val(std::string name, bufferlist* bl) override; + virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override; + virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list& keys, bool* truncated) override; + virtual void meta_list_keys_complete(void* handle) override; + virtual std::string meta_get_marker(void *handle) override; + virtual int meta_remove(const DoutPrefixProvider *dpp, std::string& metadata_key, optional_yield y) override; + + virtual const RGWSyncModuleInstanceRef& get_sync_module() { return sync_module; } + virtual std::string get_host_id() { return ""; } + + virtual std::unique_ptr get_lua_manager() override; + virtual std::unique_ptr get_role(std::string name, + std::string tenant, + std::string path="", + std::string trust_policy="", + std::string max_session_duration_str="", + std::multimap tags={}) override; + virtual std::unique_ptr get_role(const RGWRoleInfo& info) override; + virtual std::unique_ptr get_role(std::string id) override; + virtual int get_roles(const DoutPrefixProvider *dpp, + optional_yield y, + const std::string& path_prefix, + const std::string& tenant, + std::vector>& roles) override; + virtual std::unique_ptr get_oidc_provider() override; + virtual int get_oidc_providers(const DoutPrefixProvider *dpp, + const std::string& tenant, + std::vector>& providers) override; + virtual std::unique_ptr get_append_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + const std::string& unique_tag, + uint64_t position, + uint64_t *cur_accounted_size) override; + virtual std::unique_ptr get_atomic_writer(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::Object* obj, + const rgw_user& owner, + const rgw_placement_rule *ptail_placement_rule, + uint64_t olh_epoch, + const std::string& unique_tag) override; + virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override; + virtual bool valid_placement(const rgw_placement_rule& rule) override; + + virtual void finalize(void) override; + + virtual CephContext *ctx(void) override { + return cctx; + } + + virtual void register_admin_apis(RGWRESTMgr* mgr) override { }; + + int open_idx(struct m0_uint128 *id, bool create, struct m0_idx *out); + void close_idx(struct m0_idx *idx) { m0_idx_fini(idx); } + int do_idx_op(struct m0_idx *, enum m0_idx_opcode opcode, + std::vector& key, std::vector& val, bool update = false); + + int do_idx_next_op(struct m0_idx *idx, + std::vector>& key_vec, + std::vector>& val_vec); + int next_query_by_name(std::string idx_name, std::vector& key_str_vec, + std::vector& val_bl_vec, + std::string prefix="", std::string delim=""); + + void index_name_to_motr_fid(std::string iname, struct m0_uint128 *fid); + int open_motr_idx(struct m0_uint128 *id, struct m0_idx *idx); + int create_motr_idx_by_name(std::string iname); + int delete_motr_idx_by_name(std::string iname); + int do_idx_op_by_name(std::string idx_name, enum m0_idx_opcode opcode, + std::string key_str, bufferlist &bl, bool update=true); + int check_n_create_global_indices(); + int store_access_key(const DoutPrefixProvider *dpp, optional_yield y, MotrAccessKey access_key); + int delete_access_key(const DoutPrefixProvider *dpp, optional_yield y, std::string access_key); + int store_email_info(const DoutPrefixProvider *dpp, optional_yield y, MotrEmailInfo& email_info); + + int init_metadata_cache(const DoutPrefixProvider *dpp, CephContext *cct); + MotrMetaCache* get_obj_meta_cache() {return obj_meta_cache;} + MotrMetaCache* get_user_cache() {return user_cache;} + MotrMetaCache* get_bucket_inst_cache() {return bucket_inst_cache;} +}; + +struct obj_time_weight { + real_time mtime; + uint32_t zone_short_id; + uint64_t pg_ver; + bool high_precision; + + obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {} + + bool compare_low_precision(const obj_time_weight& rhs) { + struct timespec l = ceph::real_clock::to_timespec(mtime); + struct timespec r = ceph::real_clock::to_timespec(rhs.mtime); + l.tv_nsec = 0; + r.tv_nsec = 0; + if (l > r) { + return false; + } + if (l < r) { + return true; + } + if (!zone_short_id || !rhs.zone_short_id) { + /* don't compare zone ids, if one wasn't provided */ + return false; + } + if (zone_short_id != rhs.zone_short_id) { + return (zone_short_id < rhs.zone_short_id); + } + return (pg_ver < rhs.pg_ver); + + } + + bool operator<(const obj_time_weight& rhs) { + if (!high_precision || !rhs.high_precision) { + return compare_low_precision(rhs); + } + if (mtime > rhs.mtime) { + return false; + } + if (mtime < rhs.mtime) { + return true; + } + if (!zone_short_id || !rhs.zone_short_id) { + /* don't compare zone ids, if one wasn't provided */ + return false; + } + if (zone_short_id != rhs.zone_short_id) { + return (zone_short_id < rhs.zone_short_id); + } + return (pg_ver < rhs.pg_ver); + } + + void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) { + mtime = _mtime; + zone_short_id = _short_id; + pg_ver = _pg_ver; + } + + void init(RGWObjState *state) { + mtime = state->mtime; + zone_short_id = state->zone_short_id; + pg_ver = state->pg_ver; + } +}; + +inline std::ostream& operator<<(std::ostream& out, const obj_time_weight &o) { + out << o.mtime; + + if (o.zone_short_id != 0 || o.pg_ver != 0) { + out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]"; + } + + return out; +} + +} // namespace rgw::sal diff --git a/src/rgw/rgw_sal_store.h b/src/rgw/rgw_sal_store.h new file mode 100644 index 000000000..55b43e3d9 --- /dev/null +++ b/src/rgw/rgw_sal_store.h @@ -0,0 +1,419 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_sal.h" + +namespace rgw { namespace sal { + +class StoreDriver : public Driver { + public: + StoreDriver() {} + virtual ~StoreDriver() = default; + + virtual uint64_t get_new_req_id() override { + return ceph::util::generate_random_number(); + } + + int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override {return -EOPNOTSUPP;} + int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override {return -ENOENT;} + int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override {return -ENOENT;} +}; + +class StoreUser : public User { + protected: + RGWUserInfo info; + RGWObjVersionTracker objv_tracker; + Attrs attrs; + + public: + StoreUser() : info() {} + StoreUser(const rgw_user& _u) : info() { info.user_id = _u; } + StoreUser(const RGWUserInfo& _i) : info(_i) {} + StoreUser(StoreUser& _o) = default; + virtual ~StoreUser() = default; + + virtual std::string& get_display_name() override { return info.display_name; } + virtual const std::string& get_tenant() override { return info.user_id.tenant; } + virtual void set_tenant(std::string& _t) override { info.user_id.tenant = _t; } + virtual const std::string& get_ns() override { return info.user_id.ns; } + virtual void set_ns(std::string& _ns) override { info.user_id.ns = _ns; } + virtual void clear_ns() override { info.user_id.ns.clear(); } + virtual const rgw_user& get_id() const override { return info.user_id; } + virtual uint32_t get_type() const override { return info.type; } + virtual int32_t get_max_buckets() const override { return info.max_buckets; } + virtual const RGWUserCaps& get_caps() const override { return info.caps; } + virtual RGWObjVersionTracker& get_version_tracker() override { return objv_tracker; } + virtual Attrs& get_attrs() override { return attrs; } + virtual void set_attrs(Attrs& _attrs) override { attrs = _attrs; } + virtual bool empty() const override { return info.user_id.id.empty(); } + virtual RGWUserInfo& get_info() override { return info; } + virtual void print(std::ostream& out) const override { out << info.user_id; } + + friend class StoreBucket; +}; + +class StoreBucket : public Bucket { + protected: + RGWBucketEnt ent; + RGWBucketInfo info; + User* owner = nullptr; + Attrs attrs; + obj_version bucket_version; + ceph::real_time mtime; + + public: + + StoreBucket() = default; + StoreBucket(User* _u) : + owner(_u) { } + StoreBucket(const rgw_bucket& _b) { ent.bucket = _b; info.bucket = _b; } + StoreBucket(const RGWBucketEnt& _e) : ent(_e) { + info.bucket = ent.bucket; + info.placement_rule = ent.placement_rule; + info.creation_time = ent.creation_time; + } + StoreBucket(const RGWBucketInfo& _i) : info(_i) { + ent.bucket = info.bucket; + ent.placement_rule = info.placement_rule; + ent.creation_time = info.creation_time; + } + StoreBucket(const rgw_bucket& _b, User* _u) : + owner(_u) { ent.bucket = _b; info.bucket = _b; } + StoreBucket(const RGWBucketEnt& _e, User* _u) : ent(_e), owner(_u) { + info.bucket = ent.bucket; + info.placement_rule = ent.placement_rule; + info.creation_time = ent.creation_time; + } + StoreBucket(const RGWBucketInfo& _i, User* _u) : info(_i), owner(_u) { + ent.bucket = info.bucket; + ent.placement_rule = info.placement_rule; + ent.creation_time = info.creation_time; + } + virtual ~StoreBucket() = default; + + virtual Attrs& get_attrs(void) override { return attrs; } + virtual int set_attrs(Attrs a) override { attrs = a; return 0; } + virtual void set_owner(rgw::sal::User* _owner) override { + owner = _owner; + } + virtual User* get_owner(void) override { return owner; }; + virtual ACLOwner get_acl_owner(void) override { return ACLOwner(info.owner); }; + virtual bool empty() const override { return info.bucket.name.empty(); } + virtual const std::string& get_name() const override { return info.bucket.name; } + virtual const std::string& get_tenant() const override { return info.bucket.tenant; } + virtual const std::string& get_marker() const override { return info.bucket.marker; } + virtual const std::string& get_bucket_id() const override { return info.bucket.bucket_id; } + virtual size_t get_size() const override { return ent.size; } + virtual size_t get_size_rounded() const override { return ent.size_rounded; } + virtual uint64_t get_count() const override { return ent.count; } + virtual rgw_placement_rule& get_placement_rule() override { return info.placement_rule; } + virtual ceph::real_time& get_creation_time() override { return info.creation_time; } + virtual ceph::real_time& get_modification_time() override { return mtime; } + virtual obj_version& get_version() override { return bucket_version; } + virtual void set_version(obj_version &ver) override { bucket_version = ver; } + virtual bool versioned() override { return info.versioned(); } + virtual bool versioning_enabled() override { return info.versioning_enabled(); } + virtual rgw_bucket& get_key() override { return info.bucket; } + virtual RGWBucketInfo& get_info() override { return info; } + virtual void print(std::ostream& out) const override { out << info.bucket; } + virtual bool operator==(const Bucket& b) const override { + if (typeid(*this) != typeid(b)) { + return false; + } + const StoreBucket& sb = dynamic_cast(b); + + return (info.bucket.tenant == sb.info.bucket.tenant) && + (info.bucket.name == sb.info.bucket.name) && + (info.bucket.bucket_id == sb.info.bucket.bucket_id); + } + virtual bool operator!=(const Bucket& b) const override { + if (typeid(*this) != typeid(b)) { + return false; + } + const StoreBucket& sb = dynamic_cast(b); + + return (info.bucket.tenant != sb.info.bucket.tenant) || + (info.bucket.name != sb.info.bucket.name) || + (info.bucket.bucket_id != sb.info.bucket.bucket_id); + } + + int read_topics(rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override {return 0;} + int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override {return 0;} + int remove_topics(RGWObjVersionTracker* objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) override {return 0;} + + friend class BucketList; + protected: + virtual void set_ent(RGWBucketEnt& _ent) { ent = _ent; info.bucket = ent.bucket; info.placement_rule = ent.placement_rule; } +}; + +class StoreObject : public Object { + protected: + RGWObjState state; + Bucket* bucket = nullptr; + bool delete_marker{false}; + + public: + StoreObject() = default; + StoreObject(const rgw_obj_key& _k) + { state.obj.key = _k; } + StoreObject(const rgw_obj_key& _k, Bucket* _b) + : bucket(_b) + { state.obj.init(_b->get_key(), _k); } + StoreObject(const StoreObject& _o) = default; + + virtual ~StoreObject() = default; + + virtual void set_atomic() override { state.is_atomic = true; } + virtual bool is_atomic() override { return state.is_atomic; } + virtual void set_prefetch_data() override { state.prefetch_data = true; } + virtual bool is_prefetch_data() override { return state.prefetch_data; } + virtual void set_compressed() override { state.compressed = true; } + virtual bool is_compressed() override { return state.compressed; } + virtual void invalidate() override { + rgw_obj obj = state.obj; + bool is_atomic = state.is_atomic; + bool prefetch_data = state.prefetch_data; + bool compressed = state.compressed; + + state = RGWObjState(); + state.obj = obj; + state.is_atomic = is_atomic; + state.prefetch_data = prefetch_data; + state.compressed = compressed; + } + + virtual bool empty() const override { return state.obj.empty(); } + virtual const std::string &get_name() const override { return state.obj.key.name; } + virtual Attrs& get_attrs(void) override { return state.attrset; } + virtual const Attrs& get_attrs(void) const override { return state.attrset; } + virtual int set_attrs(Attrs a) override { state.attrset = a; state.has_attrs = true; return 0; } + virtual bool has_attrs(void) override { return state.has_attrs; } + virtual ceph::real_time get_mtime(void) const override { return state.mtime; } + virtual uint64_t get_obj_size(void) const override { return state.size; } + virtual Bucket* get_bucket(void) const override { return bucket; } + virtual void set_bucket(Bucket* b) override { bucket = b; state.obj.bucket = b->get_key(); } + virtual std::string get_hash_source(void) override { return state.obj.index_hash_source; } + virtual void set_hash_source(std::string s) override { state.obj.index_hash_source = s; } + virtual std::string get_oid(void) const override { return state.obj.key.get_oid(); } + virtual bool get_delete_marker(void) override { return delete_marker; } + virtual bool get_in_extra_data(void) override { return state.obj.is_in_extra_data(); } + virtual void set_in_extra_data(bool i) override { state.obj.set_in_extra_data(i); } + int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end); + virtual void set_obj_size(uint64_t s) override { state.size = s; } + virtual void set_name(const std::string& n) override { state.obj.key = n; } + virtual void set_key(const rgw_obj_key& k) override { state.obj.key = k; } + virtual rgw_obj get_obj(void) const override { return state.obj; } + virtual rgw_obj_key& get_key() override { return state.obj.key; } + virtual void set_instance(const std::string &i) override { state.obj.key.set_instance(i); } + virtual const std::string &get_instance() const override { return state.obj.key.instance; } + virtual bool have_instance(void) override { return state.obj.key.have_instance(); } + virtual void clear_instance() override { state.obj.key.instance.clear(); } + virtual int transition_to_cloud(Bucket* bucket, + rgw::sal::PlacementTier* tier, + rgw_bucket_dir_entry& o, + std::set& cloud_targets, + CephContext* cct, + bool update_object, + const DoutPrefixProvider* dpp, + optional_yield y) override { + /* Return failure here, so stores which don't transition to cloud will + * work with lifecycle */ + return -1; + } + + virtual void print(std::ostream& out) const override { + if (bucket) + out << bucket << ":"; + out << state.obj.key; + } +}; + +class StoreMultipartPart : public MultipartPart { + protected: + std::string oid; +public: + StoreMultipartPart() = default; + virtual ~StoreMultipartPart() = default; +}; + +class StoreMultipartUpload : public MultipartUpload { +protected: + Bucket* bucket; + std::map> parts; + jspan_context trace_ctx{false, false}; +public: + StoreMultipartUpload(Bucket* _bucket) : bucket(_bucket) {} + virtual ~StoreMultipartUpload() = default; + + virtual std::map>& get_parts() override { return parts; } + + virtual const jspan_context& get_trace() override { return trace_ctx; } + + virtual void print(std::ostream& out) const override { + out << get_meta(); + if (!get_upload_id().empty()) + out << ":" << get_upload_id(); + } +}; + +class StoreMPSerializer : public MPSerializer { +protected: + bool locked; + std::string oid; +public: + StoreMPSerializer() : locked(false) {} + StoreMPSerializer(std::string _oid) : locked(false), oid(_oid) {} + virtual ~StoreMPSerializer() = default; + + virtual void clear_locked() override { + locked = false; + } + virtual bool is_locked() override { return locked; } + + virtual void print(std::ostream& out) const override { out << oid; } +}; + +class StoreLCSerializer : public LCSerializer { +protected: + std::string oid; +public: + StoreLCSerializer() {} + StoreLCSerializer(std::string _oid) : oid(_oid) {} + virtual ~StoreLCSerializer() = default; + + virtual void print(std::ostream& out) const override { out << oid; } +}; + +class StoreLifecycle : public Lifecycle { +public: + struct StoreLCHead : LCHead { + time_t start_date{0}; + time_t shard_rollover_date{0}; + std::string marker; + + StoreLCHead() = default; + StoreLCHead(time_t _start_date, time_t _rollover_date, std::string& _marker) : start_date(_start_date), shard_rollover_date(_rollover_date), marker(_marker) {} + + StoreLCHead& operator=(LCHead& _h) { + start_date = _h.get_start_date(); + shard_rollover_date = _h.get_shard_rollover_date(); + marker = _h.get_marker(); + + return *this; + } + + virtual time_t& get_start_date() override { return start_date; } + virtual void set_start_date(time_t _date) override { start_date = _date; } + virtual std::string& get_marker() override { return marker; } + virtual void set_marker(const std::string& _marker) override { marker = _marker; } + virtual time_t& get_shard_rollover_date() override { return shard_rollover_date; } + virtual void set_shard_rollover_date(time_t _date) override { shard_rollover_date = _date; } + }; + + struct StoreLCEntry : LCEntry { + std::string bucket; + std::string oid; + uint64_t start_time{0}; + uint32_t status{0}; + + StoreLCEntry() = default; + StoreLCEntry(std::string& _bucket, uint64_t _time, uint32_t _status) : bucket(_bucket), start_time(_time), status(_status) {} + StoreLCEntry(std::string& _bucket, std::string _oid, uint64_t _time, uint32_t _status) : bucket(_bucket), oid(_oid), start_time(_time), status(_status) {} + StoreLCEntry(const StoreLCEntry& _e) = default; + + StoreLCEntry& operator=(LCEntry& _e) { + bucket = _e.get_bucket(); + oid = _e.get_oid(); + start_time = _e.get_start_time(); + status = _e.get_status(); + + return *this; + } + + virtual std::string& get_bucket() override { return bucket; } + virtual void set_bucket(const std::string& _bucket) override { bucket = _bucket; } + virtual std::string& get_oid() override { return oid; } + virtual void set_oid(const std::string& _oid) override { oid = _oid; } + virtual uint64_t get_start_time() override { return start_time; } + virtual void set_start_time(uint64_t _time) override { start_time = _time; } + virtual uint32_t get_status() override { return status; } + virtual void set_status(uint32_t _status) override { status = _status; } + virtual void print(std::ostream& out) const override { + out << bucket << ":" << oid << ":" << start_time << ":" << status; + } + }; + + StoreLifecycle() = default; + virtual ~StoreLifecycle() = default; + + virtual std::unique_ptr get_entry() override { + return std::make_unique(); + } + using Lifecycle::get_entry; +}; + +class StoreNotification : public Notification { +protected: + Object* obj; + Object* src_obj; + rgw::notify::EventType event_type; + + public: + StoreNotification(Object* _obj, Object* _src_obj, rgw::notify::EventType _type) + : obj(_obj), src_obj(_src_obj), event_type(_type) + {} + + virtual ~StoreNotification() = default; +}; + +class StoreWriter : public Writer { +protected: + const DoutPrefixProvider* dpp; + +public: + StoreWriter(const DoutPrefixProvider *_dpp, optional_yield y) : dpp(_dpp) {} + virtual ~StoreWriter() = default; + +}; + +class StorePlacementTier : public PlacementTier { +public: + virtual ~StorePlacementTier() = default; +}; + +class StoreZoneGroup : public ZoneGroup { +public: + virtual ~StoreZoneGroup() = default; +}; + +class StoreZone : public Zone { + public: + virtual ~StoreZone() = default; +}; + +class StoreLuaManager : public LuaManager { +public: + virtual ~StoreLuaManager() = default; +}; + +} } // namespace rgw::sal diff --git a/src/rgw/rgw_signal.cc b/src/rgw/rgw_signal.cc new file mode 100644 index 000000000..4bb29d0df --- /dev/null +++ b/src/rgw/rgw_signal.cc @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_signal.h" +#include "global/signal_handler.h" +#include "common/safe_io.h" +#include "common/errno.h" +#include "rgw_main.h" +#include "rgw_log.h" + +#ifdef HAVE_SYS_PRCTL_H +#include +#endif + +#define dout_subsys ceph_subsys_rgw +#define dout_context g_ceph_context + + +static int signal_fd[2] = {0, 0}; + +namespace rgw { +namespace signal { + +void sighup_handler(int signum) { + if (rgw::AppMain::ops_log_file != nullptr) { + rgw::AppMain::ops_log_file->reopen(); + } + g_ceph_context->reopen_logs(); +} /* sighup_handler */ + +void signal_shutdown() +{ + int val = 0; + int ret = write(signal_fd[0], (char *)&val, sizeof(val)); + if (ret < 0) { + derr << "ERROR: " << __func__ << ": write() returned " + << cpp_strerror(errno) << dendl; + } +} /* signal_shutdown */ + +void wait_shutdown() +{ + int val; + int r = safe_read_exact(signal_fd[1], &val, sizeof(val)); + if (r < 0) { + derr << "safe_read_exact returned with error" << dendl; + } +} /* wait_shutdown */ + +int signal_fd_init() +{ + return socketpair(AF_UNIX, SOCK_STREAM, 0, signal_fd); +} /* signal_fd_init */ + +void signal_fd_finalize() +{ + close(signal_fd[0]); + close(signal_fd[1]); +} /* signal_fd_finalize */ + +void handle_sigterm(int signum) +{ + dout(1) << __func__ << dendl; + + // send a signal to make fcgi's accept(2) wake up. unfortunately the + // initial signal often isn't sufficient because we race with accept's + // check of the flag wet by ShutdownPending() above. + if (signum != SIGUSR1) { + signal_shutdown(); + + // safety net in case we get stuck doing an orderly shutdown. + uint64_t secs = g_ceph_context->_conf->rgw_exit_timeout_secs; + if (secs) + alarm(secs); + dout(1) << __func__ << " set alarm for " << secs << dendl; + } +} /* handle_sigterm */ + +}} /* namespace rgw::signal */ diff --git a/src/rgw/rgw_signal.h b/src/rgw/rgw_signal.h new file mode 100644 index 000000000..68fc4f614 --- /dev/null +++ b/src/rgw/rgw_signal.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + + +namespace rgw { +namespace signal { + +void signal_shutdown(); +void wait_shutdown(); +int signal_fd_init(); +void signal_fd_finalize(); +void handle_sigterm(int signum); +void handle_sigterm(int signum); +void sighup_handler(int signum); + +} // namespace signal +} // namespace rgw diff --git a/src/rgw/rgw_string.cc b/src/rgw/rgw_string.cc new file mode 100644 index 000000000..7be82f854 --- /dev/null +++ b/src/rgw/rgw_string.cc @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_string.h" + +static bool char_eq(char c1, char c2) +{ + return c1 == c2; +} + +static bool ci_char_eq(char c1, char c2) +{ + return tolower(c1) == tolower(c2); +} + +bool match_wildcards(std::string_view pattern, std::string_view input, + uint32_t flags) +{ + const auto eq = (flags & MATCH_CASE_INSENSITIVE) ? &ci_char_eq : &char_eq; + + auto it1 = pattern.begin(); + auto it2 = input.begin(); + while (true) { + if (it1 == pattern.end()) + return it2 == input.end(); + if (*it1 == '*') { + if (it1 + 1 == pattern.end()) + return true; + if (it2 == input.end() || eq(*(it1 + 1), *it2)) + ++it1; + else + ++it2; + continue; + } + if (it2 == input.end()) + return false; + if (*it1 == '?' || eq(*it1, *it2)) { + ++it1; + ++it2; + continue; + } + return false; + } + return false; +} diff --git a/src/rgw/rgw_string.h b/src/rgw/rgw_string.h new file mode 100644 index 000000000..e58a356f4 --- /dev/null +++ b/src/rgw/rgw_string.h @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +struct ltstr_nocase +{ + bool operator()(const std::string& s1, const std::string& s2) const + { + return strcasecmp(s1.c_str(), s2.c_str()) < 0; + } +}; + +static inline int stringcasecmp(const std::string& s1, const std::string& s2) +{ + return strcasecmp(s1.c_str(), s2.c_str()); +} + +static inline int stringcasecmp(const std::string& s1, const char *s2) +{ + return strcasecmp(s1.c_str(), s2); +} + +static inline int stringcasecmp(const std::string& s1, int ofs, int size, const std::string& s2) +{ + return strncasecmp(s1.c_str() + ofs, s2.c_str(), size); +} + +static inline int stringtoll(const std::string& s, int64_t *val) +{ + char *end; + + long long result = strtoll(s.c_str(), &end, 10); + if (result == LLONG_MAX) + return -EINVAL; + + if (*end) + return -EINVAL; + + *val = (int64_t)result; + + return 0; +} + +static inline int stringtoull(const std::string& s, uint64_t *val) +{ + char *end; + + unsigned long long result = strtoull(s.c_str(), &end, 10); + if (result == ULLONG_MAX) + return -EINVAL; + + if (*end) + return -EINVAL; + + *val = (uint64_t)result; + + return 0; +} + +static inline int stringtol(const std::string& s, int32_t *val) +{ + char *end; + + long result = strtol(s.c_str(), &end, 10); + if (result == LONG_MAX) + return -EINVAL; + + if (*end) + return -EINVAL; + + *val = (int32_t)result; + + return 0; +} + +static inline int stringtoul(const std::string& s, uint32_t *val) +{ + char *end; + + unsigned long result = strtoul(s.c_str(), &end, 10); + if (result == ULONG_MAX) + return -EINVAL; + + if (*end) + return -EINVAL; + + *val = (uint32_t)result; + + return 0; +} + +/* A converter between std::string_view and null-terminated C-strings. + * It copies memory while trying to utilize the local memory instead of + * issuing dynamic allocations. */ +template +static inline boost::container::small_vector +sview2cstr(const std::string_view& sv) +{ + boost::container::small_vector cstr; + cstr.reserve(sv.size() + sizeof('\0')); + + cstr.assign(std::begin(sv), std::end(sv)); + cstr.push_back('\0'); + + return cstr; +} + +/* std::strlen() isn't guaranteed to be computable at compile-time. Although + * newer GCCs actually do that, Clang doesn't. Please be aware this function + * IS NOT A DROP-IN REPLACEMENT FOR STRLEN -- it returns a different result + * for strings having \0 in the middle. */ +template +static inline constexpr size_t sarrlen(const char (&arr)[N]) { + return N - 1; +} + +namespace detail { + +// variadic sum() to add up string lengths for reserve() +static inline constexpr size_t sum() { return 0; } +template +constexpr size_t sum(size_t v, Args... args) { return v + sum(args...); } + +// traits for string_size() +template +struct string_traits { + static constexpr size_t size(const T& s) { return s.size(); } +}; +// specializations for char*/const char* use strlen() +template <> +struct string_traits { + static size_t size(const char* s) { return std::strlen(s); } +}; +template <> +struct string_traits : string_traits {}; +// constexpr specializations for char[]/const char[] +template +struct string_traits { + static constexpr size_t size_(const char* s, size_t i) { + return i < N ? (*(s + i) == '\0' ? i : size_(s, i + 1)) + : throw std::invalid_argument("Unterminated string constant."); + } + static constexpr size_t size(const char(&s)[N]) { return size_(s, 0); } +}; +template +struct string_traits : string_traits {}; + +// helpers for string_cat_reserve() +static inline void append_to(std::string& s) {} +template +void append_to(std::string& s, const std::string_view& v, const Args&... args) +{ + s.append(v.begin(), v.end()); + append_to(s, args...); +} + +// helpers for string_join_reserve() +static inline void join_next(std::string& s, const std::string_view& d) {} +template +void join_next(std::string& s, const std::string_view& d, + const std::string_view& v, const Args&... args) +{ + s.append(d.begin(), d.end()); + s.append(v.begin(), v.end()); + join_next(s, d, args...); +} + +static inline void join(std::string& s, const std::string_view& d) {} +template +void join(std::string& s, const std::string_view& d, + const std::string_view& v, const Args&... args) +{ + s.append(v.begin(), v.end()); + join_next(s, d, args...); +} + +} // namespace detail + +/// return the length of a c string, string literal, or string type +template +constexpr size_t string_size(const T& s) +{ + return detail::string_traits::size(s); +} + +/// concatenates the given string arguments, returning as a std::string that +/// gets preallocated with reserve() +template +std::string string_cat_reserve(const Args&... args) +{ + size_t total_size = detail::sum(string_size(args)...); + std::string result; + result.reserve(total_size); + detail::append_to(result, args...); + return result; +} + +/// joins the given string arguments with a delimiter, returning as a +/// std::string that gets preallocated with reserve() +template +std::string string_join_reserve(const std::string_view& delim, + const Args&... args) +{ + size_t delim_size = delim.size() * std::max(0, sizeof...(args) - 1); + size_t total_size = detail::sum(string_size(args)...) + delim_size; + std::string result; + result.reserve(total_size); + detail::join(result, delim, args...); + return result; +} +template +std::string string_join_reserve(char delim, const Args&... args) +{ + return string_join_reserve(std::string_view{&delim, 1}, args...); +} + + +/// use case-insensitive comparison in match_wildcards() +static constexpr uint32_t MATCH_CASE_INSENSITIVE = 0x01; + +/// attempt to match the given input string with the pattern, which may contain +/// the wildcard characters * and ? +extern bool match_wildcards(std::string_view pattern, + std::string_view input, + uint32_t flags = 0); diff --git a/src/rgw/rgw_sts.cc b/src/rgw/rgw_sts.cc new file mode 100644 index 000000000..b55283442 --- /dev/null +++ b/src/rgw/rgw_sts.cc @@ -0,0 +1,469 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "common/ceph_time.h" +#include "auth/Crypto.h" +#include "include/ceph_fs.h" +#include "common/iso_8601.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_b64.h" +#include "rgw_common.h" +#include "rgw_tools.h" +#include "rgw_role.h" +#include "rgw_user.h" +#include "rgw_iam_policy.h" +#include "rgw_sts.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace STS { + +void Credentials::dump(Formatter *f) const +{ + encode_json("AccessKeyId", accessKeyId , f); + encode_json("Expiration", expiration , f); + encode_json("SecretAccessKey", secretAccessKey , f); + encode_json("SessionToken", sessionToken , f); +} + +int Credentials::generateCredentials(const DoutPrefixProvider *dpp, + CephContext* cct, + const uint64_t& duration, + const boost::optional& policy, + const boost::optional& roleId, + const boost::optional& role_session, + const boost::optional>& token_claims, + const boost::optional>>& session_princ_tags, + boost::optional user, + rgw::auth::Identity* identity) +{ + uuid_d accessKey, secretKey; + char accessKeyId_str[MAX_ACCESS_KEY_LEN], secretAccessKey_str[MAX_SECRET_KEY_LEN]; + + //AccessKeyId + gen_rand_alphanumeric_plain(cct, accessKeyId_str, sizeof(accessKeyId_str)); + accessKeyId = accessKeyId_str; + + //SecretAccessKey + gen_rand_alphanumeric_upper(cct, secretAccessKey_str, sizeof(secretAccessKey_str)); + secretAccessKey = secretAccessKey_str; + + //Expiration + real_clock::time_point t = real_clock::now(); + real_clock::time_point exp = t + std::chrono::seconds(duration); + expiration = ceph::to_iso_8601(exp); + + //Session Token - Encrypt using AES + auto* cryptohandler = cct->get_crypto_handler(CEPH_CRYPTO_AES); + if (! cryptohandler) { + ldpp_dout(dpp, 0) << "ERROR: No AES cryto handler found !" << dendl; + return -EINVAL; + } + string secret_s = cct->_conf->rgw_sts_key; + buffer::ptr secret(secret_s.c_str(), secret_s.length()); + int ret = 0; + if (ret = cryptohandler->validate_secret(secret); ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: Invalid rgw sts key, please ensure its length is 16" << dendl; + return ret; + } + string error; + std::unique_ptr keyhandler(cryptohandler->get_key_handler(secret, error)); + if (! keyhandler) { + ldpp_dout(dpp, 0) << "ERROR: No Key handler found !" << dendl; + return -EINVAL; + } + error.clear(); + //Storing policy and roleId as part of token, so that they can be extracted + // from the token itself for policy evaluation. + SessionToken token; + //authentication info + token.access_key_id = accessKeyId; + token.secret_access_key = secretAccessKey; + token.expiration = expiration; + token.issued_at = ceph::to_iso_8601(t); + + //Authorization info + if (policy) + token.policy = *policy; + else + token.policy = {}; + + if (roleId) + token.roleId = *roleId; + else + token.roleId = {}; + + if (user) + token.user = *user; + else { + rgw_user u({}, {}, {}); + token.user = u; + } + + if (token_claims) { + token.token_claims = std::move(*token_claims); + } + + if (identity) { + token.acct_name = identity->get_acct_name(); + token.perm_mask = identity->get_perm_mask(); + token.is_admin = identity->is_admin_of(token.user); + token.acct_type = identity->get_identity_type(); + } else { + token.acct_name = {}; + token.perm_mask = 0; + token.is_admin = 0; + token.acct_type = TYPE_ROLE; + token.role_session = role_session.get(); + } + + if (session_princ_tags) { + token.principal_tags = std::move(*session_princ_tags); + } + buffer::list input, enc_output; + encode(token, input); + + if (ret = keyhandler->encrypt(input, enc_output, &error); ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: Encrypting session token returned an error !" << dendl; + return ret; + } + + bufferlist encoded_op; + enc_output.encode_base64(encoded_op); + encoded_op.append('\0'); + sessionToken = encoded_op.c_str(); + + return ret; +} + +void AssumedRoleUser::dump(Formatter *f) const +{ + encode_json("Arn", arn , f); + encode_json("AssumeRoleId", assumeRoleId , f); +} + +int AssumedRoleUser::generateAssumedRoleUser(CephContext* cct, + rgw::sal::Driver* driver, + const string& roleId, + const rgw::ARN& roleArn, + const string& roleSessionName) +{ + string resource = std::move(roleArn.resource); + boost::replace_first(resource, "role", "assumed-role"); + resource.append("/"); + resource.append(roleSessionName); + + rgw::ARN assumed_role_arn(rgw::Partition::aws, + rgw::Service::sts, + "", roleArn.account, resource); + arn = assumed_role_arn.to_string(); + + //Assumeroleid = roleid:rolesessionname + assumeRoleId = roleId + ":" + roleSessionName; + + return 0; +} + +AssumeRoleRequestBase::AssumeRoleRequestBase( CephContext* cct, + const string& duration, + const string& iamPolicy, + const string& roleArn, + const string& roleSessionName) + : cct(cct), iamPolicy(iamPolicy), roleArn(roleArn), roleSessionName(roleSessionName) +{ + MIN_DURATION_IN_SECS = cct->_conf->rgw_sts_min_session_duration; + if (duration.empty()) { + this->duration = DEFAULT_DURATION_IN_SECS; + } else { + this->duration = strict_strtoll(duration.c_str(), 10, &this->err_msg); + } +} + +int AssumeRoleRequestBase::validate_input(const DoutPrefixProvider *dpp) const +{ + if (!err_msg.empty()) { + ldpp_dout(dpp, 0) << "ERROR: error message is empty !" << dendl; + return -EINVAL; + } + + if (duration < MIN_DURATION_IN_SECS || + duration > MAX_DURATION_IN_SECS) { + ldpp_dout(dpp, 0) << "ERROR: Incorrect value of duration: " << duration << dendl; + return -EINVAL; + } + + if (! iamPolicy.empty() && + (iamPolicy.size() < MIN_POLICY_SIZE || iamPolicy.size() > MAX_POLICY_SIZE)) { + ldpp_dout(dpp, 0) << "ERROR: Incorrect size of iamPolicy: " << iamPolicy.size() << dendl; + return -ERR_PACKED_POLICY_TOO_LARGE; + } + + if (! roleArn.empty() && + (roleArn.size() < MIN_ROLE_ARN_SIZE || roleArn.size() > MAX_ROLE_ARN_SIZE)) { + ldpp_dout(dpp, 0) << "ERROR: Incorrect size of roleArn: " << roleArn.size() << dendl; + return -EINVAL; + } + + if (! roleSessionName.empty()) { + if (roleSessionName.size() < MIN_ROLE_SESSION_SIZE || roleSessionName.size() > MAX_ROLE_SESSION_SIZE) { + ldpp_dout(dpp, 0) << "ERROR: Either role session name is empty or role session size is incorrect: " << roleSessionName.size() << dendl; + return -EINVAL; + } + + std::regex regex_roleSession("[A-Za-z0-9_=,.@-]+"); + if (! std::regex_match(roleSessionName, regex_roleSession)) { + ldpp_dout(dpp, 0) << "ERROR: Role session name is incorrect: " << roleSessionName << dendl; + return -EINVAL; + } + } + + return 0; +} + +int AssumeRoleWithWebIdentityRequest::validate_input(const DoutPrefixProvider *dpp) const +{ + if (! providerId.empty()) { + if (providerId.length() < MIN_PROVIDER_ID_LEN || + providerId.length() > MAX_PROVIDER_ID_LEN) { + ldpp_dout(dpp, 0) << "ERROR: Either provider id is empty or provider id length is incorrect: " << providerId.length() << dendl; + return -EINVAL; + } + } + return AssumeRoleRequestBase::validate_input(dpp); +} + +int AssumeRoleRequest::validate_input(const DoutPrefixProvider *dpp) const +{ + if (! externalId.empty()) { + if (externalId.length() < MIN_EXTERNAL_ID_LEN || + externalId.length() > MAX_EXTERNAL_ID_LEN) { + ldpp_dout(dpp, 0) << "ERROR: Either external id is empty or external id length is incorrect: " << externalId.length() << dendl; + return -EINVAL; + } + + std::regex regex_externalId("[A-Za-z0-9_=,.@:/-]+"); + if (! std::regex_match(externalId, regex_externalId)) { + ldpp_dout(dpp, 0) << "ERROR: Invalid external Id: " << externalId << dendl; + return -EINVAL; + } + } + if (! serialNumber.empty()){ + if (serialNumber.size() < MIN_SERIAL_NUMBER_SIZE || serialNumber.size() > MAX_SERIAL_NUMBER_SIZE) { + ldpp_dout(dpp, 0) << "Either serial number is empty or serial number length is incorrect: " << serialNumber.size() << dendl; + return -EINVAL; + } + + std::regex regex_serialNumber("[A-Za-z0-9_=/:,.@-]+"); + if (! std::regex_match(serialNumber, regex_serialNumber)) { + ldpp_dout(dpp, 0) << "Incorrect serial number: " << serialNumber << dendl; + return -EINVAL; + } + } + if (! tokenCode.empty() && tokenCode.size() == TOKEN_CODE_SIZE) { + ldpp_dout(dpp, 0) << "Either token code is empty or token code size is invalid: " << tokenCode.size() << dendl; + return -EINVAL; + } + + return AssumeRoleRequestBase::validate_input(dpp); +} + +std::tuple STSService::getRoleInfo(const DoutPrefixProvider *dpp, + const string& arn, + optional_yield y) +{ + if (auto r_arn = rgw::ARN::parse(arn); r_arn) { + auto pos = r_arn->resource.find_last_of('/'); + string roleName = r_arn->resource.substr(pos + 1); + std::unique_ptr role = driver->get_role(roleName, r_arn->account); + if (int ret = role->get(dpp, y); ret < 0) { + if (ret == -ENOENT) { + ldpp_dout(dpp, 0) << "Role doesn't exist: " << roleName << dendl; + ret = -ERR_NO_ROLE_FOUND; + } + return make_tuple(ret, nullptr); + } else { + auto path_pos = r_arn->resource.find('/'); + string path; + if (path_pos == pos) { + path = "/"; + } else { + path = r_arn->resource.substr(path_pos, ((pos - path_pos) + 1)); + } + string r_path = role->get_path(); + if (path != r_path) { + ldpp_dout(dpp, 0) << "Invalid Role ARN: Path in ARN does not match with the role path: " << path << " " << r_path << dendl; + return make_tuple(-EACCES, nullptr); + } + this->role = std::move(role); + return make_tuple(0, this->role.get()); + } + } else { + ldpp_dout(dpp, 0) << "Invalid role arn: " << arn << dendl; + return make_tuple(-EINVAL, nullptr); + } +} + +AssumeRoleWithWebIdentityResponse STSService::assumeRoleWithWebIdentity(const DoutPrefixProvider *dpp, AssumeRoleWithWebIdentityRequest& req) +{ + AssumeRoleWithWebIdentityResponse response; + response.assumeRoleResp.packedPolicySize = 0; + std::vector token_claims; + + if (req.getProviderId().empty()) { + response.providerId = req.getIss(); + } + response.aud = req.getAud(); + response.sub = req.getSub(); + + token_claims.emplace_back(string("iss") + ":" + req.getIss()); + token_claims.emplace_back(string("aud") + ":" + req.getAud()); + token_claims.emplace_back(string("sub") + ":" + req.getSub()); + + //Get the role info which is being assumed + boost::optional r_arn = rgw::ARN::parse(req.getRoleARN()); + if (r_arn == boost::none) { + ldpp_dout(dpp, 0) << "Error in parsing role arn: " << req.getRoleARN() << dendl; + response.assumeRoleResp.retCode = -EINVAL; + return response; + } + + string roleId = role->get_id(); + uint64_t roleMaxSessionDuration = role->get_max_session_duration(); + req.setMaxDuration(roleMaxSessionDuration); + + //Validate input + response.assumeRoleResp.retCode = req.validate_input(dpp); + if (response.assumeRoleResp.retCode < 0) { + return response; + } + + //Calculate PackedPolicySize + string policy = req.getPolicy(); + response.assumeRoleResp.packedPolicySize = (policy.size() / req.getMaxPolicySize()) * 100; + + //Generate Assumed Role User + response.assumeRoleResp.retCode = response.assumeRoleResp.user.generateAssumedRoleUser(cct, + driver, + roleId, + r_arn.get(), + req.getRoleSessionName()); + if (response.assumeRoleResp.retCode < 0) { + return response; + } + + //Generate Credentials + //Role and Policy provide the authorization info, user id and applier info are not needed + response.assumeRoleResp.retCode = response.assumeRoleResp.creds.generateCredentials(dpp, cct, req.getDuration(), + req.getPolicy(), roleId, + req.getRoleSessionName(), + token_claims, + req.getPrincipalTags(), + user_id, nullptr); + if (response.assumeRoleResp.retCode < 0) { + return response; + } + + response.assumeRoleResp.retCode = 0; + return response; +} + +AssumeRoleResponse STSService::assumeRole(const DoutPrefixProvider *dpp, + AssumeRoleRequest& req, + optional_yield y) +{ + AssumeRoleResponse response; + response.packedPolicySize = 0; + + //Get the role info which is being assumed + boost::optional r_arn = rgw::ARN::parse(req.getRoleARN()); + if (r_arn == boost::none) { + ldpp_dout(dpp, 0) << "Error in parsing role arn: " << req.getRoleARN() << dendl; + response.retCode = -EINVAL; + return response; + } + + string roleId = role->get_id(); + uint64_t roleMaxSessionDuration = role->get_max_session_duration(); + req.setMaxDuration(roleMaxSessionDuration); + + //Validate input + response.retCode = req.validate_input(dpp); + if (response.retCode < 0) { + return response; + } + + //Calculate PackedPolicySize + string policy = req.getPolicy(); + response.packedPolicySize = (policy.size() / req.getMaxPolicySize()) * 100; + + //Generate Assumed Role User + response.retCode = response.user.generateAssumedRoleUser(cct, driver, roleId, r_arn.get(), req.getRoleSessionName()); + if (response.retCode < 0) { + return response; + } + + //Generate Credentials + //Role and Policy provide the authorization info, user id and applier info are not needed + response.retCode = response.creds.generateCredentials(dpp, cct, req.getDuration(), + req.getPolicy(), roleId, + req.getRoleSessionName(), + boost::none, + boost::none, + user_id, nullptr); + if (response.retCode < 0) { + return response; + } + + response.retCode = 0; + return response; +} + +GetSessionTokenRequest::GetSessionTokenRequest(const string& duration, const string& serialNumber, const string& tokenCode) +{ + if (duration.empty()) { + this->duration = DEFAULT_DURATION_IN_SECS; + } else { + this->duration = stoull(duration); + } + this->serialNumber = serialNumber; + this->tokenCode = tokenCode; +} + +GetSessionTokenResponse STSService::getSessionToken(const DoutPrefixProvider *dpp, GetSessionTokenRequest& req) +{ + int ret; + Credentials cred; + + //Generate Credentials + if (ret = cred.generateCredentials(dpp, cct, + req.getDuration(), + boost::none, + boost::none, + boost::none, + boost::none, + boost::none, + user_id, + identity); ret < 0) { + return make_tuple(ret, cred); + } + + return make_tuple(0, cred); +} + +} diff --git a/src/rgw/rgw_sts.h b/src/rgw/rgw_sts.h new file mode 100644 index 000000000..5ee7ee444 --- /dev/null +++ b/src/rgw/rgw_sts.h @@ -0,0 +1,251 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_role.h" +#include "rgw_auth.h" +#include "rgw_web_idp.h" + +namespace STS { + +class AssumeRoleRequestBase { +protected: + static constexpr uint64_t MIN_POLICY_SIZE = 1; + static constexpr uint64_t MAX_POLICY_SIZE = 2048; + static constexpr uint64_t DEFAULT_DURATION_IN_SECS = 3600; + static constexpr uint64_t MIN_ROLE_ARN_SIZE = 2; + static constexpr uint64_t MAX_ROLE_ARN_SIZE = 2048; + static constexpr uint64_t MIN_ROLE_SESSION_SIZE = 2; + static constexpr uint64_t MAX_ROLE_SESSION_SIZE = 64; + uint64_t MIN_DURATION_IN_SECS; + uint64_t MAX_DURATION_IN_SECS; + CephContext* cct; + uint64_t duration; + std::string err_msg; + std::string iamPolicy; + std::string roleArn; + std::string roleSessionName; +public: + AssumeRoleRequestBase(CephContext* cct, + const std::string& duration, + const std::string& iamPolicy, + const std::string& roleArn, + const std::string& roleSessionName); + const std::string& getRoleARN() const { return roleArn; } + const std::string& getRoleSessionName() const { return roleSessionName; } + const std::string& getPolicy() const {return iamPolicy; } + static const uint64_t& getMaxPolicySize() { return MAX_POLICY_SIZE; } + void setMaxDuration(const uint64_t& maxDuration) { MAX_DURATION_IN_SECS = maxDuration; } + const uint64_t& getDuration() const { return duration; } + int validate_input(const DoutPrefixProvider *dpp) const; +}; + +class AssumeRoleWithWebIdentityRequest : public AssumeRoleRequestBase { + static constexpr uint64_t MIN_PROVIDER_ID_LEN = 4; + static constexpr uint64_t MAX_PROVIDER_ID_LEN = 2048; + std::string providerId; + std::string iamPolicy; + std::string iss; + std::string sub; + std::string aud; + std::vector> session_princ_tags; +public: + AssumeRoleWithWebIdentityRequest( CephContext* cct, + const std::string& duration, + const std::string& providerId, + const std::string& iamPolicy, + const std::string& roleArn, + const std::string& roleSessionName, + const std::string& iss, + const std::string& sub, + const std::string& aud, + std::vector> session_princ_tags) + : AssumeRoleRequestBase(cct, duration, iamPolicy, roleArn, roleSessionName), + providerId(providerId), iss(iss), sub(sub), aud(aud), session_princ_tags(session_princ_tags) {} + const std::string& getProviderId() const { return providerId; } + const std::string& getIss() const { return iss; } + const std::string& getAud() const { return aud; } + const std::string& getSub() const { return sub; } + const std::vector>& getPrincipalTags() const { return session_princ_tags; } + int validate_input(const DoutPrefixProvider *dpp) const; +}; + +class AssumeRoleRequest : public AssumeRoleRequestBase { + static constexpr uint64_t MIN_EXTERNAL_ID_LEN = 2; + static constexpr uint64_t MAX_EXTERNAL_ID_LEN = 1224; + static constexpr uint64_t MIN_SERIAL_NUMBER_SIZE = 9; + static constexpr uint64_t MAX_SERIAL_NUMBER_SIZE = 256; + static constexpr uint64_t TOKEN_CODE_SIZE = 6; + std::string externalId; + std::string serialNumber; + std::string tokenCode; +public: + AssumeRoleRequest(CephContext* cct, + const std::string& duration, + const std::string& externalId, + const std::string& iamPolicy, + const std::string& roleArn, + const std::string& roleSessionName, + const std::string& serialNumber, + const std::string& tokenCode) + : AssumeRoleRequestBase(cct, duration, iamPolicy, roleArn, roleSessionName), + externalId(externalId), serialNumber(serialNumber), tokenCode(tokenCode){} + int validate_input(const DoutPrefixProvider *dpp) const; +}; + +class GetSessionTokenRequest { +protected: + static constexpr uint64_t MIN_DURATION_IN_SECS = 900; + static constexpr uint64_t DEFAULT_DURATION_IN_SECS = 3600; + uint64_t duration; + std::string serialNumber; + std::string tokenCode; + +public: + GetSessionTokenRequest(const std::string& duration, const std::string& serialNumber, const std::string& tokenCode); + + const uint64_t& getDuration() const { return duration; } + static const uint64_t& getMinDuration() { return MIN_DURATION_IN_SECS; } +}; + +class AssumedRoleUser { + std::string arn; + std::string assumeRoleId; +public: + int generateAssumedRoleUser( CephContext* cct, + rgw::sal::Driver* driver, + const std::string& roleId, + const rgw::ARN& roleArn, + const std::string& roleSessionName); + const std::string& getARN() const { return arn; } + const std::string& getAssumeRoleId() const { return assumeRoleId; } + void dump(Formatter *f) const; +}; + +struct SessionToken { + std::string access_key_id; + std::string secret_access_key; + std::string expiration; + std::string policy; + std::string roleId; + rgw_user user; + std::string acct_name; + uint32_t perm_mask; + bool is_admin; + uint32_t acct_type; + std::string role_session; + std::vector token_claims; + std::string issued_at; + std::vector> principal_tags; + + SessionToken() {} + + void encode(bufferlist& bl) const { + ENCODE_START(5, 1, bl); + encode(access_key_id, bl); + encode(secret_access_key, bl); + encode(expiration, bl); + encode(policy, bl); + encode(roleId, bl); + encode(user, bl); + encode(acct_name, bl); + encode(perm_mask, bl); + encode(is_admin, bl); + encode(acct_type, bl); + encode(role_session, bl); + encode(token_claims, bl); + encode(issued_at, bl); + encode(principal_tags, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(5, bl); + decode(access_key_id, bl); + decode(secret_access_key, bl); + decode(expiration, bl); + decode(policy, bl); + decode(roleId, bl); + decode(user, bl); + decode(acct_name, bl); + decode(perm_mask, bl); + decode(is_admin, bl); + decode(acct_type, bl); + if (struct_v >= 2) { + decode(role_session, bl); + } + if (struct_v >= 3) { + decode(token_claims, bl); + } + if (struct_v >= 4) { + decode(issued_at, bl); + } + if (struct_v >= 5) { + decode(principal_tags, bl); + } + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(SessionToken) + +class Credentials { + static constexpr int MAX_ACCESS_KEY_LEN = 20; + static constexpr int MAX_SECRET_KEY_LEN = 40; + std::string accessKeyId; + std::string expiration; + std::string secretAccessKey; + std::string sessionToken; +public: + int generateCredentials(const DoutPrefixProvider *dpp, + CephContext* cct, + const uint64_t& duration, + const boost::optional& policy, + const boost::optional& roleId, + const boost::optional& role_session, + const boost::optional>& token_claims, + const boost::optional>>& session_princ_tags, + boost::optional user, + rgw::auth::Identity* identity); + const std::string& getAccessKeyId() const { return accessKeyId; } + const std::string& getExpiration() const { return expiration; } + const std::string& getSecretAccessKey() const { return secretAccessKey; } + const std::string& getSessionToken() const { return sessionToken; } + void dump(Formatter *f) const; +}; + +struct AssumeRoleResponse { + int retCode; + AssumedRoleUser user; + Credentials creds; + uint64_t packedPolicySize; +}; + +struct AssumeRoleWithWebIdentityResponse { + AssumeRoleResponse assumeRoleResp; + std::string aud; + std::string providerId; + std::string sub; +}; + +using AssumeRoleResponse = struct AssumeRoleResponse ; +using GetSessionTokenResponse = std::tuple; +using AssumeRoleWithWebIdentityResponse = struct AssumeRoleWithWebIdentityResponse; + +class STSService { + CephContext* cct; + rgw::sal::Driver* driver; + rgw_user user_id; + std::unique_ptr role; + rgw::auth::Identity* identity; +public: + STSService() = default; + STSService(CephContext* cct, rgw::sal::Driver* driver, rgw_user user_id, + rgw::auth::Identity* identity) + : cct(cct), driver(driver), user_id(user_id), identity(identity) {} + std::tuple getRoleInfo(const DoutPrefixProvider *dpp, const std::string& arn, optional_yield y); + AssumeRoleResponse assumeRole(const DoutPrefixProvider *dpp, AssumeRoleRequest& req, optional_yield y); + GetSessionTokenResponse getSessionToken(const DoutPrefixProvider *dpp, GetSessionTokenRequest& req); + AssumeRoleWithWebIdentityResponse assumeRoleWithWebIdentity(const DoutPrefixProvider *dpp, AssumeRoleWithWebIdentityRequest& req); +}; +} diff --git a/src/rgw/rgw_swift_auth.cc b/src/rgw/rgw_swift_auth.cc new file mode 100644 index 000000000..05d4b28c1 --- /dev/null +++ b/src/rgw/rgw_swift_auth.cc @@ -0,0 +1,775 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include + +#include +#include +#include + +#include "rgw_swift_auth.h" +#include "rgw_rest.h" + +#include "common/ceph_crypto.h" +#include "common/Clock.h" + +#include "include/random.h" + +#include "rgw_client_io.h" +#include "rgw_http_client.h" +#include "rgw_sal_rados.h" +#include "include/str_list.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +#define DEFAULT_SWIFT_PREFIX "/swift" + +using namespace std; +using namespace ceph::crypto; + + +namespace rgw { +namespace auth { +namespace swift { + +/* TempURL: applier */ +void TempURLApplier::modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const /* in/out */ +{ + bool inline_exists = false; + const std::string& filename = s->info.args.get("filename"); + + s->info.args.get("inline", &inline_exists); + if (inline_exists) { + s->content_disp.override = "inline"; + } else if (!filename.empty()) { + std::string fenc; + url_encode(filename, fenc); + s->content_disp.override = "attachment; filename=\"" + fenc + "\""; + } else { + std::string fenc; + url_encode(s->object->get_name(), fenc); + s->content_disp.fallback = "attachment; filename=\"" + fenc + "\""; + } + + ldpp_dout(dpp, 20) << "finished applying changes to req_state for TempURL: " + << " content_disp override " << s->content_disp.override + << " content_disp fallback " << s->content_disp.fallback + << dendl; + +} + +void TempURLApplier::write_ops_log_entry(rgw_log_entry& entry) const +{ + LocalApplier::write_ops_log_entry(entry); + entry.temp_url = true; +} + +/* TempURL: engine */ +bool TempURLEngine::is_applicable(const req_state* const s) const noexcept +{ + return s->info.args.exists("temp_url_sig") || + s->info.args.exists("temp_url_expires"); +} + +void TempURLEngine::get_owner_info(const DoutPrefixProvider* dpp, const req_state* const s, + RGWUserInfo& owner_info, optional_yield y) const +{ + /* We cannot use req_state::bucket_name because it isn't available + * now. It will be initialized in RGWHandler_REST_SWIFT::postauth_init(). */ + const string& bucket_name = s->init_state.url_bucket; + + /* TempURL requires that bucket and object names are specified. */ + if (bucket_name.empty() || s->object->empty()) { + throw -EPERM; + } + + /* TempURL case is completely different than the Keystone auth - you may + * get account name only through extraction from URL. In turn, knowledge + * about account is neccessary to obtain its bucket tenant. Without that, + * the access would be limited to accounts with empty tenant. */ + string bucket_tenant; + if (!s->account_name.empty()) { + bool found = false; + std::unique_ptr user; + + rgw_user uid(s->account_name); + if (uid.tenant.empty()) { + rgw_user tenanted_uid(uid.id, uid.id); + user = driver->get_user(tenanted_uid); + if (user->load_user(dpp, s->yield) >= 0) { + /* Succeeded */ + found = true; + } + } + + if (!found) { + user = driver->get_user(uid); + if (user->load_user(dpp, s->yield) < 0) { + throw -EPERM; + } + } + + bucket_tenant = user->get_tenant(); + } + + rgw_bucket b; + b.tenant = std::move(bucket_tenant); + b.name = std::move(bucket_name); + std::unique_ptr bucket; + int ret = driver->get_bucket(dpp, nullptr, b, &bucket, s->yield); + if (ret < 0) { + throw ret; + } + + ldpp_dout(dpp, 20) << "temp url user (bucket owner): " << bucket->get_info().owner + << dendl; + + std::unique_ptr user; + user = driver->get_user(bucket->get_info().owner); + if (user->load_user(dpp, s->yield) < 0) { + throw -EPERM; + } + + owner_info = user->get_info(); +} + +std::string TempURLEngine::convert_from_iso8601(std::string expires) const +{ + /* Swift's TempURL allows clients to send the expiration as ISO8601- + * compatible strings. Though, only plain UNIX timestamp are taken + * for the HMAC calculations. We need to make the conversion. */ + struct tm date_t; + if (!parse_iso8601(expires.c_str(), &date_t, nullptr, true)) { + return expires; + } else { + return std::to_string(internal_timegm(&date_t)); + } +} + +bool TempURLEngine::is_expired(const std::string& expires) const +{ + string err; + const utime_t now = ceph_clock_now(); + const uint64_t expiration = (uint64_t)strict_strtoll(expires.c_str(), + 10, &err); + if (!err.empty()) { + dout(5) << "failed to parse temp_url_expires: " << err << dendl; + return true; + } + + if (expiration <= (uint64_t)now.sec()) { + dout(5) << "temp url expired: " << expiration << " <= " << now.sec() << dendl; + return true; + } + + return false; +} + +bool TempURLEngine::is_disallowed_header_present(const req_info& info) const +{ + static const auto headers = { + "HTTP_X_OBJECT_MANIFEST", + }; + + return std::any_of(std::begin(headers), std::end(headers), + [&info](const char* header) { + return info.env->exists(header); + }); +} + +std::string extract_swift_subuser(const std::string& swift_user_name) +{ + size_t pos = swift_user_name.find(':'); + if (std::string::npos == pos) { + return swift_user_name; + } else { + return swift_user_name.substr(pos + 1); + } +} + +class TempURLEngine::SignatureHelper +{ +private: + static constexpr uint32_t output_size = + CEPH_CRYPTO_HMACSHA1_DIGESTSIZE * 2 + 1; + + unsigned char dest[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; // 20 + char dest_str[output_size]; + +public: + SignatureHelper() = default; + + const char* calc(const std::string& key, + const std::string_view& method, + const std::string_view& path, + const std::string& expires) { + + using ceph::crypto::HMACSHA1; + using UCHARPTR = const unsigned char*; + + HMACSHA1 hmac((UCHARPTR) key.c_str(), key.size()); + hmac.Update((UCHARPTR) method.data(), method.size()); + hmac.Update((UCHARPTR) "\n", 1); + hmac.Update((UCHARPTR) expires.c_str(), expires.size()); + hmac.Update((UCHARPTR) "\n", 1); + hmac.Update((UCHARPTR) path.data(), path.size()); + hmac.Final(dest); + + buf_to_hex((UCHARPTR) dest, sizeof(dest), dest_str); + + return dest_str; + } + + bool is_equal_to(const std::string& rhs) const { + /* never allow out-of-range exception */ + if (rhs.size() < (output_size - 1)) { + return false; + } + return rhs.compare(0 /* pos */, output_size, dest_str) == 0; + } + +}; /* TempURLEngine::SignatureHelper */ + +class TempURLEngine::PrefixableSignatureHelper + : private TempURLEngine::SignatureHelper { + using base_t = SignatureHelper; + + const std::string_view decoded_uri; + const std::string_view object_name; + std::string_view no_obj_uri; + + const boost::optional prefix; + +public: + PrefixableSignatureHelper(const std::string& _decoded_uri, + const std::string& object_name, + const boost::optional prefix) + : decoded_uri(_decoded_uri), + object_name(object_name), + prefix(prefix) { + /* Transform: v1/acct/cont/obj - > v1/acct/cont/ + * + * NOTE(rzarzynski): we really want to substr() on std::string_view, + * not std::string. Otherwise we would end with no_obj_uri referencing + * a temporary. */ + no_obj_uri = \ + decoded_uri.substr(0, decoded_uri.length() - object_name.length()); + } + + const char* calc(const std::string& key, + const std::string_view& method, + const std::string_view& path, + const std::string& expires) { + if (!prefix) { + return base_t::calc(key, method, path, expires); + } else { + const auto prefixed_path = \ + string_cat_reserve("prefix:", no_obj_uri, *prefix); + return base_t::calc(key, method, prefixed_path, expires); + } + } + + bool is_equal_to(const std::string& rhs) const { + bool is_auth_ok = base_t::is_equal_to(rhs); + + if (prefix && is_auth_ok) { + const auto prefix_uri = string_cat_reserve(no_obj_uri, *prefix); + is_auth_ok = boost::algorithm::starts_with(decoded_uri, prefix_uri); + } + + return is_auth_ok; + } +}; /* TempURLEngine::PrefixableSignatureHelper */ + +TempURLEngine::result_t +TempURLEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const +{ + if (! is_applicable(s)) { + return result_t::deny(); + } + + /* NOTE(rzarzynski): RGWHTTPArgs::get(), in contrast to RGWEnv::get(), + * never returns nullptr. If the requested parameter is absent, we will + * get the empty string. */ + const std::string& temp_url_sig = s->info.args.get("temp_url_sig"); + const std::string& temp_url_expires = \ + convert_from_iso8601(s->info.args.get("temp_url_expires")); + + if (temp_url_sig.empty() || temp_url_expires.empty()) { + return result_t::deny(); + } + + /* Though, for prefixed tempurls we need to differentiate between empty + * prefix and lack of prefix. Empty prefix means allowance for whole + * container. */ + const boost::optional temp_url_prefix = \ + s->info.args.get_optional("temp_url_prefix"); + + RGWUserInfo owner_info; + try { + get_owner_info(dpp, s, owner_info, y); + } catch (...) { + ldpp_dout(dpp, 5) << "cannot get user_info of account's owner" << dendl; + return result_t::reject(); + } + + if (owner_info.temp_url_keys.empty()) { + ldpp_dout(dpp, 5) << "user does not have temp url key set, aborting" << dendl; + return result_t::reject(); + } + + if (is_expired(temp_url_expires)) { + ldpp_dout(dpp, 5) << "temp url link expired" << dendl; + return result_t::reject(-EPERM); + } + + if (is_disallowed_header_present(s->info)) { + ldout(cct, 5) << "temp url rejected due to disallowed header" << dendl; + return result_t::reject(-EINVAL); + } + + /* We need to verify two paths because of compliance with Swift, Tempest + * and old versions of RadosGW. The second item will have the prefix + * of Swift API entry point removed. */ + + /* XXX can we search this ONCE? */ + const size_t pos = g_conf()->rgw_swift_url_prefix.find_last_not_of('/') + 1; + const std::string_view ref_uri = s->decoded_uri; + const std::array allowed_paths = { + ref_uri, + ref_uri.substr(pos + 1) + }; + + /* Account owner calculates the signature also against a HTTP method. */ + boost::container::static_vector allowed_methods; + if (strcmp("HEAD", s->info.method) == 0) { + /* HEAD requests are specially handled. */ + /* TODO: after getting a newer boost (with static_vector supporting + * initializers lists), get back to the good notation: + * allowed_methods = {"HEAD", "GET", "PUT" }; + * Just for now let's use emplace_back to construct the vector. */ + allowed_methods.emplace_back("HEAD"); + allowed_methods.emplace_back("GET"); + allowed_methods.emplace_back("PUT"); + } else if (strlen(s->info.method) > 0) { + allowed_methods.emplace_back(s->info.method); + } + + /* Need to try each combination of keys, allowed path and methods. */ + PrefixableSignatureHelper sig_helper { + s->decoded_uri, + s->object->get_name(), + temp_url_prefix + }; + + for (const auto& kv : owner_info.temp_url_keys) { + const int temp_url_key_num = kv.first; + const string& temp_url_key = kv.second; + + if (temp_url_key.empty()) { + continue; + } + + for (const auto& path : allowed_paths) { + for (const auto& method : allowed_methods) { + const char* const local_sig = sig_helper.calc(temp_url_key, method, + path, temp_url_expires); + + ldpp_dout(dpp, 20) << "temp url signature [" << temp_url_key_num + << "] (calculated): " << local_sig + << dendl; + + if (sig_helper.is_equal_to(temp_url_sig)) { + auto apl = apl_factory->create_apl_turl(cct, s, owner_info); + return result_t::grant(std::move(apl)); + } else { + ldpp_dout(dpp, 5) << "temp url signature mismatch: " << local_sig + << " != " << temp_url_sig << dendl; + } + } + } + } + + return result_t::reject(); +} + + +/* External token */ +bool ExternalTokenEngine::is_applicable(const std::string& token) const noexcept +{ + if (token.empty()) { + return false; + } else if (g_conf()->rgw_swift_auth_url.empty()) { + return false; + } else { + return true; + } +} + +ExternalTokenEngine::result_t +ExternalTokenEngine::authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* const s, optional_yield y) const +{ + if (! is_applicable(token)) { + return result_t::deny(); + } + + std::string auth_url = g_conf()->rgw_swift_auth_url; + if (auth_url.back() != '/') { + auth_url.append("/"); + } + + auth_url.append("token"); + char url_buf[auth_url.size() + 1 + token.length() + 1]; + sprintf(url_buf, "%s/%s", auth_url.c_str(), token.c_str()); + + RGWHTTPHeadersCollector validator(cct, "GET", url_buf, { "X-Auth-Groups", "X-Auth-Ttl" }); + + ldpp_dout(dpp, 10) << "rgw_swift_validate_token url=" << url_buf << dendl; + + int ret = validator.process(y); + if (ret < 0) { + throw ret; + } + + std::string swift_user; + try { + std::vector swift_groups; + get_str_vec(validator.get_header_value("X-Auth-Groups"), + ",", swift_groups); + + if (0 == swift_groups.size()) { + return result_t::deny(-EPERM); + } else { + swift_user = std::move(swift_groups[0]); + } + } catch (const std::out_of_range&) { + /* The X-Auth-Groups header isn't present in the response. */ + return result_t::deny(-EPERM); + } + + if (swift_user.empty()) { + return result_t::deny(-EPERM); + } + + ldpp_dout(dpp, 10) << "swift user=" << swift_user << dendl; + + std::unique_ptr user; + ret = driver->get_user_by_swift(dpp, swift_user, s->yield, &user); + if (ret < 0) { + ldpp_dout(dpp, 0) << "NOTICE: couldn't map swift user" << dendl; + throw ret; + } + + auto apl = apl_factory->create_apl_local(cct, s, user->get_info(), + extract_swift_subuser(swift_user), + std::nullopt, rgw::auth::LocalApplier::NO_ACCESS_KEY); + return result_t::grant(std::move(apl)); +} + +static int build_token(const string& swift_user, + const string& key, + const uint64_t nonce, + const utime_t& expiration, + bufferlist& bl) +{ + using ceph::encode; + encode(swift_user, bl); + encode(nonce, bl); + encode(expiration, bl); + + bufferptr p(CEPH_CRYPTO_HMACSHA1_DIGESTSIZE); + + char buf[bl.length() * 2 + 1]; + buf_to_hex((const unsigned char *)bl.c_str(), bl.length(), buf); + dout(20) << "build_token token=" << buf << dendl; + + char k[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; + // FIPS zeroization audit 20191116: this memset is not intended to + // wipe out a secret after use. + memset(k, 0, sizeof(k)); + const char *s = key.c_str(); + for (int i = 0; i < (int)key.length(); i++, s++) { + k[i % CEPH_CRYPTO_HMACSHA1_DIGESTSIZE] |= *s; + } + calc_hmac_sha1(k, sizeof(k), bl.c_str(), bl.length(), p.c_str()); + ::ceph::crypto::zeroize_for_security(k, sizeof(k)); + + bl.append(p); + + return 0; + +} + +static int encode_token(CephContext *cct, string& swift_user, string& key, + bufferlist& bl) +{ + const auto nonce = ceph::util::generate_random_number(); + + utime_t expiration = ceph_clock_now(); + expiration += cct->_conf->rgw_swift_token_expiration; + + return build_token(swift_user, key, nonce, expiration, bl); +} + + +/* AUTH_rgwtk (signed token): engine */ +bool SignedTokenEngine::is_applicable(const std::string& token) const noexcept +{ + if (token.empty()) { + return false; + } else { + return token.compare(0, 10, "AUTH_rgwtk") == 0; + } +} + +SignedTokenEngine::result_t +SignedTokenEngine::authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* const s) const +{ + if (! is_applicable(token)) { + return result_t::deny(-EPERM); + } + + /* Effective token string is the part after the prefix. */ + const std::string etoken = token.substr(strlen("AUTH_rgwtk")); + const size_t etoken_len = etoken.length(); + + if (etoken_len & 1) { + ldpp_dout(dpp, 0) << "NOTICE: failed to verify token: odd token length=" + << etoken_len << dendl; + throw -EINVAL; + } + + ceph::bufferptr p(etoken_len/2); + int ret = hex_to_buf(etoken.c_str(), p.c_str(), etoken_len); + if (ret < 0) { + throw ret; + } + + ceph::bufferlist tok_bl; + tok_bl.append(p); + + uint64_t nonce; + utime_t expiration; + std::string swift_user; + + try { + auto iter = tok_bl.cbegin(); + + using ceph::decode; + decode(swift_user, iter); + decode(nonce, iter); + decode(expiration, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "NOTICE: failed to decode token" << dendl; + throw -EINVAL; + } + + const utime_t now = ceph_clock_now(); + if (expiration < now) { + ldpp_dout(dpp, 0) << "NOTICE: old timed out token was used now=" << now + << " token.expiration=" << expiration + << dendl; + return result_t::deny(-EPERM); + } + + std::unique_ptr user; + ret = driver->get_user_by_swift(dpp, swift_user, s->yield, &user); + if (ret < 0) { + throw ret; + } + + ldpp_dout(dpp, 10) << "swift_user=" << swift_user << dendl; + + const auto siter = user->get_info().swift_keys.find(swift_user); + if (siter == std::end(user->get_info().swift_keys)) { + return result_t::deny(-EPERM); + } + + const auto swift_key = siter->second; + + bufferlist local_tok_bl; + ret = build_token(swift_user, swift_key.key, nonce, expiration, local_tok_bl); + if (ret < 0) { + throw ret; + } + + if (local_tok_bl.length() != tok_bl.length()) { + ldpp_dout(dpp, 0) << "NOTICE: tokens length mismatch:" + << " tok_bl.length()=" << tok_bl.length() + << " local_tok_bl.length()=" << local_tok_bl.length() + << dendl; + return result_t::deny(-EPERM); + } + + if (memcmp(local_tok_bl.c_str(), tok_bl.c_str(), + local_tok_bl.length()) != 0) { + char buf[local_tok_bl.length() * 2 + 1]; + + buf_to_hex(reinterpret_cast(local_tok_bl.c_str()), + local_tok_bl.length(), buf); + + ldpp_dout(dpp, 0) << "NOTICE: tokens mismatch tok=" << buf << dendl; + return result_t::deny(-EPERM); + } + + auto apl = apl_factory->create_apl_local(cct, s, user->get_info(), + extract_swift_subuser(swift_user), + std::nullopt, rgw::auth::LocalApplier::NO_ACCESS_KEY); + return result_t::grant(std::move(apl)); +} + +} /* namespace swift */ +} /* namespace auth */ +} /* namespace rgw */ + + +void RGW_SWIFT_Auth_Get::execute(optional_yield y) +{ + int ret = -EPERM; + + const char *key = s->info.env->get("HTTP_X_AUTH_KEY"); + const char *user_name = s->info.env->get("HTTP_X_AUTH_USER"); + + s->prot_flags |= RGW_REST_SWIFT; + + string user_str; + std::unique_ptr user; + bufferlist bl; + RGWAccessKey *swift_key; + map::iterator siter; + + string swift_url = g_conf()->rgw_swift_url; + string swift_prefix = g_conf()->rgw_swift_url_prefix; + string tenant_path; + + /* + * We did not allow an empty Swift prefix before, but we want it now. + * So, we take rgw_swift_url_prefix = "/" to yield the empty prefix. + * The rgw_swift_url_prefix = "" is the default and yields "/swift" + * in a backwards-compatible way. + */ + if (swift_prefix.size() == 0) { + swift_prefix = DEFAULT_SWIFT_PREFIX; + } else if (swift_prefix == "/") { + swift_prefix.clear(); + } else { + if (swift_prefix[0] != '/') { + swift_prefix.insert(0, "/"); + } + } + + if (swift_url.size() == 0) { + bool add_port = false; + auto server_port = s->info.env->get_optional("SERVER_PORT_SECURE"); + const char *protocol; + if (server_port) { + add_port = (*server_port != "443"); + protocol = "https"; + } else { + server_port = s->info.env->get_optional("SERVER_PORT"); + if (server_port) { + add_port = (*server_port != "80"); + } + protocol = "http"; + } + const char *host = s->info.env->get("HTTP_HOST"); + if (!host) { + dout(0) << "NOTICE: server is misconfigured, missing rgw_swift_url_prefix or rgw_swift_url, HTTP_HOST is not set" << dendl; + ret = -EINVAL; + goto done; + } + swift_url = protocol; + swift_url.append("://"); + swift_url.append(host); + if (add_port && !strchr(host, ':')) { + swift_url.append(":"); + swift_url.append(*server_port); + } + } + + if (!key || !user_name) + goto done; + + user_str = user_name; + + ret = driver->get_user_by_swift(s, user_str, s->yield, &user); + if (ret < 0) { + ret = -EACCES; + goto done; + } + + siter = user->get_info().swift_keys.find(user_str); + if (siter == user->get_info().swift_keys.end()) { + ret = -EPERM; + goto done; + } + swift_key = &siter->second; + + if (swift_key->key.compare(key) != 0) { + dout(0) << "NOTICE: RGW_SWIFT_Auth_Get::execute(): bad swift key" << dendl; + ret = -EPERM; + goto done; + } + + if (!g_conf()->rgw_swift_tenant_name.empty()) { + tenant_path = "/AUTH_"; + tenant_path.append(g_conf()->rgw_swift_tenant_name); + } else if (g_conf()->rgw_swift_account_in_url) { + tenant_path = "/AUTH_"; + tenant_path.append(user->get_id().to_str()); + } + + dump_header(s, "X-Storage-Url", swift_url + swift_prefix + "/v1" + + tenant_path); + + using rgw::auth::swift::encode_token; + if ((ret = encode_token(s->cct, swift_key->id, swift_key->key, bl)) < 0) + goto done; + + { + static constexpr size_t PREFIX_LEN = sizeof("AUTH_rgwtk") - 1; + char token_val[PREFIX_LEN + bl.length() * 2 + 1]; + + snprintf(token_val, PREFIX_LEN + 1, "AUTH_rgwtk"); + buf_to_hex((const unsigned char *)bl.c_str(), bl.length(), + token_val + PREFIX_LEN); + + dump_header(s, "X-Storage-Token", token_val); + dump_header(s, "X-Auth-Token", token_val); + } + + ret = STATUS_NO_CONTENT; + +done: + set_req_state_err(s, ret); + dump_errno(s); + end_header(s); +} + +int RGWHandler_SWIFT_Auth::init(rgw::sal::Driver* driver, req_state *state, + rgw::io::BasicClient *cio) +{ + state->dialect = "swift-auth"; + state->formatter = new JSONFormatter; + state->format = RGWFormat::JSON; + + return RGWHandler::init(driver, state, cio); +} + +int RGWHandler_SWIFT_Auth::authorize(const DoutPrefixProvider *dpp, optional_yield) +{ + return 0; +} + +RGWOp *RGWHandler_SWIFT_Auth::op_get() +{ + return new RGW_SWIFT_Auth_Get; +} + diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h new file mode 100644 index 000000000..85a103dbf --- /dev/null +++ b/src/rgw/rgw_swift_auth.h @@ -0,0 +1,354 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_common.h" +#include "rgw_user.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_auth.h" +#include "rgw_auth_keystone.h" +#include "rgw_auth_filters.h" +#include "rgw_sal.h" + +#define RGW_SWIFT_TOKEN_EXPIRATION (15 * 60) + +namespace rgw { +namespace auth { +namespace swift { + +/* TempURL: applier. */ +class TempURLApplier : public rgw::auth::LocalApplier { +public: + TempURLApplier(CephContext* const cct, + const RGWUserInfo& user_info) + : LocalApplier(cct, user_info, LocalApplier::NO_SUBUSER, std::nullopt, LocalApplier::NO_ACCESS_KEY) { + }; + + void modify_request_state(const DoutPrefixProvider* dpp, req_state * s) const override; /* in/out */ + void write_ops_log_entry(rgw_log_entry& entry) const override; + + struct Factory { + virtual ~Factory() {} + virtual aplptr_t create_apl_turl(CephContext* cct, + const req_state* s, + const RGWUserInfo& user_info) const = 0; + }; +}; + +/* TempURL: engine */ +class TempURLEngine : public rgw::auth::Engine { + using result_t = rgw::auth::Engine::result_t; + + CephContext* const cct; + rgw::sal::Driver* driver; + const TempURLApplier::Factory* const apl_factory; + + /* Helper methods. */ + void get_owner_info(const DoutPrefixProvider* dpp, + const req_state* s, + RGWUserInfo& owner_info, + optional_yield y) const; + std::string convert_from_iso8601(std::string expires) const; + bool is_applicable(const req_state* s) const noexcept; + bool is_expired(const std::string& expires) const; + bool is_disallowed_header_present(const req_info& info) const; + + class SignatureHelper; + class PrefixableSignatureHelper; + +public: + TempURLEngine(CephContext* const cct, + rgw::sal::Driver* _driver , + const TempURLApplier::Factory* const apl_factory) + : cct(cct), + driver(_driver), + apl_factory(apl_factory) { + } + + /* Interface implementations. */ + const char* get_name() const noexcept override { + return "rgw::auth::swift::TempURLEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const override; +}; + + +/* AUTH_rgwtk */ +class SignedTokenEngine : public rgw::auth::Engine { + using result_t = rgw::auth::Engine::result_t; + + CephContext* const cct; + rgw::sal::Driver* driver; + const rgw::auth::TokenExtractor* const extractor; + const rgw::auth::LocalApplier::Factory* const apl_factory; + + bool is_applicable(const std::string& token) const noexcept; + using rgw::auth::Engine::authenticate; + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* s) const; + +public: + SignedTokenEngine(CephContext* const cct, + rgw::sal::Driver* _driver, + const rgw::auth::TokenExtractor* const extractor, + const rgw::auth::LocalApplier::Factory* const apl_factory) + : cct(cct), + driver(_driver), + extractor(extractor), + apl_factory(apl_factory) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::swift::SignedTokenEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s, + optional_yield y) const override { + return authenticate(dpp, extractor->get_token(s), s); + } +}; + + +/* External token */ +class ExternalTokenEngine : public rgw::auth::Engine { + using result_t = rgw::auth::Engine::result_t; + + CephContext* const cct; + rgw::sal::Driver* driver; + const rgw::auth::TokenExtractor* const extractor; + const rgw::auth::LocalApplier::Factory* const apl_factory; + + bool is_applicable(const std::string& token) const noexcept; + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* s, optional_yield y) const; + +public: + ExternalTokenEngine(CephContext* const cct, + rgw::sal::Driver* _driver, + const rgw::auth::TokenExtractor* const extractor, + const rgw::auth::LocalApplier::Factory* const apl_factory) + : cct(cct), + driver(_driver), + extractor(extractor), + apl_factory(apl_factory) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::swift::ExternalTokenEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s, + optional_yield y) const override { + return authenticate(dpp, extractor->get_token(s), s, y); + } +}; + +/* SwiftAnonymous: applier. */ +class SwiftAnonymousApplier : public rgw::auth::LocalApplier { + public: + SwiftAnonymousApplier(CephContext* const cct, + const RGWUserInfo& user_info) + : LocalApplier(cct, user_info, LocalApplier::NO_SUBUSER, std::nullopt, LocalApplier::NO_ACCESS_KEY) { + } + bool is_admin_of(const rgw_user& uid) const {return false;} + bool is_owner_of(const rgw_user& uid) const {return uid.id.compare(RGW_USER_ANON_ID) == 0;} +}; + +class SwiftAnonymousEngine : public rgw::auth::AnonymousEngine { + const rgw::auth::TokenExtractor* const extractor; + + bool is_applicable(const req_state* s) const noexcept override { + return extractor->get_token(s).empty(); + } + +public: + SwiftAnonymousEngine(CephContext* const cct, + const SwiftAnonymousApplier::Factory* const apl_factory, + const rgw::auth::TokenExtractor* const extractor) + : AnonymousEngine(cct, apl_factory), + extractor(extractor) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::swift::SwiftAnonymousEngine"; + } +}; + + +class DefaultStrategy : public rgw::auth::Strategy, + public rgw::auth::RemoteApplier::Factory, + public rgw::auth::LocalApplier::Factory, + public rgw::auth::swift::TempURLApplier::Factory { + rgw::sal::Driver* driver; + const ImplicitTenants& implicit_tenant_context; + + /* The engines. */ + const rgw::auth::swift::TempURLEngine tempurl_engine; + const rgw::auth::swift::SignedTokenEngine signed_engine; + boost::optional keystone_engine; + const rgw::auth::swift::ExternalTokenEngine external_engine; + const rgw::auth::swift::SwiftAnonymousEngine anon_engine; + + using keystone_config_t = rgw::keystone::CephCtxConfig; + using keystone_cache_t = rgw::keystone::TokenCache; + using aplptr_t = rgw::auth::IdentityApplier::aplptr_t; + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + + /* The method implements TokenExtractor for X-Auth-Token present in req_state. */ + struct AuthTokenExtractor : rgw::auth::TokenExtractor { + std::string get_token(const req_state* const s) const override { + /* Returning a reference here would end in GCC complaining about a reference + * to temporary. */ + return s->info.env->get("HTTP_X_AUTH_TOKEN", ""); + } + } auth_token_extractor; + + /* The method implements TokenExtractor for X-Service-Token present in req_state. */ + struct ServiceTokenExtractor : rgw::auth::TokenExtractor { + std::string get_token(const req_state* const s) const override { + return s->info.env->get("HTTP_X_SERVICE_TOKEN", ""); + } + } service_token_extractor; + + aplptr_t create_apl_remote(CephContext* const cct, + const req_state* const s, + acl_strategy_t&& extra_acl_strategy, + const rgw::auth::RemoteApplier::AuthInfo &info) const override { + auto apl = \ + rgw::auth::add_3rdparty(driver, rgw_user(s->account_name), + rgw::auth::add_sysreq(cct, driver, s, + rgw::auth::RemoteApplier(cct, driver, std::move(extra_acl_strategy), info, + implicit_tenant_context, + rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_SWIFT))); + /* TODO(rzarzynski): replace with static_ptr. */ + return aplptr_t(new decltype(apl)(std::move(apl))); + } + + aplptr_t create_apl_local(CephContext* const cct, + const req_state* const s, + const RGWUserInfo& user_info, + const std::string& subuser, + const std::optional& perm_mask, + const std::string& access_key_id) const override { + auto apl = \ + rgw::auth::add_3rdparty(driver, rgw_user(s->account_name), + rgw::auth::add_sysreq(cct, driver, s, + rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask, access_key_id))); + /* TODO(rzarzynski): replace with static_ptr. */ + return aplptr_t(new decltype(apl)(std::move(apl))); + } + + aplptr_t create_apl_turl(CephContext* const cct, + const req_state* const s, + const RGWUserInfo& user_info) const override { + /* TempURL doesn't need any user account override. It's a Swift-specific + * mechanism that requires account name internally, so there is no + * business with delegating the responsibility outside. */ + return aplptr_t(new rgw::auth::swift::TempURLApplier(cct, user_info)); + } + +public: + DefaultStrategy(CephContext* const cct, + const ImplicitTenants& implicit_tenant_context, + rgw::sal::Driver* _driver) + : driver(_driver), + implicit_tenant_context(implicit_tenant_context), + tempurl_engine(cct, + driver, + static_cast(this)), + signed_engine(cct, + driver, + static_cast(&auth_token_extractor), + static_cast(this)), + external_engine(cct, + driver, + static_cast(&auth_token_extractor), + static_cast(this)), + anon_engine(cct, + static_cast(this), + static_cast(&auth_token_extractor)) { + /* When the constructor's body is being executed, all member engines + * should be initialized. Thus, we can safely add them. */ + using Control = rgw::auth::Strategy::Control; + + add_engine(Control::SUFFICIENT, tempurl_engine); + add_engine(Control::SUFFICIENT, signed_engine); + + /* The auth strategy is responsible for deciding whether a parcular + * engine is disabled or not. */ + if (! cct->_conf->rgw_keystone_url.empty()) { + keystone_engine.emplace(cct, + static_cast(&auth_token_extractor), + static_cast(&service_token_extractor), + static_cast(this), + keystone_config_t::get_instance(), + keystone_cache_t::get_instance()); + + add_engine(Control::SUFFICIENT, *keystone_engine); + } + if (! cct->_conf->rgw_swift_auth_url.empty()) { + add_engine(Control::SUFFICIENT, external_engine); + } + + add_engine(Control::SUFFICIENT, anon_engine); + } + + const char* get_name() const noexcept override { + return "rgw::auth::swift::DefaultStrategy"; + } +}; + +} /* namespace swift */ +} /* namespace auth */ +} /* namespace rgw */ + + +class RGW_SWIFT_Auth_Get : public RGWOp { +public: + RGW_SWIFT_Auth_Get() {} + ~RGW_SWIFT_Auth_Get() override {} + + int verify_permission(optional_yield) override { return 0; } + void execute(optional_yield y) override; + const char* name() const override { return "swift_auth_get"; } + dmc::client_id dmclock_client() override { return dmc::client_id::auth; } +}; + +class RGWHandler_SWIFT_Auth : public RGWHandler_REST { +public: + RGWHandler_SWIFT_Auth() {} + ~RGWHandler_SWIFT_Auth() override {} + RGWOp *op_get() override; + + int init(rgw::sal::Driver* driver, req_state *state, rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider *dpp, optional_yield y) override; + int postauth_init(optional_yield) override { return 0; } + int read_permissions(RGWOp *op, optional_yield) override { return 0; } + + virtual RGWAccessControlPolicy *alloc_policy() { return NULL; } + virtual void free_policy(RGWAccessControlPolicy *policy) {} +}; + +class RGWRESTMgr_SWIFT_Auth : public RGWRESTMgr { +public: + RGWRESTMgr_SWIFT_Auth() = default; + ~RGWRESTMgr_SWIFT_Auth() override = default; + + RGWRESTMgr *get_resource_mgr(req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry&, + const std::string&) override { + return new RGWHandler_SWIFT_Auth; + } +}; diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc new file mode 100644 index 000000000..b41d9c672 --- /dev/null +++ b/src/rgw/rgw_sync.cc @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_sync.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +std::ostream& RGWMetaSyncStatusManager::gen_prefix(std::ostream& out) const +{ + return out << "meta sync: "; +} + +unsigned RGWMetaSyncStatusManager::get_subsys() const +{ + return dout_subsys; +} + +void RGWRemoteMetaLog::finish() +{ + going_down = true; + stop(); +} diff --git a/src/rgw/rgw_sync_checkpoint.cc b/src/rgw/rgw_sync_checkpoint.cc new file mode 100644 index 000000000..5e05b0e12 --- /dev/null +++ b/src/rgw/rgw_sync_checkpoint.cc @@ -0,0 +1,273 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include "common/errno.h" +#include "rgw_sync_checkpoint.h" +#include "rgw_sal_rados.h" +#include "rgw_bucket_sync.h" +#include "rgw_data_sync.h" +#include "rgw_http_errors.h" +#include "cls/rgw/cls_rgw_client.h" +#include "services/svc_sys_obj.h" +#include "services/svc_zone.h" +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +namespace { + +std::string incremental_marker(const rgw_bucket_shard_sync_info& info) +{ + return BucketIndexShardsManager::get_shard_marker(info.inc_marker.position); +} + +bool operator<(const std::vector& lhs, + const BucketIndexShardsManager& rhs) +{ + for (size_t i = 0; i < lhs.size(); ++i) { + const auto& l = incremental_marker(lhs[i]); + const auto& r = rhs.get(i, ""); + if (l < r) { + return true; + } + } + return false; +} + +bool empty(const BucketIndexShardsManager& markers, int size) +{ + for (int i = 0; i < size; ++i) { + const auto& m = markers.get(i, ""); + if (!m.empty()) { + return false; + } + } + return true; +} + +std::ostream& operator<<(std::ostream& out, const std::vector& rhs) +{ + const char* separator = ""; // first entry has no comma + out << '['; + for (auto& i : rhs) { + out << std::exchange(separator, ", ") << incremental_marker(i); + } + return out << ']'; +} + +std::ostream& operator<<(std::ostream& out, const BucketIndexShardsManager& rhs) +{ + out << '['; + const char* separator = ""; // first entry has no comma + for (auto& [i, marker] : rhs.get()) { + out << std::exchange(separator, ", ") << marker; + } + return out << ']'; +} + +int bucket_source_sync_checkpoint(const DoutPrefixProvider* dpp, + rgw::sal::RadosStore* store, + const RGWBucketInfo& bucket_info, + const RGWBucketInfo& source_bucket_info, + const rgw_sync_bucket_pipe& pipe, + uint64_t latest_gen, + const BucketIndexShardsManager& remote_markers, + ceph::timespan retry_delay, + ceph::coarse_mono_time timeout_at) +{ + + const int num_shards = remote_markers.get().size(); + rgw_bucket_sync_status full_status; + int r = rgw_read_bucket_full_sync_status(dpp, store, pipe, &full_status, null_yield); + if (r < 0 && r != -ENOENT) { // retry on ENOENT + return r; + } + + // wait for incremental + while (full_status.state != BucketSyncState::Incremental) { + const auto delay_until = ceph::coarse_mono_clock::now() + retry_delay; + if (delay_until > timeout_at) { + lderr(store->ctx()) << "bucket checkpoint timed out waiting to reach incremental sync" << dendl; + return -ETIMEDOUT; + } + ldout(store->ctx(), 1) << "waiting to reach incremental sync.." << dendl; + std::this_thread::sleep_until(delay_until); + + r = rgw_read_bucket_full_sync_status(dpp, store, pipe, &full_status, null_yield); + if (r < 0 && r != -ENOENT) { // retry on ENOENT + return r; + } + } + + // wait for latest_gen + while (full_status.incremental_gen < latest_gen) { + const auto delay_until = ceph::coarse_mono_clock::now() + retry_delay; + if (delay_until > timeout_at) { + lderr(store->ctx()) << "bucket checkpoint timed out waiting to reach " + "latest generation " << latest_gen << dendl; + return -ETIMEDOUT; + } + ldout(store->ctx(), 1) << "waiting to reach latest gen " << latest_gen + << ", on " << full_status.incremental_gen << ".." << dendl; + std::this_thread::sleep_until(delay_until); + + r = rgw_read_bucket_full_sync_status(dpp, store, pipe, &full_status, null_yield); + if (r < 0 && r != -ENOENT) { // retry on ENOENT + return r; + } + } + + if (full_status.incremental_gen > latest_gen) { + ldpp_dout(dpp, 1) << "bucket sync caught up with source:\n" + << " local gen: " << full_status.incremental_gen << '\n' + << " remote gen: " << latest_gen << dendl; + return 0; + } + + if (empty(remote_markers, num_shards)) { + ldpp_dout(dpp, 1) << "bucket sync caught up with empty source" << dendl; + return 0; + } + + std::vector status; + status.resize(std::max(1, num_shards)); + r = rgw_read_bucket_inc_sync_status(dpp, store, pipe, + full_status.incremental_gen, &status); + if (r < 0) { + return r; + } + + while (status < remote_markers) { + const auto delay_until = ceph::coarse_mono_clock::now() + retry_delay; + if (delay_until > timeout_at) { + ldpp_dout(dpp, 0) << "bucket checkpoint timed out waiting for incremental sync to catch up" << dendl; + return -ETIMEDOUT; + } + ldpp_dout(dpp, 1) << "waiting for incremental sync to catch up:\n" + << " local status: " << status << '\n' + << " remote markers: " << remote_markers << dendl; + std::this_thread::sleep_until(delay_until); + r = rgw_read_bucket_inc_sync_status(dpp, store, pipe, + full_status.incremental_gen, &status); + if (r < 0) { + return r; + } + } + ldpp_dout(dpp, 1) << "bucket sync caught up with source:\n" + << " local status: " << status << '\n' + << " remote markers: " << remote_markers << dendl; + return 0; +} + +int source_bilog_info(const DoutPrefixProvider *dpp, + RGWSI_Zone* zone_svc, + const rgw_sync_bucket_pipe& pipe, + rgw_bucket_index_marker_info& info, + BucketIndexShardsManager& markers, + optional_yield y) +{ + ceph_assert(pipe.source.zone); + + auto& zone_conn_map = zone_svc->get_zone_conn_map(); + auto conn = zone_conn_map.find(pipe.source.zone->id); + if (conn == zone_conn_map.end()) { + return -EINVAL; + } + + return rgw_read_remote_bilog_info(dpp, conn->second, *pipe.source.bucket, + info, markers, y); +} + +} // anonymous namespace + +int rgw_bucket_sync_checkpoint(const DoutPrefixProvider* dpp, + rgw::sal::RadosStore* store, + const RGWBucketSyncPolicyHandler& policy, + const RGWBucketInfo& info, + std::optional opt_source_zone, + std::optional opt_source_bucket, + ceph::timespan retry_delay, + ceph::coarse_mono_time timeout_at) +{ + struct sync_source_entry { + rgw_sync_bucket_pipe pipe; + uint64_t latest_gen = 0; + BucketIndexShardsManager remote_markers; + RGWBucketInfo source_bucket_info; + }; + std::list sources; + + // fetch remote markers and bucket info in parallel + boost::asio::io_context ioctx; + + for (const auto& [source_zone_id, pipe] : policy.get_all_sources()) { + // filter by source zone/bucket + if (opt_source_zone && *opt_source_zone != *pipe.source.zone) { + continue; + } + if (opt_source_bucket && !opt_source_bucket->match(*pipe.source.bucket)) { + continue; + } + auto& entry = sources.emplace_back(); + entry.pipe = pipe; + + // fetch remote markers + spawn::spawn(ioctx, [&] (yield_context yield) { + auto y = optional_yield{ioctx, yield}; + rgw_bucket_index_marker_info info; + int r = source_bilog_info(dpp, store->svc()->zone, entry.pipe, + info, entry.remote_markers, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to fetch remote bilog markers: " + << cpp_strerror(r) << dendl; + throw std::system_error(-r, std::system_category()); + } + entry.latest_gen = info.latest_gen; + }); + // fetch source bucket info + spawn::spawn(ioctx, [&] (yield_context yield) { + auto y = optional_yield{ioctx, yield}; + int r = store->getRados()->get_bucket_instance_info( + *entry.pipe.source.bucket, entry.source_bucket_info, + nullptr, nullptr, y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to read source bucket info: " + << cpp_strerror(r) << dendl; + throw std::system_error(-r, std::system_category()); + } + }); + } + + try { + ioctx.run(); + } catch (const std::system_error& e) { + return -e.code().value(); + } + + // checkpoint each source sequentially + for (const auto& e : sources) { + int r = bucket_source_sync_checkpoint(dpp, store, info, e.source_bucket_info, + e.pipe, e.latest_gen, e.remote_markers, + retry_delay, timeout_at); + if (r < 0) { + ldpp_dout(dpp, 0) << "bucket sync checkpoint failed: " << cpp_strerror(r) << dendl; + return r; + } + } + ldpp_dout(dpp, 0) << "bucket checkpoint complete" << dendl; + return 0; +} + diff --git a/src/rgw/rgw_sync_checkpoint.h b/src/rgw/rgw_sync_checkpoint.h new file mode 100644 index 000000000..28df68d88 --- /dev/null +++ b/src/rgw/rgw_sync_checkpoint.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include "common/ceph_time.h" +#include "rgw_basic_types.h" + +class DoutPrefixProvider; +namespace rgw::sal { class RadosStore; } +class RGWBucketInfo; +class RGWBucketSyncPolicyHandler; + +// poll the bucket's sync status until it's caught up against all sync sources +int rgw_bucket_sync_checkpoint(const DoutPrefixProvider* dpp, + rgw::sal::RadosStore* store, + const RGWBucketSyncPolicyHandler& policy, + const RGWBucketInfo& info, + std::optional opt_source_zone, + std::optional opt_source_bucket, + ceph::timespan retry_delay, + ceph::coarse_mono_time timeout_at); diff --git a/src/rgw/rgw_sync_policy.cc b/src/rgw/rgw_sync_policy.cc new file mode 100644 index 000000000..cf28d5eec --- /dev/null +++ b/src/rgw/rgw_sync_policy.cc @@ -0,0 +1,787 @@ + + +#include "rgw_common.h" +#include "rgw_sync_policy.h" +#include "rgw_bucket.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +string rgw_sync_bucket_entity::bucket_key() const +{ + return rgw_sync_bucket_entities::bucket_key(bucket); +} + +bool rgw_sync_pipe_filter_tag::from_str(const string& s) +{ + if (s.empty()) { + return false; + } + + auto pos = s.find('='); + if (pos == string::npos) { + key = s; + return true; + } + + key = s.substr(0, pos); + if (pos < s.size() - 1) { + value = s.substr(pos + 1); + } + + return true; +} + +bool rgw_sync_pipe_filter_tag::operator==(const string& s) const +{ + if (s.empty()) { + return false; + } + + auto pos = s.find('='); + if (pos == string::npos) { + return value.empty() && (s == key); + } + + return s.compare(0, pos, s) == 0 && + s.compare(pos + 1, s.size() - pos - 1, value) == 0; +} + +void rgw_sync_pipe_filter::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(prefix, bl); + encode(tags, bl); + ENCODE_FINISH(bl); +} + +void rgw_sync_pipe_filter::decode(bufferlist::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(prefix, bl); + decode(tags, bl); + DECODE_FINISH(bl); +} + +void rgw_sync_pipe_filter::set_prefix(std::optional opt_prefix, + bool prefix_rm) +{ + if (opt_prefix) { + prefix = *opt_prefix; + } else if (prefix_rm) { + prefix.reset(); + } +} + +void rgw_sync_pipe_filter::set_tags(std::list& tags_add, + std::list& tags_rm) +{ + for (auto& t : tags_rm) { + rgw_sync_pipe_filter_tag tag; + if (tag.from_str(t)) { + tags.erase(tag); + } + } + + for (auto& t : tags_add) { + rgw_sync_pipe_filter_tag tag; + if (tag.from_str(t)) { + tags.insert(tag); + } + } +} + +bool rgw_sync_pipe_filter::is_subset_of(const rgw_sync_pipe_filter& f) const +{ + if (f.prefix) { + if (!prefix) { + return false; + } + /* f.prefix exists, and this->prefix is either equal or bigger, + * therefore this->prefix also set */ + + if (!boost::starts_with(*prefix, *f.prefix)) { + return false; + } + } + + /* prefix is subset, now check tags. All our tags should exist in f.tags */ + + for (auto& t : tags) { + if (f.tags.find(t) == f.tags.end()) { + return false; + } + } + + return true; +} + +bool rgw_sync_pipe_filter::check_tag(const string& s) const +{ + if (tags.empty()) { /* tag filter wasn't defined */ + return true; + } + + auto iter = tags.find(rgw_sync_pipe_filter_tag(s)); + return (iter != tags.end()); +} + +bool rgw_sync_pipe_filter::check_tag(const string& k, const string& v) const +{ + if (tags.empty()) { /* tag filter wasn't defined */ + return true; + } + + auto iter = tags.find(rgw_sync_pipe_filter_tag(k, v)); + return (iter != tags.end()); +} + +bool rgw_sync_pipe_filter::has_tags() const +{ + return !tags.empty(); +} + +bool rgw_sync_pipe_filter::check_tags(const std::vector& _tags) const +{ + if (tags.empty()) { + return true; + } + + for (auto& t : _tags) { + if (check_tag(t)) { + return true; + } + } + return false; +} + +bool rgw_sync_pipe_filter::check_tags(const RGWObjTags::tag_map_t& _tags) const +{ + if (tags.empty()) { + return true; + } + + for (auto& item : _tags) { + if (check_tag(item.first, item.second)) { + return true; + } + } + return false; +} + +void rgw_sync_bucket_entity::apply_bucket(std::optional b) +{ + if (!b) { + return; + } + + if (!bucket || + bucket->name.empty()) { + bucket = b; + } +} + +void rgw_sync_bucket_entities::add_zones(const std::vector& new_zones) { + for (auto& z : new_zones) { + if (z == "*") { + all_zones = true; + zones.reset(); + return; + } + + if (!zones) { + zones.emplace(); + } + + zones->insert(z); + + all_zones = false; + } +} + +std::vector rgw_sync_bucket_entities::expand() const +{ + std::vector result; + rgw_bucket b = get_bucket(); + if (all_zones) { + rgw_sync_bucket_entity e; + e.all_zones = true; + e.bucket = b; + result.push_back(e); + return result; + } + + if (!zones) { + return result; + } + + for (auto& z : *zones) { + rgw_sync_bucket_entity e; + e.all_zones = false; + e.bucket = b; + e.zone = z; + result.push_back(e); + } + + return result; +} + +void rgw_sync_bucket_entities::remove_zones(const std::vector& rm_zones) { + all_zones = false; + + if (!zones) { + return; + } + + for (auto& z : rm_zones) { + zones->erase(z); + } +} + +static void set_bucket_field(std::optional source, string *field) { + if (!source) { + return; + } + if (source == "*") { + field->clear(); + return; + } + *field = *source; +} + +void rgw_sync_bucket_entities::set_bucket(std::optional tenant, + std::optional bucket_name, + std::optional bucket_id) +{ + if ((!bucket) && (tenant || bucket_name || bucket_id)) { + bucket.emplace(); + } + + if (!bucket) { + return; + } + + set_bucket_field(tenant, &bucket->tenant); + set_bucket_field(bucket_name, &bucket->name); + set_bucket_field(bucket_id, &bucket->bucket_id); + + if (bucket->tenant.empty() && + bucket->name.empty() && + bucket->bucket_id.empty()) { + bucket.reset(); + } +} + +void rgw_sync_bucket_entities::remove_bucket(std::optional tenant, + std::optional bucket_name, + std::optional bucket_id) +{ + if (!bucket) { + return; + } + + if (tenant) { + bucket->tenant.clear(); + } + if (bucket_name) { + bucket->name.clear(); + } + if (bucket_id) { + bucket->bucket_id.clear(); + } + + if (bucket->tenant.empty() && + bucket->name.empty() && + bucket->bucket_id.empty()) { + bucket.reset(); + } +} + + +string rgw_sync_bucket_entities::bucket_key(std::optional b) +{ + if (!b) { + return string("*"); + } + + rgw_bucket _b = *b; + + if (_b.name.empty()) { + _b.name = "*"; + } + + return _b.get_key(); +} + +std::vector rgw_sync_bucket_pipes::expand() const +{ + std::vector result; + + auto sources = source.expand(); + auto dests = dest.expand(); + + for (auto& s : sources) { + for (auto& d : dests) { + rgw_sync_bucket_pipe pipe; + pipe.id = id; + pipe.source = s; + pipe.dest = d; + pipe.params = params; + result.push_back(pipe); + } + } + + return result; +} + + +void rgw_sync_bucket_pipes::get_potential_related_buckets(const rgw_bucket& bucket, + std::set *sources, + std::set *dests) const +{ + if (dest.match_bucket(bucket)) { + auto expanded_sources = source.expand(); + + for (auto& s : expanded_sources) { + if (s.bucket && !s.bucket->name.empty()) { + sources->insert(*s.bucket); + } + } + } + + if (source.match_bucket(bucket)) { + auto expanded_dests = dest.expand(); + + for (auto& d : expanded_dests) { + if (d.bucket && !d.bucket->name.empty()) { + dests->insert(*d.bucket); + } + } + } +} + +bool rgw_sync_data_flow_group::find_or_create_symmetrical(const string& flow_id, rgw_sync_symmetric_group **flow_group) +{ + for (auto& group : symmetrical) { + if (flow_id == group.id) { + *flow_group = &group; + return true; + } + } + + auto& group = symmetrical.emplace_back(); + *flow_group = &group; + (*flow_group)->id = flow_id; + return true; +} + +void rgw_sync_data_flow_group::remove_symmetrical(const string& flow_id, std::optional > zones) +{ + if (symmetrical.empty()) { + return; + } + + auto& groups = symmetrical; + + auto iter = groups.begin(); + + for (; iter != groups.end(); ++iter) { + if (iter->id == flow_id) { + if (!zones) { + groups.erase(iter); + if (groups.empty()) { + symmetrical.clear(); + } + return; + } + break; + } + } + + if (iter == groups.end()) { + return; + } + + auto& flow_group = *iter; + + for (auto& z : *zones) { + flow_group.zones.erase(z); + } + + if (flow_group.zones.empty()) { + groups.erase(iter); + } + if (groups.empty()) { + symmetrical.clear(); + } +} + +bool rgw_sync_data_flow_group::find_or_create_directional(const rgw_zone_id& source_zone, const rgw_zone_id& dest_zone, rgw_sync_directional_rule **flow_group) +{ + for (auto& rule : directional) { + if (source_zone == rule.source_zone && + dest_zone == rule.dest_zone) { + *flow_group = &rule; + return true; + } + } + + auto& rule = directional.emplace_back(); + *flow_group = &rule; + + rule.source_zone = source_zone; + rule.dest_zone = dest_zone; + + return true; +} + +void rgw_sync_data_flow_group::remove_directional(const rgw_zone_id& source_zone, const rgw_zone_id& dest_zone) +{ + if (directional.empty()) { + return; + } + + for (auto iter = directional.begin(); iter != directional.end(); ++iter) { + auto& rule = *iter; + if (source_zone == rule.source_zone && + dest_zone == rule.dest_zone) { + directional.erase(iter); + return; + } + } +} + +void rgw_sync_data_flow_group::init_default(const std::set& zones) +{ + symmetrical.clear(); + symmetrical.push_back(rgw_sync_symmetric_group("default", zones)); +} + +bool rgw_sync_policy_group::find_pipe(const string& pipe_id, bool create, rgw_sync_bucket_pipes **pipe) +{ + for (auto& p : pipes) { + if (pipe_id == p.id) { + *pipe = &p; + return true; + } + } + + if (!create) { + return false; + } + + auto& p = pipes.emplace_back(); + *pipe = &p; + p.id = pipe_id; + + return true; +} + +void rgw_sync_policy_group::remove_pipe(const string& pipe_id) +{ + for (auto iter = pipes.begin(); iter != pipes.end(); ++iter) { + if (pipe_id == iter->id) { + pipes.erase(iter); + return; + } + } +} + +void rgw_sync_policy_group::get_potential_related_buckets(const rgw_bucket& bucket, + std::set *sources, + std::set *dests) const +{ + for (auto& pipe : pipes) { + pipe.get_potential_related_buckets(bucket, sources, dests); + } +} + +void rgw_sync_policy_info::get_potential_related_buckets(const rgw_bucket& bucket, + std::set *sources, + std::set *dests) const +{ + for (auto& entry : groups) { + auto& group = entry.second; + group.get_potential_related_buckets(bucket, sources, dests); + } +} + +void rgw_sync_directional_rule::dump(Formatter *f) const +{ + encode_json("source_zone", source_zone, f); + encode_json("dest_zone", dest_zone, f); +} + +void rgw_sync_directional_rule::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("source_zone", source_zone, obj); + JSONDecoder::decode_json("dest_zone", dest_zone, obj); +} + +void rgw_sync_symmetric_group::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("zones", zones, f); +} + +void rgw_sync_symmetric_group::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("zones", zones, obj); +} + +void rgw_sync_bucket_entity::dump(Formatter *f) const +{ + encode_json("zone", zone, f); + encode_json("bucket", bucket_key(), f); +} + +void rgw_sync_bucket_entity::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("zone", zone, obj); + string s; + if (JSONDecoder::decode_json("bucket", s, obj)) { + rgw_bucket b; + int ret = rgw_bucket_parse_bucket_key(nullptr, s, &b, nullptr); + if (ret >= 0) { + bucket = b; + } else { + bucket.reset(); + } + } +} + +void rgw_sync_pipe_filter_tag::dump(Formatter *f) const +{ + encode_json("key", key, f); + encode_json("value", value, f); +} + +void rgw_sync_pipe_filter_tag::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("key", key, obj); + JSONDecoder::decode_json("value", value, obj); +} + +void rgw_sync_pipe_filter::dump(Formatter *f) const +{ + encode_json("prefix", prefix, f); + encode_json("tags", tags, f); +} + +void rgw_sync_pipe_filter::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("prefix", prefix, obj); + JSONDecoder::decode_json("tags", tags, obj); +} + +void rgw_sync_pipe_acl_translation::dump(Formatter *f) const +{ + encode_json("owner", owner, f); +} + +void rgw_sync_pipe_acl_translation::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("owner", owner, obj); +} + +void rgw_sync_pipe_source_params::dump(Formatter *f) const +{ + encode_json("filter", filter, f); +} + +void rgw_sync_pipe_source_params::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("filter", filter, obj); +} + +void rgw_sync_pipe_dest_params::dump(Formatter *f) const +{ + encode_json("acl_translation", acl_translation, f); + encode_json("storage_class", storage_class, f); +} + +void rgw_sync_pipe_dest_params::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("acl_translation", acl_translation, obj); + JSONDecoder::decode_json("storage_class", storage_class, obj); +} + +void rgw_sync_pipe_params::dump(Formatter *f) const +{ + encode_json("source", source, f); + encode_json("dest", dest, f); + encode_json("priority", priority, f); + string s; + switch (mode) { + case MODE_SYSTEM: + s = "system"; + break; + default: + s = "user"; + } + encode_json("mode", s, f); + encode_json("user", user, f); +} + +void rgw_sync_pipe_params::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("source", source, obj); + JSONDecoder::decode_json("dest", dest, obj); + JSONDecoder::decode_json("priority", priority, obj); + string s; + JSONDecoder::decode_json("mode", s, obj); + if (s == "system") { + mode = MODE_SYSTEM; + } else { + mode = MODE_USER; + } + JSONDecoder::decode_json("user", user, obj); +} + +void rgw_sync_bucket_entities::dump(Formatter *f) const +{ + encode_json("bucket", rgw_sync_bucket_entities::bucket_key(bucket), f); + if (zones) { + encode_json("zones", zones, f); + } else if (all_zones) { + set z = { "*" }; + encode_json("zones", z, f); + } +} + +void rgw_sync_bucket_entities::decode_json(JSONObj *obj) +{ + string s; + JSONDecoder::decode_json("bucket", s, obj); + if (s == "*") { + bucket.reset(); + } else { + rgw_bucket b; + int ret = rgw_bucket_parse_bucket_key(nullptr, s, &b, nullptr); + if (ret < 0) { + bucket.reset(); + } else { + if (b.tenant == "*") { + b.tenant.clear(); + } + if (b.name == "*") { + b.name.clear(); + } + if (b.bucket_id == "*") { + b.bucket_id.clear(); + } + bucket = b; + } + } + JSONDecoder::decode_json("zones", zones, obj); + if (zones && zones->size() == 1) { + auto iter = zones->begin(); + if (*iter == "*") { + zones.reset(); + all_zones = true; + } + } +} + +void rgw_sync_bucket_pipe::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("source", source, f); + encode_json("dest", dest, f); + encode_json("params", params, f); +} + +void rgw_sync_bucket_pipe::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("source", source, obj); + JSONDecoder::decode_json("dest", dest, obj); + JSONDecoder::decode_json("params", params, obj); +} + +void rgw_sync_bucket_pipes::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("source", source, f); + encode_json("dest", dest, f); + encode_json("params", params, f); +} + +void rgw_sync_bucket_pipes::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("source", source, obj); + JSONDecoder::decode_json("dest", dest, obj); + JSONDecoder::decode_json("params", params, obj); +} + +void rgw_sync_data_flow_group::dump(Formatter *f) const +{ + if (!symmetrical.empty()) { + encode_json("symmetrical", symmetrical, f); + } + + if (!directional.empty()) { + encode_json("directional", directional, f); + } +} + +void rgw_sync_data_flow_group::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("symmetrical", symmetrical, obj); + JSONDecoder::decode_json("directional", directional, obj); +} + +void rgw_sync_policy_group::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("data_flow", data_flow, f); + encode_json("pipes", pipes, f); + string s; + switch (status) { + case rgw_sync_policy_group::Status::FORBIDDEN: + s = "forbidden"; + break; + case rgw_sync_policy_group::Status::ALLOWED: + s = "allowed"; + break; + case rgw_sync_policy_group::Status::ENABLED: + s = "enabled"; + break; + default: + s = "unknown"; + } + encode_json("status", s, f); +} + +void rgw_sync_policy_group::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("data_flow", data_flow, obj); + JSONDecoder::decode_json("pipes", pipes, obj); + string s; + JSONDecoder::decode_json("status", s, obj); + set_status(s); +} + +void rgw_sync_policy_info::dump(Formatter *f) const +{ + Formatter::ArraySection section(*f, "groups"); + for (auto& group : groups ) { + encode_json("group", group.second, f); + } +} + +void rgw_sync_policy_info::decode_json(JSONObj *obj) +{ + vector groups_vec; + + JSONDecoder::decode_json("groups", groups_vec, obj); + + for (auto& group : groups_vec) { + groups.emplace(std::make_pair(group.id, std::move(group))); + } +} + diff --git a/src/rgw/rgw_sync_policy.h b/src/rgw/rgw_sync_policy.h new file mode 100644 index 000000000..4758c426d --- /dev/null +++ b/src/rgw/rgw_sync_policy.h @@ -0,0 +1,682 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_basic_types.h" +#include "rgw_tag.h" + + +struct rgw_sync_symmetric_group { + std::string id; + std::set zones; + + rgw_sync_symmetric_group() {} + rgw_sync_symmetric_group(const std::string& _id, + const std::set _zones) : id(_id), zones(_zones) {} + + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(zones, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(zones, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_sync_symmetric_group) + +struct rgw_sync_directional_rule { + rgw_zone_id source_zone; + rgw_zone_id dest_zone; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(source_zone, bl); + encode(dest_zone, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(source_zone, bl); + decode(dest_zone, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_sync_directional_rule) + +struct rgw_sync_bucket_entity { + std::optional zone; /* define specific zones */ + std::optional bucket; /* define specific bucket */ + + static bool match_str(const std::string& s1, const std::string& s2) { /* empty std::string is wildcard */ + return (s1.empty() || + s2.empty() || + s1 == s2); + } + + bool all_zones{false}; + + rgw_sync_bucket_entity() {} + rgw_sync_bucket_entity(const rgw_zone_id& _zone, + std::optional _bucket) : zone(_zone), + bucket(_bucket.value_or(rgw_bucket())) {} + + bool specific() const { + return zone && bucket; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(all_zones, bl); + encode(zone, bl); + encode(bucket, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(all_zones, bl); + decode(zone, bl); + decode(bucket, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + rgw_bucket get_bucket() const { + return bucket.value_or(rgw_bucket()); + } + + std::string bucket_key() const; + + bool match_zone(const rgw_zone_id& z) const { + if (all_zones) { + return true; + } + if (!zone) { + return false; + } + + return (*zone == z); + } + + void apply_zone(const rgw_zone_id& z) { + all_zones = false; + zone = z; + } + + static bool match_bucket_id(const std::string& bid1, const std::string& bid2) { + return (bid1.empty() || bid2.empty() || (bid1 == bid2)); + } + + bool match_bucket(std::optional b) const { + if (!b) { + return true; + } + + if (!bucket) { + return true; + } + + return (match_str(bucket->tenant, b->tenant) && + match_str(bucket->name, b->name) && + match_bucket_id(bucket->bucket_id, b->bucket_id)); + } + + bool match(const rgw_sync_bucket_entity& entity) const { + if (!entity.zone) { + return match_bucket(entity.bucket); + } + return (match_zone(*entity.zone) && match_bucket(entity.bucket)); + } + + const bool operator<(const rgw_sync_bucket_entity& e) const { + if (all_zones && !e.all_zones) { + return false; + } + if (!all_zones && e.all_zones) { + return true; + } + if (zone < e.zone) { + return true; + } + if (e.zone < zone) { + return false; + } + return (bucket < e.bucket); + } + + void apply_bucket(std::optional _b); +}; +WRITE_CLASS_ENCODER(rgw_sync_bucket_entity) + +struct rgw_sync_pipe_filter_tag { + std::string key; + std::string value; + + rgw_sync_pipe_filter_tag() {} + rgw_sync_pipe_filter_tag(const std::string& s) { + from_str(s); + } + rgw_sync_pipe_filter_tag(const std::string& _key, + const std::string& _value) : key(_key), + value(_value) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(key, bl); + encode(value, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(key, bl); + decode(value, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + bool from_str(const std::string& s); + + bool operator<(const rgw_sync_pipe_filter_tag& t) const { + if (key < t.key) { + return true; + } + if (t.key < key) { + return false; + } + return (value < t.value); + } + + bool operator==(const std::string& s) const; +}; +WRITE_CLASS_ENCODER(rgw_sync_pipe_filter_tag) + +struct rgw_sync_pipe_filter { + std::optional prefix; + std::set tags; + + void set_prefix(std::optional opt_prefix, + bool prefix_rm); + void set_tags(std::list& tags_add, + std::list& tags_rm); + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + bool is_subset_of(const rgw_sync_pipe_filter& f) const; + + bool has_tags() const; + bool check_tag(const std::string& s) const; + bool check_tag(const std::string& k, const std::string& v) const; + bool check_tags(const std::vector& tags) const; + bool check_tags(const RGWObjTags::tag_map_t& tags) const; +}; +WRITE_CLASS_ENCODER(rgw_sync_pipe_filter) + +struct rgw_sync_pipe_acl_translation { + rgw_user owner; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(owner, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(owner, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + bool operator==(const rgw_sync_pipe_acl_translation& aclt) const { + return (owner == aclt.owner); + } +}; +WRITE_CLASS_ENCODER(rgw_sync_pipe_acl_translation) + +struct rgw_sync_pipe_source_params { + rgw_sync_pipe_filter filter; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(filter, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(filter, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_sync_pipe_source_params) + +struct rgw_sync_pipe_dest_params { + std::optional acl_translation; + std::optional storage_class; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(acl_translation, bl); + encode(storage_class, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(acl_translation, bl); + decode(storage_class, bl); + DECODE_FINISH(bl); + } + + void set_storage_class(const std::string& sc) { + storage_class = sc; + } + + void set_owner(const rgw_user& owner) { + if (owner.empty()){ + acl_translation.reset(); + } else { + acl_translation.emplace(); + acl_translation->owner = owner; + } + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + bool operator==(const rgw_sync_pipe_dest_params& rhs) const { + return (acl_translation == rhs.acl_translation && + storage_class == rhs.storage_class); + } +}; +WRITE_CLASS_ENCODER(rgw_sync_pipe_dest_params) + +struct rgw_sync_pipe_params { + rgw_sync_pipe_source_params source; + rgw_sync_pipe_dest_params dest; + enum Mode { + MODE_SYSTEM = 0, + MODE_USER = 1, + } mode{MODE_SYSTEM}; + int32_t priority{0}; + rgw_user user; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(source, bl); + encode(dest, bl); + encode(priority, bl); + encode((uint8_t)mode, bl); + encode(user, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(source, bl); + decode(dest, bl); + decode(priority, bl); + uint8_t m; + decode(m, bl); + mode = (Mode)m; + decode(user, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_sync_pipe_params) + +struct rgw_sync_bucket_pipe { + std::string id; + rgw_sync_bucket_entity source; + rgw_sync_bucket_entity dest; + + rgw_sync_pipe_params params; + + bool specific() const { + return source.specific() && dest.specific(); + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(source, bl); + encode(dest, bl); + encode(params, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(source, bl); + decode(dest, bl); + decode(params, bl); + DECODE_FINISH(bl); + } + + const bool operator<(const rgw_sync_bucket_pipe& p) const { + if (id < p.id) { + return true; + } + if (id >p.id) { + return false; + } + if (source < p.source) { + return true; + } + if (p.source < source) { + return false; + } + return (dest < p.dest); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_sync_bucket_pipe) + +struct rgw_sync_bucket_entities { + std::optional bucket; /* define specific bucket */ + std::optional > zones; /* define specific zones, if not set then all zones */ + + bool all_zones{false}; + + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(bucket, bl); + encode(zones, bl); + encode(all_zones, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(bucket, bl); + decode(zones, bl); + decode(all_zones, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + bool match_bucket(std::optional b) const { + if (!b) { + return true; + } + + if (!bucket) { + return true; + } + + return (rgw_sync_bucket_entity::match_str(bucket->tenant, b->tenant) && + rgw_sync_bucket_entity::match_str(bucket->name, b->name) && + rgw_sync_bucket_entity::match_str(bucket->bucket_id, b->bucket_id)); + } + + void add_zones(const std::vector& new_zones); + void remove_zones(const std::vector& rm_zones); + void set_bucket(std::optional tenant, + std::optional bucket_name, + std::optional bucket_id); + void remove_bucket(std::optional tenant, + std::optional bucket_name, + std::optional bucket_id); + + bool match_zone(const rgw_zone_id& zone) const { + if (!zones) { + if (all_zones) { + return true; + } + return false; + } + + return (zones->find(zone) != zones->end()); + } + + std::vector expand() const; + + rgw_bucket get_bucket() const { + return bucket.value_or(rgw_bucket()); + } + + static std::string bucket_key(std::optional b); + + void set_all_zones(bool state) { + all_zones = state; + if (all_zones) { + zones.reset(); + } + } +}; +WRITE_CLASS_ENCODER(rgw_sync_bucket_entities) + +struct rgw_sync_bucket_pipes { + std::string id; + rgw_sync_bucket_entities source; + rgw_sync_bucket_entities dest; + + rgw_sync_pipe_params params; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(source, bl); + encode(dest, bl); + encode(params, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(source, bl); + decode(dest, bl); + decode(params, bl); + DECODE_FINISH(bl); + } + + bool match_source(const rgw_zone_id& zone, std::optional b) const { + return (source.match_zone(zone) && source.match_bucket(b)); + } + + bool match_dest(const rgw_zone_id& zone, std::optional b) const { + return (dest.match_zone(zone) && dest.match_bucket(b)); + } + + bool contains_zone_bucket(const rgw_zone_id& zone, std::optional b) const { + return (match_source(zone, b) || match_dest(zone, b)); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + std::vector expand() const; + + void get_potential_related_buckets(const rgw_bucket& bucket, + std::set *sources, + std::set *dests) const; +}; +WRITE_CLASS_ENCODER(rgw_sync_bucket_pipes) + +std::ostream& operator<<(std::ostream& os, const rgw_sync_bucket_entity& e); +std::ostream& operator<<(std::ostream& os, const rgw_sync_bucket_pipe& pipe); +std::ostream& operator<<(std::ostream& os, const rgw_sync_bucket_entities& e); +std::ostream& operator<<(std::ostream& os, const rgw_sync_bucket_pipes& pipe); + +/* + * define data flow between zones. Symmetrical: zones sync from each other. + * Directional: one zone fetches data from another. + */ +struct rgw_sync_data_flow_group { + std::vector symmetrical; + std::vector directional; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(symmetrical, bl); + encode(directional, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(symmetrical, bl); + decode(directional, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + bool empty() const { + return (symmetrical.empty() && directional.empty()); + } + + bool find_or_create_symmetrical(const std::string& flow_id, rgw_sync_symmetric_group **flow_group); + void remove_symmetrical(const std::string& flow_id, std::optional > zones); + bool find_or_create_directional(const rgw_zone_id& source_zone, const rgw_zone_id& dest_zone, rgw_sync_directional_rule **flow_group); + void remove_directional(const rgw_zone_id& source_zone, const rgw_zone_id& dest_zone); + + void init_default(const std::set& zones); +}; +WRITE_CLASS_ENCODER(rgw_sync_data_flow_group) + + +struct rgw_sync_policy_group { + std::string id; + + rgw_sync_data_flow_group data_flow; /* override data flow, howver, will not be able to + add new flows that don't exist at higher level */ + std::vector pipes; /* if not defined then applies to all + buckets (DR sync) */ + + enum Status { + UNKNOWN = 0, /* ? */ + FORBIDDEN = 1, /* sync not allowed */ + ALLOWED = 2, /* sync allowed */ + ENABLED = 3, /* sync should happen */ + } status; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(data_flow, bl); + encode(pipes, bl); + encode((uint32_t)status, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(data_flow, bl); + decode(pipes, bl); + uint32_t s; + decode(s, bl); + status = (Status)s; + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + bool set_status(const std::string& s) { + if (s == "forbidden") { + status = rgw_sync_policy_group::Status::FORBIDDEN; + } else if (s == "allowed") { + status = rgw_sync_policy_group::Status::ALLOWED; + } else if (s == "enabled") { + status = rgw_sync_policy_group::Status::ENABLED; + } else { + status = rgw_sync_policy_group::Status::UNKNOWN; + return false; + } + + return true; + } + + bool find_pipe(const std::string& pipe_id, bool create, rgw_sync_bucket_pipes **pipe); + void remove_pipe(const std::string& pipe_id); + + void get_potential_related_buckets(const rgw_bucket& bucket, + std::set *sources, + std::set *dests) const; + +}; +WRITE_CLASS_ENCODER(rgw_sync_policy_group) + +struct rgw_sync_policy_info { + std::map groups; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(groups, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(groups, bl); + DECODE_FINISH(bl); + } + + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + + bool empty() const { + return groups.empty(); + } + + void get_potential_related_buckets(const rgw_bucket& bucket, + std::set *sources, + std::set *dests) const; +}; +WRITE_CLASS_ENCODER(rgw_sync_policy_info) + + diff --git a/src/rgw/rgw_tag.cc b/src/rgw/rgw_tag.cc new file mode 100644 index 000000000..f7e52592f --- /dev/null +++ b/src/rgw/rgw_tag.cc @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include +#include + +#include "rgw_tag.h" +#include "rgw_common.h" + +using namespace std; + +void RGWObjTags::add_tag(const string& key, const string& val){ + tag_map.emplace(std::make_pair(key,val)); +} + +void RGWObjTags::emplace_tag(std::string&& key, std::string&& val){ + tag_map.emplace(std::move(key), std::move(val)); +} + +int RGWObjTags::check_and_add_tag(const string&key, const string& val){ + if (tag_map.size() == max_obj_tags || + key.size() > max_tag_key_size || + val.size() > max_tag_val_size || + key.size() == 0){ + return -ERR_INVALID_TAG; + } + + add_tag(key,val); + + return 0; +} + +int RGWObjTags::set_from_string(const string& input){ + if (input.empty()) { + return 0; + } + int ret=0; + vector kvs; + boost::split(kvs, input, boost::is_any_of("&")); + for (const auto& kv: kvs){ + auto p = kv.find("="); + string key,val; + if (p != string::npos) { + ret = check_and_add_tag(url_decode(kv.substr(0,p)), + url_decode(kv.substr(p+1))); + } else { + ret = check_and_add_tag(url_decode(kv)); + } + + if (ret < 0) + return ret; + } + return ret; +} + +void RGWObjTags::dump(Formatter *f) const +{ + f->open_object_section("tagset"); + for (auto& tag: tag_map){ + f->dump_string(tag.first.c_str(), tag.second); + } + f->close_section(); +} + diff --git a/src/rgw/rgw_tag.h b/src/rgw/rgw_tag.h new file mode 100644 index 000000000..15bb25ee8 --- /dev/null +++ b/src/rgw/rgw_tag.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include + +class RGWObjTags +{ +public: + using tag_map_t = std::multimap ; + +protected: + tag_map_t tag_map; + + uint32_t max_obj_tags{10}; + static constexpr uint32_t max_tag_key_size{128}; + static constexpr uint32_t max_tag_val_size{256}; + + public: + RGWObjTags() = default; + RGWObjTags(uint32_t max_obj_tags):max_obj_tags(max_obj_tags) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1,1,bl); + encode(tag_map, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl); + decode(tag_map,bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void add_tag(const std::string& key, const std::string& val=""); + void emplace_tag(std::string&& key, std::string&& val); + int check_and_add_tag(const std::string& key, const std::string& val=""); + size_t count() const {return tag_map.size();} + int set_from_string(const std::string& input); + void clear() { tag_map.clear(); } + bool empty() const noexcept { return tag_map.empty(); } + const tag_map_t& get_tags() const {return tag_map;} + tag_map_t& get_tags() {return tag_map;} +}; +WRITE_CLASS_ENCODER(RGWObjTags) diff --git a/src/rgw/rgw_tag_s3.cc b/src/rgw/rgw_tag_s3.cc new file mode 100644 index 000000000..89436c326 --- /dev/null +++ b/src/rgw/rgw_tag_s3.cc @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include +#include + +#include "include/types.h" + +#include "rgw_tag_s3.h" + +using namespace std; + +void RGWObjTagEntry_S3::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Key", key, obj, true); + RGWXMLDecoder::decode_xml("Value", val, obj, true); +} + +void RGWObjTagEntry_S3::dump_xml(Formatter *f) const { + encode_xml("Key", key, f); + encode_xml("Value", val, f); + + if (key.empty()) { + throw RGWXMLDecoder::err("empty key"); + } + + if (val.empty()) { + throw RGWXMLDecoder::err("empty val"); + } +} + +void RGWObjTagSet_S3::decode_xml(XMLObj *obj) { + vector entries; + + bool mandatory{false}; + RGWXMLDecoder::decode_xml("Tag", entries, obj, mandatory); + + for (auto& entry : entries) { + const std::string& key = entry.get_key(); + const std::string& val = entry.get_val(); + add_tag(key,val); + } +} + +int RGWObjTagSet_S3::rebuild(RGWObjTags& dest) { + int ret; + for (const auto &it : tag_map){ + ret = dest.check_and_add_tag(it.first, it.second); + if (ret < 0) + return ret; + } + return 0; +} + +void RGWObjTagging_S3::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("TagSet", tagset, obj, true); +} + +void RGWObjTagSet_S3::dump_xml(Formatter *f) const { + for (const auto& tag : tag_map){ + Formatter::ObjectSection os(*f, "Tag"); + encode_xml("Key", tag.first, f); + encode_xml("Value", tag.second, f); + } +} + diff --git a/src/rgw/rgw_tag_s3.h b/src/rgw/rgw_tag_s3.h new file mode 100644 index 000000000..7cc892f1f --- /dev/null +++ b/src/rgw/rgw_tag_s3.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rgw_tag.h" +#include "rgw_xml.h" + +class RGWObjTagEntry_S3 +{ + std::string key; + std::string val; +public: + RGWObjTagEntry_S3() {} + RGWObjTagEntry_S3(const std::string &k, const std::string &v):key(k),val(v) {}; + ~RGWObjTagEntry_S3() {} + + const std::string& get_key () const { return key; } + const std::string& get_val () const { return val; } + + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); +}; + +class RGWObjTagSet_S3: public RGWObjTags +{ +public: + int rebuild(RGWObjTags& dest); + + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); +}; + +class RGWObjTagging_S3 +{ + RGWObjTagSet_S3 tagset; +public: + void decode_xml(XMLObj *obj); + int rebuild(RGWObjTags& dest) { + return tagset.rebuild(dest); + } +}; diff --git a/src/rgw/rgw_tar.h b/src/rgw/rgw_tar.h new file mode 100644 index 000000000..b06943a3c --- /dev/null +++ b/src/rgw/rgw_tar.h @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace rgw { +namespace tar { + +static constexpr size_t BLOCK_SIZE = 512; + + +static inline std::pair> +interpret_block(const StatusIndicator& status, ceph::bufferlist& bl); + + +class StatusIndicator { + friend std::pair> + interpret_block(const StatusIndicator& status, ceph::bufferlist& bl); + + bool is_empty; + bool is_eof; + + StatusIndicator() + : is_empty(false), + is_eof(false) { + } + + StatusIndicator(const StatusIndicator& prev_status, + const bool is_empty) + : is_empty(is_empty), + is_eof(is_empty && prev_status.empty()) { + } + +public: + bool empty() const { + return is_empty; + } + + bool eof() const { + return is_eof; + } + + static StatusIndicator create() { + return StatusIndicator(); + } +} /* class StatusIndicator */; + + +enum class FileType : char { + UNKNOWN = '\0', + + /* The tar format uses ASCII encoding. */ + NORMAL_FILE = '0', + DIRECTORY = '5' +}; /* enum class FileType */ + +class HeaderView { +protected: + /* Everything is char here (ASCII encoding), so we don't need to worry about + * the struct padding. */ + const struct header_t { + char filename[100]; + char __filemode[8]; + char __owner_id[8]; + char __group_id[8]; + char filesize[12]; + char lastmod[12]; + char checksum[8]; + char filetype; + char __padding[355]; + } *header; + + static_assert(sizeof(*header) == BLOCK_SIZE, + "The TAR header must be exactly BLOCK_SIZE length"); + + /* The label is far more important from what the code really does. */ + static size_t pos2len(const size_t pos) { + return pos + 1; + } + +public: + explicit HeaderView(const char (&header)[BLOCK_SIZE]) + : header(reinterpret_cast(header)) { + } + + FileType get_filetype() const { + switch (header->filetype) { + case static_cast(FileType::NORMAL_FILE): + return FileType::NORMAL_FILE; + case static_cast(FileType::DIRECTORY): + return FileType::DIRECTORY; + default: + return FileType::UNKNOWN; + } + } + + std::string_view get_filename() const { + return std::string_view(header->filename, + std::min(sizeof(header->filename), + strlen(header->filename))); + } + + size_t get_filesize() const { + /* The string_ref is pretty suitable here because tar encodes its + * metadata in ASCII. */ + const std::string_view raw(header->filesize, sizeof(header->filesize)); + + /* We need to find where the padding ends. */ + const auto pad_ends_at = std::min(raw.find_last_not_of('\0'), + raw.find_last_not_of(' ')); + const auto trimmed = raw.substr(0, + pad_ends_at == std::string_view::npos ? std::string_view::npos + : pos2len(pad_ends_at)); + + size_t sum = 0, mul = 1; + for (const char c : boost::adaptors::reverse(trimmed)) { + sum += (c - '0') * mul; + mul *= 8; + } + + return sum; + } +}; /* class Header */ + + +static inline std::pair> +interpret_block(const StatusIndicator& status, ceph::bufferlist& bl) { + static constexpr std::array zero_block = {0, }; + const char (&block)[BLOCK_SIZE] = \ + reinterpret_cast(*bl.c_str()); + + if (std::memcmp(zero_block.data(), block, BLOCK_SIZE) == 0) { + return std::make_pair(StatusIndicator(status, true), boost::none); + } else { + return std::make_pair(StatusIndicator(status, false), HeaderView(block)); + } +} + +} /* namespace tar */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_token.cc b/src/rgw/rgw_token.cc new file mode 100644 index 000000000..999d46e0e --- /dev/null +++ b/src/rgw/rgw_token.cc @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include + +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/debug.h" +#include "global/global_init.h" +#include "include/ceph_assert.h" +#include "include/str_list.h" + +#include "rgw_token.h" +#include "rgw_b64.h" + +#define dout_subsys ceph_subsys_rgw + +namespace { + + using namespace rgw; + using std::get; + using std::string; + + RGWToken::token_type type{RGWToken::TOKEN_NONE}; + string access_key{""}; + string secret_key{""}; + + Formatter* token_formatter{nullptr}; + + bool verbose {false}; + bool do_encode {false}; + bool do_decode {false}; + +} + +using namespace std; + +void usage() +{ + cout << "usage: radosgw-token --encode --ttype= [options...]" << std::endl; + cout << "\t(maybe exporting RGW_ACCESS_KEY_ID and RGW_SECRET_ACCESS_KEY)" + << std::endl; + cout << "\t := ad | ldap" << std::endl; + cout << "\n"; + generic_client_usage(); +} + +int main(int argc, char **argv) +{ + auto args = argv_to_vec(argc, argv); + std::string val; + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + char *v{nullptr}; + v = getenv("RGW_ACCESS_KEY_ID"); + if (v) { + access_key = v; + } + + v = getenv("RGW_SECRET_ACCESS_KEY"); + if (v) { + secret_key = v; + } + + for (auto arg_iter = args.begin(); arg_iter != args.end();) { + if (ceph_argparse_witharg(args, arg_iter, &val, "--access", + (char*) nullptr)) { + access_key = val; + } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret", + (char*) nullptr)) { + secret_key = val; + } else if (ceph_argparse_witharg(args, arg_iter, &val, "--ttype", + (char*) nullptr)) { + for (const auto& ttype : {"ad", "ldap"}) { + if (boost::iequals(val, ttype)) { + type = RGWToken::to_type(val); + break; + } + } + } else if (ceph_argparse_flag(args, arg_iter, "--encode", + (char*) nullptr)) { + do_encode = true; + } else if (ceph_argparse_flag(args, arg_iter, "--decode", + (char*) nullptr)) { + do_decode = true; + } else if (ceph_argparse_flag(args, arg_iter, "--verbose", + (char*) nullptr)) { + verbose = true; + } else { + ++arg_iter; + } + } + + if ((! do_encode) || + (type == RGWToken::TOKEN_NONE)) { + return -EINVAL; + } + + token_formatter = new JSONFormatter(true /* pretty */); + + RGWToken token(type, access_key, secret_key); + if (do_encode) { + token.encode_json(token_formatter); + std::ostringstream os; + token_formatter->flush(os); + string token_str = os.str(); + if (verbose) { + std::cout << "expanded token: " << token_str << std::endl; + if (do_decode) { + RGWToken token2(token_str); + std::cout << "decoded expanded token: " << token2 << std::endl; + } + } + std::cout << to_base64(token_str) << std::endl; + } + + return 0; +} diff --git a/src/rgw/rgw_token.h b/src/rgw/rgw_token.h new file mode 100644 index 000000000..b2476596b --- /dev/null +++ b/src/rgw/rgw_token.h @@ -0,0 +1,170 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include +#include + +#include "common/ceph_json.h" +#include "common/Formatter.h" +#include "rgw/rgw_b64.h" + +namespace rgw { + + using std::string; + + class RGWToken { + public: + static constexpr auto type_name = "RGW_TOKEN"; + + enum token_type : uint32_t { + TOKEN_NONE, + TOKEN_AD, + TOKEN_KEYSTONE, + TOKEN_LDAP, + }; + + static enum token_type to_type(const string& s) { + if (boost::iequals(s, "ad")) + return TOKEN_AD; + if (boost::iequals(s, "ldap")) + return TOKEN_LDAP; + if (boost::iequals(s, "keystone")) + return TOKEN_KEYSTONE; + return TOKEN_NONE; + } + + static const char* from_type(enum token_type type) { + switch (type) { + case TOKEN_AD: + return "ad"; + case TOKEN_LDAP: + return "ldap"; + case TOKEN_KEYSTONE: + return "keystone"; + default: + return "none"; + }; + } + + token_type type; + string id; + string key; + + virtual uint32_t version() const { return 1; }; + + bool valid() const{ + return ((type != TOKEN_NONE) && + (! id.empty()) && + (! key.empty())); + } + + RGWToken() + : type(TOKEN_NONE) {}; + + RGWToken(enum token_type _type, const std::string& _id, + const std::string& _key) + : type(_type), id(_id), key(_key) {}; + + explicit RGWToken(const string& json) { + JSONParser p; + p.parse(json.c_str(), json.length()); + JSONDecoder::decode_json(RGWToken::type_name, *this, &p); + } + + RGWToken& operator=(const std::string& json) { + JSONParser p; + p.parse(json.c_str(), json.length()); + JSONDecoder::decode_json(RGWToken::type_name, *this, &p); + return *this; + } + + void encode(bufferlist& bl) const { + uint32_t ver = version(); + string typestr{from_type(type)}; + ENCODE_START(1, 1, bl); + encode(type_name, bl); + encode(ver, bl); + encode(typestr, bl); + encode(id, bl); + encode(key, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + string name; + string typestr; + uint32_t version; + DECODE_START(1, bl); + decode(name, bl); + decode(version, bl); + decode(typestr, bl); + type = to_type(typestr); + decode(id, bl); + decode(key, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter* f) const { + ::encode_json("version", uint32_t(version()), f); + ::encode_json("type", from_type(type), f); + ::encode_json("id", id, f); + ::encode_json("key", key, f); + } + + void encode_json(Formatter* f) { + RGWToken& token = *this; + f->open_object_section(type_name); + ::encode_json(type_name, token, f); + f->close_section(); + } + + void decode_json(JSONObj* obj) { + uint32_t version; + string type_name; + string typestr; + JSONDecoder::decode_json("version", version, obj); + JSONDecoder::decode_json("type", typestr, obj); + type = to_type(typestr); + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("key", key, obj); + } + + std::string encode_json_base64(Formatter* f) { + encode_json(f); + std::ostringstream os; + f->flush(os); + return to_base64(std::move(os.str())); + } + + friend inline std::ostream& operator<<(std::ostream& os, const RGWToken& token); + + virtual ~RGWToken() {}; + }; + WRITE_CLASS_ENCODER(RGWToken) + + inline std::ostream& operator<<(std::ostream& os, const RGWToken& token) + { + os << "<>"; + return os; + } + +} /* namespace rgw */ diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc new file mode 100644 index 000000000..7e6513cde --- /dev/null +++ b/src/rgw/rgw_tools.cc @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "common/errno.h" + +#include "rgw_tools.h" + +#define dout_subsys ceph_subsys_rgw +#define dout_context g_ceph_context + +#define READ_CHUNK_LEN (512 * 1024) + +using namespace std; + +static std::map* ext_mime_map; + +void parse_mime_map_line(const char *start, const char *end) +{ + char line[end - start + 1]; + strncpy(line, start, end - start); + line[end - start] = '\0'; + char *l = line; +#define DELIMS " \t\n\r" + + while (isspace(*l)) + l++; + + char *mime = strsep(&l, DELIMS); + if (!mime) + return; + + char *ext; + do { + ext = strsep(&l, DELIMS); + if (ext && *ext) { + (*ext_mime_map)[ext] = mime; + } + } while (ext); +} + + +void parse_mime_map(const char *buf) +{ + const char *start = buf, *end = buf; + while (*end) { + while (*end && *end != '\n') { + end++; + } + parse_mime_map_line(start, end); + end++; + start = end; + } +} + +static int ext_mime_map_init(const DoutPrefixProvider *dpp, CephContext *cct, const char *ext_map) +{ + int fd = open(ext_map, O_RDONLY); + char *buf = NULL; + int ret; + if (fd < 0) { + ret = -errno; + ldpp_dout(dpp, 0) << __func__ << " failed to open file=" << ext_map + << " : " << cpp_strerror(-ret) << dendl; + return ret; + } + + struct stat st; + ret = fstat(fd, &st); + if (ret < 0) { + ret = -errno; + ldpp_dout(dpp, 0) << __func__ << " failed to stat file=" << ext_map + << " : " << cpp_strerror(-ret) << dendl; + goto done; + } + + buf = (char *)malloc(st.st_size + 1); + if (!buf) { + ret = -ENOMEM; + ldpp_dout(dpp, 0) << __func__ << " failed to allocate buf" << dendl; + goto done; + } + + ret = safe_read(fd, buf, st.st_size + 1); + if (ret != st.st_size) { + // huh? file size has changed? + ldpp_dout(dpp, 0) << __func__ << " raced! will retry.." << dendl; + free(buf); + close(fd); + return ext_mime_map_init(dpp, cct, ext_map); + } + buf[st.st_size] = '\0'; + + parse_mime_map(buf); + ret = 0; +done: + free(buf); + close(fd); + return ret; +} + +const char *rgw_find_mime_by_ext(string& ext) +{ + map::iterator iter = ext_mime_map->find(ext); + if (iter == ext_mime_map->end()) + return NULL; + + return iter->second.c_str(); +} + +int rgw_tools_init(const DoutPrefixProvider *dpp, CephContext *cct) +{ + ext_mime_map = new std::map; + ext_mime_map_init(dpp, cct, cct->_conf->rgw_mime_types_file.c_str()); + // ignore errors; missing mime.types is not fatal + return 0; +} + +void rgw_tools_cleanup() +{ + delete ext_mime_map; + ext_mime_map = nullptr; +} diff --git a/src/rgw/rgw_torrent.cc b/src/rgw/rgw_torrent.cc new file mode 100644 index 000000000..e1a1417a5 --- /dev/null +++ b/src/rgw/rgw_torrent.cc @@ -0,0 +1,261 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include + +#include "rgw_torrent.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "include/str_list.h" +#include "include/rados/librados.hpp" + +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace librados; +using namespace boost; +using ceph::crypto::SHA1; + +seed::seed() +{ + seed::info.piece_length = 0; + seed::info.len = 0; + sha_len = 0; + is_torrent = false; +} + +seed::~seed() +{ + seed::info.sha1_bl.clear(); + bl.clear(); + s = NULL; + driver = NULL; +} + +void seed::init(req_state *_req, rgw::sal::Driver* _driver) +{ + s = _req; + driver = _driver; +} + +int seed::get_torrent_file(rgw::sal::Object* object, + uint64_t &total_len, + ceph::bufferlist &bl_data, + rgw_obj &obj) +{ + /* add other field if config is set */ + dencode.bencode_dict(bl); + set_announce(); + if (!comment.empty()) + { + dencode.bencode(COMMENT, comment, bl); + } + if (!create_by.empty()) + { + dencode.bencode(CREATED_BY, create_by, bl); + } + if (!encoding.empty()) + { + dencode.bencode(ENCODING, encoding, bl); + } + + string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + ldpp_dout(s, 20) << "NOTICE: head obj oid= " << oid << dendl; + + const set obj_key{RGW_OBJ_TORRENT}; + map m; + const int r = object->omap_get_vals_by_keys(s, oid, obj_key, &m); + if (r < 0) { + ldpp_dout(s, 0) << "ERROR: omap_get_vals_by_keys failed: " << r << dendl; + return r; + } + if (m.size() != 1) { + ldpp_dout(s, 0) << "ERROR: omap key " RGW_OBJ_TORRENT " not found" << dendl; + return -EINVAL; + } + bl.append(std::move(m.begin()->second)); + dencode.bencode_end(bl); + + bl_data = bl; + total_len = bl.length(); + return 0; +} + +bool seed::get_flag() +{ + return is_torrent; +} + +void seed::update(bufferlist &bl) +{ + if (!is_torrent) + { + return; + } + info.len += bl.length(); + sha1(&h, bl, bl.length()); +} + +int seed::complete(optional_yield y) +{ + uint64_t remain = info.len%info.piece_length; + uint8_t remain_len = ((remain > 0)? 1 : 0); + sha_len = (info.len/info.piece_length + remain_len)*CEPH_CRYPTO_SHA1_DIGESTSIZE; + + int ret = 0; + /* produce torrent data */ + do_encode(); + + /* save torrent data into OMAP */ + ret = save_torrent_file(y); + if (0 != ret) + { + ldpp_dout(s, 0) << "ERROR: failed to save_torrent_file() ret= "<< ret << dendl; + return ret; + } + + return 0; +} + +off_t seed::get_data_len() +{ + return info.len; +} + +void seed::set_create_date(ceph::real_time& value) +{ + utime_t date = ceph::real_clock::to_timespec(value); + create_date = date.sec(); +} + +void seed::set_info_pieces(char *buff) +{ + info.sha1_bl.append(buff, CEPH_CRYPTO_SHA1_DIGESTSIZE); +} + +void seed::set_info_name(const string& value) +{ + info.name = value; +} + +void seed::sha1(SHA1 *h, bufferlist &bl, off_t bl_len) +{ + off_t num = bl_len/info.piece_length; + off_t remain = 0; + remain = bl_len%info.piece_length; + + char *pstr = bl.c_str(); + char sha[25]; + + /* get sha1 */ + for (off_t i = 0; i < num; i++) + { + // FIPS zeroization audit 20191116: this memset is not intended to + // wipe out a secret after use. + memset(sha, 0x00, sizeof(sha)); + h->Update((unsigned char *)pstr, info.piece_length); + h->Final((unsigned char *)sha); + set_info_pieces(sha); + pstr += info.piece_length; + } + + /* process remain */ + if (0 != remain) + { + // FIPS zeroization audit 20191116: this memset is not intended to + // wipe out a secret after use. + memset(sha, 0x00, sizeof(sha)); + h->Update((unsigned char *)pstr, remain); + h->Final((unsigned char *)sha); + set_info_pieces(sha); + } + ::ceph::crypto::zeroize_for_security(sha, sizeof(sha)); +} + +int seed::get_params() +{ + is_torrent = true; + info.piece_length = g_conf()->rgw_torrent_sha_unit; + create_by = g_conf()->rgw_torrent_createby; + encoding = g_conf()->rgw_torrent_encoding; + origin = g_conf()->rgw_torrent_origin; + comment = g_conf()->rgw_torrent_comment; + announce = g_conf()->rgw_torrent_tracker; + + /* tracker and tracker list is empty, set announce to origin */ + if (announce.empty() && !origin.empty()) + { + announce = origin; + } + + return 0; +} + +void seed::set_announce() +{ + list announce_list; // used to get announce list from conf + get_str_list(announce, ",", announce_list); + + if (announce_list.empty()) + { + ldpp_dout(s, 5) << "NOTICE: announce_list is empty " << dendl; + return; + } + + list::iterator iter = announce_list.begin(); + dencode.bencode_key(ANNOUNCE, bl); + dencode.bencode_key((*iter), bl); + + dencode.bencode_key(ANNOUNCE_LIST, bl); + dencode.bencode_list(bl); + for (; iter != announce_list.end(); ++iter) + { + dencode.bencode_list(bl); + dencode.bencode_key((*iter), bl); + dencode.bencode_end(bl); + } + dencode.bencode_end(bl); +} + +void seed::do_encode() +{ + /*Only encode create_date and sha1 info*/ + /*Other field will be added if confi is set when run get torrent*/ + dencode.bencode(CREATION_DATE, create_date, bl); + + dencode.bencode_key(INFO_PIECES, bl); + dencode.bencode_dict(bl); + dencode.bencode(LENGTH, info.len, bl); + dencode.bencode(NAME, info.name, bl); + dencode.bencode(PIECE_LENGTH, info.piece_length, bl); + + char info_sha[100] = { 0 }; + sprintf(info_sha, "%" PRIu64, sha_len); + string sha_len_str = info_sha; + dencode.bencode_key(PIECES, bl); + bl.append(sha_len_str.c_str(), sha_len_str.length()); + bl.append(':'); + bl.append(info.sha1_bl.c_str(), sha_len); + dencode.bencode_end(bl); +} + +int seed::save_torrent_file(optional_yield y) +{ + int op_ret = 0; + string key = RGW_OBJ_TORRENT; + + op_ret = s->object->omap_set_val_by_key(s, key, bl, false, y); + if (op_ret < 0) + { + ldpp_dout(s, 0) << "ERROR: failed to omap_set() op_ret = " << op_ret << dendl; + return op_ret; + } + + return op_ret; +} diff --git a/src/rgw/rgw_torrent.h b/src/rgw/rgw_torrent.h new file mode 100644 index 000000000..bf2e2217c --- /dev/null +++ b/src/rgw/rgw_torrent.h @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include "common/ceph_time.h" + +#include "rgw_common.h" + +using ceph::crypto::SHA1; + +struct req_state; + +#define RGW_OBJ_TORRENT "rgw.torrent" + +#define ANNOUNCE "announce" +#define ANNOUNCE_LIST "announce-list" +#define COMMENT "comment" +#define CREATED_BY "created by" +#define CREATION_DATE "creation date" +#define ENCODING "encoding" +#define LENGTH "length" +#define NAME "name" +#define PIECE_LENGTH "piece length" +#define PIECES "pieces" +#define INFO_PIECES "info" +#define GET_TORRENT "torrent" + +class TorrentBencode +{ +public: + TorrentBencode() {} + ~TorrentBencode() {} + + //control characters + void bencode_dict(bufferlist& bl) { bl.append('d'); } + void bencode_list(bufferlist& bl) { bl.append('l'); } + void bencode_end(bufferlist& bl) { bl.append('e'); } + + //single values + void bencode(int value, bufferlist& bl) + { + bl.append('i'); + char info[100] = { 0 }; + sprintf(info, "%d", value); + bl.append(info, strlen(info)); + bencode_end(bl); + } + + //single values + void bencode(const std::string& str, bufferlist& bl) + { + bencode_key(str, bl); + } + + //dictionary elements + void bencode(const std::string& key, int value, bufferlist& bl) + { + bencode_key(key, bl); + bencode(value, bl); + } + + //dictionary elements + void bencode(const std::string& key, const std::string& value, bufferlist& bl) + { + bencode_key(key, bl); + bencode(value, bl); + } + + //key len + void bencode_key(const std::string& key, bufferlist& bl) + { + int len = key.length(); + char info[100] = { 0 }; + sprintf(info, "%d:", len); + bl.append(info, strlen(info)); + bl.append(key.c_str(), len); + } +}; + +/* torrent file struct */ +class seed +{ +private: + struct + { + int piece_length; // each piece length + bufferlist sha1_bl; // save sha1 + std::string name; // file name + off_t len; // file total bytes + }info; + + std::string announce; // tracker + std::string origin; // origin + time_t create_date{0}; // time of the file created + std::string comment; // comment + std::string create_by; // app name and version + std::string encoding; // if encode use gbk rather than gtf-8 use this field + uint64_t sha_len; // sha1 length + bool is_torrent; // flag + bufferlist bl; // bufflist ready to send + + req_state *s{nullptr}; + rgw::sal::Driver* driver{nullptr}; + SHA1 h; + + TorrentBencode dencode; +public: + seed(); + ~seed(); + + int get_params(); + void init(req_state *p_req, rgw::sal::Driver* _driver); + int get_torrent_file(rgw::sal::Object* object, + uint64_t &total_len, + ceph::bufferlist &bl_data, + rgw_obj &obj); + + off_t get_data_len(); + bool get_flag(); + + void set_create_date(ceph::real_time& value); + void set_info_name(const std::string& value); + void update(bufferlist &bl); + int complete(optional_yield y); + +private: + void do_encode (); + void set_announce(); + void set_exist(bool exist); + void set_info_pieces(char *buff); + void sha1(SHA1 *h, bufferlist &bl, off_t bl_len); + int save_torrent_file(optional_yield y); +}; diff --git a/src/rgw/rgw_tracer.cc b/src/rgw/rgw_tracer.cc new file mode 100644 index 000000000..7e12bb2e6 --- /dev/null +++ b/src/rgw/rgw_tracer.cc @@ -0,0 +1,13 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "rgw_tracer.h" + +namespace tracing { +namespace rgw { + +tracing::Tracer tracer; + +} // namespace rgw +} // namespace tracing diff --git a/src/rgw/rgw_tracer.h b/src/rgw/rgw_tracer.h new file mode 100644 index 000000000..9cbae8b9c --- /dev/null +++ b/src/rgw/rgw_tracer.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once +#include "common/tracer.h" + +#include "rgw_common.h" + +namespace tracing { +namespace rgw { + +const auto OP = "op"; +const auto BUCKET_NAME = "bucket_name"; +const auto USER_ID = "user_id"; +const auto OBJECT_NAME = "object_name"; +const auto RETURN = "return"; +const auto UPLOAD_ID = "upload_id"; +const auto TYPE = "type"; +const auto REQUEST = "request"; +const auto MULTIPART = "multipart_upload "; + +extern tracing::Tracer tracer; + +} // namespace rgw +} // namespace tracing + +static inline void extract_span_context(const rgw::sal::Attrs& attr, jspan_context& span_ctx) { + auto trace_iter = attr.find(RGW_ATTR_TRACE); + if (trace_iter != attr.end()) { + try { + auto trace_bl_iter = trace_iter->second.cbegin(); + tracing::decode(span_ctx, trace_bl_iter); + } catch (buffer::error& err) {} + } +} diff --git a/src/rgw/rgw_url.cc b/src/rgw/rgw_url.cc new file mode 100644 index 000000000..7fd4788d7 --- /dev/null +++ b/src/rgw/rgw_url.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +namespace rgw { + +namespace { + const auto USER_GROUP_IDX = 3; + const auto PASSWORD_GROUP_IDX = 4; + const auto HOST_GROUP_IDX = 5; + + const std::string schema_re = "([[:alpha:]]+:\\/\\/)"; + const std::string user_pass_re = "(([^:\\s]+):([^@\\s]+)@)?"; + const std::string host_port_re = "([[:alnum:].:-]+)"; + const std::string path_re = "(/[[:print:]]*)?"; +} + +bool parse_url_authority(const std::string& url, std::string& host, std::string& user, std::string& password) { + const std::string re = schema_re + user_pass_re + host_port_re + path_re; + const std::regex url_regex(re, std::regex::icase); + std::smatch url_match_result; + + if (std::regex_match(url, url_match_result, url_regex)) { + host = url_match_result[HOST_GROUP_IDX]; + user = url_match_result[USER_GROUP_IDX]; + password = url_match_result[PASSWORD_GROUP_IDX]; + return true; + } + + return false; +} + +bool parse_url_userinfo(const std::string& url, std::string& user, std::string& password) { + const std::string re = schema_re + user_pass_re + host_port_re + path_re; + const std::regex url_regex(re); + std::smatch url_match_result; + + if (std::regex_match(url, url_match_result, url_regex)) { + user = url_match_result[USER_GROUP_IDX]; + password = url_match_result[PASSWORD_GROUP_IDX]; + return true; + } + + return false; +} +} + diff --git a/src/rgw/rgw_url.h b/src/rgw/rgw_url.h new file mode 100644 index 000000000..089401a49 --- /dev/null +++ b/src/rgw/rgw_url.h @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +namespace rgw { +// parse a URL of the form: http|https|amqp|amqps|kafka://[user:password@][:port] +bool parse_url_authority(const std::string& url, std::string& host, std::string& user, std::string& password); +bool parse_url_userinfo(const std::string& url, std::string& user, std::string& password); +} + diff --git a/src/rgw/rgw_usage.cc b/src/rgw/rgw_usage.cc new file mode 100644 index 000000000..ca7ca20eb --- /dev/null +++ b/src/rgw/rgw_usage.cc @@ -0,0 +1,171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "rgw_rados.h" +#include "rgw_usage.h" +#include "rgw_formats.h" +#include "rgw_sal.h" + +using namespace std; + +static void dump_usage_categories_info(Formatter *formatter, const rgw_usage_log_entry& entry, map *categories) +{ + formatter->open_array_section("categories"); + map::const_iterator uiter; + for (uiter = entry.usage_map.begin(); uiter != entry.usage_map.end(); ++uiter) { + if (categories && !categories->empty() && !categories->count(uiter->first)) + continue; + const rgw_usage_data& usage = uiter->second; + formatter->open_object_section("entry"); + formatter->dump_string("category", uiter->first); + formatter->dump_unsigned("bytes_sent", usage.bytes_sent); + formatter->dump_unsigned("bytes_received", usage.bytes_received); + formatter->dump_unsigned("ops", usage.ops); + formatter->dump_unsigned("successful_ops", usage.successful_ops); + formatter->close_section(); // entry + } + formatter->close_section(); // categories +} + +int RGWUsage::show(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + rgw::sal::User* user , rgw::sal::Bucket* bucket, + uint64_t start_epoch, uint64_t end_epoch, bool show_log_entries, + bool show_log_sum, + map *categories, RGWFormatterFlusher& flusher) +{ + uint32_t max_entries = 1000; + + bool is_truncated = true; + + RGWUsageIter usage_iter; + Formatter *formatter = flusher.get_formatter(); + + map usage; + + flusher.start(0); + + formatter->open_object_section("usage"); + if (show_log_entries) { + formatter->open_array_section("entries"); + } + string last_owner; + bool user_section_open = false; + map summary_map; + int ret; + + while (is_truncated) { + if (bucket) { + ret = bucket->read_usage(dpp, start_epoch, end_epoch, max_entries, &is_truncated, + usage_iter, usage); + } else if (user) { + ret = user->read_usage(dpp, start_epoch, end_epoch, max_entries, &is_truncated, + usage_iter, usage); + } else { + ret = driver->read_all_usage(dpp, start_epoch, end_epoch, max_entries, &is_truncated, + usage_iter, usage); + } + + if (ret == -ENOENT) { + ret = 0; + is_truncated = false; + } + + if (ret < 0) { + return ret; + } + + map::iterator iter; + for (iter = usage.begin(); iter != usage.end(); ++iter) { + const rgw_user_bucket& ub = iter->first; + const rgw_usage_log_entry& entry = iter->second; + + if (show_log_entries) { + if (ub.user.compare(last_owner) != 0) { + if (user_section_open) { + formatter->close_section(); + formatter->close_section(); + } + formatter->open_object_section("user"); + formatter->dump_string("user", ub.user); + formatter->open_array_section("buckets"); + user_section_open = true; + last_owner = ub.user; + } + formatter->open_object_section("bucket"); + formatter->dump_string("bucket", ub.bucket); + utime_t ut(entry.epoch, 0); + ut.gmtime(formatter->dump_stream("time")); + formatter->dump_int("epoch", entry.epoch); + string owner = entry.owner.to_str(); + string payer = entry.payer.to_str(); + formatter->dump_string("owner", owner); + if (!payer.empty() && payer != owner) { + formatter->dump_string("payer", payer); + } + dump_usage_categories_info(formatter, entry, categories); + formatter->close_section(); // bucket + flusher.flush(); + } + + summary_map[ub.user].aggregate(entry, categories); + } + } + if (show_log_entries) { + if (user_section_open) { + formatter->close_section(); // buckets + formatter->close_section(); //user + } + formatter->close_section(); // entries + } + + if (show_log_sum) { + formatter->open_array_section("summary"); + map::iterator siter; + for (siter = summary_map.begin(); siter != summary_map.end(); ++siter) { + const rgw_usage_log_entry& entry = siter->second; + formatter->open_object_section("user"); + formatter->dump_string("user", siter->first); + dump_usage_categories_info(formatter, entry, categories); + rgw_usage_data total_usage; + entry.sum(total_usage, *categories); + formatter->open_object_section("total"); + encode_json("bytes_sent", total_usage.bytes_sent, formatter); + encode_json("bytes_received", total_usage.bytes_received, formatter); + encode_json("ops", total_usage.ops, formatter); + encode_json("successful_ops", total_usage.successful_ops, formatter); + formatter->close_section(); // total + + formatter->close_section(); // user + + flusher.flush(); + } + + formatter->close_section(); // summary + } + + formatter->close_section(); // usage + flusher.flush(); + + return 0; +} + +int RGWUsage::trim(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + rgw::sal::User* user , rgw::sal::Bucket* bucket, + uint64_t start_epoch, uint64_t end_epoch) +{ + if (bucket) { + return bucket->trim_usage(dpp, start_epoch, end_epoch); + } else if (user) { + return user->trim_usage(dpp, start_epoch, end_epoch); + } else { + return driver->trim_all_usage(dpp, start_epoch, end_epoch); + } +} + +int RGWUsage::clear(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver) +{ + return driver->clear_usage(dpp); +} diff --git a/src/rgw/rgw_usage.h b/src/rgw/rgw_usage.h new file mode 100644 index 000000000..b12b57df0 --- /dev/null +++ b/src/rgw/rgw_usage.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include + +#include "common/Formatter.h" +#include "common/dout.h" +#include "rgw_formats.h" +#include "rgw_user.h" +#include "rgw_sal_fwd.h" + + +class RGWUsage +{ +public: + static int show(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + rgw::sal::User* user , rgw::sal::Bucket* bucket, + uint64_t start_epoch, uint64_t end_epoch, bool show_log_entries, + bool show_log_sum, + std::map *categories, RGWFormatterFlusher& flusher); + + static int trim(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + rgw::sal::User* user , rgw::sal::Bucket* bucket, + uint64_t start_epoch, uint64_t end_epoch); + + static int clear(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver); +}; diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc new file mode 100644 index 000000000..e5e07cbc4 --- /dev/null +++ b/src/rgw/rgw_user.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_sal_rados.h" + +#include "include/types.h" +#include "rgw_user.h" + +// until everything is moved from rgw_common +#include "rgw_common.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int rgw_user_sync_all_stats(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, + rgw::sal::User* user, optional_yield y) +{ + rgw::sal::BucketList user_buckets; + + CephContext *cct = driver->ctx(); + size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk; + string marker; + int ret; + + do { + ret = user->list_buckets(dpp, marker, string(), max_entries, false, user_buckets, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed to read user buckets: ret=" << ret << dendl; + return ret; + } + auto& buckets = user_buckets.get_buckets(); + for (auto i = buckets.begin(); i != buckets.end(); ++i) { + marker = i->first; + + auto& bucket = i->second; + + ret = bucket->load_bucket(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not read bucket info: bucket=" << bucket << " ret=" << ret << dendl; + continue; + } + ret = bucket->sync_user_stats(dpp, y); + if (ret < 0) { + ldout(cct, 0) << "ERROR: could not sync bucket stats: ret=" << ret << dendl; + return ret; + } + ret = bucket->check_bucket_shards(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR in check_bucket_shards: " << cpp_strerror(-ret)<< dendl; + } + } + } while (user_buckets.is_truncated()); + + ret = user->complete_flush_stats(dpp, y); + if (ret < 0) { + cerr << "ERROR: failed to complete syncing user stats: ret=" << ret << std::endl; + return ret; + } + + return 0; +} + +int rgw_user_get_all_buckets_stats(const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + rgw::sal::User* user, + map& buckets_usage_map, + optional_yield y) +{ + CephContext *cct = driver->ctx(); + size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk; + bool done; + string marker; + int ret; + + do { + rgw::sal::BucketList buckets; + ret = user->list_buckets(dpp, marker, string(), max_entries, false, buckets, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed to read user buckets: ret=" << ret << dendl; + return ret; + } + auto& m = buckets.get_buckets(); + for (const auto& i : m) { + marker = i.first; + + auto& bucket_ent = i.second; + ret = bucket_ent->load_bucket(dpp, y, true /* load user stats */); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not get bucket stats: ret=" << ret << dendl; + return ret; + } + bucket_meta_entry entry; + entry.size = bucket_ent->get_size(); + entry.size_rounded = bucket_ent->get_size_rounded(); + entry.creation_time = bucket_ent->get_creation_time(); + entry.count = bucket_ent->get_count(); + buckets_usage_map.emplace(bucket_ent->get_name(), entry); + } + done = (buckets.count() < max_entries); + } while (!done); + + return 0; +} + +int rgw_validate_tenant_name(const string& t) +{ + struct tench { + static bool is_good(char ch) { + return isalnum(ch) || ch == '_'; + } + }; + std::string::const_iterator it = + std::find_if_not(t.begin(), t.end(), tench::is_good); + return (it == t.end())? 0: -ERR_INVALID_TENANT_NAME; +} + +/** + * Get the anonymous (ie, unauthenticated) user info. + */ +void rgw_get_anon_user(RGWUserInfo& info) +{ + info.user_id = RGW_USER_ANON_ID; + info.display_name.clear(); + info.access_keys.clear(); +} + diff --git a/src/rgw/rgw_user_types.h b/src/rgw/rgw_user_types.h new file mode 100644 index 000000000..c9a1a46ad --- /dev/null +++ b/src/rgw/rgw_user_types.h @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * include files which can only be compiled in radosgw or OSD + * contexts (e.g., rgw_sal.h, rgw_common.h) */ + +#pragma once + +#include +#include + +#include "common/dout.h" +#include "common/Formatter.h" + +struct rgw_user { + std::string tenant; + std::string id; + std::string ns; + + rgw_user() {} + explicit rgw_user(const std::string& s) { + from_str(s); + } + rgw_user(const std::string& tenant, const std::string& id, const std::string& ns="") + : tenant(tenant), + id(id), + ns(ns) { + } + rgw_user(std::string&& tenant, std::string&& id, std::string&& ns="") + : tenant(std::move(tenant)), + id(std::move(id)), + ns(std::move(ns)) { + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(2, 1, bl); + encode(tenant, bl); + encode(id, bl); + encode(ns, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(2, bl); + decode(tenant, bl); + decode(id, bl); + if (struct_v >= 2) { + decode(ns, bl); + } + DECODE_FINISH(bl); + } + + void to_str(std::string& str) const { + if (!tenant.empty()) { + if (!ns.empty()) { + str = tenant + '$' + ns + '$' + id; + } else { + str = tenant + '$' + id; + } + } else if (!ns.empty()) { + str = '$' + ns + '$' + id; + } else { + str = id; + } + } + + void clear() { + tenant.clear(); + id.clear(); + ns.clear(); + } + + bool empty() const { + return id.empty(); + } + + std::string to_str() const { + std::string s; + to_str(s); + return s; + } + + void from_str(const std::string& str) { + size_t pos = str.find('$'); + if (pos != std::string::npos) { + tenant = str.substr(0, pos); + std::string_view sv = str; + std::string_view ns_id = sv.substr(pos + 1); + size_t ns_pos = ns_id.find('$'); + if (ns_pos != std::string::npos) { + ns = std::string(ns_id.substr(0, ns_pos)); + id = std::string(ns_id.substr(ns_pos + 1)); + } else { + ns.clear(); + id = std::string(ns_id); + } + } else { + tenant.clear(); + ns.clear(); + id = str; + } + } + + rgw_user& operator=(const std::string& str) { + from_str(str); + return *this; + } + + int compare(const rgw_user& u) const { + int r = tenant.compare(u.tenant); + if (r != 0) + return r; + r = ns.compare(u.ns); + if (r != 0) { + return r; + } + return id.compare(u.id); + } + int compare(const std::string& str) const { + rgw_user u(str); + return compare(u); + } + + bool operator!=(const rgw_user& rhs) const { + return (compare(rhs) != 0); + } + bool operator==(const rgw_user& rhs) const { + return (compare(rhs) == 0); + } + bool operator<(const rgw_user& rhs) const { + if (tenant < rhs.tenant) { + return true; + } else if (tenant > rhs.tenant) { + return false; + } + if (ns < rhs.ns) { + return true; + } else if (ns > rhs.ns) { + return false; + } + return (id < rhs.id); + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(rgw_user) diff --git a/src/rgw/rgw_web_idp.h b/src/rgw/rgw_web_idp.h new file mode 100644 index 000000000..a9aa5b829 --- /dev/null +++ b/src/rgw/rgw_web_idp.h @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +namespace rgw { +namespace web_idp { + +//WebToken contains some claims from the decoded token which are of interest to us. +struct WebTokenClaims { + //Subject of the token + std::string sub; + //Intended audience for this token + std::string aud; + //Issuer of this token + std::string iss; + //Human-readable id for the resource owner + std::string user_name; + //Client Id + std::string client_id; + //azp + std::string azp; +}; + +}; /* namespace web_idp */ +}; /* namespace rgw */ diff --git a/src/rgw/rgw_website.cc b/src/rgw/rgw_website.cc new file mode 100644 index 000000000..0b68fc170 --- /dev/null +++ b/src/rgw/rgw_website.cc @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Yehuda Sadeh + * Copyright (C) 2015 Robin H. Johnson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/debug.h" + +#include "common/ceph_json.h" +#include "common/Formatter.h" + +#include "acconfig.h" + +#include +#include +#include +#include "include/types.h" +#include "rgw_website.h" +#include "rgw_common.h" +#include "rgw_xml.h" + +using namespace std; + +bool RGWBWRoutingRuleCondition::check_key_condition(const string& key) { + return (key.size() >= key_prefix_equals.size() && + key.compare(0, key_prefix_equals.size(), key_prefix_equals) == 0); +} + + +void RGWBWRoutingRule::apply_rule(const string& default_protocol, const string& default_hostname, + const string& key, string *new_url, int *redirect_code) +{ + RGWRedirectInfo& redirect = redirect_info.redirect; + + string protocol = (!redirect.protocol.empty() ? redirect.protocol : default_protocol); + string hostname = (!redirect.hostname.empty() ? redirect.hostname : default_hostname); + + *new_url = protocol + "://" + hostname + "/"; + + if (!redirect_info.replace_key_prefix_with.empty()) { + *new_url += redirect_info.replace_key_prefix_with; + if (key.size() > condition.key_prefix_equals.size()) { + *new_url += key.substr(condition.key_prefix_equals.size()); + } + } else if (!redirect_info.replace_key_with.empty()) { + *new_url += redirect_info.replace_key_with; + } else { + *new_url += key; + } + + if(redirect.http_redirect_code > 0) + *redirect_code = redirect.http_redirect_code; +} + +bool RGWBWRoutingRules::check_key_and_error_code_condition(const string &key, int error_code, RGWBWRoutingRule **rule) +{ + for (list::iterator iter = rules.begin(); iter != rules.end(); ++iter) { + if (iter->check_key_condition(key) && iter->check_error_code_condition(error_code)) { + *rule = &(*iter); + return true; + } + } + return false; +} + +bool RGWBWRoutingRules::check_key_condition(const string& key, RGWBWRoutingRule **rule) +{ + for (list::iterator iter = rules.begin(); iter != rules.end(); ++iter) { + if (iter->check_key_condition(key)) { + *rule = &(*iter); + return true; + } + } + return false; +} + +bool RGWBWRoutingRules::check_error_code_condition(const int http_error_code, RGWBWRoutingRule **rule) +{ + for (list::iterator iter = rules.begin(); iter != rules.end(); ++iter) { + if (iter->check_error_code_condition(http_error_code)) { + *rule = &(*iter); + return true; + } + } + return false; +} + +bool RGWBucketWebsiteConf::should_redirect(const string& key, const int http_error_code, RGWBWRoutingRule *redirect) +{ + RGWBWRoutingRule *rule; + if(!redirect_all.hostname.empty()) { + RGWBWRoutingRule redirect_all_rule; + redirect_all_rule.redirect_info.redirect = redirect_all; + redirect_all.http_redirect_code = 301; + *redirect = redirect_all_rule; + return true; + } else if (!routing_rules.check_key_and_error_code_condition(key, http_error_code, &rule)) { + return false; + } + + *redirect = *rule; + + return true; +} + +bool RGWBucketWebsiteConf::get_effective_key(const string& key, string *effective_key, bool is_file) const +{ + if (index_doc_suffix.empty()) { + return false; + } + + if (key.empty()) { + *effective_key = index_doc_suffix; + } else if (key[key.size() - 1] == '/') { + *effective_key = key + index_doc_suffix; + } else if (! is_file) { + *effective_key = key + "/" + index_doc_suffix; + } else { + *effective_key = key; + } + + return true; +} + +void RGWRedirectInfo::dump(Formatter *f) const +{ + encode_json("protocol", protocol, f); + encode_json("hostname", hostname, f); + encode_json("http_redirect_code", (int)http_redirect_code, f); +} + +void RGWRedirectInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("protocol", protocol, obj); + JSONDecoder::decode_json("hostname", hostname, obj); + int code; + JSONDecoder::decode_json("http_redirect_code", code, obj); + http_redirect_code = code; +} + +void RGWBWRedirectInfo::dump(Formatter *f) const +{ + encode_json("redirect", redirect, f); + encode_json("replace_key_prefix_with", replace_key_prefix_with, f); + encode_json("replace_key_with", replace_key_with, f); +} + +void RGWBWRedirectInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("redirect", redirect, obj); + JSONDecoder::decode_json("replace_key_prefix_with", replace_key_prefix_with, obj); + JSONDecoder::decode_json("replace_key_with", replace_key_with, obj); +} + +void RGWBWRoutingRuleCondition::dump(Formatter *f) const +{ + encode_json("key_prefix_equals", key_prefix_equals, f); + encode_json("http_error_code_returned_equals", (int)http_error_code_returned_equals, f); +} + +void RGWBWRoutingRuleCondition::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("key_prefix_equals", key_prefix_equals, obj); + int code; + JSONDecoder::decode_json("http_error_code_returned_equals", code, obj); + http_error_code_returned_equals = code; +} + +void RGWBWRoutingRule::dump(Formatter *f) const +{ + encode_json("condition", condition, f); + encode_json("redirect_info", redirect_info, f); +} + +void RGWBWRoutingRule::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("condition", condition, obj); + JSONDecoder::decode_json("redirect_info", redirect_info, obj); +} + +void RGWBWRoutingRules::dump(Formatter *f) const +{ + encode_json("rules", rules, f); +} + +void RGWBWRoutingRules::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("rules", rules, obj); +} + +void RGWBucketWebsiteConf::dump(Formatter *f) const +{ + if (!redirect_all.hostname.empty()) { + encode_json("redirect_all", redirect_all, f); + } else { + encode_json("index_doc_suffix", index_doc_suffix, f); + encode_json("error_doc", error_doc, f); + encode_json("routing_rules", routing_rules, f); + } +} + +void RGWBucketWebsiteConf::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("redirect_all", redirect_all, obj); + JSONDecoder::decode_json("index_doc_suffix", index_doc_suffix, obj); + JSONDecoder::decode_json("error_doc", error_doc, obj); + JSONDecoder::decode_json("routing_rules", routing_rules, obj); +} + +void RGWBWRedirectInfo::dump_xml(Formatter *f) const +{ + if (!redirect.protocol.empty()) { + encode_xml("Protocol", redirect.protocol, f); + } + if (!redirect.hostname.empty()) { + encode_xml("HostName", redirect.hostname, f); + } + if (redirect.http_redirect_code > 0) { + encode_xml("HttpRedirectCode", (int)redirect.http_redirect_code, f); + } + if (!replace_key_prefix_with.empty()) { + encode_xml("ReplaceKeyPrefixWith", replace_key_prefix_with, f); + } + if (!replace_key_with.empty()) { + encode_xml("ReplaceKeyWith", replace_key_with, f); + } +} + +#define WEBSITE_HTTP_REDIRECT_CODE_MIN 300 +#define WEBSITE_HTTP_REDIRECT_CODE_MAX 400 +void RGWBWRedirectInfo::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Protocol", redirect.protocol, obj); + RGWXMLDecoder::decode_xml("HostName", redirect.hostname, obj); + int code = 0; + bool has_http_redirect_code = RGWXMLDecoder::decode_xml("HttpRedirectCode", code, obj); + if (has_http_redirect_code && + !(code > WEBSITE_HTTP_REDIRECT_CODE_MIN && + code < WEBSITE_HTTP_REDIRECT_CODE_MAX)) { + throw RGWXMLDecoder::err("The provided HTTP redirect code is not valid. Valid codes are 3XX except 300."); + } + redirect.http_redirect_code = code; + bool has_replace_key_prefix_with = RGWXMLDecoder::decode_xml("ReplaceKeyPrefixWith", replace_key_prefix_with, obj); + bool has_replace_key_with = RGWXMLDecoder::decode_xml("ReplaceKeyWith", replace_key_with, obj); + if (has_replace_key_prefix_with && has_replace_key_with) { + throw RGWXMLDecoder::err("You can only define ReplaceKeyPrefix or ReplaceKey but not both."); + } +} + +void RGWBWRoutingRuleCondition::dump_xml(Formatter *f) const +{ + if (!key_prefix_equals.empty()) { + encode_xml("KeyPrefixEquals", key_prefix_equals, f); + } + if (http_error_code_returned_equals > 0) { + encode_xml("HttpErrorCodeReturnedEquals", (int)http_error_code_returned_equals, f); + } +} + +#define WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MIN 400 +#define WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MAX 600 +void RGWBWRoutingRuleCondition::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("KeyPrefixEquals", key_prefix_equals, obj); + int code = 0; + bool has_http_error_code_returned_equals = RGWXMLDecoder::decode_xml("HttpErrorCodeReturnedEquals", code, obj); + if (has_http_error_code_returned_equals && + !(code >= WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MIN && + code < WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MAX)) { + throw RGWXMLDecoder::err("The provided HTTP redirect code is not valid. Valid codes are 4XX or 5XX."); + } + http_error_code_returned_equals = code; +} + +void RGWBWRoutingRule::dump_xml(Formatter *f) const +{ + encode_xml("Condition", condition, f); + encode_xml("Redirect", redirect_info, f); +} + +void RGWBWRoutingRule::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Condition", condition, obj); + RGWXMLDecoder::decode_xml("Redirect", redirect_info, obj); +} + +static void encode_xml(const char *name, const std::list& l, ceph::Formatter *f) +{ + do_encode_xml("RoutingRules", l, "RoutingRule", f); +} + +void RGWBucketWebsiteConf::dump_xml(Formatter *f) const +{ + if (!redirect_all.hostname.empty()) { + f->open_object_section("RedirectAllRequestsTo"); + encode_xml("HostName", redirect_all.hostname, f); + if (!redirect_all.protocol.empty()) { + encode_xml("Protocol", redirect_all.protocol, f); + } + f->close_section(); + } + if (!index_doc_suffix.empty()) { + f->open_object_section("IndexDocument"); + encode_xml("Suffix", index_doc_suffix, f); + f->close_section(); + } + if (!error_doc.empty()) { + f->open_object_section("ErrorDocument"); + encode_xml("Key", error_doc, f); + f->close_section(); + } + if (!routing_rules.rules.empty()) { + encode_xml("RoutingRules", routing_rules.rules, f); + } +} + +void decode_xml_obj(list& l, XMLObj *obj) +{ + do_decode_xml_obj(l, "RoutingRule", obj); +} + +void RGWBucketWebsiteConf::decode_xml(XMLObj *obj) { + XMLObj *o = obj->find_first("RedirectAllRequestsTo"); + if (o) { + is_redirect_all = true; + RGWXMLDecoder::decode_xml("HostName", redirect_all.hostname, o, true); + RGWXMLDecoder::decode_xml("Protocol", redirect_all.protocol, o); + } else { + o = obj->find_first("IndexDocument"); + if (o) { + is_set_index_doc = true; + RGWXMLDecoder::decode_xml("Suffix", index_doc_suffix, o); + } + o = obj->find_first("ErrorDocument"); + if (o) { + RGWXMLDecoder::decode_xml("Key", error_doc, o); + } + RGWXMLDecoder::decode_xml("RoutingRules", routing_rules.rules, obj); + } +} diff --git a/src/rgw/rgw_website.h b/src/rgw/rgw_website.h new file mode 100644 index 000000000..bf92011ba --- /dev/null +++ b/src/rgw/rgw_website.h @@ -0,0 +1,243 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Yehuda Sadeh + * Copyright (C) 2015 Robin H. Johnson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include +#include + +#include "common/ceph_json.h" + +#include "rgw_xml.h" + +struct RGWRedirectInfo +{ + std::string protocol; + std::string hostname; + uint16_t http_redirect_code = 0; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(protocol, bl); + encode(hostname, bl); + encode(http_redirect_code, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(protocol, bl); + decode(hostname, bl); + decode(http_redirect_code, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWRedirectInfo) + + +struct RGWBWRedirectInfo +{ + RGWRedirectInfo redirect; + std::string replace_key_prefix_with; + std::string replace_key_with; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(redirect, bl); + encode(replace_key_prefix_with, bl); + encode(replace_key_with, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(redirect, bl); + decode(replace_key_prefix_with, bl); + decode(replace_key_with, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void decode_json(JSONObj *obj); + void decode_xml(XMLObj *obj); +}; +WRITE_CLASS_ENCODER(RGWBWRedirectInfo) + +struct RGWBWRoutingRuleCondition +{ + std::string key_prefix_equals; + uint16_t http_error_code_returned_equals = 0; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(key_prefix_equals, bl); + encode(http_error_code_returned_equals, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(key_prefix_equals, bl); + decode(http_error_code_returned_equals, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void decode_json(JSONObj *obj); + void decode_xml(XMLObj *obj); + + bool check_key_condition(const std::string& key); + bool check_error_code_condition(const int error_code) { + return (uint16_t)error_code == http_error_code_returned_equals; + } +}; +WRITE_CLASS_ENCODER(RGWBWRoutingRuleCondition) + +struct RGWBWRoutingRule +{ + RGWBWRoutingRuleCondition condition; + RGWBWRedirectInfo redirect_info; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(condition, bl); + encode(redirect_info, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(condition, bl); + decode(redirect_info, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void decode_json(JSONObj *obj); + void decode_xml(XMLObj *obj); + + bool check_key_condition(const std::string& key) { + return condition.check_key_condition(key); + } + bool check_error_code_condition(int error_code) { + return condition.check_error_code_condition(error_code); + } + + void apply_rule(const std::string& default_protocol, + const std::string& default_hostname, + const std::string& key, + std::string *redirect, + int *redirect_code); +}; +WRITE_CLASS_ENCODER(RGWBWRoutingRule) + +struct RGWBWRoutingRules +{ + std::list rules; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(rules, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(rules, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void decode_json(JSONObj *obj); + + bool check_key_condition(const std::string& key, RGWBWRoutingRule **rule); + bool check_error_code_condition(int error_code, RGWBWRoutingRule **rule); + bool check_key_and_error_code_condition(const std::string& key, + const int error_code, + RGWBWRoutingRule **rule); +}; +WRITE_CLASS_ENCODER(RGWBWRoutingRules) + +struct RGWBucketWebsiteConf +{ + RGWRedirectInfo redirect_all; + std::string index_doc_suffix; + std::string error_doc; + std::string subdir_marker; + std::string listing_css_doc; + bool listing_enabled; + bool is_redirect_all; + bool is_set_index_doc; + RGWBWRoutingRules routing_rules; + + RGWBucketWebsiteConf() + : listing_enabled(false) { + is_redirect_all = false; + is_set_index_doc = false; + } + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(index_doc_suffix, bl); + encode(error_doc, bl); + encode(routing_rules, bl); + encode(redirect_all, bl); + encode(subdir_marker, bl); + encode(listing_css_doc, bl); + encode(listing_enabled, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(index_doc_suffix, bl); + decode(error_doc, bl); + decode(routing_rules, bl); + decode(redirect_all, bl); + if (struct_v >= 2) { + decode(subdir_marker, bl); + decode(listing_css_doc, bl); + decode(listing_enabled, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + bool should_redirect(const std::string& key, + const int http_error_code, + RGWBWRoutingRule *redirect); + + bool get_effective_key(const std::string& key, + std::string *effective_key, bool is_file) const; + + const std::string& get_index_doc() const { + return index_doc_suffix; + } + + bool is_empty() const { + return index_doc_suffix.empty() && + error_doc.empty() && + subdir_marker.empty() && + listing_css_doc.empty() && + ! listing_enabled; + } +}; +WRITE_CLASS_ENCODER(RGWBucketWebsiteConf) diff --git a/src/rgw/rgw_worker.h b/src/rgw/rgw_worker.h new file mode 100644 index 000000000..eb2e55243 --- /dev/null +++ b/src/rgw/rgw_worker.h @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include + +#include "common/Thread.h" +#include "common/ceph_mutex.h" +#include "include/common_fwd.h" + +class RGWRados; + +class RGWRadosThread { + class Worker : public Thread, public DoutPrefixProvider { + CephContext *cct; + RGWRadosThread *processor; + ceph::mutex lock = ceph::make_mutex("RGWRadosThread::Worker"); + ceph::condition_variable cond; + + void wait() { + std::unique_lock l{lock}; + cond.wait(l); + }; + + void wait_interval(const ceph::real_clock::duration& wait_time) { + std::unique_lock l{lock}; + cond.wait_for(l, wait_time); + } + + public: + Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p) {} + void *entry() override; + void signal() { + std::lock_guard l{lock}; + cond.notify_all(); + } + + CephContext *get_cct() const { return cct; } + unsigned get_subsys() const { return ceph_subsys_rgw; } + std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw rados thread: "; } + + }; + + Worker *worker; + +protected: + CephContext *cct; + RGWRados *store; + + std::atomic down_flag = { false }; + + std::string thread_name; + + virtual uint64_t interval_msec() = 0; + virtual void stop_process() {} +public: + RGWRadosThread(RGWRados *_store, const std::string& thread_name = "radosgw") + : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {} + virtual ~RGWRadosThread() { + stop(); + } + + virtual int init(const DoutPrefixProvider *dpp) { return 0; } + virtual int process(const DoutPrefixProvider *dpp) = 0; + + bool going_down() { return down_flag; } + + void start(); + void stop(); + + void signal() { + if (worker) { + worker->signal(); + } + } +}; + diff --git a/src/rgw/rgw_xml.cc b/src/rgw/rgw_xml.cc new file mode 100644 index 000000000..22a62ac48 --- /dev/null +++ b/src/rgw/rgw_xml.cc @@ -0,0 +1,502 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include +#include + +#include + +#include "include/types.h" +#include "include/utime.h" + +#include "rgw_xml.h" + +using namespace std; + +XMLObjIter:: +XMLObjIter() +{ +} + +XMLObjIter:: +~XMLObjIter() +{ +} + +void XMLObjIter:: +set(const XMLObjIter::map_iter_t &_cur, const XMLObjIter::map_iter_t &_end) +{ + cur = _cur; + end = _end; +} + +XMLObj *XMLObjIter:: +get_next() +{ + XMLObj *obj = NULL; + if (cur != end) { + obj = cur->second; + ++cur; + } + return obj; +} + +bool XMLObjIter::get_name(std::string& name) const +{ + if (cur == end) { + return false; + } + + name = cur->first; + return true; +} + +ostream& operator<<(ostream &out, const XMLObj &obj) { + out << obj.obj_type << ": " << obj.data; + return out; +} + +XMLObj:: +~XMLObj() +{ +} + +bool XMLObj:: +xml_start(XMLObj *parent, const char *el, const char **attr) +{ + this->parent = parent; + obj_type = el; + for (int i = 0; attr[i]; i += 2) { + attr_map[attr[i]] = std::string(attr[i + 1]); + } + return true; +} + +bool XMLObj:: +xml_end(const char *el) +{ + return true; +} + +void XMLObj:: +xml_handle_data(const char *s, int len) +{ + data.append(s, len); +} + +const std::string& XMLObj:: +XMLObj::get_data() const +{ + return data; +} + +const std::string& XMLObj:: +XMLObj::get_obj_type() const +{ + return obj_type; +} + +XMLObj *XMLObj:: +XMLObj::get_parent() +{ + return parent; +} + +void XMLObj:: +add_child(const std::string& el, XMLObj *obj) +{ + children.insert(std::pair(el, obj)); +} + +bool XMLObj:: +get_attr(const std::string& name, std::string& attr) const +{ + const std::map::const_iterator iter = attr_map.find(name); + if (iter == attr_map.end()) + return false; + attr = iter->second; + return true; +} + +XMLObjIter XMLObj:: +find(const std::string& name) +{ + XMLObjIter iter; + const XMLObjIter::const_map_iter_t first = children.find(name); + XMLObjIter::const_map_iter_t last; + if (first != children.end()) { + last = children.upper_bound(name); + }else + last = children.end(); + iter.set(first, last); + return iter; +} + +XMLObjIter XMLObj::find_first() +{ + XMLObjIter iter; + const XMLObjIter::const_map_iter_t first = children.begin(); + const XMLObjIter::const_map_iter_t last = children.end(); + iter.set(first, last); + return iter; +} + +XMLObj *XMLObj:: +find_first(const std::string& name) +{ + const XMLObjIter::const_map_iter_t first = children.find(name); + if (first != children.end()) + return first->second; + return nullptr; +} + +RGWXMLParser:: +RGWXMLParser() : buf(nullptr), buf_len(0), cur_obj(nullptr), success(true), init_called(false) +{ + p = XML_ParserCreate(nullptr); +} + +RGWXMLParser:: +~RGWXMLParser() +{ + XML_ParserFree(p); + + free(buf); + std::list::const_iterator iter; + for (iter = allocated_objs.begin(); iter != allocated_objs.end(); ++iter) { + XMLObj *obj = *iter; + delete obj; + } +} + +void RGWXMLParser::call_xml_start(void* user_data, const char *el, const char **attr) { + RGWXMLParser *handler = static_cast(user_data); + XMLObj * obj = handler->alloc_obj(el); + if (!obj) { + handler->unallocated_objs.push_back(XMLObj()); + obj = &handler->unallocated_objs.back(); + } else { + handler->allocated_objs.push_back(obj); + } + if (!obj->xml_start(handler->cur_obj, el, attr)) { + handler->success = false; + return; + } + if (handler->cur_obj) { + handler->cur_obj->add_child(el, obj); + } else { + handler->children.insert(std::pair(el, obj)); + } + handler->cur_obj = obj; + + handler->objs.push_back(obj); +} + +void RGWXMLParser::call_xml_end(void* user_data, const char *el) { + RGWXMLParser *handler = static_cast(user_data); + XMLObj *parent_obj = handler->cur_obj->get_parent(); + if (!handler->cur_obj->xml_end(el)) { + handler->success = false; + return; + } + handler->cur_obj = parent_obj; +} + +void RGWXMLParser::call_xml_handle_data(void* user_data, const char *s, int len) +{ + RGWXMLParser *handler = static_cast(user_data); + handler->cur_obj->xml_handle_data(s, len); +} + +bool RGWXMLParser::init() +{ + if (!p) { + return false; + } + init_called = true; + XML_SetElementHandler(p, RGWXMLParser::call_xml_start, RGWXMLParser::call_xml_end); + XML_SetCharacterDataHandler(p, RGWXMLParser::call_xml_handle_data); + XML_SetUserData(p, (void *)this); + return true; +} + +bool RGWXMLParser::parse(const char *_buf, int len, int done) +{ + ceph_assert(init_called); + int pos = buf_len; + char *tmp_buf; + tmp_buf = (char *)realloc(buf, buf_len + len); + if (tmp_buf == NULL){ + free(buf); + buf = NULL; + return false; + } else { + buf = tmp_buf; + } + + memcpy(&buf[buf_len], _buf, len); + buf_len += len; + + success = true; + if (!XML_Parse(p, &buf[pos], len, done)) { + fprintf(stderr, "Parse error at line %d:\n%s\n", + (int)XML_GetCurrentLineNumber(p), + XML_ErrorString(XML_GetErrorCode(p))); + success = false; + } + + return success; +} + +void decode_xml_obj(unsigned long& val, XMLObj *obj) +{ + auto& s = obj->get_data(); + const char *start = s.c_str(); + char *p; + + errno = 0; + val = strtoul(start, &p, 10); + + /* Check for various possible errors */ + + if ((errno == ERANGE && val == ULONG_MAX) || + (errno != 0 && val == 0)) { + throw RGWXMLDecoder::err("failed to number"); + } + + if (p == start) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + while (*p != '\0') { + if (!isspace(*p)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + p++; + } +} + + +void decode_xml_obj(long& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + const char *start = s.c_str(); + char *p; + + errno = 0; + val = strtol(start, &p, 10); + + /* Check for various possible errors */ + + if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) || + (errno != 0 && val == 0)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + if (p == start) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + while (*p != '\0') { + if (!isspace(*p)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + p++; + } +} + +void decode_xml_obj(long long& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + const char *start = s.c_str(); + char *p; + + errno = 0; + val = strtoll(start, &p, 10); + + /* Check for various possible errors */ + + if ((errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) || + (errno != 0 && val == 0)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + if (p == start) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + while (*p != '\0') { + if (!isspace(*p)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + p++; + } +} + +void decode_xml_obj(unsigned long long& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + const char *start = s.c_str(); + char *p; + + errno = 0; + val = strtoull(start, &p, 10); + + /* Check for various possible errors */ + + if ((errno == ERANGE && val == ULLONG_MAX) || + (errno != 0 && val == 0)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + if (p == start) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + while (*p != '\0') { + if (!isspace(*p)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + p++; + } +} + +void decode_xml_obj(int& val, XMLObj *obj) +{ + long l; + decode_xml_obj(l, obj); +#if LONG_MAX > INT_MAX + if (l > INT_MAX || l < INT_MIN) { + throw RGWXMLDecoder::err("integer out of range"); + } +#endif + + val = (int)l; +} + +void decode_xml_obj(unsigned& val, XMLObj *obj) +{ + unsigned long l; + decode_xml_obj(l, obj); +#if ULONG_MAX > UINT_MAX + if (l > UINT_MAX) { + throw RGWXMLDecoder::err("unsigned integer out of range"); + } +#endif + + val = (unsigned)l; +} + +void decode_xml_obj(bool& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + if (strncasecmp(s.c_str(), "true", 8) == 0) { + val = true; + return; + } + if (strncasecmp(s.c_str(), "false", 8) == 0) { + val = false; + return; + } + int i; + decode_xml_obj(i, obj); + val = (bool)i; +} + +void decode_xml_obj(bufferlist& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + + bufferlist bl; + bl.append(s.c_str(), s.size()); + try { + val.decode_base64(bl); + } catch (buffer::error& err) { + throw RGWXMLDecoder::err("failed to decode base64"); + } +} + +void decode_xml_obj(utime_t& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + uint64_t epoch; + uint64_t nsec; + int r = utime_t::parse_date(s, &epoch, &nsec); + if (r == 0) { + val = utime_t(epoch, nsec); + } else { + throw RGWXMLDecoder::err("failed to decode utime_t"); + } +} + +void encode_xml(const char *name, const string& val, Formatter *f) +{ + f->dump_string(name, val); +} + +void encode_xml(const char *name, const char *val, Formatter *f) +{ + f->dump_string(name, val); +} + +void encode_xml(const char *name, bool val, Formatter *f) +{ + std::string s; + if (val) + s = "True"; + else + s = "False"; + + f->dump_string(name, s); +} + +void encode_xml(const char *name, int val, Formatter *f) +{ + f->dump_int(name, val); +} + +void encode_xml(const char *name, long val, Formatter *f) +{ + f->dump_int(name, val); +} + +void encode_xml(const char *name, unsigned val, Formatter *f) +{ + f->dump_unsigned(name, val); +} + +void encode_xml(const char *name, unsigned long val, Formatter *f) +{ + f->dump_unsigned(name, val); +} + +void encode_xml(const char *name, unsigned long long val, Formatter *f) +{ + f->dump_unsigned(name, val); +} + +void encode_xml(const char *name, long long val, Formatter *f) +{ + f->dump_int(name, val); +} + +void encode_xml(const char *name, const utime_t& val, Formatter *f) +{ + val.gmtime(f->dump_stream(name)); +} + +void encode_xml(const char *name, const bufferlist& bl, Formatter *f) +{ + /* need to copy data from bl, as it is const bufferlist */ + bufferlist src = bl; + + bufferlist b64; + src.encode_base64(b64); + + const std::string s(b64.c_str(), b64.length()); + + encode_xml(name, s, f); +} + diff --git a/src/rgw/rgw_xml.h b/src/rgw/rgw_xml.h new file mode 100644 index 000000000..74a8c27a0 --- /dev/null +++ b/src/rgw/rgw_xml.h @@ -0,0 +1,371 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include +#include + +class XMLObj; +class RGWXMLParser; + +class XMLObjIter { +public: + typedef std::map::iterator map_iter_t; + typedef std::map::iterator const_map_iter_t; + + XMLObjIter(); + virtual ~XMLObjIter(); + void set(const XMLObjIter::const_map_iter_t &_cur, const XMLObjIter::const_map_iter_t &_end); + XMLObj *get_next(); + bool get_name(std::string& name) const; + +private: + map_iter_t cur; + map_iter_t end; +}; + +/** + * Represents a block of XML. + * Give the class an XML blob, and it will parse the blob into + * an attr_name->value map. + * It shouldn't be the start point for any parsing. Look at RGWXMLParser for that. + */ +class XMLObj +{ +private: + XMLObj *parent; + std::string obj_type; + +protected: + std::string data; + std::multimap children; + std::map attr_map; + + // invoked at the beginning of the XML tag, and populate any attributes + bool xml_start(XMLObj *parent, const char *el, const char **attr); + // callback invoked at the end of the XML tag + // if objects are created while parsing, this should be overwritten in the drived class + virtual bool xml_end(const char *el); + // callback invoked for storing the data of the XML tag + // if data manipulation is needed this could be overwritten in the drived class + virtual void xml_handle_data(const char *s, int len); + // get the parent object + XMLObj *get_parent(); + // add a child XML object + void add_child(const std::string& el, XMLObj *obj); + +public: + XMLObj() : parent(nullptr) {} + virtual ~XMLObj(); + + // get the data (as string) + const std::string& get_data() const; + // get the type of the object (as string) + const std::string& get_obj_type() const; + bool get_attr(const std::string& name, std::string& attr) const; + // return a list of sub-tags matching the name + XMLObjIter find(const std::string& name); + // return the first sub-tag + XMLObjIter find_first(); + // return the first sub-tags matching the name + XMLObj *find_first(const std::string& name); + + friend std::ostream& operator<<(std::ostream &out, const XMLObj &obj); + friend RGWXMLParser; +}; + +struct XML_ParserStruct; + +// an XML parser is an XML object without a parent (root of the tree) +// the parser could be used in 2 ways: +// +// (1) lazy object creation/intrusive API: usually used within the RGWXMLDecode namespace (as RGWXMLDecode::XMLParser) +// the parser will parse the input and store info, but will not generate the target object. The object can be allocated outside +// of the parser (stack or heap), and require to implement the decode_xml() API for the values to be populated. +// note that the decode_xml() calls may throw exceptions if parsing fails +// +// (2) object creation while parsing: a new class needs to be derived from RGWXMLParser and implement alloc_obj() +// API that should create a set of classes derived from XMLObj implementing xml_end() to create the actual target objects +// +// There could be a mix-and-match of the 2 types, control over that is in the alloc_obj() call +// deciding for which tags objects are allocate during parsing and for which tags object allocation is external + +class RGWXMLParser : public XMLObj +{ +private: + XML_ParserStruct *p; + char *buf; + int buf_len; + XMLObj *cur_obj; + std::vector objs; + std::list allocated_objs; + std::list unallocated_objs; + bool success; + bool init_called; + + // calls xml_start() on each parsed object + // passed as static callback to actual parser, passes itself as user_data + static void call_xml_start(void* user_data, const char *el, const char **attr); + // calls xml_end() on each parsed object + // passed as static callback to actual parser, passes itself as user_data + static void call_xml_end(void* user_data, const char *el); + // calls xml_handle_data() on each parsed object + // passed as static callback to actual parser, passes itself as user_data + static void call_xml_handle_data(void* user_data, const char *s, int len); + +protected: + // if objects are created while parsing, this should be implemented in the derived class + // and be a factory for creating the classes derived from XMLObj + // note that not all sub-tags has to be constructed here, any such tag which is not + // constructed will be lazily created when decode_xml() is invoked on it + // + // note that in case of different tags sharing the same name at different levels + // this method should not be used + virtual XMLObj *alloc_obj(const char *el) { + return nullptr; + } + +public: + RGWXMLParser(); + virtual ~RGWXMLParser() override; + + // initialize the parser, must be called before parsing + bool init(); + // parse the XML buffer (can be invoked multiple times for incremental parsing) + // receives the buffer to parse, its length, and boolean indication (0,1) + // whether this is the final chunk of the buffer + bool parse(const char *buf, int len, int done); + // get the XML blob being parsed + const char *get_xml() const { return buf; } +}; + +namespace RGWXMLDecoder { + struct err : std::runtime_error { + using runtime_error::runtime_error; + }; + + typedef RGWXMLParser XMLParser; + + template + bool decode_xml(const char *name, T& val, XMLObj* obj, bool mandatory = false); + + template + bool decode_xml(const char *name, std::vector& v, XMLObj* obj, bool mandatory = false); + + template + bool decode_xml(const char *name, C& container, void (*cb)(C&, XMLObj *obj), XMLObj *obj, bool mandatory = false); + + template + void decode_xml(const char *name, T& val, T& default_val, XMLObj* obj); +} + +static inline std::ostream& operator<<(std::ostream &out, RGWXMLDecoder::err& err) +{ + return out << err.what(); +} + +template +void decode_xml_obj(T& val, XMLObj *obj) +{ + val.decode_xml(obj); +} + +static inline void decode_xml_obj(std::string& val, XMLObj *obj) +{ + val = obj->get_data(); +} + +void decode_xml_obj(unsigned long long& val, XMLObj *obj); +void decode_xml_obj(long long& val, XMLObj *obj); +void decode_xml_obj(unsigned long& val, XMLObj *obj); +void decode_xml_obj(long& val, XMLObj *obj); +void decode_xml_obj(unsigned& val, XMLObj *obj); +void decode_xml_obj(int& val, XMLObj *obj); +void decode_xml_obj(bool& val, XMLObj *obj); +void decode_xml_obj(bufferlist& val, XMLObj *obj); +class utime_t; +void decode_xml_obj(utime_t& val, XMLObj *obj); + +template +void decode_xml_obj(std::optional& val, XMLObj *obj) +{ + val.emplace(); + decode_xml_obj(*val, obj); +} + +template +void do_decode_xml_obj(std::list& l, const std::string& name, XMLObj *obj) +{ + l.clear(); + + XMLObjIter iter = obj->find(name); + XMLObj *o; + + while ((o = iter.get_next())) { + T val; + decode_xml_obj(val, o); + l.push_back(val); + } +} + +template +bool RGWXMLDecoder::decode_xml(const char *name, T& val, XMLObj *obj, bool mandatory) +{ + XMLObjIter iter = obj->find(name); + XMLObj *o = iter.get_next(); + if (!o) { + if (mandatory) { + std::string s = "missing mandatory field " + std::string(name); + throw err(s); + } + val = T(); + return false; + } + + try { + decode_xml_obj(val, o); + } catch (const err& e) { + std::string s = std::string(name) + ": "; + s.append(e.what()); + throw err(s); + } + + return true; +} + +template +bool RGWXMLDecoder::decode_xml(const char *name, std::vector& v, XMLObj *obj, bool mandatory) +{ + XMLObjIter iter = obj->find(name); + XMLObj *o = iter.get_next(); + + v.clear(); + + if (!o) { + if (mandatory) { + std::string s = "missing mandatory field " + std::string(name); + throw err(s); + } + return false; + } + + do { + T val; + try { + decode_xml_obj(val, o); + } catch (const err& e) { + std::string s = std::string(name) + ": "; + s.append(e.what()); + throw err(s); + } + v.push_back(val); + } while ((o = iter.get_next())); + return true; +} + +template +bool RGWXMLDecoder::decode_xml(const char *name, C& container, void (*cb)(C&, XMLObj *), XMLObj *obj, bool mandatory) +{ + container.clear(); + + XMLObjIter iter = obj->find(name); + XMLObj *o = iter.get_next(); + if (!o) { + if (mandatory) { + std::string s = "missing mandatory field " + std::string(name); + throw err(s); + } + return false; + } + + try { + decode_xml_obj(container, cb, o); + } catch (const err& e) { + std::string s = std::string(name) + ": "; + s.append(e.what()); + throw err(s); + } + + return true; +} + +template +void RGWXMLDecoder::decode_xml(const char *name, T& val, T& default_val, XMLObj *obj) +{ + XMLObjIter iter = obj->find(name); + XMLObj *o = iter.get_next(); + if (!o) { + val = default_val; + return; + } + + try { + decode_xml_obj(val, o); + } catch (const err& e) { + val = default_val; + std::string s = std::string(name) + ": "; + s.append(e.what()); + throw err(s); + } +} + +template +static void encode_xml(const char *name, const T& val, ceph::Formatter *f) +{ + f->open_object_section(name); + val.dump_xml(f); + f->close_section(); +} + +template +static void encode_xml(const char *name, const char *ns, const T& val, ceph::Formatter *f) +{ + f->open_object_section_in_ns(name, ns); + val.dump_xml(f); + f->close_section(); +} + +void encode_xml(const char *name, const std::string& val, ceph::Formatter *f); +void encode_xml(const char *name, const char *val, ceph::Formatter *f); +void encode_xml(const char *name, bool val, ceph::Formatter *f); +void encode_xml(const char *name, int val, ceph::Formatter *f); +void encode_xml(const char *name, unsigned val, ceph::Formatter *f); +void encode_xml(const char *name, long val, ceph::Formatter *f); +void encode_xml(const char *name, unsigned long val, ceph::Formatter *f); +void encode_xml(const char *name, long long val, ceph::Formatter *f); +void encode_xml(const char *name, const utime_t& val, ceph::Formatter *f); +void encode_xml(const char *name, const bufferlist& bl, ceph::Formatter *f); +void encode_xml(const char *name, long long unsigned val, ceph::Formatter *f); + +template +static void do_encode_xml(const char *name, const std::list& l, const char *entry_name, ceph::Formatter *f) +{ + f->open_array_section(name); + for (typename std::list::const_iterator iter = l.begin(); iter != l.end(); ++iter) { + encode_xml(entry_name, *iter, f); + } + f->close_section(); +} + +template +static void encode_xml(const char *name, const std::vector& l, ceph::Formatter *f) +{ + for (typename std::vector::const_iterator iter = l.begin(); iter != l.end(); ++iter) { + encode_xml(name, *iter, f); + } +} + +template +static void encode_xml(const char *name, const std::optional& o, ceph::Formatter *f) +{ + if (!o) { + return; + } + + encode_xml(name, *o, f); +} diff --git a/src/rgw/rgw_xml_enc.cc b/src/rgw/rgw_xml_enc.cc new file mode 100644 index 000000000..554e953d7 --- /dev/null +++ b/src/rgw/rgw_xml_enc.cc @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Yehuda Sadeh + * Copyright (C) 2015 Robin H. Johnson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_common.h" +#include "rgw_xml.h" + +#include "common/Formatter.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + diff --git a/src/rgw/rgw_zone.cc b/src/rgw/rgw_zone.cc new file mode 100644 index 000000000..b743689ed --- /dev/null +++ b/src/rgw/rgw_zone.cc @@ -0,0 +1,1371 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "common/errno.h" + +#include "rgw_zone.h" +#include "rgw_sal_config.h" +#include "rgw_sync.h" + +#include "services/svc_zone.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +namespace rgw_zone_defaults { + +static std::string default_bucket_index_pool_suffix = "rgw.buckets.index"; +static std::string default_storage_extra_pool_suffix = "rgw.buckets.non-ec"; +static std::string zone_info_oid_prefix = "zone_info."; + +std::string zone_names_oid_prefix = "zone_names."; +std::string region_info_oid_prefix = "region_info."; +std::string zone_group_info_oid_prefix = "zonegroup_info."; +std::string default_region_info_oid = "default.region"; +std::string default_zone_group_info_oid = "default.zonegroup"; +std::string region_map_oid = "region_map"; +std::string default_zonegroup_name = "default"; +std::string default_zone_name = "default"; +std::string zonegroup_names_oid_prefix = "zonegroups_names."; +std::string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root"; +std::string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root"; +std::string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root"; +std::string avail_pools = ".pools.avail"; +std::string default_storage_pool_suffix = "rgw.buckets.data"; + +} + +using namespace std; +using namespace rgw_zone_defaults; + +void encode_json_plain(const char *name, const RGWAccessKey& val, Formatter *f) +{ + f->open_object_section(name); + val.dump_plain(f); + f->close_section(); +} + +static void decode_zones(map& zones, JSONObj *o) +{ + RGWZone z; + z.decode_json(o); + zones[z.id] = z; +} + +static void decode_placement_targets(map& targets, JSONObj *o) +{ + RGWZoneGroupPlacementTarget t; + t.decode_json(o); + targets[t.name] = t; +} + +void RGWZone::generate_test_instances(list &o) +{ + RGWZone *z = new RGWZone; + o.push_back(z); + o.push_back(new RGWZone); +} + +void RGWZone::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("name", name, f); + encode_json("endpoints", endpoints, f); + encode_json("log_meta", log_meta, f); + encode_json("log_data", log_data, f); + encode_json("bucket_index_max_shards", bucket_index_max_shards, f); + encode_json("read_only", read_only, f); + encode_json("tier_type", tier_type, f); + encode_json("sync_from_all", sync_from_all, f); + encode_json("sync_from", sync_from, f); + encode_json("redirect_zone", redirect_zone, f); + encode_json("supported_features", supported_features, f); +} + +void RGWZone::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("name", name, obj); + if (id.empty()) { + id = name; + } + JSONDecoder::decode_json("endpoints", endpoints, obj); + JSONDecoder::decode_json("log_meta", log_meta, obj); + JSONDecoder::decode_json("log_data", log_data, obj); + JSONDecoder::decode_json("bucket_index_max_shards", bucket_index_max_shards, obj); + JSONDecoder::decode_json("read_only", read_only, obj); + JSONDecoder::decode_json("tier_type", tier_type, obj); + JSONDecoder::decode_json("sync_from_all", sync_from_all, true, obj); + JSONDecoder::decode_json("sync_from", sync_from, obj); + JSONDecoder::decode_json("redirect_zone", redirect_zone, obj); + JSONDecoder::decode_json("supported_features", supported_features, obj); +} + +int RGWSystemMetaObj::init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, + optional_yield y, + bool setup_obj, bool old_format) +{ + reinit_instance(_cct, _sysobj_svc); + + if (!setup_obj) + return 0; + + if (old_format && id.empty()) { + id = name; + } + + if (id.empty()) { + id = get_predefined_id(cct); + } + + if (id.empty()) { + int r; + if (name.empty()) { + name = get_predefined_name(cct); + } + if (name.empty()) { + r = use_default(dpp, y, old_format); + if (r < 0) { + return r; + } + } else if (!old_format) { + r = read_id(dpp, name, id, y); + if (r < 0) { + if (r != -ENOENT) { + ldpp_dout(dpp, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl; + } + return r; + } + } + } + + return read_info(dpp, id, y, old_format); +} + +RGWZoneGroup::~RGWZoneGroup() {} + +const string RGWZoneGroup::get_default_oid(bool old_region_format) const +{ + if (old_region_format) { + if (cct->_conf->rgw_default_region_info_oid.empty()) { + return default_region_info_oid; + } + return cct->_conf->rgw_default_region_info_oid; + } + + string default_oid = cct->_conf->rgw_default_zonegroup_info_oid; + + if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) { + default_oid = default_zone_group_info_oid; + } + + default_oid += "." + realm_id; + + return default_oid; +} + +const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format) const +{ + if (old_region_format) { + return region_info_oid_prefix; + } + return zone_group_info_oid_prefix; +} + +const string& RGWZoneGroup::get_names_oid_prefix() const +{ + return zonegroup_names_oid_prefix; +} + +string RGWZoneGroup::get_predefined_id(CephContext *cct) const { + return cct->_conf.get_val("rgw_zonegroup_id"); +} + +const string& RGWZoneGroup::get_predefined_name(CephContext *cct) const { + return cct->_conf->rgw_zonegroup; +} + +rgw_pool RGWZoneGroup::get_pool(CephContext *cct_) const +{ + if (cct_->_conf->rgw_zonegroup_root_pool.empty()) { + return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL); + } + + return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool); +} + +int RGWZoneGroup::read_default_id(const DoutPrefixProvider *dpp, string& default_id, optional_yield y, + bool old_format) +{ + if (realm_id.empty()) { + /* try using default realm */ + RGWRealm realm; + int ret = realm.init(dpp, cct, sysobj_svc, y); + // no default realm exist + if (ret < 0) { + return read_id(dpp, default_zonegroup_name, default_id, y); + } + realm_id = realm.get_id(); + } + + return RGWSystemMetaObj::read_default_id(dpp, default_id, y, old_format); +} + +int RGWSystemMetaObj::use_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format) +{ + return read_default_id(dpp, id, y, old_format); +} + +void RGWSystemMetaObj::reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc) +{ + cct = _cct; + sysobj_svc = _sysobj_svc; + zone_svc = _sysobj_svc->get_zone_svc(); +} + +int RGWSystemMetaObj::read_info(const DoutPrefixProvider *dpp, const string& obj_id, optional_yield y, + bool old_format) +{ + rgw_pool pool(get_pool(cct)); + + bufferlist bl; + + string oid = get_info_oid_prefix(old_format) + obj_id; + + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid}); + int ret = sysobj.rop().read(dpp, &bl, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + using ceph::decode; + + try { + auto iter = bl.cbegin(); + decode(*this, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl; + return -EIO; + } + + return 0; +} + +void RGWZoneGroup::decode_json(JSONObj *obj) +{ + RGWSystemMetaObj::decode_json(obj); + if (id.empty()) { + derr << "old format " << dendl; + JSONDecoder::decode_json("name", name, obj); + id = name; + } + JSONDecoder::decode_json("api_name", api_name, obj); + JSONDecoder::decode_json("is_master", is_master, obj); + JSONDecoder::decode_json("endpoints", endpoints, obj); + JSONDecoder::decode_json("hostnames", hostnames, obj); + JSONDecoder::decode_json("hostnames_s3website", hostnames_s3website, obj); + JSONDecoder::decode_json("master_zone", master_zone, obj); + JSONDecoder::decode_json("zones", zones, decode_zones, obj); + JSONDecoder::decode_json("placement_targets", placement_targets, decode_placement_targets, obj); + string pr; + JSONDecoder::decode_json("default_placement", pr, obj); + default_placement.from_str(pr); + JSONDecoder::decode_json("realm_id", realm_id, obj); + JSONDecoder::decode_json("sync_policy", sync_policy, obj); + JSONDecoder::decode_json("enabled_features", enabled_features, obj); +} + +RGWZoneParams::~RGWZoneParams() {} + +void RGWZoneParams::decode_json(JSONObj *obj) +{ + RGWSystemMetaObj::decode_json(obj); + JSONDecoder::decode_json("domain_root", domain_root, obj); + JSONDecoder::decode_json("control_pool", control_pool, obj); + JSONDecoder::decode_json("gc_pool", gc_pool, obj); + JSONDecoder::decode_json("lc_pool", lc_pool, obj); + JSONDecoder::decode_json("log_pool", log_pool, obj); + JSONDecoder::decode_json("intent_log_pool", intent_log_pool, obj); + JSONDecoder::decode_json("roles_pool", roles_pool, obj); + JSONDecoder::decode_json("reshard_pool", reshard_pool, obj); + JSONDecoder::decode_json("usage_log_pool", usage_log_pool, obj); + JSONDecoder::decode_json("user_keys_pool", user_keys_pool, obj); + JSONDecoder::decode_json("user_email_pool", user_email_pool, obj); + JSONDecoder::decode_json("user_swift_pool", user_swift_pool, obj); + JSONDecoder::decode_json("user_uid_pool", user_uid_pool, obj); + JSONDecoder::decode_json("otp_pool", otp_pool, obj); + JSONDecoder::decode_json("system_key", system_key, obj); + JSONDecoder::decode_json("placement_pools", placement_pools, obj); + JSONDecoder::decode_json("tier_config", tier_config, obj); + JSONDecoder::decode_json("realm_id", realm_id, obj); + JSONDecoder::decode_json("notif_pool", notif_pool, obj); + +} + +void RGWZoneParams::dump(Formatter *f) const +{ + RGWSystemMetaObj::dump(f); + encode_json("domain_root", domain_root, f); + encode_json("control_pool", control_pool, f); + encode_json("gc_pool", gc_pool, f); + encode_json("lc_pool", lc_pool, f); + encode_json("log_pool", log_pool, f); + encode_json("intent_log_pool", intent_log_pool, f); + encode_json("usage_log_pool", usage_log_pool, f); + encode_json("roles_pool", roles_pool, f); + encode_json("reshard_pool", reshard_pool, f); + encode_json("user_keys_pool", user_keys_pool, f); + encode_json("user_email_pool", user_email_pool, f); + encode_json("user_swift_pool", user_swift_pool, f); + encode_json("user_uid_pool", user_uid_pool, f); + encode_json("otp_pool", otp_pool, f); + encode_json_plain("system_key", system_key, f); + encode_json("placement_pools", placement_pools, f); + encode_json("tier_config", tier_config, f); + encode_json("realm_id", realm_id, f); + encode_json("notif_pool", notif_pool, f); +} + +int RGWZoneParams::init(const DoutPrefixProvider *dpp, + CephContext *cct, RGWSI_SysObj *sysobj_svc, + optional_yield y, bool setup_obj, bool old_format) +{ + if (name.empty()) { + name = cct->_conf->rgw_zone; + } + + return RGWSystemMetaObj::init(dpp, cct, sysobj_svc, y, setup_obj, old_format); +} + +rgw_pool RGWZoneParams::get_pool(CephContext *cct) const +{ + if (cct->_conf->rgw_zone_root_pool.empty()) { + return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL); + } + + return rgw_pool(cct->_conf->rgw_zone_root_pool); +} + +const string RGWZoneParams::get_default_oid(bool old_format) const +{ + if (old_format) { + return cct->_conf->rgw_default_zone_info_oid; + } + + return cct->_conf->rgw_default_zone_info_oid + "." + realm_id; +} + +const string& RGWZoneParams::get_names_oid_prefix() const +{ + return zone_names_oid_prefix; +} + +const string& RGWZoneParams::get_info_oid_prefix(bool old_format) const +{ + return zone_info_oid_prefix; +} + +string RGWZoneParams::get_predefined_id(CephContext *cct) const { + return cct->_conf.get_val("rgw_zone_id"); +} + +const string& RGWZoneParams::get_predefined_name(CephContext *cct) const { + return cct->_conf->rgw_zone; +} + +int RGWZoneParams::read_default_id(const DoutPrefixProvider *dpp, string& default_id, optional_yield y, + bool old_format) +{ + if (realm_id.empty()) { + /* try using default realm */ + RGWRealm realm; + int ret = realm.init(dpp, cct, sysobj_svc, y); + //no default realm exist + if (ret < 0) { + return read_id(dpp, default_zone_name, default_id, y); + } + realm_id = realm.get_id(); + } + + return RGWSystemMetaObj::read_default_id(dpp, default_id, y, old_format); +} + + +int RGWZoneParams::set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive) +{ + if (realm_id.empty()) { + /* try using default realm */ + RGWRealm realm; + int ret = realm.init(dpp, cct, sysobj_svc, y); + if (ret < 0) { + ldpp_dout(dpp, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl; + return -EINVAL; + } + realm_id = realm.get_id(); + } + + return RGWSystemMetaObj::set_as_default(dpp, y, exclusive); +} + +int RGWZoneParams::create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive) +{ + /* check for old pools config */ + rgw_raw_obj obj(domain_root, avail_pools); + auto sysobj = sysobj_svc->get_obj(obj); + int r = sysobj.rop().stat(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl; + /* a new system, let's set new placement info */ + RGWZonePlacementInfo default_placement; + default_placement.index_pool = name + "." + default_bucket_index_pool_suffix; + rgw_pool pool = name + "." + default_storage_pool_suffix; + default_placement.storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &pool, nullptr); + default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix; + placement_pools["default-placement"] = default_placement; + } + + r = fix_pool_names(dpp, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: fix_pool_names returned r=" << r << dendl; + return r; + } + + r = RGWSystemMetaObj::create(dpp, y, exclusive); + if (r < 0) { + return r; + } + + // try to set as default. may race with another create, so pass exclusive=true + // so we don't override an existing default + r = set_as_default(dpp, y, true); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, 10) << "WARNING: failed to set zone as default, r=" << r << dendl; + } + + return 0; +} + +rgw_pool fix_zone_pool_dup(const set& pools, + const string& default_prefix, + const string& default_suffix, + const rgw_pool& suggested_pool) +{ + string suggested_name = suggested_pool.to_str(); + + string prefix = default_prefix; + string suffix = default_suffix; + + if (!suggested_pool.empty()) { + prefix = suggested_name.substr(0, suggested_name.find(".")); + suffix = suggested_name.substr(prefix.length()); + } + + rgw_pool pool(prefix + suffix); + + while (pools.count(pool)) { + pool = prefix + "_" + std::to_string(std::rand()) + suffix; + } + return pool; +} + +void add_zone_pools(const RGWZoneParams& info, + std::set& pools) +{ + pools.insert(info.domain_root); + pools.insert(info.control_pool); + pools.insert(info.gc_pool); + pools.insert(info.log_pool); + pools.insert(info.intent_log_pool); + pools.insert(info.usage_log_pool); + pools.insert(info.user_keys_pool); + pools.insert(info.user_email_pool); + pools.insert(info.user_swift_pool); + pools.insert(info.user_uid_pool); + pools.insert(info.otp_pool); + pools.insert(info.roles_pool); + pools.insert(info.reshard_pool); + pools.insert(info.oidc_pool); + pools.insert(info.notif_pool); + + for (const auto& [pname, placement] : info.placement_pools) { + pools.insert(placement.index_pool); + for (const auto& [sname, sc] : placement.storage_classes.get_all()) { + if (sc.data_pool) { + pools.insert(sc.data_pool.get()); + } + } + pools.insert(placement.data_extra_pool); + } +} + +namespace rgw { + +int get_zones_pool_set(const DoutPrefixProvider *dpp, + optional_yield y, + rgw::sal::ConfigStore* cfgstore, + std::string_view my_zone_id, + std::set& pools) +{ + std::array zone_names; + rgw::sal::ListResult listing; + do { + int r = cfgstore->list_zone_names(dpp, y, listing.next, + zone_names, listing); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to list zones with " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& name : listing.entries) { + RGWZoneParams info; + r = cfgstore->read_zone_by_name(dpp, y, name, info, nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << "failed to load zone " << name + << " with " << cpp_strerror(r) << dendl; + return r; + } + if (info.get_id() != my_zone_id) { + add_zone_pools(info, pools); + } + } + } while (!listing.next.empty()); + + return 0; +} + +} + +static int get_zones_pool_set(const DoutPrefixProvider *dpp, + CephContext* cct, + RGWSI_SysObj* sysobj_svc, + const list& zone_names, + const string& my_zone_id, + set& pool_names, + optional_yield y) +{ + for (const auto& name : zone_names) { + RGWZoneParams zone(name); + int r = zone.init(dpp, cct, sysobj_svc, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "Error: failed to load zone " << name + << " with " << cpp_strerror(-r) << dendl; + return r; + } + if (zone.get_id() != my_zone_id) { + add_zone_pools(zone, pool_names); + } + } + return 0; +} + +int RGWZoneParams::fix_pool_names(const DoutPrefixProvider *dpp, optional_yield y) +{ + + list zones; + int r = zone_svc->list_zones(dpp, zones); + if (r < 0) { + ldpp_dout(dpp, 10) << "WARNING: driver->list_zones() returned r=" << r << dendl; + } + + set pools; + r = get_zones_pool_set(dpp, cct, sysobj_svc, zones, id, pools, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "Error: get_zones_pool_names" << r << dendl; + return r; + } + + domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root); + control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool); + gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool); + lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool); + log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool); + intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool); + usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool); + user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool); + user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool); + user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool); + user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool); + roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool); + reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool); + otp_pool = fix_zone_pool_dup(pools, name, ".rgw.otp", otp_pool); + oidc_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:oidc", oidc_pool); + notif_pool = fix_zone_pool_dup(pools, name ,".rgw.log:notif", notif_pool); + + for(auto& iter : placement_pools) { + iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix, + iter.second.index_pool); + for (auto& pi : iter.second.storage_classes.get_all()) { + if (pi.second.data_pool) { + rgw_pool& pool = pi.second.data_pool.get(); + pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix, + pool); + } + } + iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix, + iter.second.data_extra_pool); + } + + return 0; +} + +int RGWPeriodConfig::read(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, + optional_yield y) +{ + const auto& pool = get_pool(sysobj_svc->ctx()); + const auto& oid = get_oid(realm_id); + bufferlist bl; + + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid}); + int ret = sysobj.rop().read(dpp, &bl, y); + if (ret < 0) { + return ret; + } + using ceph::decode; + try { + auto iter = bl.cbegin(); + decode(*this, iter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +int RGWPeriodConfig::write(const DoutPrefixProvider *dpp, + RGWSI_SysObj *sysobj_svc, + const std::string& realm_id, optional_yield y) +{ + const auto& pool = get_pool(sysobj_svc->ctx()); + const auto& oid = get_oid(realm_id); + bufferlist bl; + using ceph::encode; + encode(*this, bl); + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid}); + return sysobj.wop() + .set_exclusive(false) + .write(dpp, bl, y); +} + +void RGWPeriodConfig::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj); + JSONDecoder::decode_json("user_quota", quota.user_quota, obj); + JSONDecoder::decode_json("user_ratelimit", user_ratelimit, obj); + JSONDecoder::decode_json("bucket_ratelimit", bucket_ratelimit, obj); + JSONDecoder::decode_json("anonymous_ratelimit", anon_ratelimit, obj); +} + +void RGWPeriodConfig::dump(Formatter *f) const +{ + encode_json("bucket_quota", quota.bucket_quota, f); + encode_json("user_quota", quota.user_quota, f); + encode_json("user_ratelimit", user_ratelimit, f); + encode_json("bucket_ratelimit", bucket_ratelimit, f); + encode_json("anonymous_ratelimit", anon_ratelimit, f); +} + +std::string RGWPeriodConfig::get_oid(const std::string& realm_id) +{ + if (realm_id.empty()) { + return "period_config.default"; + } + return "period_config." + realm_id; +} + +rgw_pool RGWPeriodConfig::get_pool(CephContext *cct) +{ + const auto& pool_name = cct->_conf->rgw_period_root_pool; + if (pool_name.empty()) { + return {RGW_DEFAULT_PERIOD_ROOT_POOL}; + } + return {pool_name}; +} + +int RGWSystemMetaObj::delete_obj(const DoutPrefixProvider *dpp, optional_yield y, bool old_format) +{ + rgw_pool pool(get_pool(cct)); + + /* check to see if obj is the default */ + RGWDefaultSystemMetaObjInfo default_info; + int ret = read_default(dpp, default_info, get_default_oid(old_format), y); + if (ret < 0 && ret != -ENOENT) + return ret; + if (default_info.default_id == id || (old_format && default_info.default_id == name)) { + string oid = get_default_oid(old_format); + rgw_raw_obj default_named_obj(pool, oid); + auto sysobj = sysobj_svc->get_obj(default_named_obj); + ret = sysobj.wop().remove(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + } + if (!old_format) { + string oid = get_names_oid_prefix() + name; + rgw_raw_obj object_name(pool, oid); + auto sysobj = sysobj_svc->get_obj(object_name); + ret = sysobj.wop().remove(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + string oid = get_info_oid_prefix(old_format); + if (old_format) { + oid += name; + } else { + oid += id; + } + + rgw_raw_obj object_id(pool, oid); + auto sysobj = sysobj_svc->get_obj(object_id); + ret = sysobj.wop().remove(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl; + } + + return ret; +} + +void RGWZoneGroup::dump(Formatter *f) const +{ + RGWSystemMetaObj::dump(f); + encode_json("api_name", api_name, f); + encode_json("is_master", is_master, f); + encode_json("endpoints", endpoints, f); + encode_json("hostnames", hostnames, f); + encode_json("hostnames_s3website", hostnames_s3website, f); + encode_json("master_zone", master_zone, f); + encode_json_map("zones", zones, f); /* more friendly representation */ + encode_json_map("placement_targets", placement_targets, f); /* more friendly representation */ + encode_json("default_placement", default_placement, f); + encode_json("realm_id", realm_id, f); + encode_json("sync_policy", sync_policy, f); + encode_json("enabled_features", enabled_features, f); +} + +void RGWZoneGroupPlacementTarget::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("tags", tags, obj); + JSONDecoder::decode_json("storage_classes", storage_classes, obj); + if (storage_classes.empty()) { + storage_classes.insert(RGW_STORAGE_CLASS_STANDARD); + } + JSONDecoder::decode_json("tier_targets", tier_targets, obj); +} + +void RGWZonePlacementInfo::dump(Formatter *f) const +{ + encode_json("index_pool", index_pool, f); + encode_json("storage_classes", storage_classes, f); + encode_json("data_extra_pool", data_extra_pool, f); + encode_json("index_type", (uint32_t)index_type, f); + encode_json("inline_data", inline_data, f); + + /* no real need for backward compatibility of compression_type and data_pool in here, + * rather not clutter the output */ +} + +void RGWZonePlacementInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("index_pool", index_pool, obj); + JSONDecoder::decode_json("storage_classes", storage_classes, obj); + JSONDecoder::decode_json("data_extra_pool", data_extra_pool, obj); + uint32_t it; + JSONDecoder::decode_json("index_type", it, obj); + JSONDecoder::decode_json("inline_data", inline_data, obj); + index_type = (rgw::BucketIndexType)it; + + /* backward compatibility, these are now defined in storage_classes */ + string standard_compression_type; + string *pcompression = nullptr; + if (JSONDecoder::decode_json("compression", standard_compression_type, obj)) { + pcompression = &standard_compression_type; + } + rgw_pool standard_data_pool; + rgw_pool *ppool = nullptr; + if (JSONDecoder::decode_json("data_pool", standard_data_pool, obj)) { + ppool = &standard_data_pool; + } + if (ppool || pcompression) { + storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, ppool, pcompression); + } +} + +void RGWSystemMetaObj::dump(Formatter *f) const +{ + encode_json("id", id , f); + encode_json("name", name , f); +} + +void RGWSystemMetaObj::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("name", name, obj); +} + +int RGWSystemMetaObj::read_default(const DoutPrefixProvider *dpp, + RGWDefaultSystemMetaObjInfo& default_info, + const string& oid, optional_yield y) +{ + using ceph::decode; + auto pool = get_pool(cct); + bufferlist bl; + + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid)); + int ret = sysobj.rop().read(dpp, &bl, y); + if (ret < 0) + return ret; + + try { + auto iter = bl.cbegin(); + decode(default_info, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "error decoding data from " << pool << ":" << oid << dendl; + return -EIO; + } + + return 0; +} + +void RGWZoneGroupPlacementTarget::dump(Formatter *f) const +{ + encode_json("name", name, f); + encode_json("tags", tags, f); + encode_json("storage_classes", storage_classes, f); + if (!tier_targets.empty()) { + encode_json("tier_targets", tier_targets, f); + } +} + +void RGWZoneGroupPlacementTier::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("tier_type", tier_type, obj); + JSONDecoder::decode_json("storage_class", storage_class, obj); + JSONDecoder::decode_json("retain_head_object", retain_head_object, obj); + + if (tier_type == "cloud-s3") { + JSONDecoder::decode_json("s3", t.s3, obj); + } +} + +void RGWZoneStorageClasses::dump(Formatter *f) const +{ + for (auto& i : m) { + encode_json(i.first.c_str(), i.second, f); + } +} + +void RGWZoneStorageClasses::decode_json(JSONObj *obj) +{ + JSONFormattable f; + decode_json_obj(f, obj); + + for (auto& field : f.object()) { + JSONObj *field_obj = obj->find_obj(field.first); + assert(field_obj); + + decode_json_obj(m[field.first], field_obj); + } + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; +} + +void RGWZoneGroupPlacementTier::dump(Formatter *f) const +{ + encode_json("tier_type", tier_type, f); + encode_json("storage_class", storage_class, f); + encode_json("retain_head_object", retain_head_object, f); + + if (tier_type == "cloud-s3") { + encode_json("s3", t.s3, f); + } +} + +void RGWZoneGroupPlacementTierS3::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("endpoint", endpoint, obj); + JSONDecoder::decode_json("access_key", key.id, obj); + JSONDecoder::decode_json("secret", key.key, obj); + JSONDecoder::decode_json("region", region, obj); + string s; + JSONDecoder::decode_json("host_style", s, obj); + if (s != "virtual") { + host_style = PathStyle; + } else { + host_style = VirtualStyle; + } + JSONDecoder::decode_json("target_storage_class", target_storage_class, obj); + JSONDecoder::decode_json("target_path", target_path, obj); + JSONDecoder::decode_json("acl_mappings", acl_mappings, obj); + JSONDecoder::decode_json("multipart_sync_threshold", multipart_sync_threshold, obj); + JSONDecoder::decode_json("multipart_min_part_size", multipart_min_part_size, obj); +} + +void RGWZoneStorageClass::dump(Formatter *f) const +{ + if (data_pool) { + encode_json("data_pool", data_pool.get(), f); + } + if (compression_type) { + encode_json("compression_type", compression_type.get(), f); + } +} + +void RGWZoneStorageClass::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("data_pool", data_pool, obj); + JSONDecoder::decode_json("compression_type", compression_type, obj); +} + +void RGWTierACLMapping::decode_json(JSONObj *obj) +{ + string s; + JSONDecoder::decode_json("type", s, obj); + if (s == "email") { + type = ACL_TYPE_EMAIL_USER; + } else if (s == "uri") { + type = ACL_TYPE_GROUP; + } else { + type = ACL_TYPE_CANON_USER; + } + + JSONDecoder::decode_json("source_id", source_id, obj); + JSONDecoder::decode_json("dest_id", dest_id, obj); +} + +void RGWZoneGroupPlacementTierS3::dump(Formatter *f) const +{ + encode_json("endpoint", endpoint, f); + encode_json("access_key", key.id, f); + encode_json("secret", key.key, f); + encode_json("region", region, f); + string s = (host_style == PathStyle ? "path" : "virtual"); + encode_json("host_style", s, f); + encode_json("target_storage_class", target_storage_class, f); + encode_json("target_path", target_path, f); + encode_json("acl_mappings", acl_mappings, f); + encode_json("multipart_sync_threshold", multipart_sync_threshold, f); + encode_json("multipart_min_part_size", multipart_min_part_size, f); +} + +void RGWTierACLMapping::dump(Formatter *f) const +{ + string s; + switch (type) { + case ACL_TYPE_EMAIL_USER: + s = "email"; + break; + case ACL_TYPE_GROUP: + s = "uri"; + break; + default: + s = "id"; + break; + } + encode_json("type", s, f); + encode_json("source_id", source_id, f); + encode_json("dest_id", dest_id, f); +} + +void RGWPeriodMap::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json_map("zonegroups", zonegroups, f); + encode_json("short_zone_ids", short_zone_ids, f); +} + +static void decode_zonegroups(map& zonegroups, JSONObj *o) +{ + RGWZoneGroup zg; + zg.decode_json(o); + zonegroups[zg.get_id()] = zg; +} + +void RGWPeriodMap::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("zonegroups", zonegroups, decode_zonegroups, obj); + /* backward compatability with region */ + if (zonegroups.empty()) { + JSONDecoder::decode_json("regions", zonegroups, obj); + } + /* backward compatability with region */ + if (master_zonegroup.empty()) { + JSONDecoder::decode_json("master_region", master_zonegroup, obj); + } + JSONDecoder::decode_json("short_zone_ids", short_zone_ids, obj); +} + +void RGWPeriodMap::decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(id, bl); + decode(zonegroups, bl); + decode(master_zonegroup, bl); + if (struct_v >= 2) { + decode(short_zone_ids, bl); + } + DECODE_FINISH(bl); + + zonegroups_by_api.clear(); + for (map::iterator iter = zonegroups.begin(); + iter != zonegroups.end(); ++iter) { + RGWZoneGroup& zonegroup = iter->second; + zonegroups_by_api[zonegroup.api_name] = zonegroup; + if (zonegroup.is_master_zonegroup()) { + master_zonegroup = zonegroup.get_id(); + } + } +} + +void RGWPeriodMap::encode(bufferlist& bl) const +{ + ENCODE_START(2, 1, bl); + encode(id, bl); + encode(zonegroups, bl); + encode(master_zonegroup, bl); + encode(short_zone_ids, bl); + ENCODE_FINISH(bl); +} + +int RGWSystemMetaObj::create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive) +{ + int ret; + + /* check to see the name is not used */ + ret = read_id(dpp, name, id, y); + if (exclusive && ret == 0) { + ldpp_dout(dpp, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl; + return -EEXIST; + } else if ( ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + if (id.empty()) { + /* create unique id */ + uuid_d new_uuid; + char uuid_str[37]; + new_uuid.generate_random(); + new_uuid.print(uuid_str); + id = uuid_str; + } + + ret = store_info(dpp, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return store_name(dpp, exclusive, y); +} + +int RGWSystemMetaObj::read_default_id(const DoutPrefixProvider *dpp, string& default_id, optional_yield y, + bool old_format) +{ + RGWDefaultSystemMetaObjInfo default_info; + + int ret = read_default(dpp, default_info, get_default_oid(old_format), y); + if (ret < 0) { + return ret; + } + + default_id = default_info.default_id; + + return 0; +} + +int RGWSystemMetaObj::set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive) +{ + using ceph::encode; + string oid = get_default_oid(); + + rgw_pool pool(get_pool(cct)); + bufferlist bl; + + RGWDefaultSystemMetaObjInfo default_info; + default_info.default_id = id; + + encode(default_info, bl); + + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid)); + int ret = sysobj.wop() + .set_exclusive(exclusive) + .write(dpp, bl, y); + if (ret < 0) + return ret; + + return 0; +} + +int RGWSystemMetaObj::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + rgw_pool pool(get_pool(cct)); + + string oid = get_info_oid_prefix() + id; + + bufferlist bl; + using ceph::encode; + encode(*this, bl); + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid}); + return sysobj.wop() + .set_exclusive(exclusive) + .write(dpp, bl, y); +} + +int RGWSystemMetaObj::read_id(const DoutPrefixProvider *dpp, const string& obj_name, string& object_id, + optional_yield y) +{ + using ceph::decode; + rgw_pool pool(get_pool(cct)); + bufferlist bl; + + string oid = get_names_oid_prefix() + obj_name; + + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid)); + int ret = sysobj.rop().read(dpp, &bl, y); + if (ret < 0) { + return ret; + } + + RGWNameToId nameToId; + try { + auto iter = bl.cbegin(); + decode(nameToId, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl; + return -EIO; + } + object_id = nameToId.obj_id; + return 0; +} + +int RGWSystemMetaObj::store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + rgw_pool pool(get_pool(cct)); + string oid = get_names_oid_prefix() + name; + + RGWNameToId nameToId; + nameToId.obj_id = id; + + bufferlist bl; + using ceph::encode; + encode(nameToId, bl); + auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid)); + return sysobj.wop() + .set_exclusive(exclusive) + .write(dpp, bl, y); +} + +bool RGWPeriodMap::find_zone_by_id(const rgw_zone_id& zone_id, + RGWZoneGroup *zonegroup, + RGWZone *zone) const +{ + for (auto& iter : zonegroups) { + auto& zg = iter.second; + + auto ziter = zg.zones.find(zone_id); + if (ziter != zg.zones.end()) { + *zonegroup = zg; + *zone = ziter->second; + return true; + } + } + + return false; +} + +int RGWZoneGroup::set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive) +{ + if (realm_id.empty()) { + /* try using default realm */ + RGWRealm realm; + int ret = realm.init(dpp, cct, sysobj_svc, y); + if (ret < 0) { + ldpp_dout(dpp, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl; + return -EINVAL; + } + realm_id = realm.get_id(); + } + + return RGWSystemMetaObj::set_as_default(dpp, y, exclusive); +} + +int RGWSystemMetaObj::write(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) +{ + int ret = store_info(dpp, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl; + return ret; + } + ret = store_name(dpp, exclusive, y); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl; + return ret; + } + return 0; +} + +namespace rgw { + +int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y, + const std::set& pools, RGWZoneParams& info) +{ + info.domain_root = fix_zone_pool_dup(pools, info.name, ".rgw.meta:root", info.domain_root); + info.control_pool = fix_zone_pool_dup(pools, info.name, ".rgw.control", info.control_pool); + info.gc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:gc", info.gc_pool); + info.lc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:lc", info.lc_pool); + info.log_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log", info.log_pool); + info.intent_log_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:intent", info.intent_log_pool); + info.usage_log_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:usage", info.usage_log_pool); + info.user_keys_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:users.keys", info.user_keys_pool); + info.user_email_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:users.email", info.user_email_pool); + info.user_swift_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:users.swift", info.user_swift_pool); + info.user_uid_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:users.uid", info.user_uid_pool); + info.roles_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:roles", info.roles_pool); + info.reshard_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:reshard", info.reshard_pool); + info.otp_pool = fix_zone_pool_dup(pools, info.name, ".rgw.otp", info.otp_pool); + info.oidc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:oidc", info.oidc_pool); + info.notif_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:notif", info.notif_pool); + + for (auto& [pname, placement] : info.placement_pools) { + placement.index_pool = fix_zone_pool_dup(pools, info.name, "." + default_bucket_index_pool_suffix, placement.index_pool); + placement.data_extra_pool= fix_zone_pool_dup(pools, info.name, "." + default_storage_extra_pool_suffix, placement.data_extra_pool); + for (auto& [sname, sc] : placement.storage_classes.get_all()) { + if (sc.data_pool) { + sc.data_pool = fix_zone_pool_dup(pools, info.name, "." + default_storage_pool_suffix, *sc.data_pool); + } + } + } + + return 0; +} + +int add_zone_to_group(const DoutPrefixProvider* dpp, RGWZoneGroup& zonegroup, + const RGWZoneParams& zone_params, + const bool *pis_master, const bool *pread_only, + const std::list& endpoints, + const std::string *ptier_type, + const bool *psync_from_all, + const std::list& sync_from, + const std::list& sync_from_rm, + const std::string *predirect_zone, + std::optional bucket_index_max_shards, + const rgw::zone_features::set& enable_features, + const rgw::zone_features::set& disable_features) +{ + const std::string& zone_id = zone_params.id; + const std::string& zone_name = zone_params.name; + + if (zone_id.empty()) { + ldpp_dout(dpp, -1) << __func__ << " requires a zone id" << dendl; + return -EINVAL; + } + if (zone_name.empty()) { + ldpp_dout(dpp, -1) << __func__ << " requires a zone name" << dendl; + return -EINVAL; + } + + // check for duplicate zone name on insert + if (!zonegroup.zones.count(zone_id)) { + for (const auto& [id, zone] : zonegroup.zones) { + if (zone.name == zone_name) { + ldpp_dout(dpp, 0) << "ERROR: found existing zone name " << zone_name + << " (" << id << ") in zonegroup " << zonegroup.name << dendl; + return -EEXIST; + } + } + } + + rgw_zone_id& master_zone = zonegroup.master_zone; + if (pis_master) { + if (*pis_master) { + if (!master_zone.empty() && master_zone != zone_id) { + ldpp_dout(dpp, 0) << "NOTICE: overriding master zone: " + << master_zone << dendl; + } + master_zone = zone_id; + } else if (master_zone == zone_id) { + master_zone.clear(); + } + } else if (master_zone.empty() && zonegroup.zones.empty()) { + ldpp_dout(dpp, 0) << "NOTICE: promoted " << zone_name + << " as new master_zone of zonegroup " << zonegroup.name << dendl; + master_zone = zone_id; + } + + // make sure the zone's placement targets are named in the zonegroup + for (const auto& [name, placement] : zone_params.placement_pools) { + auto target = RGWZoneGroupPlacementTarget{.name = name}; + zonegroup.placement_targets.emplace(name, std::move(target)); + } + + RGWZone& zone = zonegroup.zones[zone_params.id]; + zone.id = zone_params.id; + zone.name = zone_params.name; + if (!endpoints.empty()) { + zone.endpoints = endpoints; + } + if (pread_only) { + zone.read_only = *pread_only; + } + if (ptier_type) { + zone.tier_type = *ptier_type; + } + if (psync_from_all) { + zone.sync_from_all = *psync_from_all; + } + if (predirect_zone) { + zone.redirect_zone = *predirect_zone; + } + if (bucket_index_max_shards) { + zone.bucket_index_max_shards = *bucket_index_max_shards; + } + + // add/remove sync_from + for (auto add : sync_from) { + zone.sync_from.insert(add); + } + + for (const auto& rm : sync_from_rm) { + auto i = zone.sync_from.find(rm); + if (i == zone.sync_from.end()) { + ldpp_dout(dpp, 1) << "WARNING: zone \"" << rm + << "\" was not in sync_from" << dendl; + continue; + } + zone.sync_from.erase(i); + } + + // add/remove supported features + zone.supported_features.insert(enable_features.begin(), + enable_features.end()); + + for (const auto& feature : disable_features) { + if (zonegroup.enabled_features.contains(feature)) { + ldpp_dout(dpp, -1) << "ERROR: Cannot disable zone feature \"" << feature + << "\" until it's been disabled in zonegroup " << zonegroup.name << dendl; + return -EINVAL; + } + auto i = zone.supported_features.find(feature); + if (i == zone.supported_features.end()) { + ldpp_dout(dpp, 1) << "WARNING: zone feature \"" << feature + << "\" was not enabled in zone " << zone.name << dendl; + continue; + } + zone.supported_features.erase(i); + } + + const bool log_data = zonegroup.zones.size() > 1; + for (auto& [id, zone] : zonegroup.zones) { + zone.log_data = log_data; + } + + return 0; +} + +} // namespace rgw + diff --git a/src/rgw/rgw_zone_features.h b/src/rgw/rgw_zone_features.h new file mode 100644 index 000000000..5e1a435d4 --- /dev/null +++ b/src/rgw/rgw_zone_features.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* N.B., this header defines fundamental serialized types. Do not + * include files which can only be compiled in radosgw or OSD + * contexts (e.g., rgw_sal.h, rgw_common.h) */ + +#pragma once + +#include +#include + +namespace rgw::zone_features { + +// zone feature names +inline constexpr std::string_view resharding = "resharding"; +inline constexpr std::string_view compress_encrypted = "compress-encrypted"; + +// static list of features supported by this release +inline constexpr std::initializer_list supported = { + resharding, + compress_encrypted, +}; + +inline constexpr bool supports(std::string_view feature) { + for (auto i : supported) { + if (feature.compare(i) == 0) { + return true; + } + } + return false; +} + +// static list of features enabled by default on new zonegroups +inline constexpr std::initializer_list enabled = { + resharding, +}; + + +// enable string_view overloads for find() contains() etc +struct feature_less : std::less { + using is_transparent = std::true_type; +}; + +using set = boost::container::flat_set; + +} // namespace rgw::zone_features diff --git a/src/rgw/rgw_zone_types.h b/src/rgw/rgw_zone_types.h new file mode 100644 index 000000000..f2881dfef --- /dev/null +++ b/src/rgw/rgw_zone_types.h @@ -0,0 +1,625 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* N.B., this header defines fundamental serialized types. Do not + * introduce changes or include files which can only be compiled in + * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h) + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include "include/types.h" +#include "rgw_bucket_layout.h" +#include "rgw_zone_features.h" +#include "rgw_pool_types.h" +#include "rgw_acl_types.h" +#include "rgw_placement_types.h" + +#include "common/Formatter.h" + +class JSONObj; + +namespace rgw_zone_defaults { + +extern std::string zone_names_oid_prefix; +extern std::string region_info_oid_prefix; +extern std::string realm_names_oid_prefix; +extern std::string zone_group_info_oid_prefix; +extern std::string realm_info_oid_prefix; +extern std::string default_region_info_oid; +extern std::string default_zone_group_info_oid; +extern std::string region_map_oid; +extern std::string default_realm_info_oid; +extern std::string default_zonegroup_name; +extern std::string default_zone_name; +extern std::string zonegroup_names_oid_prefix; +extern std::string RGW_DEFAULT_ZONE_ROOT_POOL; +extern std::string RGW_DEFAULT_ZONEGROUP_ROOT_POOL; +extern std::string RGW_DEFAULT_REALM_ROOT_POOL; +extern std::string RGW_DEFAULT_PERIOD_ROOT_POOL; +extern std::string avail_pools; +extern std::string default_storage_pool_suffix; + +} /* namespace rgw_zone_defaults */ + +struct RGWNameToId { + std::string obj_id; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(obj_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(obj_id, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWNameToId) + +struct RGWDefaultSystemMetaObjInfo { + std::string default_id; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(default_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(default_id, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWDefaultSystemMetaObjInfo) + +struct RGWZoneStorageClass { + boost::optional data_pool; + boost::optional compression_type; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(data_pool, bl); + encode(compression_type, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(data_pool, bl); + decode(compression_type, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneStorageClass) + +class RGWZoneStorageClasses { + std::map m; + + /* in memory only */ + RGWZoneStorageClass *standard_class; + +public: + RGWZoneStorageClasses() { + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; + } + RGWZoneStorageClasses(const RGWZoneStorageClasses& rhs) { + m = rhs.m; + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; + } + RGWZoneStorageClasses& operator=(const RGWZoneStorageClasses& rhs) { + m = rhs.m; + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; + return *this; + } + + const RGWZoneStorageClass& get_standard() const { + return *standard_class; + } + + bool find(const std::string& sc, const RGWZoneStorageClass** pstorage_class) const { + auto iter = m.find(sc); + if (iter == m.end()) { + return false; + } + *pstorage_class = &iter->second; + return true; + } + + bool exists(const std::string& sc) const { + if (sc.empty()) { + return true; + } + auto iter = m.find(sc); + return (iter != m.end()); + } + + const std::map& get_all() const { + return m; + } + + std::map& get_all() { + return m; + } + + void set_storage_class(const std::string& sc, const rgw_pool* data_pool, const std::string* compression_type) { + const std::string *psc = ≻ + if (sc.empty()) { + psc = &RGW_STORAGE_CLASS_STANDARD; + } + RGWZoneStorageClass& storage_class = m[*psc]; + if (data_pool) { + storage_class.data_pool = *data_pool; + } + if (compression_type) { + storage_class.compression_type = *compression_type; + } + } + + void remove_storage_class(const std::string& sc) { + if (!sc.empty()) { + m.erase(sc); + } + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(m, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(m, bl); + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneStorageClasses) + +struct RGWZonePlacementInfo { + rgw_pool index_pool; + rgw_pool data_extra_pool; /* if not set we should use data_pool */ + RGWZoneStorageClasses storage_classes; + rgw::BucketIndexType index_type; + bool inline_data; + + RGWZonePlacementInfo() : index_type(rgw::BucketIndexType::Normal), inline_data(true) {} + + void encode(bufferlist& bl) const { + ENCODE_START(8, 1, bl); + encode(index_pool.to_str(), bl); + rgw_pool standard_data_pool = get_data_pool(RGW_STORAGE_CLASS_STANDARD); + encode(standard_data_pool.to_str(), bl); + encode(data_extra_pool.to_str(), bl); + encode((uint32_t)index_type, bl); + std::string standard_compression_type = get_compression_type(RGW_STORAGE_CLASS_STANDARD); + encode(standard_compression_type, bl); + encode(storage_classes, bl); + encode(inline_data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(8, bl); + std::string index_pool_str; + std::string data_pool_str; + decode(index_pool_str, bl); + index_pool = rgw_pool(index_pool_str); + decode(data_pool_str, bl); + rgw_pool standard_data_pool(data_pool_str); + if (struct_v >= 4) { + std::string data_extra_pool_str; + decode(data_extra_pool_str, bl); + data_extra_pool = rgw_pool(data_extra_pool_str); + } + if (struct_v >= 5) { + uint32_t it; + decode(it, bl); + index_type = (rgw::BucketIndexType)it; + } + std::string standard_compression_type; + if (struct_v >= 6) { + decode(standard_compression_type, bl); + } + if (struct_v >= 7) { + decode(storage_classes, bl); + } else { + storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &standard_data_pool, + (!standard_compression_type.empty() ? &standard_compression_type : nullptr)); + } + if (struct_v >= 8) { + decode(inline_data, bl); + } + DECODE_FINISH(bl); + } + const rgw_pool& get_data_extra_pool() const { + static rgw_pool no_pool; + if (data_extra_pool.empty()) { + return storage_classes.get_standard().data_pool.get_value_or(no_pool); + } + return data_extra_pool; + } + const rgw_pool& get_data_pool(const std::string& sc) const { + const RGWZoneStorageClass *storage_class; + static rgw_pool no_pool; + + if (!storage_classes.find(sc, &storage_class)) { + return storage_classes.get_standard().data_pool.get_value_or(no_pool); + } + + return storage_class->data_pool.get_value_or(no_pool); + } + const rgw_pool& get_standard_data_pool() const { + return get_data_pool(RGW_STORAGE_CLASS_STANDARD); + } + + const std::string& get_compression_type(const std::string& sc) const { + const RGWZoneStorageClass *storage_class; + static std::string no_compression; + + if (!storage_classes.find(sc, &storage_class)) { + return no_compression; + } + return storage_class->compression_type.get_value_or(no_compression); + } + + bool storage_class_exists(const std::string& sc) const { + return storage_classes.exists(sc); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + +}; +WRITE_CLASS_ENCODER(RGWZonePlacementInfo) + +struct RGWZone { + std::string id; + std::string name; + std::list endpoints; // std::vector? + bool log_meta; + bool log_data; + bool read_only; + std::string tier_type; + std::string redirect_zone; + +/** + * Represents the number of shards for the bucket index object, a value of zero + * indicates there is no sharding. By default (no sharding, the name of the object + * is '.dir.{marker}', with sharding, the name is '.dir.{marker}.{sharding_id}', + * sharding_id is zero-based value. It is not recommended to set a too large value + * (e.g. thousand) as it increases the cost for bucket listing. + */ + uint32_t bucket_index_max_shards; + + // pre-shard buckets on creation to enable some write-parallism by default, + // delay the need to reshard as the bucket grows, and (in multisite) get some + // bucket index sharding where dynamic resharding is not supported + static constexpr uint32_t default_bucket_index_max_shards = 11; + + bool sync_from_all; + std::set sync_from; /* list of zones to sync from */ + + rgw::zone_features::set supported_features; + + RGWZone() + : log_meta(false), log_data(false), read_only(false), + bucket_index_max_shards(default_bucket_index_max_shards), + sync_from_all(true) {} + + void encode(bufferlist& bl) const { + ENCODE_START(8, 1, bl); + encode(name, bl); + encode(endpoints, bl); + encode(log_meta, bl); + encode(log_data, bl); + encode(bucket_index_max_shards, bl); + encode(id, bl); + encode(read_only, bl); + encode(tier_type, bl); + encode(sync_from_all, bl); + encode(sync_from, bl); + encode(redirect_zone, bl); + encode(supported_features, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(8, bl); + decode(name, bl); + if (struct_v < 4) { + id = name; + } + decode(endpoints, bl); + if (struct_v >= 2) { + decode(log_meta, bl); + decode(log_data, bl); + } + if (struct_v >= 3) { + decode(bucket_index_max_shards, bl); + } + if (struct_v >= 4) { + decode(id, bl); + decode(read_only, bl); + } + if (struct_v >= 5) { + decode(tier_type, bl); + } + if (struct_v >= 6) { + decode(sync_from_all, bl); + decode(sync_from, bl); + } + if (struct_v >= 7) { + decode(redirect_zone, bl); + } + if (struct_v >= 8) { + decode(supported_features, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& o); + + bool is_read_only() const { return read_only; } + + bool syncs_from(const std::string& zone_name) const { + return (sync_from_all || sync_from.find(zone_name) != sync_from.end()); + } + + bool supports(std::string_view feature) const { + return supported_features.contains(feature); + } +}; +WRITE_CLASS_ENCODER(RGWZone) + +struct RGWDefaultZoneGroupInfo { + std::string default_zonegroup; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(default_zonegroup, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(default_zonegroup, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + //todo: implement ceph-dencoder +}; +WRITE_CLASS_ENCODER(RGWDefaultZoneGroupInfo) + +struct RGWTierACLMapping { + ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER}; + std::string source_id; + std::string dest_id; + + RGWTierACLMapping() = default; + + RGWTierACLMapping(ACLGranteeTypeEnum t, + const std::string& s, + const std::string& d) : type(t), + source_id(s), + dest_id(d) {} + + void init(const JSONFormattable& config) { + const std::string& t = config["type"]; + + if (t == "email") { + type = ACL_TYPE_EMAIL_USER; + } else if (t == "uri") { + type = ACL_TYPE_GROUP; + } else { + type = ACL_TYPE_CANON_USER; + } + + source_id = config["source_id"]; + dest_id = config["dest_id"]; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode((uint32_t)type, bl); + encode(source_id, bl); + encode(dest_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + uint32_t it; + decode(it, bl); + type = (ACLGranteeTypeEnum)it; + decode(source_id, bl); + decode(dest_id, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWTierACLMapping) + +enum HostStyle { + PathStyle = 0, + VirtualStyle = 1, +}; + +struct RGWZoneGroupPlacementTierS3 { +#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024) + std::string endpoint; + RGWAccessKey key; + std::string region; + HostStyle host_style{PathStyle}; + std::string target_storage_class; + + /* Should below be bucket/zone specific?? */ + std::string target_path; + std::map acl_mappings; + + uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE}; + uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE}; + + int update_params(const JSONFormattable& config); + int clear_params(const JSONFormattable& config); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(endpoint, bl); + encode(key, bl); + encode(region, bl); + encode((uint32_t)host_style, bl); // XXX kill C-style casts + encode(target_storage_class, bl); + encode(target_path, bl); + encode(acl_mappings, bl); + encode(multipart_sync_threshold, bl); + encode(multipart_min_part_size, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(endpoint, bl); + decode(key, bl); + decode(region, bl); + + uint32_t it; + decode(it, bl); + host_style = (HostStyle)it; // XXX can't this be HostStyle(it)? + + decode(target_storage_class, bl); + decode(target_path, bl); + decode(acl_mappings, bl); + decode(multipart_sync_threshold, bl); + decode(multipart_min_part_size, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTierS3) + +struct RGWZoneGroupPlacementTier { + std::string tier_type; + std::string storage_class; + bool retain_head_object = false; + + struct _tier { + RGWZoneGroupPlacementTierS3 s3; + } t; + + int update_params(const JSONFormattable& config); + int clear_params(const JSONFormattable& config); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(tier_type, bl); + encode(storage_class, bl); + encode(retain_head_object, bl); + if (tier_type == "cloud-s3") { + encode(t.s3, bl); + } + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(tier_type, bl); + decode(storage_class, bl); + decode(retain_head_object, bl); + if (tier_type == "cloud-s3") { + decode(t.s3, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTier) + +struct RGWZoneGroupPlacementTarget { + std::string name; + std::set tags; + std::set storage_classes; + std::map tier_targets; + + bool user_permitted(const std::list& user_tags) const { + if (tags.empty()) { + return true; + } + for (auto& rule : user_tags) { + if (tags.find(rule) != tags.end()) { + return true; + } + } + return false; + } + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(name, bl); + encode(tags, bl); + encode(storage_classes, bl); + encode(tier_targets, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(name, bl); + decode(tags, bl); + if (struct_v >= 2) { + decode(storage_classes, bl); + } + if (storage_classes.empty()) { + storage_classes.insert(RGW_STORAGE_CLASS_STANDARD); + } + if (struct_v >= 3) { + decode(tier_targets, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTarget) diff --git a/src/rgw/rgwam.py b/src/rgw/rgwam.py new file mode 100755 index 000000000..f07d2b423 --- /dev/null +++ b/src/rgw/rgwam.py @@ -0,0 +1,240 @@ +#!@Python3_EXECUTABLE@ +# -*- mode:python -*- +# vim: ts=4 sw=4 smarttab expandtab +# +# Processed in Makefile to add python #! line and version variable +# +# + +import subprocess +import random +import string +import json +import argparse +import sys +import socket +import base64 +import logging + +from urllib.parse import urlparse + +from ceph.rgw.rgwam_core import RGWAM, EnvArgs +from ceph.rgw.types import RGWAMEnvMgr, RGWAMException + +class RGWAMCLIMgr(RGWAMEnvMgr): + def __init__(self, common_args): + args = [] + + if common_args.conf_path: + args += [ '-c', common_args.conf_path ] + + if common_args.ceph_name: + args += [ '-n', common_args.ceph_name ] + + if common_args.ceph_keyring: + args += [ '-k', common_args.ceph_keyring ] + + self.args_prefix = args + + def tool_exec(self, prog, args): + run_cmd = [ prog ] + self.args_prefix + args + + result = subprocess.run(run_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + stdout = result.stdout.decode('utf-8') + stderr = result.stderr.decode('utf-8') + + return run_cmd, result.returncode, stdout, stderr + + def apply_rgw(self, svc_id, realm_name, zone_name, port = None): + return None + + def list_daemons(self, service_name, daemon_type = None, daemon_id = None, hostname = None, refresh = True): + return [] + +class RealmCommand: + def __init__(self, env, args): + self.env = env + self.args = args + + def parse(self): + parser = argparse.ArgumentParser( + usage='''rgwam realm + +The subcommands are: + bootstrap Bootstrap new realm + new-zone-creds Create credentials for connecting new zone +''') + parser.add_argument('subcommand', help='Subcommand to run') + # parse_args defaults to [1:] for args, but you need to + # exclude the rest of the args too, or validation will fail + args = parser.parse_args(self.args[0:1]) + + sub = args.subcommand.replace('-', '_') + + if not hasattr(self, sub): + print('Unrecognized subcommand:', args.subcommand) + parser.print_help() + exit(1) + # use dispatch pattern to invoke method with same name + + return getattr(self, sub) + + def bootstrap(self): + parser = argparse.ArgumentParser( + description='Bootstrap new realm', + usage='rgwam realm bootstrap []') + parser.add_argument('--realm') + parser.add_argument('--zonegroup') + parser.add_argument('--zone') + parser.add_argument('--endpoints') + parser.add_argument('--sys-uid') + parser.add_argument('--uid') + parser.add_argument('--start-radosgw', action='store_true', dest='start_radosgw', default=True) + parser.add_argument('--no-start-radosgw', action='store_false', dest='start_radosgw') + + args = parser.parse_args(self.args[1:]) + + return RGWAM(self.env).realm_bootstrap(args.realm, args.zonegroup, args.zone, args.endpoints, + args.sys_uid, args.uid, args.start_radosgw) + + def new_zone_creds(self): + parser = argparse.ArgumentParser( + description='Bootstrap new realm', + usage='rgwam realm new-zone-creds []') + parser.add_argument('--endpoints') + parser.add_argument('--sys-uid') + + args = parser.parse_args(self.args[1:]) + + return RGWAM(self.env).realm_new_zone_creds(args.endpoints, args.sys_uid) + + +class ZoneCommand: + def __init__(self, env, args): + self.env = env + self.args = args + + def parse(self): + parser = argparse.ArgumentParser( + usage='''rgwam zone + +The subcommands are: + run run radosgw daemon in current zone +''') + parser.add_argument('subcommand', help='Subcommand to run') + # parse_args defaults to [1:] for args, but you need to + # exclude the rest of the args too, or validation will fail + args = parser.parse_args(self.args[0:1]) + if not hasattr(self, args.subcommand): + print('Unrecognized subcommand:', args.subcommand) + parser.print_help() + exit(1) + # use dispatch pattern to invoke method with same name + return getattr(self, args.subcommand) + + def run(self): + parser = argparse.ArgumentParser( + description='Run radosgw daemon', + usage='rgwam zone run []') + parser.add_argument('--port') + parser.add_argument('--log-file') + parser.add_argument('--debug-ms') + parser.add_argument('--debug-rgw') + + args = parser.parse_args(self.args[1:]) + + return RGWAM(self.env).run_radosgw(port = args.port) + + def create(self): + parser = argparse.ArgumentParser( + description='Create new zone to join existing realm', + usage='rgwam zone create []') + parser.add_argument('--realm-token') + parser.add_argument('--zone') + parser.add_argument('--zonegroup') + parser.add_argument('--endpoints') + parser.add_argument('--start-radosgw', action='store_true', dest='start_radosgw', default=True) + parser.add_argument('--no-start-radosgw', action='store_false', dest='start_radosgw') + + args = parser.parse_args(self.args[1:]) + + return RGWAM(self.env).zone_create(args.realm_token, args.zonegroup, args.zone, args.endpoints, args.start_radosgw) + +class CommonArgs: + def __init__(self, ns): + self.conf_path = ns.conf_path + self.ceph_name = ns.ceph_name + self.ceph_keyring = ns.ceph_keyring + +class TopLevelCommand: + + def _parse(self): + parser = argparse.ArgumentParser( + description='RGW assist for multisite tool', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +The commands are: + realm bootstrap Bootstrap new realm + realm new-zone-creds Create credentials to connect new zone to realm + zone create Create new zone and connect it to existing realm + zone run Run radosgw in current zone +''') + + parser.add_argument('command', help='command to run', default=None) + parser.add_argument('-c', help='ceph conf path', dest='conf_path') + parser.add_argument('-n', help='ceph user name', dest='ceph_name') + parser.add_argument('-k', help='ceph keyring', dest='ceph_keyring') + + removed_args = [] + + args = sys.argv[1:] + if len(args) > 0: + if hasattr(self, args[0]): + # remove -h/--help if top command is not empty so that top level help + # doesn't override subcommand, we'll add it later + help_args = [ '-h', '--help' ] + removed_args = [arg for arg in args if arg in help_args] + args = [arg for arg in args if arg not in help_args] + + (ns, args) = parser.parse_known_args(args) + if not hasattr(self, ns.command) or ns.command[0] == '_': + print('Unrecognized command:', ns.command) + parser.print_help() + exit(1) + # use dispatch pattern to invoke method with same name + args += removed_args + return (getattr(self, ns.command), CommonArgs(ns), args) + + def realm(self, env, args): + cmd = RealmCommand(env, args).parse() + return cmd() + + def zone(self, env, args): + cmd = ZoneCommand(env, args).parse() + return cmd() + + +def main(): + logging.basicConfig(level=logging.INFO) + + log = logging.getLogger(__name__) + + (cmd, common_args, args)= TopLevelCommand()._parse() + + env = EnvArgs(RGWAMCLIMgr(common_args)) + + try: + retval, out, err = cmd(env, args) + if retval != 0: + log.error('stdout: '+ out + '\nstderr: ' + err) + sys.exit(retval) + except RGWAMException as e: + print('ERROR: ' + e.message) + + sys.exit(0) + + +if __name__ == '__main__': + main() + diff --git a/src/rgw/services/svc_bi.h b/src/rgw/services/svc_bi.h new file mode 100644 index 000000000..bd811e162 --- /dev/null +++ b/src/rgw/services/svc_bi.h @@ -0,0 +1,44 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +class RGWBucketInfo; +struct RGWBucketEnt; + + +class RGWSI_BucketIndex : public RGWServiceInstance +{ +public: + RGWSI_BucketIndex(CephContext *cct) : RGWServiceInstance(cct) {} + virtual ~RGWSI_BucketIndex() {} + + virtual int init_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout) = 0; + virtual int clean_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout) = 0; + + virtual int read_stats(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWBucketEnt *stats, + optional_yield y) = 0; + + virtual int handle_overwrite(const DoutPrefixProvider *dpp, + const RGWBucketInfo& info, + const RGWBucketInfo& orig_info, + optional_yield y) = 0; +}; diff --git a/src/rgw/services/svc_bi_rados.cc b/src/rgw/services/svc_bi_rados.cc new file mode 100644 index 000000000..6002b986f --- /dev/null +++ b/src/rgw/services/svc_bi_rados.cc @@ -0,0 +1,509 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_bi_rados.h" +#include "svc_bilog_rados.h" +#include "svc_zone.h" + +#include "rgw_bucket.h" +#include "rgw_zone.h" +#include "rgw_datalog.h" + +#include "cls/rgw/cls_rgw_client.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static string dir_oid_prefix = ".dir."; + +RGWSI_BucketIndex_RADOS::RGWSI_BucketIndex_RADOS(CephContext *cct) : RGWSI_BucketIndex(cct) +{ +} + +void RGWSI_BucketIndex_RADOS::init(RGWSI_Zone *zone_svc, + RGWSI_RADOS *rados_svc, + RGWSI_BILog_RADOS *bilog_svc, + RGWDataChangesLog *datalog_rados_svc) +{ + svc.zone = zone_svc; + svc.rados = rados_svc; + svc.bilog = bilog_svc; + svc.datalog_rados = datalog_rados_svc; +} + +int RGWSI_BucketIndex_RADOS::open_pool(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + RGWSI_RADOS::Pool *index_pool, + bool mostly_omap) +{ + *index_pool = svc.rados->pool(pool); + return index_pool->open(dpp, RGWSI_RADOS::OpenParams() + .set_mostly_omap(mostly_omap)); +} + +int RGWSI_BucketIndex_RADOS::open_bucket_index_pool(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWSI_RADOS::Pool *index_pool) +{ + const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool; + + if (!explicit_pool.empty()) { + return open_pool(dpp, explicit_pool, index_pool, false); + } + + auto& zonegroup = svc.zone->get_zonegroup(); + auto& zone_params = svc.zone->get_zone_params(); + + const rgw_placement_rule *rule = &bucket_info.placement_rule; + if (rule->empty()) { + rule = &zonegroup.default_placement; + } + auto iter = zone_params.placement_pools.find(rule->name); + if (iter == zone_params.placement_pools.end()) { + ldpp_dout(dpp, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl; + return -EINVAL; + } + + int r = open_pool(dpp, iter->second.index_pool, index_pool, true); + if (r < 0) + return r; + + return 0; +} + +int RGWSI_BucketIndex_RADOS::open_bucket_index_base(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWSI_RADOS::Pool *index_pool, + string *bucket_oid_base) +{ + const rgw_bucket& bucket = bucket_info.bucket; + int r = open_bucket_index_pool(dpp, bucket_info, index_pool); + if (r < 0) + return r; + + if (bucket.bucket_id.empty()) { + ldpp_dout(dpp, 0) << "ERROR: empty bucket_id for bucket operation" << dendl; + return -EIO; + } + + *bucket_oid_base = dir_oid_prefix; + bucket_oid_base->append(bucket.bucket_id); + + return 0; + +} + +int RGWSI_BucketIndex_RADOS::open_bucket_index(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWSI_RADOS::Pool *index_pool, + string *bucket_oid) +{ + const rgw_bucket& bucket = bucket_info.bucket; + int r = open_bucket_index_pool(dpp, bucket_info, index_pool); + if (r < 0) { + ldpp_dout(dpp, 20) << __func__ << ": open_bucket_index_pool() returned " + << r << dendl; + return r; + } + + if (bucket.bucket_id.empty()) { + ldpp_dout(dpp, 0) << "ERROR: empty bucket id for bucket operation" << dendl; + return -EIO; + } + + *bucket_oid = dir_oid_prefix; + bucket_oid->append(bucket.bucket_id); + + return 0; +} + +static char bucket_obj_with_generation(char *buf, size_t len, const string& bucket_oid_base, uint64_t gen_id, + uint32_t shard_id) +{ + return snprintf(buf, len, "%s.%" PRIu64 ".%d", bucket_oid_base.c_str(), gen_id, shard_id); +} + +static char bucket_obj_without_generation(char *buf, size_t len, const string& bucket_oid_base, uint32_t shard_id) +{ + return snprintf(buf, len, "%s.%d", bucket_oid_base.c_str(), shard_id); +} + +static void get_bucket_index_objects(const string& bucket_oid_base, + uint32_t num_shards, uint64_t gen_id, + map *_bucket_objects, + int shard_id = -1) +{ + auto& bucket_objects = *_bucket_objects; + if (!num_shards) { + bucket_objects[0] = bucket_oid_base; + } else { + char buf[bucket_oid_base.size() + 64]; + if (shard_id < 0) { + for (uint32_t i = 0; i < num_shards; ++i) { + if (gen_id) { + bucket_obj_with_generation(buf, sizeof(buf), bucket_oid_base, gen_id, i); + } else { + bucket_obj_without_generation(buf, sizeof(buf), bucket_oid_base, i); + } + bucket_objects[i] = buf; + } + } else { + if (std::cmp_greater(shard_id, num_shards)) { + return; + } else { + if (gen_id) { + bucket_obj_with_generation(buf, sizeof(buf), bucket_oid_base, gen_id, shard_id); + } else { + // for backward compatibility, gen_id(0) will not be added in the object name + bucket_obj_without_generation(buf, sizeof(buf), bucket_oid_base, shard_id); + } + bucket_objects[shard_id] = buf; + } + } + } +} + +static void get_bucket_instance_ids(const RGWBucketInfo& bucket_info, + int num_shards, int shard_id, + map *result) +{ + const rgw_bucket& bucket = bucket_info.bucket; + string plain_id = bucket.name + ":" + bucket.bucket_id; + + if (!num_shards) { + (*result)[0] = plain_id; + } else { + char buf[16]; + if (shard_id < 0) { + for (int i = 0; i < num_shards; ++i) { + snprintf(buf, sizeof(buf), ":%d", i); + (*result)[i] = plain_id + buf; + } + } else { + if (shard_id > num_shards) { + return; + } + snprintf(buf, sizeof(buf), ":%d", shard_id); + (*result)[shard_id] = plain_id + buf; + } + } +} + +int RGWSI_BucketIndex_RADOS::open_bucket_index(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + std::optional _shard_id, + const rgw::bucket_index_layout_generation& idx_layout, + RGWSI_RADOS::Pool *index_pool, + map *bucket_objs, + map *bucket_instance_ids) +{ + int shard_id = _shard_id.value_or(-1); + string bucket_oid_base; + int ret = open_bucket_index_base(dpp, bucket_info, index_pool, &bucket_oid_base); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << ": open_bucket_index_pool() returned " + << ret << dendl; + return ret; + } + + get_bucket_index_objects(bucket_oid_base, idx_layout.layout.normal.num_shards, + idx_layout.gen, bucket_objs, shard_id); + if (bucket_instance_ids) { + get_bucket_instance_ids(bucket_info, idx_layout.layout.normal.num_shards, + shard_id, bucket_instance_ids); + } + return 0; +} + +void RGWSI_BucketIndex_RADOS::get_bucket_index_object( + const std::string& bucket_oid_base, + const rgw::bucket_index_normal_layout& normal, + uint64_t gen_id, int shard_id, + std::string* bucket_obj) +{ + if (!normal.num_shards) { + // By default with no sharding, we use the bucket oid as itself + (*bucket_obj) = bucket_oid_base; + } else { + char buf[bucket_oid_base.size() + 64]; + if (gen_id) { + bucket_obj_with_generation(buf, sizeof(buf), bucket_oid_base, gen_id, shard_id); + (*bucket_obj) = buf; + ldout(cct, 10) << "bucket_obj is " << (*bucket_obj) << dendl; + } else { + // for backward compatibility, gen_id(0) will not be added in the object name + bucket_obj_without_generation(buf, sizeof(buf), bucket_oid_base, shard_id); + (*bucket_obj) = buf; + } + } +} + +int RGWSI_BucketIndex_RADOS::get_bucket_index_object( + const std::string& bucket_oid_base, + const rgw::bucket_index_normal_layout& normal, + uint64_t gen_id, const std::string& obj_key, + std::string* bucket_obj, int* shard_id) +{ + int r = 0; + switch (normal.hash_type) { + case rgw::BucketHashType::Mod: + if (!normal.num_shards) { + // By default with no sharding, we use the bucket oid as itself + (*bucket_obj) = bucket_oid_base; + if (shard_id) { + *shard_id = -1; + } + } else { + uint32_t sid = bucket_shard_index(obj_key, normal.num_shards); + char buf[bucket_oid_base.size() + 64]; + if (gen_id) { + bucket_obj_with_generation(buf, sizeof(buf), bucket_oid_base, gen_id, sid); + } else { + bucket_obj_without_generation(buf, sizeof(buf), bucket_oid_base, sid); + } + (*bucket_obj) = buf; + if (shard_id) { + *shard_id = (int)sid; + } + } + break; + default: + r = -ENOTSUP; + } + return r; +} + +int RGWSI_BucketIndex_RADOS::open_bucket_index_shard(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const string& obj_key, + RGWSI_RADOS::Obj *bucket_obj, + int *shard_id) +{ + string bucket_oid_base; + + RGWSI_RADOS::Pool pool; + + int ret = open_bucket_index_base(dpp, bucket_info, &pool, &bucket_oid_base); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << ": open_bucket_index_pool() returned " + << ret << dendl; + return ret; + } + + string oid; + + const auto& current_index = bucket_info.layout.current_index; + ret = get_bucket_index_object(bucket_oid_base, current_index.layout.normal, + current_index.gen, obj_key, &oid, shard_id); + if (ret < 0) { + ldpp_dout(dpp, 10) << "get_bucket_index_object() returned ret=" << ret << dendl; + return ret; + } + + *bucket_obj = svc.rados->obj(pool, oid); + + return 0; +} + +int RGWSI_BucketIndex_RADOS::open_bucket_index_shard(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& index, + int shard_id, + RGWSI_RADOS::Obj *bucket_obj) +{ + RGWSI_RADOS::Pool index_pool; + string bucket_oid_base; + int ret = open_bucket_index_base(dpp, bucket_info, &index_pool, &bucket_oid_base); + if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << ": open_bucket_index_pool() returned " + << ret << dendl; + return ret; + } + + string oid; + + get_bucket_index_object(bucket_oid_base, index.layout.normal, + index.gen, shard_id, &oid); + + *bucket_obj = svc.rados->obj(index_pool, oid); + + return 0; +} + +int RGWSI_BucketIndex_RADOS::cls_bucket_head(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, + vector *headers, + map *bucket_instance_ids, + optional_yield y) +{ + RGWSI_RADOS::Pool index_pool; + map oids; + int r = open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids); + if (r < 0) + return r; + + map list_results; + for (auto& iter : oids) { + list_results.emplace(iter.first, rgw_cls_list_ret()); + } + + r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) + return r; + + map::iterator iter = list_results.begin(); + for(; iter != list_results.end(); ++iter) { + headers->push_back(std::move(iter->second.dir.header)); + } + return 0; +} + +int RGWSI_BucketIndex_RADOS::init_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout) +{ + RGWSI_RADOS::Pool index_pool; + + string dir_oid = dir_oid_prefix; + int r = open_bucket_index_pool(dpp, bucket_info, &index_pool); + if (r < 0) { + return r; + } + + dir_oid.append(bucket_info.bucket.bucket_id); + + map bucket_objs; + get_bucket_index_objects(dir_oid, idx_layout.layout.normal.num_shards, idx_layout.gen, &bucket_objs); + + return CLSRGWIssueBucketIndexInit(index_pool.ioctx(), + bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWSI_BucketIndex_RADOS::clean_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout) +{ + RGWSI_RADOS::Pool index_pool; + + std::string dir_oid = dir_oid_prefix; + int r = open_bucket_index_pool(dpp, bucket_info, &index_pool); + if (r < 0) { + return r; + } + + dir_oid.append(bucket_info.bucket.bucket_id); + + std::map bucket_objs; + get_bucket_index_objects(dir_oid, idx_layout.layout.normal.num_shards, + idx_layout.gen, &bucket_objs); + + return CLSRGWIssueBucketIndexClean(index_pool.ioctx(), + bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWSI_BucketIndex_RADOS::read_stats(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWBucketEnt *result, + optional_yield y) +{ + vector headers; + + result->bucket = bucket_info.bucket; + int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, &headers, nullptr, y); + if (r < 0) { + return r; + } + + result->count = 0; + result->size = 0; + result->size_rounded = 0; + + auto hiter = headers.begin(); + for (; hiter != headers.end(); ++hiter) { + RGWObjCategory category = RGWObjCategory::Main; + auto iter = (hiter->stats).find(category); + if (iter != hiter->stats.end()) { + struct rgw_bucket_category_stats& stats = iter->second; + result->count += stats.num_entries; + result->size += stats.total_size; + result->size_rounded += stats.total_size_rounded; + } + } + + result->placement_rule = std::move(bucket_info.placement_rule); + + return 0; +} + +int RGWSI_BucketIndex_RADOS::get_reshard_status(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, list *status) +{ + map bucket_objs; + + RGWSI_RADOS::Pool index_pool; + + int r = open_bucket_index(dpp, bucket_info, + std::nullopt, + bucket_info.layout.current_index, + &index_pool, + &bucket_objs, + nullptr); + if (r < 0) { + return r; + } + + for (auto i : bucket_objs) { + cls_rgw_bucket_instance_entry entry; + + int ret = cls_rgw_get_bucket_resharding(index_pool.ioctx(), i.second, &entry); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": cls_rgw_get_bucket_resharding() returned ret=" << ret << dendl; + return ret; + } + + status->push_back(entry); + } + + return 0; +} + +int RGWSI_BucketIndex_RADOS::handle_overwrite(const DoutPrefixProvider *dpp, + const RGWBucketInfo& info, + const RGWBucketInfo& orig_info, + optional_yield y) +{ + bool new_sync_enabled = info.datasync_flag_enabled(); + bool old_sync_enabled = orig_info.datasync_flag_enabled(); + + if (old_sync_enabled == new_sync_enabled) { + return 0; // datasync flag didn't change + } + if (info.layout.logs.empty()) { + return 0; // no bilog + } + const auto& bilog = info.layout.logs.back(); + if (bilog.layout.type != rgw::BucketLogType::InIndex) { + return -ENOTSUP; + } + const int shards_num = rgw::num_shards(bilog.layout.in_index); + + int ret; + if (!new_sync_enabled) { + ret = svc.bilog->log_stop(dpp, info, bilog, -1); + } else { + ret = svc.bilog->log_start(dpp, info, bilog, -1); + } + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed writing bilog (bucket=" << info.bucket << "); ret=" << ret << dendl; + return ret; + } + + for (int i = 0; i < shards_num; ++i) { + ret = svc.datalog_rados->add_entry(dpp, info, bilog, i, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed writing data log (info.bucket=" << info.bucket << ", shard_id=" << i << ")" << dendl; + } // datalog error is not fatal + } + + return 0; +} diff --git a/src/rgw/services/svc_bi_rados.h b/src/rgw/services/svc_bi_rados.h new file mode 100644 index 000000000..feba0cfcd --- /dev/null +++ b/src/rgw/services/svc_bi_rados.h @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_datalog.h" +#include "rgw_service.h" +#include "rgw_tools.h" + +#include "svc_bi.h" +#include "svc_rados.h" +#include "svc_tier_rados.h" + +struct rgw_bucket_dir_header; + +class RGWSI_BILog_RADOS; + +#define RGW_NO_SHARD -1 + +#define RGW_SHARDS_PRIME_0 7877 +#define RGW_SHARDS_PRIME_1 65521 + +/* + * Defined Bucket Index Namespaces + */ +#define RGW_OBJ_NS_MULTIPART "multipart" +#define RGW_OBJ_NS_SHADOW "shadow" + +class RGWSI_BucketIndex_RADOS : public RGWSI_BucketIndex +{ + friend class RGWSI_BILog_RADOS; + + int open_pool(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + RGWSI_RADOS::Pool *index_pool, + bool mostly_omap); + + int open_bucket_index_pool(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWSI_RADOS::Pool *index_pool); + int open_bucket_index_base(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWSI_RADOS::Pool *index_pool, + std::string *bucket_oid_base); + + // return the index oid for the given shard id + void get_bucket_index_object(const std::string& bucket_oid_base, + const rgw::bucket_index_normal_layout& normal, + uint64_t gen_id, int shard_id, + std::string* bucket_obj); + // return the index oid and shard id for the given object name + int get_bucket_index_object(const std::string& bucket_oid_base, + const rgw::bucket_index_normal_layout& normal, + uint64_t gen_id, const std::string& obj_key, + std::string* bucket_obj, int* shard_id); + + int cls_bucket_head(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, + std::vector *headers, + std::map *bucket_instance_ids, + optional_yield y); + +public: + + struct Svc { + RGWSI_Zone *zone{nullptr}; + RGWSI_RADOS *rados{nullptr}; + RGWSI_BILog_RADOS *bilog{nullptr}; + RGWDataChangesLog *datalog_rados{nullptr}; + } svc; + + RGWSI_BucketIndex_RADOS(CephContext *cct); + + void init(RGWSI_Zone *zone_svc, + RGWSI_RADOS *rados_svc, + RGWSI_BILog_RADOS *bilog_svc, + RGWDataChangesLog *datalog_rados_svc); + + static int shards_max() { + return RGW_SHARDS_PRIME_1; + } + + static int shard_id(const std::string& key, int max_shards) { + return rgw_shard_id(key, max_shards); + } + + static uint32_t bucket_shard_index(const std::string& key, + int num_shards) { + uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size()); + uint32_t sid2 = sid ^ ((sid & 0xFF) << 24); + return rgw_shards_mod(sid2, num_shards); + } + + static uint32_t bucket_shard_index(const rgw_obj_key& obj_key, + int num_shards) + { + std::string sharding_key; + if (obj_key.ns == RGW_OBJ_NS_MULTIPART) { + RGWMPObj mp; + mp.from_meta(obj_key.name); + sharding_key = mp.get_key(); + } else { + sharding_key = obj_key.name; + } + + return bucket_shard_index(sharding_key, num_shards); + } + + int init_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,const rgw::bucket_index_layout_generation& idx_layout) override; + int clean_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout) override; + + /* RADOS specific */ + + int read_stats(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWBucketEnt *stats, + optional_yield y) override; + + int get_reshard_status(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, + std::list *status); + + int handle_overwrite(const DoutPrefixProvider *dpp, const RGWBucketInfo& info, + const RGWBucketInfo& orig_info, + optional_yield y) override; + + int open_bucket_index_shard(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const std::string& obj_key, + RGWSI_RADOS::Obj *bucket_obj, + int *shard_id); + + int open_bucket_index_shard(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& index, + int shard_id, RGWSI_RADOS::Obj *bucket_obj); + + int open_bucket_index(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + RGWSI_RADOS::Pool *index_pool, + std::string *bucket_oid); + + int open_bucket_index(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + std::optional shard_id, + const rgw::bucket_index_layout_generation& idx_layout, + RGWSI_RADOS::Pool *index_pool, + std::map *bucket_objs, + std::map *bucket_instance_ids); +}; + + diff --git a/src/rgw/services/svc_bilog_rados.cc b/src/rgw/services/svc_bilog_rados.cc new file mode 100644 index 000000000..f4bb13ec1 --- /dev/null +++ b/src/rgw/services/svc_bilog_rados.cc @@ -0,0 +1,220 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_bilog_rados.h" +#include "svc_bi_rados.h" + +#include "cls/rgw/cls_rgw_client.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWSI_BILog_RADOS::RGWSI_BILog_RADOS(CephContext *cct) : RGWServiceInstance(cct) +{ +} + +void RGWSI_BILog_RADOS::init(RGWSI_BucketIndex_RADOS *bi_rados_svc) +{ + svc.bi = bi_rados_svc; +} + +int RGWSI_BILog_RADOS::log_trim(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_log_layout_generation& log_layout, + int shard_id, + std::string_view start_marker, + std::string_view end_marker) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + + BucketIndexShardsManager start_marker_mgr; + BucketIndexShardsManager end_marker_mgr; + + const auto& current_index = rgw::log_to_index_layout(log_layout); + int r = svc.bi->open_bucket_index(dpp, bucket_info, shard_id, current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) { + return r; + } + + r = start_marker_mgr.from_string(start_marker, shard_id); + if (r < 0) { + return r; + } + + r = end_marker_mgr.from_string(end_marker, shard_id); + if (r < 0) { + return r; + } + + return CLSRGWIssueBILogTrim(index_pool.ioctx(), start_marker_mgr, end_marker_mgr, bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWSI_BILog_RADOS::log_start(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_log_layout_generation& log_layout, int shard_id) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + const auto& current_index = rgw::log_to_index_layout(log_layout); + int r = svc.bi->open_bucket_index(dpp, bucket_info, shard_id, current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) + return r; + + return CLSRGWIssueResyncBucketBILog(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWSI_BILog_RADOS::log_stop(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_log_layout_generation& log_layout, int shard_id) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + const auto& current_index = rgw::log_to_index_layout(log_layout); + int r = svc.bi->open_bucket_index(dpp, bucket_info, shard_id, current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) + return r; + + return CLSRGWIssueBucketBILogStop(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); +} + +static void build_bucket_index_marker(const string& shard_id_str, + const string& shard_marker, + string *marker) { + if (marker) { + *marker = shard_id_str; + marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR); + marker->append(shard_marker); + } +} + +int RGWSI_BILog_RADOS::log_list(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_log_layout_generation& log_layout, + int shard_id, string& marker, uint32_t max, + std::list& result, bool *truncated) +{ + ldpp_dout(dpp, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl; + result.clear(); + + RGWSI_RADOS::Pool index_pool; + map oids; + map bi_log_lists; + const auto& current_index = rgw::log_to_index_layout(log_layout); + int r = svc.bi->open_bucket_index(dpp, bucket_info, shard_id, current_index, &index_pool, &oids, nullptr); + if (r < 0) + return r; + + BucketIndexShardsManager marker_mgr; + bool has_shards = (oids.size() > 1 || shard_id >= 0); + // If there are multiple shards for the bucket index object, the marker + // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}# + // {shard_marker_2}...', if there is no sharding, the bi_log_list should + // only contain one record, and the key is the bucket instance id. + r = marker_mgr.from_string(marker, shard_id); + if (r < 0) + return r; + + r = CLSRGWIssueBILogList(index_pool.ioctx(), marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) + return r; + + map::iterator> vcurrents; + map::iterator> vends; + if (truncated) { + *truncated = false; + } + map::iterator miter = bi_log_lists.begin(); + for (; miter != bi_log_lists.end(); ++miter) { + int shard_id = miter->first; + vcurrents[shard_id] = miter->second.entries.begin(); + vends[shard_id] = miter->second.entries.end(); + if (truncated) { + *truncated = (*truncated || miter->second.truncated); + } + } + + size_t total = 0; + bool has_more = true; + map::iterator>::iterator viter; + map::iterator>::iterator eiter; + while (total < max && has_more) { + has_more = false; + + viter = vcurrents.begin(); + eiter = vends.begin(); + + for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) { + assert (eiter != vends.end()); + + int shard_id = viter->first; + list::iterator& liter = viter->second; + + if (liter == eiter->second){ + continue; + } + rgw_bi_log_entry& entry = *(liter); + if (has_shards) { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + string tmp_id; + build_bucket_index_marker(buf, entry.id, &tmp_id); + entry.id.swap(tmp_id); + } + marker_mgr.add(shard_id, entry.id); + result.push_back(entry); + total++; + has_more = true; + ++liter; + } + } + + if (truncated) { + for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) { + assert (eiter != vends.end()); + *truncated = (*truncated || (viter->second != eiter->second)); + } + } + + // Refresh marker, if there are multiple shards, the output will look like + // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...', + // if there is no sharding, the simply marker (without oid) is returned + if (has_shards) { + marker_mgr.to_string(&marker); + } else { + if (!result.empty()) { + marker = result.rbegin()->id; + } + } + + return 0; +} + +int RGWSI_BILog_RADOS::get_log_status(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_log_layout_generation& log_layout, + int shard_id, + map *markers, + optional_yield y) +{ + vector headers; + map bucket_instance_ids; + const auto& current_index = rgw::log_to_index_layout(log_layout); + int r = svc.bi->cls_bucket_head(dpp, bucket_info, current_index, shard_id, &headers, &bucket_instance_ids, y); + if (r < 0) + return r; + + ceph_assert(headers.size() == bucket_instance_ids.size()); + + auto iter = headers.begin(); + map::iterator viter = bucket_instance_ids.begin(); + + for(; iter != headers.end(); ++iter, ++viter) { + if (shard_id >= 0) { + (*markers)[shard_id] = iter->max_marker; + } else { + (*markers)[viter->first] = iter->max_marker; + } + } + + return 0; +} + diff --git a/src/rgw/services/svc_bilog_rados.h b/src/rgw/services/svc_bilog_rados.h new file mode 100644 index 000000000..e9d5dbb5c --- /dev/null +++ b/src/rgw/services/svc_bilog_rados.h @@ -0,0 +1,60 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +#include "svc_rados.h" + + + + +class RGWSI_BILog_RADOS : public RGWServiceInstance +{ +public: + struct Svc { + RGWSI_BucketIndex_RADOS *bi{nullptr}; + } svc; + + RGWSI_BILog_RADOS(CephContext *cct); + + void init(RGWSI_BucketIndex_RADOS *bi_rados_svc); + + int log_start(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_log_layout_generation& log_layout, int shard_id); + int log_stop(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_log_layout_generation& log_layout, int shard_id); + + int log_trim(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, + const rgw::bucket_log_layout_generation& log_layout, + int shard_id, + std::string_view start_marker, + std::string_view end_marker); + int log_list(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, + const rgw::bucket_log_layout_generation& log_layout, + int shard_id, + std::string& marker, + uint32_t max, + std::list& result, + bool *truncated); + + int get_log_status(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_log_layout_generation& log_layout, + int shard_id, + std::map *markers, + optional_yield y); +}; diff --git a/src/rgw/services/svc_bucket.cc b/src/rgw/services/svc_bucket.cc new file mode 100644 index 000000000..b115990d2 --- /dev/null +++ b/src/rgw/services/svc_bucket.cc @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#include "svc_bucket.h" + +#define dout_subsys ceph_subsys_rgw + +std::string RGWSI_Bucket::get_entrypoint_meta_key(const rgw_bucket& bucket) +{ + if (bucket.bucket_id.empty()) { + return bucket.get_key(); + } + + rgw_bucket b(bucket); + b.bucket_id.clear(); + + return b.get_key(); +} + +std::string RGWSI_Bucket::get_bi_meta_key(const rgw_bucket& bucket) +{ + return bucket.get_key(); +} + diff --git a/src/rgw/services/svc_bucket.h b/src/rgw/services/svc_bucket.h new file mode 100644 index 000000000..4a526e4f2 --- /dev/null +++ b/src/rgw/services/svc_bucket.h @@ -0,0 +1,111 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +#include "svc_bucket_types.h" + +class RGWSI_Bucket : public RGWServiceInstance +{ +public: + RGWSI_Bucket(CephContext *cct) : RGWServiceInstance(cct) {} + virtual ~RGWSI_Bucket() {} + + static std::string get_entrypoint_meta_key(const rgw_bucket& bucket); + static std::string get_bi_meta_key(const rgw_bucket& bucket); + + virtual RGWSI_Bucket_BE_Handler& get_ep_be_handler() = 0; + virtual RGWSI_BucketInstance_BE_Handler& get_bi_be_handler() = 0; + + virtual int read_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const std::string& key, + RGWBucketEntryPoint *entry_point, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, + std::map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp, + rgw_cache_entry_info *cache_info = nullptr, + boost::optional refresh_version = boost::none) = 0; + + virtual int store_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const std::string& key, + RGWBucketEntryPoint& info, + bool exclusive, + real_time mtime, + std::map *pattrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int remove_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const std::string& key, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const std::string& key, + RGWBucketInfo *info, + real_time *pmtime, + std::map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp, + rgw_cache_entry_info *cache_info = nullptr, + boost::optional refresh_version = boost::none) = 0; + + virtual int read_bucket_info(RGWSI_Bucket_X_Ctx& ep_ctx, + const rgw_bucket& bucket, + RGWBucketInfo *info, + real_time *pmtime, + std::map *pattrs, + boost::optional refresh_version, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const std::string& key, + RGWBucketInfo& info, + std::optional orig_info, /* nullopt: orig_info was not fetched, + nullptr: orig_info was not found (new bucket instance */ + bool exclusive, + real_time mtime, + std::map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int remove_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const std::string& key, + const RGWBucketInfo& bucket_info, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int read_bucket_stats(RGWSI_Bucket_X_Ctx& ctx, + const rgw_bucket& bucket, + RGWBucketEnt *ent, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int read_buckets_stats(RGWSI_Bucket_X_Ctx& ctx, + std::map& m, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; +}; + diff --git a/src/rgw/services/svc_bucket_sobj.cc b/src/rgw/services/svc_bucket_sobj.cc new file mode 100644 index 000000000..08a528015 --- /dev/null +++ b/src/rgw/services/svc_bucket_sobj.cc @@ -0,0 +1,644 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#include "svc_bucket_sobj.h" +#include "svc_zone.h" +#include "svc_sys_obj.h" +#include "svc_sys_obj_cache.h" +#include "svc_bi.h" +#include "svc_meta.h" +#include "svc_meta_be_sobj.h" +#include "svc_sync_modules.h" + +#include "rgw_bucket.h" +#include "rgw_tools.h" +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +#define RGW_BUCKET_INSTANCE_MD_PREFIX ".bucket.meta." + +using namespace std; + +class RGWSI_Bucket_SObj_Module : public RGWSI_MBSObj_Handler_Module { + RGWSI_Bucket_SObj::Svc& svc; + + const string prefix; +public: + RGWSI_Bucket_SObj_Module(RGWSI_Bucket_SObj::Svc& _svc) : RGWSI_MBSObj_Handler_Module("bucket"), + svc(_svc) {} + + void get_pool_and_oid(const string& key, rgw_pool *pool, string *oid) override { + if (pool) { + *pool = svc.zone->get_zone_params().domain_root; + } + if (oid) { + *oid = key; + } + } + + const string& get_oid_prefix() override { + return prefix; + } + + bool is_valid_oid(const string& oid) override { + return (!oid.empty() && oid[0] != '.'); + } + + string key_to_oid(const string& key) override { + return key; + } + + string oid_to_key(const string& oid) override { + /* should have been called after is_valid_oid(), + * so no need to check for validity */ + return oid; + } +}; + +class RGWSI_BucketInstance_SObj_Module : public RGWSI_MBSObj_Handler_Module { + RGWSI_Bucket_SObj::Svc& svc; + + const string prefix; +public: + RGWSI_BucketInstance_SObj_Module(RGWSI_Bucket_SObj::Svc& _svc) : RGWSI_MBSObj_Handler_Module("bucket.instance"), + svc(_svc), prefix(RGW_BUCKET_INSTANCE_MD_PREFIX) {} + + void get_pool_and_oid(const string& key, rgw_pool *pool, string *oid) override { + if (pool) { + *pool = svc.zone->get_zone_params().domain_root; + } + if (oid) { + *oid = key_to_oid(key); + } + } + + const string& get_oid_prefix() override { + return prefix; + } + + bool is_valid_oid(const string& oid) override { + return (oid.compare(0, prefix.size(), RGW_BUCKET_INSTANCE_MD_PREFIX) == 0); + } + +// 'tenant/' is used in bucket instance keys for sync to avoid parsing ambiguity +// with the existing instance[:shard] format. once we parse the shard, the / is +// replaced with a : to match the [tenant:]instance format + string key_to_oid(const string& key) override { + string oid = prefix + key; + + // replace tenant/ with tenant: + auto c = oid.find('/', prefix.size()); + if (c != string::npos) { + oid[c] = ':'; + } + + return oid; + } + + // convert bucket instance oids back to the tenant/ format for metadata keys. + // it's safe to parse 'tenant:' only for oids, because they won't contain the + // optional :shard at the end + string oid_to_key(const string& oid) override { + /* this should have been called after oid was checked for validity */ + + if (oid.size() < prefix.size()) { /* just sanity check */ + return string(); + } + + string key = oid.substr(prefix.size()); + + // find first : (could be tenant:bucket or bucket:instance) + auto c = key.find(':'); + if (c != string::npos) { + // if we find another :, the first one was for tenant + if (key.find(':', c + 1) != string::npos) { + key[c] = '/'; + } + } + + return key; + } + + /* + * hash entry for mdlog placement. Use the same hash key we'd have for the bucket entry + * point, so that the log entries end up at the same log shard, so that we process them + * in order + */ + string get_hash_key(const string& key) override { + string k = "bucket:"; + int pos = key.find(':'); + if (pos < 0) + k.append(key); + else + k.append(key.substr(0, pos)); + + return k; + } +}; + +RGWSI_Bucket_SObj::RGWSI_Bucket_SObj(CephContext *cct): RGWSI_Bucket(cct) { +} + +RGWSI_Bucket_SObj::~RGWSI_Bucket_SObj() { +} + +void RGWSI_Bucket_SObj::init(RGWSI_Zone *_zone_svc, RGWSI_SysObj *_sysobj_svc, + RGWSI_SysObj_Cache *_cache_svc, RGWSI_BucketIndex *_bi, + RGWSI_Meta *_meta_svc, RGWSI_MetaBackend *_meta_be_svc, + RGWSI_SyncModules *_sync_modules_svc, + RGWSI_Bucket_Sync *_bucket_sync_svc) +{ + svc.bucket = this; + svc.zone = _zone_svc; + svc.sysobj = _sysobj_svc; + svc.cache = _cache_svc; + svc.bi = _bi; + svc.meta = _meta_svc; + svc.meta_be = _meta_be_svc; + svc.sync_modules = _sync_modules_svc; + svc.bucket_sync = _bucket_sync_svc; +} + +int RGWSI_Bucket_SObj::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + binfo_cache.reset(new RGWChainedCacheImpl); + binfo_cache->init(svc.cache); + + /* create first backend handler for bucket entrypoints */ + + RGWSI_MetaBackend_Handler *ep_handler; + + int r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_SOBJ, &ep_handler); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to create be handler: r=" << r << dendl; + return r; + } + + ep_be_handler = ep_handler; + + RGWSI_MetaBackend_Handler_SObj *ep_bh = static_cast(ep_handler); + + auto ep_module = new RGWSI_Bucket_SObj_Module(svc); + ep_be_module.reset(ep_module); + ep_bh->set_module(ep_module); + + /* create a second backend handler for bucket instance */ + + RGWSI_MetaBackend_Handler *bi_handler; + + r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_SOBJ, &bi_handler); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to create be handler: r=" << r << dendl; + return r; + } + + bi_be_handler = bi_handler; + + RGWSI_MetaBackend_Handler_SObj *bi_bh = static_cast(bi_handler); + + auto bi_module = new RGWSI_BucketInstance_SObj_Module(svc); + bi_be_module.reset(bi_module); + bi_bh->set_module(bi_module); + + return 0; +} + +int RGWSI_Bucket_SObj::read_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const string& key, + RGWBucketEntryPoint *entry_point, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, + map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version) +{ + bufferlist bl; + + auto params = RGWSI_MBSObj_GetParams(&bl, pattrs, pmtime).set_cache_info(cache_info) + .set_refresh_version(refresh_version); + + int ret = svc.meta_be->get_entry(ctx.get(), key, params, objv_tracker, y, dpp); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(*entry_point, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl; + return -EIO; + } + return 0; +} + +int RGWSI_Bucket_SObj::store_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const string& key, + RGWBucketEntryPoint& info, + bool exclusive, + real_time mtime, + map *pattrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + bufferlist bl; + encode(info, bl); + + RGWSI_MBSObj_PutParams params(bl, pattrs, mtime, exclusive); + + int ret = svc.meta_be->put(ctx.get(), key, params, objv_tracker, y, dpp); + if (ret < 0) { + return ret; + } + + return ret; +} + +int RGWSI_Bucket_SObj::remove_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const string& key, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + RGWSI_MBSObj_RemoveParams params; + return svc.meta_be->remove(ctx.get(), key, params, objv_tracker, y, dpp); +} + +int RGWSI_Bucket_SObj::read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const string& key, + RGWBucketInfo *info, + real_time *pmtime, map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version) +{ + string cache_key("bi/"); + cache_key.append(key); + + if (auto e = binfo_cache->find(cache_key)) { + if (refresh_version && + e->info.objv_tracker.read_version.compare(&(*refresh_version))) { + ldpp_dout(dpp, -1) << "WARNING: The bucket info cache is inconsistent. This is " + << "a failure that should be debugged. I am a nice machine, " + << "so I will try to recover." << dendl; + binfo_cache->invalidate(key); + } else { + *info = e->info; + if (pattrs) + *pattrs = e->attrs; + if (pmtime) + *pmtime = e->mtime; + return 0; + } + } + + bucket_info_cache_entry e; + rgw_cache_entry_info ci; + + int ret = do_read_bucket_instance_info(ctx, key, + &e.info, &e.mtime, &e.attrs, + &ci, refresh_version, y, dpp); + *info = e.info; + + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: do_read_bucket_instance_info failed: " << ret << dendl; + } else { + ldpp_dout(dpp, 20) << "do_read_bucket_instance_info, bucket instance not found (key=" << key << ")" << dendl; + } + return ret; + } + + if (pmtime) { + *pmtime = e.mtime; + } + if (pattrs) { + *pattrs = e.attrs; + } + if (cache_info) { + *cache_info = ci; + } + + /* chain to only bucket instance and *not* bucket entrypoint */ + if (!binfo_cache->put(dpp, svc.cache, cache_key, &e, {&ci})) { + ldpp_dout(dpp, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl; + } + + if (refresh_version && + refresh_version->compare(&info->objv_tracker.read_version)) { + ldpp_dout(dpp, -1) << "WARNING: The OSD has the same version I have. Something may " + << "have gone squirrelly. An administrator may have forced a " + << "change; otherwise there is a problem somewhere." << dendl; + } + + return 0; +} + +int RGWSI_Bucket_SObj::do_read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const string& key, + RGWBucketInfo *info, + real_time *pmtime, map *pattrs, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + bufferlist bl; + RGWObjVersionTracker ot; + + auto params = RGWSI_MBSObj_GetParams(&bl, pattrs, pmtime).set_cache_info(cache_info) + .set_refresh_version(refresh_version); + + int ret = svc.meta_be->get_entry(ctx.get(), key, params, &ot, y, dpp); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(*info, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl; + return -EIO; + } + info->objv_tracker = ot; + return 0; +} + +int RGWSI_Bucket_SObj::read_bucket_info(RGWSI_Bucket_X_Ctx& ctx, + const rgw_bucket& bucket, + RGWBucketInfo *info, + real_time *pmtime, + map *pattrs, + boost::optional refresh_version, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + rgw_cache_entry_info cache_info; + + if (!bucket.bucket_id.empty()) { + return read_bucket_instance_info(ctx.bi, get_bi_meta_key(bucket), + info, + pmtime, pattrs, + y, + dpp, + &cache_info, refresh_version); + } + + string bucket_entry = get_entrypoint_meta_key(bucket); + string cache_key("b/"); + cache_key.append(bucket_entry); + + if (auto e = binfo_cache->find(cache_key)) { + bool found_version = (bucket.bucket_id.empty() || + bucket.bucket_id == e->info.bucket.bucket_id); + + if (!found_version || + (refresh_version && + e->info.objv_tracker.read_version.compare(&(*refresh_version)))) { + ldpp_dout(dpp, -1) << "WARNING: The bucket info cache is inconsistent. This is " + << "a failure that should be debugged. I am a nice machine, " + << "so I will try to recover." << dendl; + binfo_cache->invalidate(cache_key); + } else { + *info = e->info; + if (pattrs) + *pattrs = e->attrs; + if (pmtime) + *pmtime = e->mtime; + return 0; + } + } + + RGWBucketEntryPoint entry_point; + real_time ep_mtime; + RGWObjVersionTracker ot; + rgw_cache_entry_info entry_cache_info; + int ret = read_bucket_entrypoint_info(ctx.ep, bucket_entry, + &entry_point, &ot, &ep_mtime, pattrs, + y, + dpp, + &entry_cache_info, refresh_version); + if (ret < 0) { + /* only init these fields */ + info->bucket = bucket; + return ret; + } + + if (entry_point.has_bucket_info) { + *info = entry_point.old_bucket_info; + info->bucket.tenant = bucket.tenant; + ldpp_dout(dpp, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info->bucket << " owner " << info->owner << dendl; + return 0; + } + + /* data is in the bucket instance object, we need to get attributes from there, clear everything + * that we got + */ + if (pattrs) { + pattrs->clear(); + } + + ldpp_dout(dpp, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl; + + + /* read bucket instance info */ + + bucket_info_cache_entry e; + + ret = read_bucket_instance_info(ctx.bi, get_bi_meta_key(entry_point.bucket), + &e.info, &e.mtime, &e.attrs, + y, + dpp, + &cache_info, refresh_version); + *info = e.info; + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: read_bucket_instance_from_oid failed: " << ret << dendl; + info->bucket = bucket; + // XXX and why return anything in case of an error anyway? + return ret; + } + + if (pmtime) + *pmtime = e.mtime; + if (pattrs) + *pattrs = e.attrs; + + /* chain to both bucket entry point and bucket instance */ + if (!binfo_cache->put(dpp, svc.cache, cache_key, &e, {&entry_cache_info, &cache_info})) { + ldpp_dout(dpp, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl; + } + + if (refresh_version && + refresh_version->compare(&info->objv_tracker.read_version)) { + ldpp_dout(dpp, -1) << "WARNING: The OSD has the same version I have. Something may " + << "have gone squirrelly. An administrator may have forced a " + << "change; otherwise there is a problem somewhere." << dendl; + } + + return 0; +} + + +int RGWSI_Bucket_SObj::store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const string& key, + RGWBucketInfo& info, + std::optional orig_info, + bool exclusive, + real_time mtime, + map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + bufferlist bl; + encode(info, bl); + + /* + * we might need some special handling if overwriting + */ + RGWBucketInfo shared_bucket_info; + if (!orig_info && !exclusive) { /* if exclusive, we're going to fail when try + to overwrite, so the whole check here is moot */ + /* + * we're here because orig_info wasn't passed in + * we don't have info about what was there before, so need to fetch first + */ + int r = read_bucket_instance_info(ctx, + key, + &shared_bucket_info, + nullptr, nullptr, + y, + dpp, + nullptr, boost::none); + if (r < 0) { + if (r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): read_bucket_instance_info() of key=" << key << " returned r=" << r << dendl; + return r; + } + } else { + orig_info = &shared_bucket_info; + } + } + + if (orig_info && *orig_info && !exclusive) { + int r = svc.bi->handle_overwrite(dpp, info, *(orig_info.value()), y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): svc.bi->handle_overwrite() of key=" << key << " returned r=" << r << dendl; + return r; + } + } + + RGWSI_MBSObj_PutParams params(bl, pattrs, mtime, exclusive); + + int ret = svc.meta_be->put(ctx.get(), key, params, &info.objv_tracker, y, dpp); + + if (ret >= 0) { + int r = svc.bucket_sync->handle_bi_update(dpp, info, + orig_info.value_or(nullptr), + y); + if (r < 0) { + return r; + } + } else if (ret == -EEXIST) { + /* well, if it's exclusive we shouldn't overwrite it, because we might race with another + * bucket operation on this specific bucket (e.g., being synced from the master), but + * since bucket instance meta object is unique for this specific bucket instance, we don't + * need to return an error. + * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the + * master, creating a bucket, sending bucket creation to the master, we create the bucket + * locally, while in the sync thread we sync the new bucket. + */ + ret = 0; + } + + if (ret < 0) { + return ret; + } + + return ret; +} + +int RGWSI_Bucket_SObj::remove_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const string& key, + const RGWBucketInfo& info, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + RGWSI_MBSObj_RemoveParams params; + int ret = svc.meta_be->remove_entry(dpp, ctx.get(), key, params, objv_tracker, y); + + if (ret < 0 && + ret != -ENOENT) { + return ret; + } + + int r = svc.bucket_sync->handle_bi_removal(dpp, info, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update bucket instance sync index: r=" << r << dendl; + /* returning success as index is just keeping hints, so will keep extra hints, + * but bucket removal succeeded + */ + } + + return 0; +} + +int RGWSI_Bucket_SObj::read_bucket_stats(const RGWBucketInfo& bucket_info, + RGWBucketEnt *ent, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + ent->count = 0; + ent->size = 0; + ent->size_rounded = 0; + + vector headers; + + int r = svc.bi->read_stats(dpp, bucket_info, ent, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): read_stats returned r=" << r << dendl; + return r; + } + + return 0; +} + +int RGWSI_Bucket_SObj::read_bucket_stats(RGWSI_Bucket_X_Ctx& ctx, + const rgw_bucket& bucket, + RGWBucketEnt *ent, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + RGWBucketInfo bucket_info; + int ret = read_bucket_info(ctx, bucket, &bucket_info, nullptr, nullptr, boost::none, y, dpp); + if (ret < 0) { + return ret; + } + + return read_bucket_stats(bucket_info, ent, y, dpp); +} + +int RGWSI_Bucket_SObj::read_buckets_stats(RGWSI_Bucket_X_Ctx& ctx, + map& m, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + map::iterator iter; + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt& ent = iter->second; + int r = read_bucket_stats(ctx, ent.bucket, &ent, y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): read_bucket_stats returned r=" << r << dendl; + return r; + } + } + + return m.size(); +} diff --git a/src/rgw/services/svc_bucket_sobj.h b/src/rgw/services/svc_bucket_sobj.h new file mode 100644 index 000000000..8e9fe063c --- /dev/null +++ b/src/rgw/services/svc_bucket_sobj.h @@ -0,0 +1,180 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +#include "svc_meta_be.h" +#include "svc_bucket_types.h" +#include "svc_bucket.h" +#include "svc_bucket_sync.h" + +class RGWSI_Zone; +class RGWSI_SysObj; +class RGWSI_SysObj_Cache; +class RGWSI_Meta; +class RGWSI_SyncModules; + +struct rgw_cache_entry_info; + +template +class RGWChainedCacheImpl; + +class RGWSI_Bucket_SObj : public RGWSI_Bucket +{ + struct bucket_info_cache_entry { + RGWBucketInfo info; + real_time mtime; + std::map attrs; + }; + + using RGWChainedCacheImpl_bucket_info_cache_entry = RGWChainedCacheImpl; + std::unique_ptr binfo_cache; + + RGWSI_Bucket_BE_Handler ep_be_handler; + std::unique_ptr ep_be_module; + RGWSI_BucketInstance_BE_Handler bi_be_handler; + std::unique_ptr bi_be_module; + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + + int do_read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const std::string& key, + RGWBucketInfo *info, + real_time *pmtime, + std::map *pattrs, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version, + optional_yield y, + const DoutPrefixProvider *dpp); + + int read_bucket_stats(const RGWBucketInfo& bucket_info, + RGWBucketEnt *ent, + optional_yield y, + const DoutPrefixProvider *dpp); + +public: + struct Svc { + RGWSI_Bucket_SObj *bucket{nullptr}; + RGWSI_BucketIndex *bi{nullptr}; + RGWSI_Zone *zone{nullptr}; + RGWSI_SysObj *sysobj{nullptr}; + RGWSI_SysObj_Cache *cache{nullptr}; + RGWSI_Meta *meta{nullptr}; + RGWSI_MetaBackend *meta_be{nullptr}; + RGWSI_SyncModules *sync_modules{nullptr}; + RGWSI_Bucket_Sync *bucket_sync{nullptr}; + } svc; + + RGWSI_Bucket_SObj(CephContext *cct); + ~RGWSI_Bucket_SObj(); + + RGWSI_Bucket_BE_Handler& get_ep_be_handler() override { + return ep_be_handler; + } + + RGWSI_BucketInstance_BE_Handler& get_bi_be_handler() override { + return bi_be_handler; + } + + void init(RGWSI_Zone *_zone_svc, + RGWSI_SysObj *_sysobj_svc, + RGWSI_SysObj_Cache *_cache_svc, + RGWSI_BucketIndex *_bi, + RGWSI_Meta *_meta_svc, + RGWSI_MetaBackend *_meta_be_svc, + RGWSI_SyncModules *_sync_modules_svc, + RGWSI_Bucket_Sync *_bucket_sync_svc); + + + int read_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const std::string& key, + RGWBucketEntryPoint *entry_point, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, + std::map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp, + rgw_cache_entry_info *cache_info = nullptr, + boost::optional refresh_version = boost::none) override; + + int store_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const std::string& key, + RGWBucketEntryPoint& info, + bool exclusive, + real_time mtime, + std::map *pattrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int remove_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx, + const std::string& key, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const std::string& key, + RGWBucketInfo *info, + real_time *pmtime, + std::map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp, + rgw_cache_entry_info *cache_info = nullptr, + boost::optional refresh_version = boost::none) override; + + int read_bucket_info(RGWSI_Bucket_X_Ctx& ep_ctx, + const rgw_bucket& bucket, + RGWBucketInfo *info, + real_time *pmtime, + std::map *pattrs, + boost::optional refresh_version, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const std::string& key, + RGWBucketInfo& info, + std::optional orig_info, /* nullopt: orig_info was not fetched, + nullptr: orig_info was not found (new bucket instance */ + bool exclusive, + real_time mtime, + std::map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int remove_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx, + const std::string& key, + const RGWBucketInfo& bucket_info, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int read_bucket_stats(RGWSI_Bucket_X_Ctx& ctx, + const rgw_bucket& bucket, + RGWBucketEnt *ent, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int read_buckets_stats(RGWSI_Bucket_X_Ctx& ctx, + std::map& m, + optional_yield y, + const DoutPrefixProvider *dpp) override; +}; + diff --git a/src/rgw/services/svc_bucket_sync.h b/src/rgw/services/svc_bucket_sync.h new file mode 100644 index 000000000..7975e062b --- /dev/null +++ b/src/rgw/services/svc_bucket_sync.h @@ -0,0 +1,55 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +#include "svc_bucket_types.h" + +class RGWBucketSyncPolicyHandler; +using RGWBucketSyncPolicyHandlerRef = std::shared_ptr; + + +class RGWSI_Bucket_Sync : public RGWServiceInstance +{ +public: + RGWSI_Bucket_Sync(CephContext *cct) : RGWServiceInstance(cct) {} + + virtual int get_policy_handler(RGWSI_Bucket_X_Ctx& ctx, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef *handler, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int handle_bi_update(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + RGWBucketInfo *orig_bucket_info, + optional_yield y) = 0; + virtual int handle_bi_removal(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + optional_yield y) = 0; + + virtual int get_bucket_sync_hints(const DoutPrefixProvider *dpp, + const rgw_bucket& bucket, + std::set *sources, + std::set *dests, + optional_yield y) = 0; +}; + + diff --git a/src/rgw/services/svc_bucket_sync_sobj.cc b/src/rgw/services/svc_bucket_sync_sobj.cc new file mode 100644 index 000000000..ea3398a3f --- /dev/null +++ b/src/rgw/services/svc_bucket_sync_sobj.cc @@ -0,0 +1,903 @@ +#include "svc_bucket_sync_sobj.h" +#include "svc_zone.h" +#include "svc_sys_obj_cache.h" +#include "svc_bucket_sobj.h" + +#include "rgw_bucket_sync.h" +#include "rgw_zone.h" +#include "rgw_sync_policy.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static string bucket_sync_sources_oid_prefix = "bucket.sync-source-hints"; +static string bucket_sync_targets_oid_prefix = "bucket.sync-target-hints"; + +class RGWSI_Bucket_Sync_SObj_HintIndexManager { + CephContext *cct; + + struct { + RGWSI_Zone *zone; + RGWSI_SysObj *sysobj; + } svc; + +public: + RGWSI_Bucket_Sync_SObj_HintIndexManager(RGWSI_Zone *_zone_svc, + RGWSI_SysObj *_sysobj_svc) { + svc.zone = _zone_svc; + svc.sysobj = _sysobj_svc; + + cct = svc.zone->ctx(); + } + + rgw_raw_obj get_sources_obj(const rgw_bucket& bucket) const; + rgw_raw_obj get_dests_obj(const rgw_bucket& bucket) const; + + template + int update_hints(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + C1& added_dests, + C2& removed_dests, + C1& added_sources, + C2& removed_sources, + optional_yield y); +}; + +RGWSI_Bucket_Sync_SObj::RGWSI_Bucket_Sync_SObj(CephContext *cct) : RGWSI_Bucket_Sync(cct) { +} +RGWSI_Bucket_Sync_SObj::~RGWSI_Bucket_Sync_SObj() { +} + +void RGWSI_Bucket_Sync_SObj::init(RGWSI_Zone *_zone_svc, + RGWSI_SysObj *_sysobj_svc, + RGWSI_SysObj_Cache *_cache_svc, + RGWSI_Bucket_SObj *bucket_sobj_svc) +{ + svc.zone = _zone_svc; + svc.sysobj = _sysobj_svc; + svc.cache = _cache_svc; + svc.bucket_sobj = bucket_sobj_svc; + + hint_index_mgr.reset(new RGWSI_Bucket_Sync_SObj_HintIndexManager(svc.zone, svc.sysobj)); +} + +int RGWSI_Bucket_Sync_SObj::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + sync_policy_cache.reset(new RGWChainedCacheImpl); + sync_policy_cache->init(svc.cache); + + return 0; +} + +void RGWSI_Bucket_Sync_SObj::get_hint_entities(RGWSI_Bucket_X_Ctx& ctx, + const std::set& zones, + const std::set& buckets, + std::set *hint_entities, + optional_yield y, const DoutPrefixProvider *dpp) +{ + vector hint_buckets; + + hint_buckets.reserve(buckets.size()); + + for (auto& b : buckets) { + RGWBucketInfo hint_bucket_info; + int ret = svc.bucket_sobj->read_bucket_info(ctx, b, &hint_bucket_info, + nullptr, nullptr, boost::none, + y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 20) << "could not init bucket info for hint bucket=" << b << " ... skipping" << dendl; + continue; + } + + hint_buckets.emplace_back(std::move(hint_bucket_info.bucket)); + } + + for (auto& zone : zones) { + for (auto& b : hint_buckets) { + hint_entities->insert(rgw_sync_bucket_entity(zone, b)); + } + } +} + +int RGWSI_Bucket_Sync_SObj::resolve_policy_hints(RGWSI_Bucket_X_Ctx& ctx, + rgw_sync_bucket_entity& self_entity, + RGWBucketSyncPolicyHandlerRef& handler, + RGWBucketSyncPolicyHandlerRef& zone_policy_handler, + std::map& temp_map, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + set source_zones; + set target_zones; + + zone_policy_handler->reflect(dpp, nullptr, nullptr, + nullptr, nullptr, + &source_zones, + &target_zones, + false); /* relaxed: also get all zones that we allow to sync to/from */ + + std::set hint_entities; + + get_hint_entities(ctx, source_zones, handler->get_source_hints(), &hint_entities, y, dpp); + get_hint_entities(ctx, target_zones, handler->get_target_hints(), &hint_entities, y, dpp); + + std::set resolved_sources; + std::set resolved_dests; + + for (auto& hint_entity : hint_entities) { + if (!hint_entity.zone || + !hint_entity.bucket) { + continue; /* shouldn't really happen */ + } + + auto& zid = *hint_entity.zone; + auto& hint_bucket = *hint_entity.bucket; + + RGWBucketSyncPolicyHandlerRef hint_bucket_handler; + + auto iter = temp_map.find(optional_zone_bucket(zid, hint_bucket)); + if (iter != temp_map.end()) { + hint_bucket_handler = iter->second; + } else { + int r = do_get_policy_handler(ctx, zid, hint_bucket, temp_map, &hint_bucket_handler, y, dpp); + if (r < 0) { + ldpp_dout(dpp, 20) << "could not get bucket sync policy handler for hint bucket=" << hint_bucket << " ... skipping" << dendl; + continue; + } + } + + hint_bucket_handler->get_pipes(&resolved_dests, + &resolved_sources, + self_entity); /* flipping resolved dests and sources as these are + relative to the remote entity */ + } + + handler->set_resolved_hints(std::move(resolved_sources), std::move(resolved_dests)); + + return 0; +} + +int RGWSI_Bucket_Sync_SObj::do_get_policy_handler(RGWSI_Bucket_X_Ctx& ctx, + std::optional zone, + std::optional _bucket, + std::map& temp_map, + RGWBucketSyncPolicyHandlerRef *handler, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + if (!_bucket) { + *handler = svc.zone->get_sync_policy_handler(zone); + return 0; + } + + auto bucket = *_bucket; + + if (bucket.bucket_id.empty()) { + RGWBucketEntryPoint ep_info; + int ret = svc.bucket_sobj->read_bucket_entrypoint_info(ctx.ep, + RGWSI_Bucket::get_entrypoint_meta_key(bucket), + &ep_info, + nullptr, /* objv_tracker */ + nullptr, /* mtime */ + nullptr, /* attrs */ + y, + dpp, + nullptr, /* cache_info */ + boost::none /* refresh_version */); + if (ret < 0) { + if (ret != -ENOENT) { + ldout(cct, 0) << "ERROR: svc.bucket->read_bucket_info(bucket=" << bucket << ") returned r=" << ret << dendl; + } + return ret; + } + + bucket = ep_info.bucket; + } + + string zone_key; + string bucket_key; + + if (zone && *zone != svc.zone->zone_id()) { + zone_key = zone->id; + } + + bucket_key = RGWSI_Bucket::get_bi_meta_key(bucket); + + string cache_key("bi/" + zone_key + "/" + bucket_key); + + if (auto e = sync_policy_cache->find(cache_key)) { + *handler = e->handler; + return 0; + } + + bucket_sync_policy_cache_entry e; + rgw_cache_entry_info cache_info; + + RGWBucketInfo bucket_info; + map attrs; + + int r = svc.bucket_sobj->read_bucket_instance_info(ctx.bi, + bucket_key, + &bucket_info, + nullptr, + &attrs, + y, + dpp, + &cache_info); + if (r < 0) { + if (r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: svc.bucket->read_bucket_instance_info(key=" << bucket_key << ") returned r=" << r << dendl; + } + return r; + } + + auto zone_policy_handler = svc.zone->get_sync_policy_handler(zone); + if (!zone_policy_handler) { + ldpp_dout(dpp, 20) << "ERROR: could not find policy handler for zone=" << zone << dendl; + return -ENOENT; + } + + e.handler.reset(zone_policy_handler->alloc_child(bucket_info, std::move(attrs))); + + r = e.handler->init(dpp, y); + if (r < 0) { + ldpp_dout(dpp, 20) << "ERROR: failed to init bucket sync policy handler: r=" << r << dendl; + return r; + } + + temp_map.emplace(optional_zone_bucket{zone, bucket}, e.handler); + + rgw_sync_bucket_entity self_entity(zone.value_or(svc.zone->zone_id()), bucket); + + r = resolve_policy_hints(ctx, self_entity, + e.handler, + zone_policy_handler, + temp_map, y, dpp); + if (r < 0) { + ldpp_dout(dpp, 20) << "ERROR: failed to resolve policy hints: bucket_key=" << bucket_key << ", r=" << r << dendl; + return r; + } + + if (!sync_policy_cache->put(dpp, svc.cache, cache_key, &e, {&cache_info})) { + ldpp_dout(dpp, 20) << "couldn't put bucket_sync_policy cache entry, might have raced with data changes" << dendl; + } + + *handler = e.handler; + + return 0; +} + +int RGWSI_Bucket_Sync_SObj::get_policy_handler(RGWSI_Bucket_X_Ctx& ctx, + std::optional zone, + std::optional _bucket, + RGWBucketSyncPolicyHandlerRef *handler, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + std::map temp_map; + return do_get_policy_handler(ctx, zone, _bucket, temp_map, handler, y, dpp); +} + +static bool diff_sets(std::set& orig_set, + std::set& new_set, + vector *added, + vector *removed) +{ + auto oiter = orig_set.begin(); + auto niter = new_set.begin(); + + while (oiter != orig_set.end() && + niter != new_set.end()) { + if (*oiter == *niter) { + ++oiter; + ++niter; + continue; + } else if (*oiter < *niter) { + removed->push_back(*oiter); + ++oiter; + } else { + added->push_back(*niter); + ++niter; + } + } + for (; oiter != orig_set.end(); ++oiter) { + removed->push_back(*oiter); + } + for (; niter != new_set.end(); ++niter) { + added->push_back(*niter); + } + + return !(removed->empty() && added->empty()); +} + + +class RGWSI_BS_SObj_HintIndexObj +{ + friend class RGWSI_Bucket_Sync_SObj; + + CephContext *cct; + struct { + RGWSI_SysObj *sysobj; + } svc; + + rgw_raw_obj obj; + RGWSysObj sysobj; + + RGWObjVersionTracker ot; + + bool has_data{false}; + +public: + struct bi_entry { + rgw_bucket bucket; + map sources; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(bucket, bl); + encode(sources, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(bucket, bl); + decode(sources, bl); + DECODE_FINISH(bl); + } + + bool add(const rgw_bucket& info_source, + const obj_version& info_source_ver) { + auto& ver = sources[info_source]; + + if (ver == info_source_ver) { /* already updated */ + return false; + } + + if (info_source_ver.tag == ver.tag && + info_source_ver.ver < ver.ver) { + return false; + } + + ver = info_source_ver; + + return true; + } + + bool remove(const rgw_bucket& info_source, + const obj_version& info_source_ver) { + auto iter = sources.find(info_source); + if (iter == sources.end()) { + return false; + } + + auto& ver = iter->second; + + if (info_source_ver.tag == ver.tag && + info_source_ver.ver < ver.ver) { + return false; + } + + sources.erase(info_source); + return true; + } + + bool empty() const { + return sources.empty(); + } + }; + + struct single_instance_info { + map entries; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(entries, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(entries, bl); + DECODE_FINISH(bl); + } + + bool add_entry(const rgw_bucket& info_source, + const obj_version& info_source_ver, + const rgw_bucket& bucket) { + auto& entry = entries[bucket]; + + if (!entry.add(info_source, info_source_ver)) { + return false; + } + + entry.bucket = bucket; + + return true; + } + + bool remove_entry(const rgw_bucket& info_source, + const obj_version& info_source_ver, + const rgw_bucket& bucket) { + auto iter = entries.find(bucket); + if (iter == entries.end()) { + return false; + } + + if (!iter->second.remove(info_source, info_source_ver)) { + return false; + } + + if (iter->second.empty()) { + entries.erase(iter); + } + + return true; + } + + void clear() { + entries.clear(); + } + + bool empty() const { + return entries.empty(); + } + + void get_entities(std::set *result) const { + for (auto& iter : entries) { + result->insert(iter.first); + } + } + }; + + struct info_map { + map instances; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(instances, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(instances, bl); + DECODE_FINISH(bl); + } + + bool empty() const { + return instances.empty(); + } + + void clear() { + instances.clear(); + } + + void get_entities(const rgw_bucket& bucket, + std::set *result) const { + auto iter = instances.find(bucket); + if (iter == instances.end()) { + return; + } + iter->second.get_entities(result); + } + } info; + + RGWSI_BS_SObj_HintIndexObj(RGWSI_SysObj *_sysobj_svc, + const rgw_raw_obj& _obj) : cct(_sysobj_svc->ctx()), + obj(_obj), + sysobj(_sysobj_svc->get_obj(obj)) + { + svc.sysobj = _sysobj_svc; + } + + template + int update(const DoutPrefixProvider *dpp, + const rgw_bucket& entity, + const RGWBucketInfo& info_source, + C1 *add, + C2 *remove, + optional_yield y); + +private: + template + void update_entries(const rgw_bucket& info_source, + const obj_version& info_source_ver, + C1 *add, + C2 *remove, + single_instance_info *instance); + + int read(const DoutPrefixProvider *dpp, optional_yield y); + int flush(const DoutPrefixProvider *dpp, optional_yield y); + + void invalidate() { + has_data = false; + info.clear(); + } + + void get_entities(const rgw_bucket& bucket, + std::set *result) const { + info.get_entities(bucket, result); + } +}; +WRITE_CLASS_ENCODER(RGWSI_BS_SObj_HintIndexObj::bi_entry) +WRITE_CLASS_ENCODER(RGWSI_BS_SObj_HintIndexObj::single_instance_info) +WRITE_CLASS_ENCODER(RGWSI_BS_SObj_HintIndexObj::info_map) + +template +int RGWSI_BS_SObj_HintIndexObj::update(const DoutPrefixProvider *dpp, + const rgw_bucket& entity, + const RGWBucketInfo& info_source, + C1 *add, + C2 *remove, + optional_yield y) +{ + int r = 0; + + auto& info_source_ver = info_source.objv_tracker.read_version; + +#define MAX_RETRIES 25 + + for (int i = 0; i < MAX_RETRIES; ++i) { + if (!has_data) { + r = read(dpp, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: cannot update hint index: failed to read: r=" << r << dendl; + return r; + } + } + + auto& instance = info.instances[entity]; + + update_entries(info_source.bucket, + info_source_ver, + add, remove, + &instance); + + if (instance.empty()) { + info.instances.erase(entity); + } + + r = flush(dpp, y); + if (r >= 0) { + return 0; + } + + if (r != -ECANCELED) { + ldpp_dout(dpp, 0) << "ERROR: failed to flush hint index: obj=" << obj << " r=" << r << dendl; + return r; + } + + invalidate(); + } + ldpp_dout(dpp, 0) << "ERROR: failed to flush hint index: too many retries (obj=" << obj << "), likely a bug" << dendl; + + return -EIO; +} + +template +void RGWSI_BS_SObj_HintIndexObj::update_entries(const rgw_bucket& info_source, + const obj_version& info_source_ver, + C1 *add, + C2 *remove, + single_instance_info *instance) +{ + if (remove) { + for (auto& bucket : *remove) { + instance->remove_entry(info_source, info_source_ver, bucket); + } + } + + if (add) { + for (auto& bucket : *add) { + instance->add_entry(info_source, info_source_ver, bucket); + } + } +} + +int RGWSI_BS_SObj_HintIndexObj::read(const DoutPrefixProvider *dpp, optional_yield y) { + RGWObjVersionTracker _ot; + bufferlist bl; + int r = sysobj.rop() + .set_objv_tracker(&_ot) /* forcing read of current version */ + .read(dpp, &bl, y); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: failed reading data (obj=" << obj << "), r=" << r << dendl; + return r; + } + + ot = _ot; + + if (r >= 0) { + auto iter = bl.cbegin(); + try { + decode(info, iter); + has_data = true; + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to decode entries, ignoring" << dendl; + info.clear(); + } + } else { + info.clear(); + } + + return 0; +} + +int RGWSI_BS_SObj_HintIndexObj::flush(const DoutPrefixProvider *dpp, optional_yield y) { + int r; + + if (!info.empty()) { + bufferlist bl; + encode(info, bl); + + r = sysobj.wop() + .set_objv_tracker(&ot) /* forcing read of current version */ + .write(dpp, bl, y); + + } else { /* remove */ + r = sysobj.wop() + .set_objv_tracker(&ot) + .remove(dpp, y); + } + + if (r < 0) { + return r; + } + + return 0; +} + +rgw_raw_obj RGWSI_Bucket_Sync_SObj_HintIndexManager::get_sources_obj(const rgw_bucket& bucket) const +{ + rgw_bucket b = bucket; + b.bucket_id.clear(); + return rgw_raw_obj(svc.zone->get_zone_params().log_pool, + bucket_sync_sources_oid_prefix + "." + b.get_key()); +} + +rgw_raw_obj RGWSI_Bucket_Sync_SObj_HintIndexManager::get_dests_obj(const rgw_bucket& bucket) const +{ + rgw_bucket b = bucket; + b.bucket_id.clear(); + return rgw_raw_obj(svc.zone->get_zone_params().log_pool, + bucket_sync_targets_oid_prefix + "." + b.get_key()); +} + +template +int RGWSI_Bucket_Sync_SObj_HintIndexManager::update_hints(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + C1& added_dests, + C2& removed_dests, + C1& added_sources, + C2& removed_sources, + optional_yield y) +{ + C1 self_entity = { bucket_info.bucket }; + + if (!added_dests.empty() || + !removed_dests.empty()) { + /* update our dests */ + RGWSI_BS_SObj_HintIndexObj index(svc.sysobj, + get_dests_obj(bucket_info.bucket)); + int r = index.update(dpp, bucket_info.bucket, + bucket_info, + &added_dests, + &removed_dests, + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << bucket_info.bucket << " r=" << r << dendl; + return r; + } + + /* update dest buckets */ + for (auto& dest_bucket : added_dests) { + RGWSI_BS_SObj_HintIndexObj dep_index(svc.sysobj, + get_sources_obj(dest_bucket)); + int r = dep_index.update(dpp, dest_bucket, + bucket_info, + &self_entity, + static_cast(nullptr), + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << dest_bucket << " r=" << r << dendl; + return r; + } + } + /* update removed dest buckets */ + for (auto& dest_bucket : removed_dests) { + RGWSI_BS_SObj_HintIndexObj dep_index(svc.sysobj, + get_sources_obj(dest_bucket)); + int r = dep_index.update(dpp, dest_bucket, + bucket_info, + static_cast(nullptr), + &self_entity, + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << dest_bucket << " r=" << r << dendl; + return r; + } + } + } + + if (!added_sources.empty() || + !removed_sources.empty()) { + RGWSI_BS_SObj_HintIndexObj index(svc.sysobj, + get_sources_obj(bucket_info.bucket)); + /* update our sources */ + int r = index.update(dpp, bucket_info.bucket, + bucket_info, + &added_sources, + &removed_sources, + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << bucket_info.bucket << " r=" << r << dendl; + return r; + } + + /* update added sources buckets */ + for (auto& source_bucket : added_sources) { + RGWSI_BS_SObj_HintIndexObj dep_index(svc.sysobj, + get_dests_obj(source_bucket)); + int r = dep_index.update(dpp, source_bucket, + bucket_info, + &self_entity, + static_cast(nullptr), + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << source_bucket << " r=" << r << dendl; + return r; + } + } + /* update removed dest buckets */ + for (auto& source_bucket : removed_sources) { + RGWSI_BS_SObj_HintIndexObj dep_index(svc.sysobj, + get_dests_obj(source_bucket)); + int r = dep_index.update(dpp, source_bucket, + bucket_info, + static_cast(nullptr), + &self_entity, + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << source_bucket << " r=" << r << dendl; + return r; + } + } + } + + return 0; +} + +int RGWSI_Bucket_Sync_SObj::handle_bi_removal(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + optional_yield y) +{ + std::set sources_set; + std::set dests_set; + + if (bucket_info.sync_policy) { + bucket_info.sync_policy->get_potential_related_buckets(bucket_info.bucket, + &sources_set, + &dests_set); + } + + std::vector removed_sources; + removed_sources.reserve(sources_set.size()); + for (auto& e : sources_set) { + removed_sources.push_back(e); + } + + std::vector removed_dests; + removed_dests.reserve(dests_set.size()); + for (auto& e : dests_set) { + removed_dests.push_back(e); + } + + std::vector added_sources; + std::vector added_dests; + + return hint_index_mgr->update_hints(dpp, bucket_info, + added_dests, + removed_dests, + added_sources, + removed_sources, + y); +} + +int RGWSI_Bucket_Sync_SObj::handle_bi_update(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + RGWBucketInfo *orig_bucket_info, + optional_yield y) +{ + std::set orig_sources; + std::set orig_dests; + + if (orig_bucket_info && + orig_bucket_info->sync_policy) { + orig_bucket_info->sync_policy->get_potential_related_buckets(bucket_info.bucket, + &orig_sources, + &orig_dests); + } + + std::set sources; + std::set dests; + if (bucket_info.sync_policy) { + bucket_info.sync_policy->get_potential_related_buckets(bucket_info.bucket, + &sources, + &dests); + } + + std::vector removed_sources; + std::vector added_sources; + bool found = diff_sets(orig_sources, sources, &added_sources, &removed_sources); + ldpp_dout(dpp, 20) << __func__ << "(): bucket=" << bucket_info.bucket << ": orig_sources=" << orig_sources << " new_sources=" << sources << dendl; + ldpp_dout(dpp, 20) << __func__ << "(): bucket=" << bucket_info.bucket << ": potential sources added=" << added_sources << " removed=" << removed_sources << dendl; + + std::vector removed_dests; + std::vector added_dests; + found = found || diff_sets(orig_dests, dests, &added_dests, &removed_dests); + + ldpp_dout(dpp, 20) << __func__ << "(): bucket=" << bucket_info.bucket << ": orig_dests=" << orig_dests << " new_dests=" << dests << dendl; + ldpp_dout(dpp, 20) << __func__ << "(): bucket=" << bucket_info.bucket << ": potential dests added=" << added_dests << " removed=" << removed_dests << dendl; + + if (!found) { + return 0; + } + + return hint_index_mgr->update_hints(dpp, bucket_info, + dests, /* set all dests, not just the ones that were added */ + removed_dests, + sources, /* set all sources, not just that the ones that were added */ + removed_sources, + y); +} + +int RGWSI_Bucket_Sync_SObj::get_bucket_sync_hints(const DoutPrefixProvider *dpp, + const rgw_bucket& bucket, + std::set *sources, + std::set *dests, + optional_yield y) +{ + if (!sources && !dests) { + return 0; + } + + if (sources) { + RGWSI_BS_SObj_HintIndexObj index(svc.sysobj, + hint_index_mgr->get_sources_obj(bucket)); + int r = index.read(dpp, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to update sources index for bucket=" << bucket << " r=" << r << dendl; + return r; + } + + index.get_entities(bucket, sources); + + if (!bucket.bucket_id.empty()) { + rgw_bucket b = bucket; + b.bucket_id.clear(); + index.get_entities(b, sources); + } + } + + if (dests) { + RGWSI_BS_SObj_HintIndexObj index(svc.sysobj, + hint_index_mgr->get_dests_obj(bucket)); + int r = index.read(dpp, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read targets index for bucket=" << bucket << " r=" << r << dendl; + return r; + } + + index.get_entities(bucket, dests); + + if (!bucket.bucket_id.empty()) { + rgw_bucket b = bucket; + b.bucket_id.clear(); + index.get_entities(b, dests); + } + } + + return 0; +} diff --git a/src/rgw/services/svc_bucket_sync_sobj.h b/src/rgw/services/svc_bucket_sync_sobj.h new file mode 100644 index 000000000..779df7b99 --- /dev/null +++ b/src/rgw/services/svc_bucket_sync_sobj.h @@ -0,0 +1,123 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +#include "svc_meta_be.h" +#include "svc_bucket_sync.h" + +class RGWSI_Zone; +class RGWSI_SysObj_Cache; +class RGWSI_Bucket_SObj; + +template +class RGWChainedCacheImpl; + +class RGWSI_Bucket_Sync_SObj_HintIndexManager; + +struct rgw_sync_bucket_entity; + +class RGWSI_Bucket_Sync_SObj : public RGWSI_Bucket_Sync +{ + struct bucket_sync_policy_cache_entry { + std::shared_ptr handler; + }; + + std::unique_ptr > sync_policy_cache; + + std::unique_ptr hint_index_mgr; + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + + struct optional_zone_bucket { + std::optional zone; + std::optional bucket; + + optional_zone_bucket(const std::optional& _zone, + const std::optional& _bucket) : zone(_zone), bucket(_bucket) {} + + bool operator<(const optional_zone_bucket& ozb) const { + if (zone < ozb.zone) { + return true; + } + if (zone > ozb.zone) { + return false; + } + return bucket < ozb.bucket; + } + }; + + void get_hint_entities(RGWSI_Bucket_X_Ctx& ctx, + const std::set& zone_names, + const std::set& buckets, + std::set *hint_entities, + optional_yield y, const DoutPrefixProvider *); + int resolve_policy_hints(RGWSI_Bucket_X_Ctx& ctx, + rgw_sync_bucket_entity& self_entity, + RGWBucketSyncPolicyHandlerRef& handler, + RGWBucketSyncPolicyHandlerRef& zone_policy_handler, + std::map& temp_map, + optional_yield y, + const DoutPrefixProvider *dpp); + int do_get_policy_handler(RGWSI_Bucket_X_Ctx& ctx, + std::optional zone, + std::optional _bucket, + std::map& temp_map, + RGWBucketSyncPolicyHandlerRef *handler, + optional_yield y, + const DoutPrefixProvider *dpp); +public: + struct Svc { + RGWSI_Zone *zone{nullptr}; + RGWSI_SysObj *sysobj{nullptr}; + RGWSI_SysObj_Cache *cache{nullptr}; + RGWSI_Bucket_SObj *bucket_sobj{nullptr}; + } svc; + + RGWSI_Bucket_Sync_SObj(CephContext *cct); + ~RGWSI_Bucket_Sync_SObj(); + + void init(RGWSI_Zone *_zone_svc, + RGWSI_SysObj *_sysobj_svc, + RGWSI_SysObj_Cache *_cache_svc, + RGWSI_Bucket_SObj *_bucket_sobj_svc); + + + int get_policy_handler(RGWSI_Bucket_X_Ctx& ctx, + std::optional zone, + std::optional bucket, + RGWBucketSyncPolicyHandlerRef *handler, + optional_yield y, + const DoutPrefixProvider *dpp); + + int handle_bi_update(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + RGWBucketInfo *orig_bucket_info, + optional_yield y) override; + int handle_bi_removal(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + optional_yield y) override; + + int get_bucket_sync_hints(const DoutPrefixProvider *dpp, + const rgw_bucket& bucket, + std::set *sources, + std::set *dests, + optional_yield y) override; +}; + diff --git a/src/rgw/services/svc_bucket_types.h b/src/rgw/services/svc_bucket_types.h new file mode 100644 index 000000000..30e5309d5 --- /dev/null +++ b/src/rgw/services/svc_bucket_types.h @@ -0,0 +1,38 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "common/ptr_wrapper.h" + +#include "svc_meta_be.h" +#include "svc_meta_be_types.h" + +class RGWSI_MetaBackend_Handler; + +using RGWSI_Bucket_BE_Handler = ptr_wrapper; +using RGWSI_BucketInstance_BE_Handler = ptr_wrapper; + + +using RGWSI_Bucket_EP_Ctx = ptr_wrapper; +using RGWSI_Bucket_BI_Ctx = ptr_wrapper; + +struct RGWSI_Bucket_X_Ctx { + RGWSI_Bucket_EP_Ctx ep; + RGWSI_Bucket_BI_Ctx bi; +}; + diff --git a/src/rgw/services/svc_cls.cc b/src/rgw/services/svc_cls.cc new file mode 100644 index 000000000..342146bfe --- /dev/null +++ b/src/rgw/services/svc_cls.cc @@ -0,0 +1,478 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#include "svc_cls.h" +#include "svc_rados.h" +#include "svc_zone.h" + +#include "rgw_zone.h" + +#include "cls/otp/cls_otp_client.h" +#include "cls/log/cls_log_client.h" +#include "cls/lock/cls_lock_client.h" + + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static string log_lock_name = "rgw_log_lock"; + +int RGWSI_Cls::do_start(optional_yield y, const DoutPrefixProvider *dpp) +{ + int r = mfa.do_start(y, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start mfa service" << dendl; + return r; + } + + return 0; +} + +int RGWSI_Cls::MFA::get_mfa_obj(const DoutPrefixProvider *dpp, const rgw_user& user, std::optional *obj) +{ + string oid = get_mfa_oid(user); + rgw_raw_obj o(zone_svc->get_zone_params().otp_pool, oid); + + obj->emplace(rados_svc->obj(o)); + int r = (*obj)->open(dpp); + if (r < 0) { + ldpp_dout(dpp, 4) << "failed to open rados context for " << o << dendl; + return r; + } + + return 0; +} + +int RGWSI_Cls::MFA::get_mfa_ref(const DoutPrefixProvider *dpp, const rgw_user& user, rgw_rados_ref *ref) +{ + std::optional obj; + int r = get_mfa_obj(dpp, user, &obj); + if (r < 0) { + return r; + } + *ref = obj->get_ref(); + return 0; +} + +int RGWSI_Cls::MFA::check_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const string& otp_id, const string& pin, optional_yield y) +{ + rgw_rados_ref ref; + int r = get_mfa_ref(dpp, user, &ref); + if (r < 0) { + return r; + } + + rados::cls::otp::otp_check_t result; + + r = rados::cls::otp::OTP::check(cct, ref.pool.ioctx(), ref.obj.oid, otp_id, pin, &result); + if (r < 0) + return r; + + ldpp_dout(dpp, 20) << "OTP check, otp_id=" << otp_id << " result=" << (int)result.result << dendl; + + return (result.result == rados::cls::otp::OTP_CHECK_SUCCESS ? 0 : -EACCES); +} + +void RGWSI_Cls::MFA::prepare_mfa_write(librados::ObjectWriteOperation *op, + RGWObjVersionTracker *objv_tracker, + const ceph::real_time& mtime) +{ + RGWObjVersionTracker ot; + + if (objv_tracker) { + ot = *objv_tracker; + } + + if (ot.write_version.tag.empty()) { + if (ot.read_version.tag.empty()) { + ot.generate_new_write_ver(cct); + } else { + ot.write_version = ot.read_version; + ot.write_version.ver++; + } + } + + ot.prepare_op_for_write(op); + struct timespec mtime_ts = real_clock::to_timespec(mtime); + op->mtime2(&mtime_ts); +} + +int RGWSI_Cls::MFA::create_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const rados::cls::otp::otp_info_t& config, + RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime, optional_yield y) +{ + std::optional obj; + int r = get_mfa_obj(dpp, user, &obj); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + prepare_mfa_write(&op, objv_tracker, mtime); + rados::cls::otp::OTP::create(&op, config); + r = obj->operate(dpp, &op, y); + if (r < 0) { + ldpp_dout(dpp, 20) << "OTP create, otp_id=" << config.id << " result=" << (int)r << dendl; + return r; + } + + return 0; +} + +int RGWSI_Cls::MFA::remove_mfa(const DoutPrefixProvider *dpp, + const rgw_user& user, const string& id, + RGWObjVersionTracker *objv_tracker, + const ceph::real_time& mtime, + optional_yield y) +{ + std::optional obj; + int r = get_mfa_obj(dpp, user, &obj); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + prepare_mfa_write(&op, objv_tracker, mtime); + rados::cls::otp::OTP::remove(&op, id); + r = obj->operate(dpp, &op, y); + if (r < 0) { + ldpp_dout(dpp, 20) << "OTP remove, otp_id=" << id << " result=" << (int)r << dendl; + return r; + } + + return 0; +} + +int RGWSI_Cls::MFA::get_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const string& id, rados::cls::otp::otp_info_t *result, + optional_yield y) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(dpp, user, &ref); + if (r < 0) { + return r; + } + + r = rados::cls::otp::OTP::get(nullptr, ref.pool.ioctx(), ref.obj.oid, id, result); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_Cls::MFA::list_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, list *result, + optional_yield y) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(dpp, user, &ref); + if (r < 0) { + return r; + } + + r = rados::cls::otp::OTP::get_all(nullptr, ref.pool.ioctx(), ref.obj.oid, result); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_Cls::MFA::otp_get_current_time(const DoutPrefixProvider *dpp, const rgw_user& user, ceph::real_time *result, + optional_yield y) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(dpp, user, &ref); + if (r < 0) { + return r; + } + + r = rados::cls::otp::OTP::get_current_time(ref.pool.ioctx(), ref.obj.oid, result); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_Cls::MFA::set_mfa(const DoutPrefixProvider *dpp, const string& oid, const list& entries, + bool reset_obj, RGWObjVersionTracker *objv_tracker, + const real_time& mtime, + optional_yield y) +{ + rgw_raw_obj o(zone_svc->get_zone_params().otp_pool, oid); + auto obj = rados_svc->obj(o); + int r = obj.open(dpp); + if (r < 0) { + ldpp_dout(dpp, 4) << "failed to open rados context for " << o << dendl; + return r; + } + librados::ObjectWriteOperation op; + if (reset_obj) { + op.remove(); + op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK); + op.create(false); + } + prepare_mfa_write(&op, objv_tracker, mtime); + rados::cls::otp::OTP::set(&op, entries); + r = obj.operate(dpp, &op, y); + if (r < 0) { + ldpp_dout(dpp, 20) << "OTP set entries.size()=" << entries.size() << " result=" << (int)r << dendl; + return r; + } + + return 0; +} + +int RGWSI_Cls::MFA::list_mfa(const DoutPrefixProvider *dpp, const string& oid, list *result, + RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime, + optional_yield y) +{ + rgw_raw_obj o(zone_svc->get_zone_params().otp_pool, oid); + auto obj = rados_svc->obj(o); + int r = obj.open(dpp); + if (r < 0) { + ldpp_dout(dpp, 4) << "failed to open rados context for " << o << dendl; + return r; + } + auto& ref = obj.get_ref(); + librados::ObjectReadOperation op; + struct timespec mtime_ts; + if (pmtime) { + op.stat2(nullptr, &mtime_ts, nullptr); + } + objv_tracker->prepare_op_for_read(&op); + r = rados::cls::otp::OTP::get_all(&op, ref.pool.ioctx(), ref.obj.oid, result); + if (r < 0) { + return r; + } + if (pmtime) { + *pmtime = ceph::real_clock::from_timespec(mtime_ts); + } + + return 0; +} + +void RGWSI_Cls::TimeLog::prepare_entry(cls_log_entry& entry, + const real_time& ut, + const string& section, + const string& key, + bufferlist& bl) +{ + cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl); +} + +int RGWSI_Cls::TimeLog::init_obj(const DoutPrefixProvider *dpp, const string& oid, RGWSI_RADOS::Obj& obj) +{ + rgw_raw_obj o(zone_svc->get_zone_params().log_pool, oid); + obj = rados_svc->obj(o); + return obj.open(dpp); + +} +int RGWSI_Cls::TimeLog::add(const DoutPrefixProvider *dpp, + const string& oid, + const real_time& ut, + const string& section, + const string& key, + bufferlist& bl, + optional_yield y) +{ + RGWSI_RADOS::Obj obj; + + int r = init_obj(dpp, oid, obj); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + utime_t t(ut); + cls_log_add(op, t, section, key, bl); + + return obj.operate(dpp, &op, y); +} + +int RGWSI_Cls::TimeLog::add(const DoutPrefixProvider *dpp, + const string& oid, + std::list& entries, + librados::AioCompletion *completion, + bool monotonic_inc, + optional_yield y) +{ + RGWSI_RADOS::Obj obj; + + int r = init_obj(dpp, oid, obj); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + cls_log_add(op, entries, monotonic_inc); + + if (!completion) { + r = obj.operate(dpp, &op, y); + } else { + r = obj.aio_operate(completion, &op); + } + return r; +} + +int RGWSI_Cls::TimeLog::list(const DoutPrefixProvider *dpp, + const string& oid, + const real_time& start_time, + const real_time& end_time, + int max_entries, std::list& entries, + const string& marker, + string *out_marker, + bool *truncated, + optional_yield y) +{ + RGWSI_RADOS::Obj obj; + + int r = init_obj(dpp, oid, obj); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + + utime_t st(start_time); + utime_t et(end_time); + + cls_log_list(op, st, et, marker, max_entries, entries, + out_marker, truncated); + + bufferlist obl; + + int ret = obj.operate(dpp, &op, &obl, y); + if (ret < 0) + return ret; + + return 0; +} + +int RGWSI_Cls::TimeLog::info(const DoutPrefixProvider *dpp, + const string& oid, + cls_log_header *header, + optional_yield y) +{ + RGWSI_RADOS::Obj obj; + + int r = init_obj(dpp, oid, obj); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + + cls_log_info(op, header); + + bufferlist obl; + + int ret = obj.operate(dpp, &op, &obl, y); + if (ret < 0) + return ret; + + return 0; +} + +int RGWSI_Cls::TimeLog::info_async(const DoutPrefixProvider *dpp, + RGWSI_RADOS::Obj& obj, + const string& oid, + cls_log_header *header, + librados::AioCompletion *completion) +{ + int r = init_obj(dpp, oid, obj); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + + cls_log_info(op, header); + + int ret = obj.aio_operate(completion, &op, nullptr); + if (ret < 0) + return ret; + + return 0; +} + +int RGWSI_Cls::TimeLog::trim(const DoutPrefixProvider *dpp, + const string& oid, + const real_time& start_time, + const real_time& end_time, + const string& from_marker, + const string& to_marker, + librados::AioCompletion *completion, + optional_yield y) +{ + RGWSI_RADOS::Obj obj; + + int r = init_obj(dpp, oid, obj); + if (r < 0) { + return r; + } + + utime_t st(start_time); + utime_t et(end_time); + + librados::ObjectWriteOperation op; + cls_log_trim(op, st, et, from_marker, to_marker); + + if (!completion) { + r = obj.operate(dpp, &op, y); + } else { + r = obj.aio_operate(completion, &op); + } + return r; +} + +int RGWSI_Cls::Lock::lock_exclusive(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + const string& oid, + timespan& duration, + string& zone_id, + string& owner_id, + std::optional lock_name) +{ + auto p = rados_svc->pool(pool); + int r = p.open(dpp); + if (r < 0) { + return r; + } + + uint64_t msec = std::chrono::duration_cast(duration).count(); + utime_t ut(msec / 1000, msec % 1000); + + rados::cls::lock::Lock l(lock_name.value_or(log_lock_name)); + l.set_duration(ut); + l.set_cookie(owner_id); + l.set_tag(zone_id); + l.set_may_renew(true); + + return l.lock_exclusive(&p.ioctx(), oid); +} + +int RGWSI_Cls::Lock::unlock(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + const string& oid, + string& zone_id, + string& owner_id, + std::optional lock_name) +{ + auto p = rados_svc->pool(pool); + int r = p.open(dpp); + if (r < 0) { + return r; + } + + rados::cls::lock::Lock l(lock_name.value_or(log_lock_name)); + l.set_tag(zone_id); + l.set_cookie(owner_id); + + return l.unlock(&p.ioctx(), oid); +} + diff --git a/src/rgw/services/svc_cls.h b/src/rgw/services/svc_cls.h new file mode 100644 index 000000000..d1d1d659b --- /dev/null +++ b/src/rgw/services/svc_cls.h @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "cls/otp/cls_otp_types.h" +#include "cls/log/cls_log_types.h" + +#include "rgw_service.h" + +#include "svc_rados.h" + + +class RGWSI_Cls : public RGWServiceInstance +{ + RGWSI_Zone *zone_svc{nullptr}; + RGWSI_RADOS *rados_svc{nullptr}; + + class ClsSubService : public RGWServiceInstance { + friend class RGWSI_Cls; + + RGWSI_Cls *cls_svc{nullptr}; + RGWSI_Zone *zone_svc{nullptr}; + RGWSI_RADOS *rados_svc{nullptr}; + + void init(RGWSI_Cls *_cls_svc, RGWSI_Zone *_zone_svc, RGWSI_RADOS *_rados_svc) { + cls_svc = _cls_svc; + zone_svc = _cls_svc->zone_svc; + rados_svc = _cls_svc->rados_svc; + } + + public: + ClsSubService(CephContext *cct) : RGWServiceInstance(cct) {} + }; + +public: + class MFA : public ClsSubService { + int get_mfa_obj(const DoutPrefixProvider *dpp, const rgw_user& user, std::optional *obj); + int get_mfa_ref(const DoutPrefixProvider *dpp, const rgw_user& user, rgw_rados_ref *ref); + + void prepare_mfa_write(librados::ObjectWriteOperation *op, + RGWObjVersionTracker *objv_tracker, + const ceph::real_time& mtime); + + public: + MFA(CephContext *cct): ClsSubService(cct) {} + + std::string get_mfa_oid(const rgw_user& user) { + return std::string("user:") + user.to_str(); + } + + int check_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& otp_id, const std::string& pin, optional_yield y); + int create_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const rados::cls::otp::otp_info_t& config, + RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime, optional_yield y); + int remove_mfa(const DoutPrefixProvider *dpp, + const rgw_user& user, const std::string& id, + RGWObjVersionTracker *objv_tracker, + const ceph::real_time& mtime, + optional_yield y); + int get_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& id, rados::cls::otp::otp_info_t *result, optional_yield y); + int list_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, std::list *result, optional_yield y); + int otp_get_current_time(const DoutPrefixProvider *dpp, const rgw_user& user, ceph::real_time *result, optional_yield y); + int set_mfa(const DoutPrefixProvider *dpp, const std::string& oid, const std::list& entries, + bool reset_obj, RGWObjVersionTracker *objv_tracker, + const real_time& mtime, optional_yield y); + int list_mfa(const DoutPrefixProvider *dpp, const std::string& oid, std::list *result, + RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime, optional_yield y); + } mfa; + + class TimeLog : public ClsSubService { + int init_obj(const DoutPrefixProvider *dpp, const std::string& oid, RGWSI_RADOS::Obj& obj); + public: + TimeLog(CephContext *cct): ClsSubService(cct) {} + + void prepare_entry(cls_log_entry& entry, + const real_time& ut, + const std::string& section, + const std::string& key, + bufferlist& bl); + int add(const DoutPrefixProvider *dpp, + const std::string& oid, + const real_time& ut, + const std::string& section, + const std::string& key, + bufferlist& bl, + optional_yield y); + int add(const DoutPrefixProvider *dpp, + const std::string& oid, + std::list& entries, + librados::AioCompletion *completion, + bool monotonic_inc, + optional_yield y); + int list(const DoutPrefixProvider *dpp, + const std::string& oid, + const real_time& start_time, + const real_time& end_time, + int max_entries, std::list& entries, + const std::string& marker, + std::string *out_marker, + bool *truncated, + optional_yield y); + int info(const DoutPrefixProvider *dpp, + const std::string& oid, + cls_log_header *header, + optional_yield y); + int info_async(const DoutPrefixProvider *dpp, + RGWSI_RADOS::Obj& obj, + const std::string& oid, + cls_log_header *header, + librados::AioCompletion *completion); + int trim(const DoutPrefixProvider *dpp, + const std::string& oid, + const real_time& start_time, + const real_time& end_time, + const std::string& from_marker, + const std::string& to_marker, + librados::AioCompletion *completion, + optional_yield y); + } timelog; + + class Lock : public ClsSubService { + int init_obj(const std::string& oid, RGWSI_RADOS::Obj& obj); + public: + Lock(CephContext *cct): ClsSubService(cct) {} + int lock_exclusive(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + const std::string& oid, + timespan& duration, + std::string& zone_id, + std::string& owner_id, + std::optional lock_name = std::nullopt); + int unlock(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + const std::string& oid, + std::string& zone_id, + std::string& owner_id, + std::optional lock_name = std::nullopt); + } lock; + + RGWSI_Cls(CephContext *cct): RGWServiceInstance(cct), mfa(cct), timelog(cct), lock(cct) {} + + void init(RGWSI_Zone *_zone_svc, RGWSI_RADOS *_rados_svc) { + rados_svc = _rados_svc; + zone_svc = _zone_svc; + + mfa.init(this, zone_svc, rados_svc); + timelog.init(this, zone_svc, rados_svc); + lock.init(this, zone_svc, rados_svc); + } + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; +}; + diff --git a/src/rgw/services/svc_config_key.h b/src/rgw/services/svc_config_key.h new file mode 100644 index 000000000..1c068b795 --- /dev/null +++ b/src/rgw/services/svc_config_key.h @@ -0,0 +1,31 @@ + + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +class RGWSI_ConfigKey : public RGWServiceInstance +{ +public: + RGWSI_ConfigKey(CephContext *cct) : RGWServiceInstance(cct) {} + virtual ~RGWSI_ConfigKey() {} + + virtual int get(const std::string& key, bool secure, bufferlist *result) = 0; +}; + diff --git a/src/rgw/services/svc_config_key_rados.cc b/src/rgw/services/svc_config_key_rados.cc new file mode 100644 index 000000000..5edb02ea7 --- /dev/null +++ b/src/rgw/services/svc_config_key_rados.cc @@ -0,0 +1,50 @@ + +#include "svc_rados.h" +#include "svc_config_key_rados.h" + +using namespace std; + +RGWSI_ConfigKey_RADOS::~RGWSI_ConfigKey_RADOS(){} + +int RGWSI_ConfigKey_RADOS::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + maybe_insecure_mon_conn = !svc.rados->check_secure_mon_conn(dpp); + + return 0; +} + +void RGWSI_ConfigKey_RADOS::warn_if_insecure() +{ + if (!maybe_insecure_mon_conn || + warned_insecure.test_and_set()) { + return; + } + + string s = "rgw is configured to optionally allow insecure connections to the monitors (auth_supported, ms_mon_client_mode), ssl certificates stored at the monitor configuration could leak"; + + svc.rados->clog_warn(s); + + lderr(ctx()) << __func__ << "(): WARNING: " << s << dendl; +} + +int RGWSI_ConfigKey_RADOS::get(const string& key, bool secure, bufferlist *result) +{ + string cmd = + "{" + "\"prefix\": \"config-key get\", " + "\"key\": \"" + key + "\"" + "}"; + + bufferlist inbl; + auto handle = svc.rados->handle(); + int ret = handle.mon_command(cmd, inbl, result, nullptr); + if (ret < 0) { + return ret; + } + + if (secure) { + warn_if_insecure(); + } + + return 0; +} diff --git a/src/rgw/services/svc_config_key_rados.h b/src/rgw/services/svc_config_key_rados.h new file mode 100644 index 000000000..b3b995ac7 --- /dev/null +++ b/src/rgw/services/svc_config_key_rados.h @@ -0,0 +1,54 @@ + + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include + +#include "rgw_service.h" + +#include "svc_config_key.h" + +class RGWSI_RADOS; + +class RGWSI_ConfigKey_RADOS : public RGWSI_ConfigKey +{ + bool maybe_insecure_mon_conn{false}; + std::atomic_flag warned_insecure = ATOMIC_FLAG_INIT; + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + + void warn_if_insecure(); + +public: + struct Svc { + RGWSI_RADOS *rados{nullptr}; + } svc; + + void init(RGWSI_RADOS *rados_svc) { + svc.rados = rados_svc; + } + + RGWSI_ConfigKey_RADOS(CephContext *cct) : RGWSI_ConfigKey(cct) {} + + virtual ~RGWSI_ConfigKey_RADOS() override; + + int get(const std::string& key, bool secure, bufferlist *result) override; +}; + + diff --git a/src/rgw/services/svc_finisher.cc b/src/rgw/services/svc_finisher.cc new file mode 100644 index 000000000..4883c7c50 --- /dev/null +++ b/src/rgw/services/svc_finisher.cc @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/Finisher.h" + +#include "svc_finisher.h" + +using namespace std; + +int RGWSI_Finisher::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + finisher = new Finisher(cct); + finisher->start(); + + return 0; +} + +void RGWSI_Finisher::shutdown() +{ + if (finalized) { + return; + } + + if (finisher) { + finisher->stop(); + + map cbs; + cbs.swap(shutdown_cbs); /* move cbs out, in case caller unregisters */ + for (auto& iter : cbs) { + iter.second->call(); + } + delete finisher; + } + + finalized = true; +} + +RGWSI_Finisher::~RGWSI_Finisher() +{ + shutdown(); +} + +void RGWSI_Finisher::register_caller(ShutdownCB *cb, int *phandle) +{ + *phandle = ++handles_counter; + shutdown_cbs[*phandle] = cb; +} + +void RGWSI_Finisher::unregister_caller(int handle) +{ + shutdown_cbs.erase(handle); +} + +void RGWSI_Finisher::schedule_context(Context *c) +{ + finisher->queue(c); +} + diff --git a/src/rgw/services/svc_finisher.h b/src/rgw/services/svc_finisher.h new file mode 100644 index 000000000..911b48f2b --- /dev/null +++ b/src/rgw/services/svc_finisher.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_service.h" + +class Context; +class Finisher; + +class RGWSI_Finisher : public RGWServiceInstance +{ + friend struct RGWServices_Def; +public: + class ShutdownCB; + +private: + Finisher *finisher{nullptr}; + bool finalized{false}; + + void shutdown() override; + + std::map shutdown_cbs; + std::atomic handles_counter{0}; + +protected: + void init() {} + int do_start(optional_yield y, const DoutPrefixProvider *dpp) override; + +public: + RGWSI_Finisher(CephContext *cct): RGWServiceInstance(cct) {} + ~RGWSI_Finisher(); + + class ShutdownCB { + public: + virtual ~ShutdownCB() {} + virtual void call() = 0; + }; + + void register_caller(ShutdownCB *cb, int *phandle); + void unregister_caller(int handle); + + void schedule_context(Context *c); +}; diff --git a/src/rgw/services/svc_mdlog.cc b/src/rgw/services/svc_mdlog.cc new file mode 100644 index 000000000..09a68d3d7 --- /dev/null +++ b/src/rgw/services/svc_mdlog.cc @@ -0,0 +1,549 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_mdlog.h" +#include "svc_rados.h" +#include "svc_zone.h" +#include "svc_sys_obj.h" + +#include "rgw_tools.h" +#include "rgw_mdlog.h" +#include "rgw_coroutine.h" +#include "rgw_cr_rados.h" +#include "rgw_zone.h" + +#include "common/errno.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +using Svc = RGWSI_MDLog::Svc; +using Cursor = RGWPeriodHistory::Cursor; + +RGWSI_MDLog::RGWSI_MDLog(CephContext *cct, bool _run_sync) : RGWServiceInstance(cct), run_sync(_run_sync) { +} + +RGWSI_MDLog::~RGWSI_MDLog() { +} + +int RGWSI_MDLog::init(RGWSI_RADOS *_rados_svc, RGWSI_Zone *_zone_svc, RGWSI_SysObj *_sysobj_svc, RGWSI_Cls *_cls_svc) +{ + svc.zone = _zone_svc; + svc.sysobj = _sysobj_svc; + svc.mdlog = this; + svc.rados = _rados_svc; + svc.cls = _cls_svc; + + return 0; +} + +int RGWSI_MDLog::do_start(optional_yield y, const DoutPrefixProvider *dpp) +{ + auto& current_period = svc.zone->get_current_period(); + + current_log = get_log(current_period.get_id()); + + period_puller.reset(new RGWPeriodPuller(svc.zone, svc.sysobj)); + period_history.reset(new RGWPeriodHistory(cct, period_puller.get(), + current_period)); + + if (run_sync && + svc.zone->need_to_sync()) { + // initialize the log period history + svc.mdlog->init_oldest_log_period(y, dpp); + } + return 0; +} + +int RGWSI_MDLog::read_history(RGWMetadataLogHistory *state, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) const +{ + auto& pool = svc.zone->get_zone_params().log_pool; + const auto& oid = RGWMetadataLogHistory::oid; + bufferlist bl; + int ret = rgw_get_system_obj(svc.sysobj, pool, oid, bl, objv_tracker, nullptr, y, dpp); + if (ret < 0) { + return ret; + } + if (bl.length() == 0) { + /* bad history object, remove it */ + rgw_raw_obj obj(pool, oid); + auto sysobj = svc.sysobj->get_obj(obj); + ret = sysobj.wop().remove(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: meta history is empty, but cannot remove it (" << cpp_strerror(-ret) << ")" << dendl; + return ret; + } + return -ENOENT; + } + try { + auto p = bl.cbegin(); + state->decode(p); + } catch (buffer::error& e) { + ldpp_dout(dpp, 1) << "failed to decode the mdlog history: " + << e.what() << dendl; + return -EIO; + } + return 0; +} + +int RGWSI_MDLog::write_history(const DoutPrefixProvider *dpp, + const RGWMetadataLogHistory& state, + RGWObjVersionTracker *objv_tracker, + optional_yield y, bool exclusive) +{ + bufferlist bl; + state.encode(bl); + + auto& pool = svc.zone->get_zone_params().log_pool; + const auto& oid = RGWMetadataLogHistory::oid; + return rgw_put_system_obj(dpp, svc.sysobj, pool, oid, bl, + exclusive, objv_tracker, real_time{}, y); +} + +namespace mdlog { + +using Cursor = RGWPeriodHistory::Cursor; + +namespace { +template +class SysObjReadCR : public RGWSimpleCoroutine { + const DoutPrefixProvider *dpp; + RGWAsyncRadosProcessor *async_rados; + RGWSI_SysObj *svc; + + rgw_raw_obj obj; + T *result; + /// on ENOENT, call handle_data() with an empty object instead of failing + const bool empty_on_enoent; + RGWObjVersionTracker *objv_tracker; + RGWAsyncGetSystemObj *req{nullptr}; + +public: + SysObjReadCR(const DoutPrefixProvider *_dpp, + RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc, + const rgw_raw_obj& _obj, + T *_result, bool empty_on_enoent = true, + RGWObjVersionTracker *objv_tracker = nullptr) + : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados), svc(_svc), + obj(_obj), result(_result), + empty_on_enoent(empty_on_enoent), objv_tracker(objv_tracker) {} + + ~SysObjReadCR() override { + try { + request_cleanup(); + } catch (const boost::container::length_error_t& e) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": reference counted object mismatched, \"" << e.what() << + "\"" << dendl; + } + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) { + req = new RGWAsyncGetSystemObj(dpp, this, stack->create_completion_notifier(), svc, + objv_tracker, obj, false, false); + async_rados->queue(req); + return 0; + } + + int request_complete() { + int ret = req->get_ret_status(); + retcode = ret; + if (ret == -ENOENT && empty_on_enoent) { + *result = T(); + } else { + if (ret < 0) { + return ret; + } + if (objv_tracker) { // copy the updated version + *objv_tracker = req->objv_tracker; + } + try { + auto iter = req->bl.cbegin(); + if (iter.end()) { + // allow successful reads with empty buffers. ReadSyncStatus + // coroutines depend on this to be able to read without + // locking, because the cls lock from InitSyncStatus will + // create an empty object if it didn't exist + *result = T(); + } else { + decode(*result, iter); + } + } catch (buffer::error& err) { + return -EIO; + } + } + return handle_data(*result); + } + + virtual int handle_data(T& data) { + return 0; + } +}; + +template +class SysObjWriteCR : public RGWSimpleCoroutine { + const DoutPrefixProvider *dpp; + RGWAsyncRadosProcessor *async_rados; + RGWSI_SysObj *svc; + bufferlist bl; + rgw_raw_obj obj; + RGWObjVersionTracker *objv_tracker; + bool exclusive; + RGWAsyncPutSystemObj *req{nullptr}; + +public: + SysObjWriteCR(const DoutPrefixProvider *_dpp, + RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc, + const rgw_raw_obj& _obj, const T& _data, + RGWObjVersionTracker *objv_tracker = nullptr, + bool exclusive = false) + : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados), + svc(_svc), obj(_obj), objv_tracker(objv_tracker), exclusive(exclusive) { + encode(_data, bl); + } + + ~SysObjWriteCR() override { + try { + request_cleanup(); + } catch (const boost::container::length_error_t& e) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": reference counted object mismatched, \"" << e.what() << + "\"" << dendl; + } + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request(const DoutPrefixProvider *dpp) override { + req = new RGWAsyncPutSystemObj(dpp, this, stack->create_completion_notifier(), + svc, objv_tracker, obj, exclusive, std::move(bl)); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + if (objv_tracker) { // copy the updated version + *objv_tracker = req->objv_tracker; + } + return req->get_ret_status(); + } +}; +} + +/// read the mdlog history and use it to initialize the given cursor +class ReadHistoryCR : public RGWCoroutine { + const DoutPrefixProvider *dpp; + Svc svc; + Cursor *cursor; + RGWObjVersionTracker *objv_tracker; + RGWMetadataLogHistory state; + RGWAsyncRadosProcessor *async_processor; + + public: + ReadHistoryCR(const DoutPrefixProvider *dpp, + const Svc& svc, + Cursor *cursor, + RGWObjVersionTracker *objv_tracker) + : RGWCoroutine(svc.zone->ctx()), dpp(dpp), svc(svc), + cursor(cursor), + objv_tracker(objv_tracker), + async_processor(svc.rados->get_async_processor()) + {} + + int operate(const DoutPrefixProvider *dpp) { + reenter(this) { + yield { + rgw_raw_obj obj{svc.zone->get_zone_params().log_pool, + RGWMetadataLogHistory::oid}; + constexpr bool empty_on_enoent = false; + + using ReadCR = SysObjReadCR; + call(new ReadCR(dpp, async_processor, svc.sysobj, obj, + &state, empty_on_enoent, objv_tracker)); + } + if (retcode < 0) { + ldpp_dout(dpp, 1) << "failed to read mdlog history: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + *cursor = svc.mdlog->period_history->lookup(state.oldest_realm_epoch); + if (!*cursor) { + return set_cr_error(cursor->get_error()); + } + + ldpp_dout(dpp, 10) << "read mdlog history with oldest period id=" + << state.oldest_period_id << " realm_epoch=" + << state.oldest_realm_epoch << dendl; + return set_cr_done(); + } + return 0; + } +}; + +/// write the given cursor to the mdlog history +class WriteHistoryCR : public RGWCoroutine { + const DoutPrefixProvider *dpp; + Svc svc; + Cursor cursor; + RGWObjVersionTracker *objv; + RGWMetadataLogHistory state; + RGWAsyncRadosProcessor *async_processor; + + public: + WriteHistoryCR(const DoutPrefixProvider *dpp, + Svc& svc, + const Cursor& cursor, + RGWObjVersionTracker *objv) + : RGWCoroutine(svc.zone->ctx()), dpp(dpp), svc(svc), + cursor(cursor), objv(objv), + async_processor(svc.rados->get_async_processor()) + {} + + int operate(const DoutPrefixProvider *dpp) { + reenter(this) { + state.oldest_period_id = cursor.get_period().get_id(); + state.oldest_realm_epoch = cursor.get_epoch(); + + yield { + rgw_raw_obj obj{svc.zone->get_zone_params().log_pool, + RGWMetadataLogHistory::oid}; + + using WriteCR = SysObjWriteCR; + call(new WriteCR(dpp, async_processor, svc.sysobj, obj, state, objv)); + } + if (retcode < 0) { + ldpp_dout(dpp, 1) << "failed to write mdlog history: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + ldpp_dout(dpp, 10) << "wrote mdlog history with oldest period id=" + << state.oldest_period_id << " realm_epoch=" + << state.oldest_realm_epoch << dendl; + return set_cr_done(); + } + return 0; + } +}; + +/// update the mdlog history to reflect trimmed logs +class TrimHistoryCR : public RGWCoroutine { + const DoutPrefixProvider *dpp; + Svc svc; + const Cursor cursor; //< cursor to trimmed period + RGWObjVersionTracker *objv; //< to prevent racing updates + Cursor next; //< target cursor for oldest log period + Cursor existing; //< existing cursor read from disk + + public: + TrimHistoryCR(const DoutPrefixProvider *dpp, const Svc& svc, Cursor cursor, RGWObjVersionTracker *objv) + : RGWCoroutine(svc.zone->ctx()), dpp(dpp), svc(svc), + cursor(cursor), objv(objv), next(cursor) { + next.next(); // advance past cursor + } + + int operate(const DoutPrefixProvider *dpp) { + reenter(this) { + // read an existing history, and write the new history if it's newer + yield call(new ReadHistoryCR(dpp, svc, &existing, objv)); + if (retcode < 0) { + return set_cr_error(retcode); + } + // reject older trims with ECANCELED + if (cursor.get_epoch() < existing.get_epoch()) { + ldpp_dout(dpp, 4) << "found oldest log epoch=" << existing.get_epoch() + << ", rejecting trim at epoch=" << cursor.get_epoch() << dendl; + return set_cr_error(-ECANCELED); + } + // overwrite with updated history + yield call(new WriteHistoryCR(dpp, svc, next, objv)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +} // mdlog namespace + +// traverse all the way back to the beginning of the period history, and +// return a cursor to the first period in a fully attached history +Cursor RGWSI_MDLog::find_oldest_period(const DoutPrefixProvider *dpp, optional_yield y) +{ + auto cursor = period_history->get_current(); + + while (cursor) { + // advance to the period's predecessor + if (!cursor.has_prev()) { + auto& predecessor = cursor.get_period().get_predecessor(); + if (predecessor.empty()) { + // this is the first period, so our logs must start here + ldpp_dout(dpp, 10) << "find_oldest_period returning first " + "period " << cursor.get_period().get_id() << dendl; + return cursor; + } + // pull the predecessor and add it to our history + RGWPeriod period; + int r = period_puller->pull(dpp, predecessor, period, y); + if (r < 0) { + return cursor; + } + auto prev = period_history->insert(std::move(period)); + if (!prev) { + return prev; + } + ldpp_dout(dpp, 20) << "find_oldest_period advancing to " + "predecessor period " << predecessor << dendl; + ceph_assert(cursor.has_prev()); + } + cursor.prev(); + } + ldpp_dout(dpp, 10) << "find_oldest_period returning empty cursor" << dendl; + return cursor; +} + +Cursor RGWSI_MDLog::init_oldest_log_period(optional_yield y, const DoutPrefixProvider *dpp) +{ + // read the mdlog history + RGWMetadataLogHistory state; + RGWObjVersionTracker objv; + int ret = read_history(&state, &objv, y, dpp); + + if (ret == -ENOENT) { + // initialize the mdlog history and write it + ldpp_dout(dpp, 10) << "initializing mdlog history" << dendl; + auto cursor = find_oldest_period(dpp, y); + if (!cursor) { + return cursor; + } + // write the initial history + state.oldest_realm_epoch = cursor.get_epoch(); + state.oldest_period_id = cursor.get_period().get_id(); + + constexpr bool exclusive = true; // don't overwrite + int ret = write_history(dpp, state, &objv, y, exclusive); + if (ret < 0 && ret != -EEXIST) { + ldpp_dout(dpp, 1) << "failed to write mdlog history: " + << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + return cursor; + } else if (ret < 0) { + ldpp_dout(dpp, 1) << "failed to read mdlog history: " + << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + + // if it's already in the history, return it + auto cursor = period_history->lookup(state.oldest_realm_epoch); + if (cursor) { + return cursor; + } else { + cursor = find_oldest_period(dpp, y); + state.oldest_realm_epoch = cursor.get_epoch(); + state.oldest_period_id = cursor.get_period().get_id(); + ldpp_dout(dpp, 10) << "rewriting mdlog history" << dendl; + ret = write_history(dpp, state, &objv, y); + if (ret < 0 && ret != -ECANCELED) { + ldpp_dout(dpp, 1) << "failed to write mdlog history: " + << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + return cursor; + } + + // pull the oldest period by id + RGWPeriod period; + ret = period_puller->pull(dpp, state.oldest_period_id, period, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "failed to read period id=" << state.oldest_period_id + << " for mdlog history: " << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + // verify its realm_epoch + if (period.get_realm_epoch() != state.oldest_realm_epoch) { + ldpp_dout(dpp, 1) << "inconsistent mdlog history: read period id=" + << period.get_id() << " with realm_epoch=" << period.get_realm_epoch() + << ", expected realm_epoch=" << state.oldest_realm_epoch << dendl; + return Cursor{-EINVAL}; + } + // attach the period to our history + return period_history->attach(dpp, std::move(period), y); +} + +Cursor RGWSI_MDLog::read_oldest_log_period(optional_yield y, const DoutPrefixProvider *dpp) const +{ + RGWMetadataLogHistory state; + int ret = read_history(&state, nullptr, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 1) << "failed to read mdlog history: " + << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + + ldpp_dout(dpp, 10) << "read mdlog history with oldest period id=" + << state.oldest_period_id << " realm_epoch=" + << state.oldest_realm_epoch << dendl; + + return period_history->lookup(state.oldest_realm_epoch); +} + +RGWCoroutine* RGWSI_MDLog::read_oldest_log_period_cr(const DoutPrefixProvider *dpp, + Cursor *period, RGWObjVersionTracker *objv) const +{ + return new mdlog::ReadHistoryCR(dpp, svc, period, objv); +} + +RGWCoroutine* RGWSI_MDLog::trim_log_period_cr(const DoutPrefixProvider *dpp, + Cursor period, RGWObjVersionTracker *objv) const +{ + return new mdlog::TrimHistoryCR(dpp, svc, period, objv); +} + +RGWMetadataLog* RGWSI_MDLog::get_log(const std::string& period) +{ + // construct the period's log in place if it doesn't exist + auto insert = md_logs.emplace(std::piecewise_construct, + std::forward_as_tuple(period), + std::forward_as_tuple(cct, svc.zone, svc.cls, period)); + return &insert.first->second; +} + +int RGWSI_MDLog::add_entry(const DoutPrefixProvider *dpp, const string& hash_key, const string& section, const string& key, bufferlist& bl) +{ + ceph_assert(current_log); // must have called init() + return current_log->add_entry(dpp, hash_key, section, key, bl); +} + +int RGWSI_MDLog::get_shard_id(const string& hash_key, int *shard_id) +{ + ceph_assert(current_log); // must have called init() + return current_log->get_shard_id(hash_key, shard_id); +} + +int RGWSI_MDLog::pull_period(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, + optional_yield y) +{ + return period_puller->pull(dpp, period_id, period, y); +} + diff --git a/src/rgw/services/svc_mdlog.h b/src/rgw/services/svc_mdlog.h new file mode 100644 index 000000000..703d6f605 --- /dev/null +++ b/src/rgw/services/svc_mdlog.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" +#include "rgw_period_history.h" +#include "rgw_period_puller.h" + +#include "svc_meta_be.h" + + +class RGWMetadataLog; +class RGWMetadataLogHistory; +class RGWCoroutine; + +class RGWSI_Zone; +class RGWSI_SysObj; +class RGWSI_RADOS; + +namespace mdlog { + class ReadHistoryCR; + class WriteHistoryCR; +} + +class RGWSI_MDLog : public RGWServiceInstance +{ + friend class mdlog::ReadHistoryCR; + friend class mdlog::WriteHistoryCR; + + // maintain a separate metadata log for each period + std::map md_logs; + + // use the current period's log for mutating operations + RGWMetadataLog* current_log{nullptr}; + + bool run_sync; + + // pulls missing periods for period_history + std::unique_ptr period_puller; + // maintains a connected history of periods + std::unique_ptr period_history; + +public: + RGWSI_MDLog(CephContext *cct, bool run_sync); + virtual ~RGWSI_MDLog(); + + struct Svc { + RGWSI_RADOS *rados{nullptr}; + RGWSI_Zone *zone{nullptr}; + RGWSI_SysObj *sysobj{nullptr}; + RGWSI_MDLog *mdlog{nullptr}; + RGWSI_Cls *cls{nullptr}; + } svc; + + int init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc, + RGWSI_SysObj *_sysobj_svc, + RGWSI_Cls *_cls_svc); + + int do_start(optional_yield y, const DoutPrefixProvider *dpp) override; + + // traverse all the way back to the beginning of the period history, and + // return a cursor to the first period in a fully attached history + RGWPeriodHistory::Cursor find_oldest_period(const DoutPrefixProvider *dpp, optional_yield y); + + /// initialize the oldest log period if it doesn't exist, and attach it to + /// our current history + RGWPeriodHistory::Cursor init_oldest_log_period(optional_yield y, const DoutPrefixProvider *dpp); + + /// read the oldest log period, and return a cursor to it in our existing + /// period history + RGWPeriodHistory::Cursor read_oldest_log_period(optional_yield y, const DoutPrefixProvider *dpp) const; + + /// read the oldest log period asynchronously and write its result to the + /// given cursor pointer + RGWCoroutine* read_oldest_log_period_cr(const DoutPrefixProvider *dpp, + RGWPeriodHistory::Cursor *period, + RGWObjVersionTracker *objv) const; + + /// try to advance the oldest log period when the given period is trimmed, + /// using a rados lock to provide atomicity + RGWCoroutine* trim_log_period_cr(const DoutPrefixProvider *dpp, + RGWPeriodHistory::Cursor period, + RGWObjVersionTracker *objv) const; + int read_history(RGWMetadataLogHistory *state, RGWObjVersionTracker *objv_tracker,optional_yield y, const DoutPrefixProvider *dpp) const; + int write_history(const DoutPrefixProvider *dpp, + const RGWMetadataLogHistory& state, + RGWObjVersionTracker *objv_tracker, + optional_yield y, bool exclusive = false); + + int add_entry(const DoutPrefixProvider *dpp, const std::string& hash_key, const std::string& section, const std::string& key, bufferlist& bl); + + int get_shard_id(const std::string& hash_key, int *shard_id); + + RGWPeriodHistory *get_period_history() { + return period_history.get(); + } + + int pull_period(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, optional_yield y); + + /// find or create the metadata log for the given period + RGWMetadataLog* get_log(const std::string& period); +}; + diff --git a/src/rgw/services/svc_meta.cc b/src/rgw/services/svc_meta.cc new file mode 100644 index 000000000..735c39f85 --- /dev/null +++ b/src/rgw/services/svc_meta.cc @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#include "svc_meta.h" + +#include "rgw_metadata.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWSI_Meta::RGWSI_Meta(CephContext *cct) : RGWServiceInstance(cct) { +} + +RGWSI_Meta::~RGWSI_Meta() {} + +void RGWSI_Meta::init(RGWSI_SysObj *_sysobj_svc, + RGWSI_MDLog *_mdlog_svc, + vector& _be_svc) +{ + sysobj_svc = _sysobj_svc; + mdlog_svc = _mdlog_svc; + + for (auto& be : _be_svc) { + be_svc[be->get_type()] = be; + } +} + +int RGWSI_Meta::create_be_handler(RGWSI_MetaBackend::Type be_type, + RGWSI_MetaBackend_Handler **phandler) +{ + auto iter = be_svc.find(be_type); + if (iter == be_svc.end()) { + ldout(cct, 0) << __func__ << "(): ERROR: backend type not found" << dendl; + return -EINVAL; + } + + auto handler = iter->second->alloc_be_handler(); + + be_handlers.emplace_back(handler); + *phandler = handler; + + return 0; +} + diff --git a/src/rgw/services/svc_meta.h b/src/rgw/services/svc_meta.h new file mode 100644 index 000000000..b398e27fd --- /dev/null +++ b/src/rgw/services/svc_meta.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "svc_meta_be.h" + +#include "rgw_service.h" + + +class RGWMetadataLog; +class RGWCoroutine; + + +class RGWSI_Meta : public RGWServiceInstance +{ + RGWSI_SysObj *sysobj_svc{nullptr}; + RGWSI_MDLog *mdlog_svc{nullptr}; + + std::map be_svc; + + std::vector > be_handlers; + +public: + RGWSI_Meta(CephContext *cct); + ~RGWSI_Meta(); + + void init(RGWSI_SysObj *_sysobj_svc, + RGWSI_MDLog *_mdlog_svc, + std::vector& _be_svc); + + int create_be_handler(RGWSI_MetaBackend::Type be_type, + RGWSI_MetaBackend_Handler **phandler); +}; + diff --git a/src/rgw/services/svc_meta_be.cc b/src/rgw/services/svc_meta_be.cc new file mode 100644 index 000000000..2cb0365c8 --- /dev/null +++ b/src/rgw/services/svc_meta_be.cc @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#include "svc_meta_be.h" + +#include "rgw_mdlog.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWSI_MetaBackend::Context::~Context() {} // needed, even though destructor is pure virtual +RGWSI_MetaBackend::Module::~Module() {} // ditto +RGWSI_MetaBackend::PutParams::~PutParams() {} // ... +RGWSI_MetaBackend::GetParams::~GetParams() {} // ... +RGWSI_MetaBackend::RemoveParams::~RemoveParams() {} // ... + +int RGWSI_MetaBackend::pre_modify(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const string& key, + RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type, + optional_yield y) +{ + /* if write version has not been set, and there's a read version, set it so that we can + * log it + */ + if (objv_tracker && + objv_tracker->read_version.ver && !objv_tracker->write_version.ver) { + objv_tracker->write_version = objv_tracker->read_version; + objv_tracker->write_version.ver++; + } + + return 0; +} + +int RGWSI_MetaBackend::post_modify(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const string& key, + RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, int ret, + optional_yield y) +{ + return ret; +} + +int RGWSI_MetaBackend::prepare_mutate(RGWSI_MetaBackend::Context *ctx, + const string& key, + const real_time& mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + real_time orig_mtime; + + int ret = call_with_get_params(&orig_mtime, [&](GetParams& params) { + return get_entry(ctx, key, params, objv_tracker, y, dpp); + }); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + + if (objv_tracker->write_version.tag.empty()) { + if (objv_tracker->read_version.tag.empty()) { + objv_tracker->generate_new_write_ver(cct); + } else { + objv_tracker->write_version = objv_tracker->read_version; + objv_tracker->write_version.ver++; + } + } + return 0; +} + +int RGWSI_MetaBackend::do_mutate(RGWSI_MetaBackend::Context *ctx, + const string& key, + const ceph::real_time& mtime, + RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type, + optional_yield y, + std::function f, + bool generic_prepare, + const DoutPrefixProvider *dpp) +{ + int ret; + + if (generic_prepare) { + ret = prepare_mutate(ctx, key, mtime, objv_tracker, y, dpp); + if (ret < 0 || + ret == STATUS_NO_APPLY) { + return ret; + } + } + + RGWMetadataLogData log_data; + ret = pre_modify(dpp, ctx, key, log_data, objv_tracker, op_type, y); + if (ret < 0) { + return ret; + } + + ret = f(); + + /* cascading ret into post_modify() */ + + ret = post_modify(dpp, ctx, key, log_data, objv_tracker, ret, y); + if (ret < 0) + return ret; + + return 0; +} + +int RGWSI_MetaBackend::get(Context *ctx, + const string& key, + GetParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + bool get_raw_attrs) +{ + return get_entry(ctx, key, params, objv_tracker, y, dpp, get_raw_attrs); +} + +int RGWSI_MetaBackend::put(Context *ctx, + const string& key, + PutParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + std::function f = [&]() { + return put_entry(dpp, ctx, key, params, objv_tracker, y); + }; + + return do_mutate(ctx, key, params.mtime, objv_tracker, + MDLOG_STATUS_WRITE, + y, + f, + false, + dpp); +} + +int RGWSI_MetaBackend::remove(Context *ctx, + const string& key, + RemoveParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + std::function f = [&]() { + return remove_entry(dpp, ctx, key, params, objv_tracker, y); + }; + + return do_mutate(ctx, key, params.mtime, objv_tracker, + MDLOG_STATUS_REMOVE, + y, + f, + false, + dpp); +} + +int RGWSI_MetaBackend::mutate(Context *ctx, + const std::string& key, + MutateParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + std::function f, + const DoutPrefixProvider *dpp) +{ + return do_mutate(ctx, key, params.mtime, objv_tracker, + params.op_type, y, + f, + false, + dpp); +} + +int RGWSI_MetaBackend_Handler::call(std::optional bectx_params, + std::function f) +{ + return be->call(bectx_params, [&](RGWSI_MetaBackend::Context *ctx) { + ctx->init(this); + Op op(be, ctx); + return f(&op); + }); +} + +RGWSI_MetaBackend_Handler::Op_ManagedCtx::Op_ManagedCtx(RGWSI_MetaBackend_Handler *handler) : Op(handler->be, handler->be->alloc_ctx()) +{ + auto c = ctx(); + c->init(handler); + pctx.reset(c); +} + diff --git a/src/rgw/services/svc_meta_be.h b/src/rgw/services/svc_meta_be.h new file mode 100644 index 000000000..97267a4e7 --- /dev/null +++ b/src/rgw/services/svc_meta_be.h @@ -0,0 +1,294 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "svc_meta_be_params.h" + +#include "rgw_service.h" +#include "rgw_mdlog_types.h" + +class RGWMetadataLogData; + +class RGWSI_MDLog; +class RGWSI_Meta; +class RGWObjVersionTracker; +class RGWSI_MetaBackend_Handler; + +class RGWSI_MetaBackend : public RGWServiceInstance +{ + friend class RGWSI_Meta; +public: + class Module; + class Context; +protected: + RGWSI_MDLog *mdlog_svc{nullptr}; + + void base_init(RGWSI_MDLog *_mdlog_svc) { + mdlog_svc = _mdlog_svc; + } + + int prepare_mutate(RGWSI_MetaBackend::Context *ctx, + const std::string& key, + const ceph::real_time& mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp); + + virtual int do_mutate(Context *ctx, + const std::string& key, + const ceph::real_time& mtime, RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type, + optional_yield y, + std::function f, + bool generic_prepare, + const DoutPrefixProvider *dpp); + + virtual int pre_modify(const DoutPrefixProvider *dpp, + Context *ctx, + const std::string& key, + RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type, + optional_yield y); + virtual int post_modify(const DoutPrefixProvider *dpp, + Context *ctx, + const std::string& key, + RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, int ret, + optional_yield y); +public: + class Module { + /* + * Backend specialization module + */ + public: + virtual ~Module() = 0; + }; + + using ModuleRef = std::shared_ptr; + + struct Context { /* + * A single metadata operation context. Will be holding info about + * backend and operation itself; operation might span multiple backend + * calls. + */ + virtual ~Context() = 0; + + virtual void init(RGWSI_MetaBackend_Handler *h) = 0; + }; + + virtual Context *alloc_ctx() = 0; + + struct PutParams { + ceph::real_time mtime; + + PutParams() {} + PutParams(const ceph::real_time& _mtime) : mtime(_mtime) {} + virtual ~PutParams() = 0; + }; + + struct GetParams { + GetParams() {} + GetParams(ceph::real_time *_pmtime) : pmtime(_pmtime) {} + virtual ~GetParams(); + + ceph::real_time *pmtime{nullptr}; + }; + + struct RemoveParams { + virtual ~RemoveParams() = 0; + + ceph::real_time mtime; + }; + + struct MutateParams { + ceph::real_time mtime; + RGWMDLogStatus op_type; + + MutateParams() {} + MutateParams(const ceph::real_time& _mtime, + RGWMDLogStatus _op_type) : mtime(_mtime), op_type(_op_type) {} + virtual ~MutateParams() {} + }; + + enum Type { + MDBE_SOBJ = 0, + MDBE_OTP = 1, + }; + + RGWSI_MetaBackend(CephContext *cct) : RGWServiceInstance(cct) {} + virtual ~RGWSI_MetaBackend() {} + + virtual Type get_type() = 0; + + virtual RGWSI_MetaBackend_Handler *alloc_be_handler() = 0; + virtual int call_with_get_params(ceph::real_time *pmtime, std::function) = 0; + + /* these should be implemented by backends */ + virtual int get_entry(RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWSI_MetaBackend::GetParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + bool get_raw_attrs=false) = 0; + virtual int put_entry(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWSI_MetaBackend::PutParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y) = 0; + virtual int remove_entry(const DoutPrefixProvider *dpp, + Context *ctx, + const std::string& key, + RGWSI_MetaBackend::RemoveParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y) = 0; + + virtual int list_init(const DoutPrefixProvider *dpp, RGWSI_MetaBackend::Context *ctx, const std::string& marker) = 0; + virtual int list_next(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + int max, std::list *keys, + bool *truncated) = 0; + virtual int list_get_marker(RGWSI_MetaBackend::Context *ctx, + std::string *marker) = 0; + + int call(std::function f) { + return call(std::nullopt, f); + } + + virtual int call(std::optional opt, + std::function f) = 0; + + virtual int get_shard_id(RGWSI_MetaBackend::Context *ctx, + const std::string& key, + int *shard_id) = 0; + + /* higher level */ + virtual int get(Context *ctx, + const std::string& key, + GetParams ¶ms, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + bool get_raw_attrs=false); + + virtual int put(Context *ctx, + const std::string& key, + PutParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp); + + virtual int remove(Context *ctx, + const std::string& key, + RemoveParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp); + + virtual int mutate(Context *ctx, + const std::string& key, + MutateParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + std::function f, + const DoutPrefixProvider *dpp); +}; + +class RGWSI_MetaBackend_Handler { + RGWSI_MetaBackend *be{nullptr}; + +public: + class Op { + friend class RGWSI_MetaBackend_Handler; + + RGWSI_MetaBackend *be; + RGWSI_MetaBackend::Context *be_ctx; + + Op(RGWSI_MetaBackend *_be, + RGWSI_MetaBackend::Context *_ctx) : be(_be), be_ctx(_ctx) {} + + public: + RGWSI_MetaBackend::Context *ctx() { + return be_ctx; + } + + int get(const std::string& key, + RGWSI_MetaBackend::GetParams ¶ms, + RGWObjVersionTracker *objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) { + return be->get(be_ctx, key, params, objv_tracker, y, dpp); + } + + int put(const std::string& key, + RGWSI_MetaBackend::PutParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) { + return be->put(be_ctx, key, params, objv_tracker, y, dpp); + } + + int remove(const std::string& key, + RGWSI_MetaBackend::RemoveParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) { + return be->remove(be_ctx, key, params, objv_tracker, y, dpp); + } + + int mutate(const std::string& key, + RGWSI_MetaBackend::MutateParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + std::function f, + const DoutPrefixProvider *dpp) { + return be->mutate(be_ctx, key, params, objv_tracker, y, f, dpp); + } + + int list_init(const DoutPrefixProvider *dpp, const std::string& marker) { + return be->list_init(dpp, be_ctx, marker); + } + int list_next(const DoutPrefixProvider *dpp, int max, std::list *keys, + bool *truncated) { + return be->list_next(dpp, be_ctx, max, keys, truncated); + } + int list_get_marker(std::string *marker) { + return be->list_get_marker(be_ctx, marker); + } + + int get_shard_id(const std::string& key, int *shard_id) { + return be->get_shard_id(be_ctx, key, shard_id); + } + }; + + class Op_ManagedCtx : public Op { + std::unique_ptr pctx; + public: + Op_ManagedCtx(RGWSI_MetaBackend_Handler *handler); + }; + + RGWSI_MetaBackend_Handler(RGWSI_MetaBackend *_be) : be(_be) {} + virtual ~RGWSI_MetaBackend_Handler() {} + + int call(std::function f) { + return call(std::nullopt, f); + } + + virtual int call(std::optional bectx_params, + std::function f); +}; + diff --git a/src/rgw/services/svc_meta_be_otp.cc b/src/rgw/services/svc_meta_be_otp.cc new file mode 100644 index 000000000..3cabeb9d0 --- /dev/null +++ b/src/rgw/services/svc_meta_be_otp.cc @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_meta_be_otp.h" + +#include "rgw_tools.h" +#include "rgw_metadata.h" +#include "rgw_mdlog.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWSI_MetaBackend_OTP::RGWSI_MetaBackend_OTP(CephContext *cct) : RGWSI_MetaBackend_SObj(cct) { +} + +RGWSI_MetaBackend_OTP::~RGWSI_MetaBackend_OTP() { +} + +string RGWSI_MetaBackend_OTP::get_meta_key(const rgw_user& user) +{ + return string("otp:user:") + user.to_str(); +} + +RGWSI_MetaBackend_Handler *RGWSI_MetaBackend_OTP::alloc_be_handler() +{ + return new RGWSI_MetaBackend_Handler_OTP(this); +} + +RGWSI_MetaBackend::Context *RGWSI_MetaBackend_OTP::alloc_ctx() +{ + return new Context_OTP; +} + +int RGWSI_MetaBackend_OTP::call_with_get_params(ceph::real_time *pmtime, std::function cb) +{ + otp_devices_list_t devices; + RGWSI_MBOTP_GetParams params; + params.pdevices = &devices; + params.pmtime = pmtime; + return cb(params); +} + +int RGWSI_MetaBackend_OTP::get_entry(RGWSI_MetaBackend::Context *_ctx, + const string& key, + RGWSI_MetaBackend::GetParams& _params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + bool get_raw_attrs) +{ + RGWSI_MBOTP_GetParams& params = static_cast(_params); + + int r = cls_svc->mfa.list_mfa(dpp, key, params.pdevices, objv_tracker, params.pmtime, y); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_MetaBackend_OTP::put_entry(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *_ctx, + const string& key, + RGWSI_MetaBackend::PutParams& _params, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + RGWSI_MBOTP_PutParams& params = static_cast(_params); + + return cls_svc->mfa.set_mfa(dpp, key, params.devices, true, objv_tracker, params.mtime, y); +} + diff --git a/src/rgw/services/svc_meta_be_otp.h b/src/rgw/services/svc_meta_be_otp.h new file mode 100644 index 000000000..7bd9cf652 --- /dev/null +++ b/src/rgw/services/svc_meta_be_otp.h @@ -0,0 +1,89 @@ + + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +#include "svc_cls.h" +#include "svc_meta_be.h" +#include "svc_meta_be_sobj.h" +#include "svc_sys_obj.h" + + +using RGWSI_MBOTP_Handler_Module = RGWSI_MBSObj_Handler_Module; +using RGWSI_MetaBackend_Handler_OTP = RGWSI_MetaBackend_Handler_SObj; + +using otp_devices_list_t = std::list; + +struct RGWSI_MBOTP_GetParams : public RGWSI_MetaBackend::GetParams { + otp_devices_list_t *pdevices{nullptr}; +}; + +struct RGWSI_MBOTP_PutParams : public RGWSI_MetaBackend::PutParams { + otp_devices_list_t devices; +}; + +using RGWSI_MBOTP_RemoveParams = RGWSI_MBSObj_RemoveParams; + +class RGWSI_MetaBackend_OTP : public RGWSI_MetaBackend_SObj +{ + RGWSI_Cls *cls_svc{nullptr}; + +public: + struct Context_OTP : public RGWSI_MetaBackend_SObj::Context_SObj { + otp_devices_list_t devices; + }; + + RGWSI_MetaBackend_OTP(CephContext *cct); + virtual ~RGWSI_MetaBackend_OTP(); + + RGWSI_MetaBackend::Type get_type() { + return MDBE_OTP; + } + + static std::string get_meta_key(const rgw_user& user); + + void init(RGWSI_SysObj *_sysobj_svc, + RGWSI_MDLog *_mdlog_svc, + RGWSI_Cls *_cls_svc) { + RGWSI_MetaBackend_SObj::init(_sysobj_svc, _mdlog_svc); + cls_svc = _cls_svc; + } + + RGWSI_MetaBackend_Handler *alloc_be_handler() override; + RGWSI_MetaBackend::Context *alloc_ctx() override; + + int call_with_get_params(ceph::real_time *pmtime, std::function cb) override; + + int get_entry(RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWSI_MetaBackend::GetParams& _params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + bool get_raw_attrs=false); + int put_entry(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWSI_MetaBackend::PutParams& _params, + RGWObjVersionTracker *objv_tracker, + optional_yield y); +}; + + diff --git a/src/rgw/services/svc_meta_be_params.h b/src/rgw/services/svc_meta_be_params.h new file mode 100644 index 000000000..445f6e188 --- /dev/null +++ b/src/rgw/services/svc_meta_be_params.h @@ -0,0 +1,25 @@ + + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include + +struct RGWSI_MetaBackend_CtxParams_SObj {}; + +using RGWSI_MetaBackend_CtxParams = std::variant; diff --git a/src/rgw/services/svc_meta_be_sobj.cc b/src/rgw/services/svc_meta_be_sobj.cc new file mode 100644 index 000000000..c0ff402fc --- /dev/null +++ b/src/rgw/services/svc_meta_be_sobj.cc @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_meta_be_sobj.h" +#include "svc_meta_be_params.h" +#include "svc_mdlog.h" + +#include "rgw_tools.h" +#include "rgw_metadata.h" +#include "rgw_mdlog.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWSI_MetaBackend_SObj::RGWSI_MetaBackend_SObj(CephContext *cct) : RGWSI_MetaBackend(cct) { +} + +RGWSI_MetaBackend_SObj::~RGWSI_MetaBackend_SObj() { +} + +RGWSI_MetaBackend_Handler *RGWSI_MetaBackend_SObj::alloc_be_handler() +{ + return new RGWSI_MetaBackend_Handler_SObj(this); +} + +RGWSI_MetaBackend::Context *RGWSI_MetaBackend_SObj::alloc_ctx() +{ + return new Context_SObj; +} + +int RGWSI_MetaBackend_SObj::pre_modify(const DoutPrefixProvider *dpp, RGWSI_MetaBackend::Context *_ctx, + const string& key, + RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type, + optional_yield y) +{ + auto ctx = static_cast(_ctx); + int ret = RGWSI_MetaBackend::pre_modify(dpp, ctx, key, log_data, + objv_tracker, op_type, + y); + if (ret < 0) { + return ret; + } + + /* if write version has not been set, and there's a read version, set it so that we can + * log it + */ + if (objv_tracker) { + log_data.read_version = objv_tracker->read_version; + log_data.write_version = objv_tracker->write_version; + } + + log_data.status = op_type; + + bufferlist logbl; + encode(log_data, logbl); + + ret = mdlog_svc->add_entry(dpp, ctx->module->get_hash_key(key), ctx->module->get_section(), key, logbl); + if (ret < 0) + return ret; + + return 0; +} + +int RGWSI_MetaBackend_SObj::post_modify(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *_ctx, + const string& key, + RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, int ret, + optional_yield y) +{ + auto ctx = static_cast(_ctx); + if (ret >= 0) + log_data.status = MDLOG_STATUS_COMPLETE; + else + log_data.status = MDLOG_STATUS_ABORT; + + bufferlist logbl; + encode(log_data, logbl); + + int r = mdlog_svc->add_entry(dpp, ctx->module->get_hash_key(key), ctx->module->get_section(), key, logbl); + if (ret < 0) + return ret; + + if (r < 0) + return r; + + return RGWSI_MetaBackend::post_modify(dpp, ctx, key, log_data, objv_tracker, ret, y); +} + +int RGWSI_MetaBackend_SObj::get_shard_id(RGWSI_MetaBackend::Context *_ctx, + const std::string& key, + int *shard_id) +{ + auto ctx = static_cast(_ctx); + *shard_id = mdlog_svc->get_shard_id(ctx->module->get_hash_key(key), shard_id); + return 0; +} + +int RGWSI_MetaBackend_SObj::call(std::optional opt, + std::function f) +{ + RGWSI_MetaBackend_SObj::Context_SObj ctx; + return f(&ctx); +} + +void RGWSI_MetaBackend_SObj::Context_SObj::init(RGWSI_MetaBackend_Handler *h) +{ + RGWSI_MetaBackend_Handler_SObj *handler = static_cast(h); + module = handler->module; +} + +int RGWSI_MetaBackend_SObj::call_with_get_params(ceph::real_time *pmtime, std::function cb) +{ + bufferlist bl; + RGWSI_MBSObj_GetParams params; + params.pmtime = pmtime; + params.pbl = &bl; + return cb(params); +} + +int RGWSI_MetaBackend_SObj::get_entry(RGWSI_MetaBackend::Context *_ctx, + const string& key, + GetParams& _params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + bool get_raw_attrs) +{ + RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast(_ctx); + RGWSI_MBSObj_GetParams& params = static_cast(_params); + + rgw_pool pool; + string oid; + ctx->module->get_pool_and_oid(key, &pool, &oid); + + int ret = 0; + ret = rgw_get_system_obj(sysobj_svc, pool, oid, *params.pbl, + objv_tracker, params.pmtime, + y, dpp, + params.pattrs, params.cache_info, + params.refresh_version, get_raw_attrs); + + return ret; +} + +int RGWSI_MetaBackend_SObj::put_entry(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *_ctx, + const string& key, + PutParams& _params, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast(_ctx); + RGWSI_MBSObj_PutParams& params = static_cast(_params); + + rgw_pool pool; + string oid; + ctx->module->get_pool_and_oid(key, &pool, &oid); + + return rgw_put_system_obj(dpp, sysobj_svc, pool, oid, params.bl, params.exclusive, + objv_tracker, params.mtime, y, params.pattrs); +} + +int RGWSI_MetaBackend_SObj::remove_entry(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *_ctx, + const string& key, + RemoveParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast(_ctx); + + rgw_pool pool; + string oid; + ctx->module->get_pool_and_oid(key, &pool, &oid); + rgw_raw_obj k(pool, oid); + + auto sysobj = sysobj_svc->get_obj(k); + return sysobj.wop() + .set_objv_tracker(objv_tracker) + .remove(dpp, y); +} + +int RGWSI_MetaBackend_SObj::list_init(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *_ctx, + const string& marker) +{ + RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast(_ctx); + + rgw_pool pool; + + string no_key; + ctx->module->get_pool_and_oid(no_key, &pool, nullptr); + + ctx->list.pool = sysobj_svc->get_pool(pool); + ctx->list.op.emplace(ctx->list.pool->op()); + + string prefix = ctx->module->get_oid_prefix(); + ctx->list.op->init(dpp, marker, prefix); + + return 0; +} + +int RGWSI_MetaBackend_SObj::list_next(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *_ctx, + int max, list *keys, + bool *truncated) +{ + RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast(_ctx); + + vector oids; + + keys->clear(); + + int ret = ctx->list.op->get_next(dpp, max, &oids, truncated); + if (ret < 0 && ret != -ENOENT) + return ret; + if (ret == -ENOENT) { + if (truncated) + *truncated = false; + return 0; + } + + auto module = ctx->module; + + for (auto& o : oids) { + if (!module->is_valid_oid(o)) { + continue; + } + keys->emplace_back(module->oid_to_key(o)); + } + + return 0; +} + +int RGWSI_MetaBackend_SObj::list_get_marker(RGWSI_MetaBackend::Context *_ctx, + string *marker) +{ + RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast(_ctx); + + return ctx->list.op->get_marker(marker); +} + diff --git a/src/rgw/services/svc_meta_be_sobj.h b/src/rgw/services/svc_meta_be_sobj.h new file mode 100644 index 000000000..304afc8bf --- /dev/null +++ b/src/rgw/services/svc_meta_be_sobj.h @@ -0,0 +1,194 @@ + + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +#include "svc_meta_be.h" +#include "svc_sys_obj.h" + + +class RGWSI_MBSObj_Handler_Module : public RGWSI_MetaBackend::Module { +protected: + std::string section; +public: + RGWSI_MBSObj_Handler_Module(const std::string& _section) : section(_section) {} + virtual void get_pool_and_oid(const std::string& key, rgw_pool *pool, std::string *oid) = 0; + virtual const std::string& get_oid_prefix() = 0; + virtual std::string key_to_oid(const std::string& key) = 0; + virtual bool is_valid_oid(const std::string& oid) = 0; + virtual std::string oid_to_key(const std::string& oid) = 0; + + const std::string& get_section() { + return section; + } + + /* key to use for hashing entries for log shard placement */ + virtual std::string get_hash_key(const std::string& key) { + return section + ":" + key; + } +}; + +struct RGWSI_MBSObj_GetParams : public RGWSI_MetaBackend::GetParams { + bufferlist *pbl{nullptr}; + std::map *pattrs{nullptr}; + rgw_cache_entry_info *cache_info{nullptr}; + boost::optional refresh_version; + + RGWSI_MBSObj_GetParams() {} + RGWSI_MBSObj_GetParams(bufferlist *_pbl, + std::map *_pattrs, + ceph::real_time *_pmtime) : RGWSI_MetaBackend::GetParams(_pmtime), + pbl(_pbl), + pattrs(_pattrs) {} + + RGWSI_MBSObj_GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) { + cache_info = _cache_info; + return *this; + } + RGWSI_MBSObj_GetParams& set_refresh_version(boost::optional& _refresh_version) { + refresh_version = _refresh_version; + return *this; + } +}; + +struct RGWSI_MBSObj_PutParams : public RGWSI_MetaBackend::PutParams { + bufferlist bl; + std::map *pattrs{nullptr}; + bool exclusive{false}; + + RGWSI_MBSObj_PutParams() {} + RGWSI_MBSObj_PutParams(std::map *_pattrs, + const ceph::real_time& _mtime) : RGWSI_MetaBackend::PutParams(_mtime), + pattrs(_pattrs) {} + RGWSI_MBSObj_PutParams(bufferlist& _bl, + std::map *_pattrs, + const ceph::real_time& _mtime, + bool _exclusive) : RGWSI_MetaBackend::PutParams(_mtime), + bl(_bl), + pattrs(_pattrs), + exclusive(_exclusive) {} +}; + +struct RGWSI_MBSObj_RemoveParams : public RGWSI_MetaBackend::RemoveParams { +}; + +class RGWSI_MetaBackend_SObj : public RGWSI_MetaBackend +{ +protected: + RGWSI_SysObj *sysobj_svc{nullptr}; + +public: + struct Context_SObj : public RGWSI_MetaBackend::Context { + RGWSI_MBSObj_Handler_Module *module{nullptr}; + struct _list { + std::optional pool; + std::optional op; + } list; + + void init(RGWSI_MetaBackend_Handler *h) override; + }; + + RGWSI_MetaBackend_SObj(CephContext *cct); + virtual ~RGWSI_MetaBackend_SObj(); + + RGWSI_MetaBackend::Type get_type() { + return MDBE_SOBJ; + } + + void init(RGWSI_SysObj *_sysobj_svc, + RGWSI_MDLog *_mdlog_svc) { + base_init(_mdlog_svc); + sysobj_svc = _sysobj_svc; + } + + RGWSI_MetaBackend_Handler *alloc_be_handler() override; + RGWSI_MetaBackend::Context *alloc_ctx() override; + + + int call_with_get_params(ceph::real_time *pmtime, std::function cb) override; + + int pre_modify(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type, + optional_yield y); + int post_modify(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, int ret, + optional_yield y); + + int get_entry(RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWSI_MetaBackend::GetParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp, + bool get_raw_attrs=false) override; + int put_entry(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWSI_MetaBackend::PutParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y) override; + int remove_entry(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const std::string& key, + RGWSI_MetaBackend::RemoveParams& params, + RGWObjVersionTracker *objv_tracker, + optional_yield y) override; + + int list_init(const DoutPrefixProvider *dpp, RGWSI_MetaBackend::Context *_ctx, const std::string& marker) override; + int list_next(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *_ctx, + int max, std::list *keys, + bool *truncated) override; + int list_get_marker(RGWSI_MetaBackend::Context *ctx, + std::string *marker) override; + + int get_shard_id(RGWSI_MetaBackend::Context *ctx, + const std::string& key, + int *shard_id) override; + + int call(std::optional opt, + std::function f) override; +}; + + +class RGWSI_MetaBackend_Handler_SObj : public RGWSI_MetaBackend_Handler { + friend class RGWSI_MetaBackend_SObj::Context_SObj; + + RGWSI_MBSObj_Handler_Module *module{nullptr}; + +public: + RGWSI_MetaBackend_Handler_SObj(RGWSI_MetaBackend *be) : + RGWSI_MetaBackend_Handler(be) {} + + void set_module(RGWSI_MBSObj_Handler_Module *_module) { + module = _module; + } + + RGWSI_MBSObj_Handler_Module *get_module() { + return module; + } +}; diff --git a/src/rgw/services/svc_meta_be_types.h b/src/rgw/services/svc_meta_be_types.h new file mode 100644 index 000000000..4a88a8e0b --- /dev/null +++ b/src/rgw/services/svc_meta_be_types.h @@ -0,0 +1,26 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +enum RGWSI_META_BE_TYPES { + SOBJ = 1, + OTP = 2, + BUCKET = 3, + BI = 4, + USER = 5, +}; + diff --git a/src/rgw/services/svc_notify.cc b/src/rgw/services/svc_notify.cc new file mode 100644 index 000000000..43f84ed0a --- /dev/null +++ b/src/rgw/services/svc_notify.cc @@ -0,0 +1,515 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "include/random.h" +#include "include/Context.h" +#include "common/errno.h" + +#include "rgw_cache.h" +#include "svc_notify.h" +#include "svc_finisher.h" +#include "svc_zone.h" +#include "svc_rados.h" + +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +static string notify_oid_prefix = "notify"; + +RGWSI_Notify::~RGWSI_Notify() +{ + shutdown(); +} + + +class RGWWatcher : public DoutPrefixProvider , public librados::WatchCtx2 { + CephContext *cct; + RGWSI_Notify *svc; + int index; + RGWSI_RADOS::Obj obj; + uint64_t watch_handle; + int register_ret{0}; + bool unregister_done{false}; + librados::AioCompletion *register_completion{nullptr}; + + class C_ReinitWatch : public Context { + RGWWatcher *watcher; + public: + explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {} + void finish(int r) override { + watcher->reinit(); + } + }; + + CephContext *get_cct() const override { return cct; } + unsigned get_subsys() const override { return dout_subsys; } + std::ostream& gen_prefix(std::ostream& out) const override { + return out << "rgw watcher librados: "; + } + +public: + RGWWatcher(CephContext *_cct, RGWSI_Notify *s, int i, RGWSI_RADOS::Obj& o) : cct(_cct), svc(s), index(i), obj(o), watch_handle(0) {} + void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) override { + ldpp_dout(this, 10) << "RGWWatcher::handle_notify() " + << " notify_id " << notify_id + << " cookie " << cookie + << " notifier " << notifier_id + << " bl.length()=" << bl.length() << dendl; + + if (unlikely(svc->inject_notify_timeout_probability == 1) || + (svc->inject_notify_timeout_probability > 0 && + (svc->inject_notify_timeout_probability > + ceph::util::generate_random_number(0.0, 1.0)))) { + ldpp_dout(this, 0) + << "RGWWatcher::handle_notify() dropping notification! " + << "If this isn't what you want, set " + << "rgw_inject_notify_timeout_probability to zero!" << dendl; + return; + } + + svc->watch_cb(this, notify_id, cookie, notifier_id, bl); + + bufferlist reply_bl; // empty reply payload + obj.notify_ack(notify_id, cookie, reply_bl); + } + void handle_error(uint64_t cookie, int err) override { + ldpp_dout(this, -1) << "RGWWatcher::handle_error cookie " << cookie + << " err " << cpp_strerror(err) << dendl; + svc->remove_watcher(index); + svc->schedule_context(new C_ReinitWatch(this)); + } + + void reinit() { + if(!unregister_done) { + int ret = unregister_watch(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl; + } + } + int ret = register_watch(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: register_watch() returned ret=" << ret << dendl; + svc->schedule_context(new C_ReinitWatch(this)); + return; + } + } + + int unregister_watch() { + int r = svc->unwatch(obj, watch_handle); + unregister_done = true; + if (r < 0) { + return r; + } + svc->remove_watcher(index); + return 0; + } + + int register_watch_async() { + if (register_completion) { + register_completion->release(); + register_completion = nullptr; + } + register_completion = librados::Rados::aio_create_completion(nullptr, nullptr); + register_ret = obj.aio_watch(register_completion, &watch_handle, this); + if (register_ret < 0) { + register_completion->release(); + return register_ret; + } + return 0; + } + + int register_watch_finish() { + if (register_ret < 0) { + return register_ret; + } + if (!register_completion) { + return -EINVAL; + } + register_completion->wait_for_complete(); + int r = register_completion->get_return_value(); + register_completion->release(); + register_completion = nullptr; + if (r < 0) { + return r; + } + svc->add_watcher(index); + unregister_done = false; + return 0; + } + + int register_watch() { + int r = obj.watch(&watch_handle, this); + if (r < 0) { + return r; + } + svc->add_watcher(index); + unregister_done = false; + return 0; + } +}; + + +class RGWSI_Notify_ShutdownCB : public RGWSI_Finisher::ShutdownCB +{ + RGWSI_Notify *svc; +public: + RGWSI_Notify_ShutdownCB(RGWSI_Notify *_svc) : svc(_svc) {} + void call() override { + svc->shutdown(); + } +}; + +string RGWSI_Notify::get_control_oid(int i) +{ + char buf[notify_oid_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", notify_oid_prefix.c_str(), i); + + return string(buf); +} + +// do not call pick_obj_control before init_watch +RGWSI_RADOS::Obj RGWSI_Notify::pick_control_obj(const string& key) +{ + uint32_t r = ceph_str_hash_linux(key.c_str(), key.size()); + + int i = r % num_watchers; + return notify_objs[i]; +} + +int RGWSI_Notify::init_watch(const DoutPrefixProvider *dpp, optional_yield y) +{ + num_watchers = cct->_conf->rgw_num_control_oids; + + bool compat_oid = (num_watchers == 0); + + if (num_watchers <= 0) + num_watchers = 1; + + watchers = new RGWWatcher *[num_watchers]; + + int error = 0; + + notify_objs.resize(num_watchers); + + for (int i=0; i < num_watchers; i++) { + string notify_oid; + + if (!compat_oid) { + notify_oid = get_control_oid(i); + } else { + notify_oid = notify_oid_prefix; + } + + notify_objs[i] = rados_svc->handle().obj({control_pool, notify_oid}); + auto& notify_obj = notify_objs[i]; + + int r = notify_obj.open(dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: notify_obj.open() returned r=" << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + op.create(false); + r = notify_obj.operate(dpp, &op, y); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, 0) << "ERROR: notify_obj.operate() returned r=" << r << dendl; + return r; + } + + RGWWatcher *watcher = new RGWWatcher(cct, this, i, notify_obj); + watchers[i] = watcher; + + r = watcher->register_watch_async(); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: register_watch_aio() returned " << r << dendl; + error = r; + continue; + } + } + + for (int i = 0; i < num_watchers; ++i) { + int r = watchers[i]->register_watch_finish(); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: async watch returned " << r << dendl; + error = r; + } + } + + if (error < 0) { + return error; + } + + return 0; +} + +void RGWSI_Notify::finalize_watch() +{ + for (int i = 0; i < num_watchers; i++) { + RGWWatcher *watcher = watchers[i]; + if (watchers_set.find(i) != watchers_set.end()) + watcher->unregister_watch(); + delete watcher; + } + + delete[] watchers; +} + +int RGWSI_Notify::do_start(optional_yield y, const DoutPrefixProvider *dpp) +{ + int r = zone_svc->start(y, dpp); + if (r < 0) { + return r; + } + + assert(zone_svc->is_started()); /* otherwise there's an ordering problem */ + + r = rados_svc->start(y, dpp); + if (r < 0) { + return r; + } + r = finisher_svc->start(y, dpp); + if (r < 0) { + return r; + } + + inject_notify_timeout_probability = + cct->_conf.get_val("rgw_inject_notify_timeout_probability"); + max_notify_retries = cct->_conf.get_val("rgw_max_notify_retries"); + + control_pool = zone_svc->get_zone_params().control_pool; + + int ret = init_watch(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl; + return ret; + } + + shutdown_cb = new RGWSI_Notify_ShutdownCB(this); + int handle; + finisher_svc->register_caller(shutdown_cb, &handle); + finisher_handle = handle; + + return 0; +} + +void RGWSI_Notify::shutdown() +{ + if (finalized) { + return; + } + + if (finisher_handle) { + finisher_svc->unregister_caller(*finisher_handle); + } + finalize_watch(); + + delete shutdown_cb; + + finalized = true; +} + +int RGWSI_Notify::unwatch(RGWSI_RADOS::Obj& obj, uint64_t watch_handle) +{ + int r = obj.unwatch(watch_handle); + if (r < 0) { + ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl; + return r; + } + r = rados_svc->handle().watch_flush(); + if (r < 0) { + ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl; + return r; + } + return 0; +} + +void RGWSI_Notify::add_watcher(int i) +{ + ldout(cct, 20) << "add_watcher() i=" << i << dendl; + std::unique_lock l{watchers_lock}; + watchers_set.insert(i); + if (watchers_set.size() == (size_t)num_watchers) { + ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl; + _set_enabled(true); + } +} + +void RGWSI_Notify::remove_watcher(int i) +{ + ldout(cct, 20) << "remove_watcher() i=" << i << dendl; + std::unique_lock l{watchers_lock}; + size_t orig_size = watchers_set.size(); + watchers_set.erase(i); + if (orig_size == (size_t)num_watchers && + watchers_set.size() < orig_size) { /* actually removed */ + ldout(cct, 2) << "removed watcher, disabling cache" << dendl; + _set_enabled(false); + } +} + +int RGWSI_Notify::watch_cb(const DoutPrefixProvider *dpp, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) +{ + std::shared_lock l{watchers_lock}; + if (cb) { + return cb->watch_cb(dpp, notify_id, cookie, notifier_id, bl); + } + return 0; +} + +void RGWSI_Notify::set_enabled(bool status) +{ + std::unique_lock l{watchers_lock}; + _set_enabled(status); +} + +void RGWSI_Notify::_set_enabled(bool status) +{ + enabled = status; + if (cb) { + cb->set_enabled(status); + } +} + +int RGWSI_Notify::distribute(const DoutPrefixProvider *dpp, const string& key, + const RGWCacheNotifyInfo& cni, + optional_yield y) +{ + /* The RGW uses the control pool to store the watch notify objects. + The precedence in RGWSI_Notify::do_start is to call to zone_svc->start and later to init_watch(). + The first time, RGW starts in the cluster, the RGW will try to create zone and zonegroup system object. + In that case RGW will try to distribute the cache before it ran init_watch, + which will lead to division by 0 in pick_obj_control (num_watchers is 0). + */ + if (num_watchers > 0) { + RGWSI_RADOS::Obj notify_obj = pick_control_obj(key); + + ldpp_dout(dpp, 10) << "distributing notification oid=" << notify_obj.get_ref().obj + << " cni=" << cni << dendl; + return robust_notify(dpp, notify_obj, cni, y); + } + return 0; +} + +namespace librados { + +static std::ostream& operator<<(std::ostream& out, const notify_timeout_t& t) +{ + return out << t.notifier_id << ':' << t.cookie; +} + +} // namespace librados + +using timeout_vector = std::vector; + +static timeout_vector decode_timeouts(const bufferlist& bl) +{ + using ceph::decode; + auto p = bl.begin(); + + // decode and discard the acks + uint32_t num_acks; + decode(num_acks, p); + for (auto i = 0u; i < num_acks; ++i) { + std::pair id; + decode(id, p); + // discard the payload + uint32_t blen; + decode(blen, p); + p += blen; + } + + // decode and return the timeouts + uint32_t num_timeouts; + decode(num_timeouts, p); + + timeout_vector timeouts; + for (auto i = 0u; i < num_timeouts; ++i) { + std::pair id; + decode(id, p); + timeouts.push_back({id.first, id.second}); + } + return timeouts; +} + +int RGWSI_Notify::robust_notify(const DoutPrefixProvider *dpp, + RGWSI_RADOS::Obj& notify_obj, + const RGWCacheNotifyInfo& cni, + optional_yield y) +{ + bufferlist bl, rbl; + encode(cni, bl); + + // First, try to send, without being fancy about it. + auto r = notify_obj.notify(dpp, bl, 0, &rbl, y); + + if (r < 0) { + timeout_vector timeouts; + try { + timeouts = decode_timeouts(rbl); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 0) << "robust_notify failed to decode notify response: " + << e.what() << dendl; + } + + ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " Watchers " << timeouts << " did not respond." + << " Notify failed on object " << cni.obj << ": " + << cpp_strerror(-r) << dendl; + } + + // If we timed out, get serious. + if (r == -ETIMEDOUT) { + RGWCacheNotifyInfo info; + info.op = INVALIDATE_OBJ; + info.obj = cni.obj; + bufferlist retrybl; + encode(info, retrybl); + + for (auto tries = 0u; + r == -ETIMEDOUT && tries < max_notify_retries; + ++tries) { + ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " Invalidating obj=" << info.obj << " tries=" + << tries << dendl; + r = notify_obj.notify(dpp, retrybl, 0, &rbl, y); + if (r < 0) { + timeout_vector timeouts; + try { + timeouts = decode_timeouts(rbl); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 0) << "robust_notify failed to decode notify response: " + << e.what() << dendl; + } + + ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " Watchers " << timeouts << " did not respond." + << " Invalidation attempt " << tries << " failed: " + << cpp_strerror(-r) << dendl; + } + } + } + return r; +} + +void RGWSI_Notify::register_watch_cb(CB *_cb) +{ + std::unique_lock l{watchers_lock}; + cb = _cb; + _set_enabled(enabled); +} + +void RGWSI_Notify::schedule_context(Context *c) +{ + finisher_svc->schedule_context(c); +} diff --git a/src/rgw/services/svc_notify.h b/src/rgw/services/svc_notify.h new file mode 100644 index 000000000..f7329136e --- /dev/null +++ b/src/rgw/services/svc_notify.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_service.h" + +#include "svc_rados.h" + + +class Context; + +class RGWSI_Zone; +class RGWSI_Finisher; + +class RGWWatcher; +class RGWSI_Notify_ShutdownCB; +struct RGWCacheNotifyInfo; + +class RGWSI_Notify : public RGWServiceInstance +{ + friend class RGWWatcher; + friend class RGWSI_Notify_ShutdownCB; + friend class RGWServices_Def; + +public: + class CB; + +private: + RGWSI_Zone *zone_svc{nullptr}; + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_Finisher *finisher_svc{nullptr}; + + ceph::shared_mutex watchers_lock = ceph::make_shared_mutex("watchers_lock"); + rgw_pool control_pool; + + int num_watchers{0}; + RGWWatcher **watchers{nullptr}; + std::set watchers_set; + std::vector notify_objs; + + bool enabled{false}; + + double inject_notify_timeout_probability{0}; + uint64_t max_notify_retries = 10; + + std::string get_control_oid(int i); + RGWSI_RADOS::Obj pick_control_obj(const std::string& key); + + CB *cb{nullptr}; + + std::optional finisher_handle; + RGWSI_Notify_ShutdownCB *shutdown_cb{nullptr}; + + bool finalized{false}; + + int init_watch(const DoutPrefixProvider *dpp, optional_yield y); + void finalize_watch(); + + void init(RGWSI_Zone *_zone_svc, + RGWSI_RADOS *_rados_svc, + RGWSI_Finisher *_finisher_svc) { + zone_svc = _zone_svc; + rados_svc = _rados_svc; + finisher_svc = _finisher_svc; + } + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + void shutdown() override; + + int unwatch(RGWSI_RADOS::Obj& obj, uint64_t watch_handle); + void add_watcher(int i); + void remove_watcher(int i); + + int watch_cb(const DoutPrefixProvider *dpp, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl); + void _set_enabled(bool status); + void set_enabled(bool status); + + int robust_notify(const DoutPrefixProvider *dpp, RGWSI_RADOS::Obj& notify_obj, + const RGWCacheNotifyInfo& bl, optional_yield y); + + void schedule_context(Context *c); +public: + RGWSI_Notify(CephContext *cct): RGWServiceInstance(cct) {} + + virtual ~RGWSI_Notify() override; + + class CB { + public: + virtual ~CB() {} + virtual int watch_cb(const DoutPrefixProvider *dpp, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) = 0; + virtual void set_enabled(bool status) = 0; + }; + + int distribute(const DoutPrefixProvider *dpp, const std::string& key, const RGWCacheNotifyInfo& bl, + optional_yield y); + + void register_watch_cb(CB *cb); +}; diff --git a/src/rgw/services/svc_otp.cc b/src/rgw/services/svc_otp.cc new file mode 100644 index 000000000..81d8d5711 --- /dev/null +++ b/src/rgw/services/svc_otp.cc @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_otp.h" +#include "svc_zone.h" +#include "svc_meta.h" +#include "svc_meta_be_sobj.h" + +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +class RGW_MB_Handler_Module_OTP : public RGWSI_MBSObj_Handler_Module { + RGWSI_Zone *zone_svc; + string prefix; +public: + RGW_MB_Handler_Module_OTP(RGWSI_Zone *_zone_svc) : RGWSI_MBSObj_Handler_Module("otp"), + zone_svc(_zone_svc) {} + + void get_pool_and_oid(const string& key, rgw_pool *pool, string *oid) override { + if (pool) { + *pool = zone_svc->get_zone_params().otp_pool; + } + + if (oid) { + *oid = key; + } + } + + const string& get_oid_prefix() override { + return prefix; + } + + bool is_valid_oid(const string& oid) override { + return true; + } + + string key_to_oid(const string& key) override { + return key; + } + + string oid_to_key(const string& oid) override { + return oid; + } +}; + +RGWSI_OTP::RGWSI_OTP(CephContext *cct): RGWServiceInstance(cct) { +} + +RGWSI_OTP::~RGWSI_OTP() { +} + +void RGWSI_OTP::init(RGWSI_Zone *_zone_svc, + RGWSI_Meta *_meta_svc, + RGWSI_MetaBackend *_meta_be_svc) +{ + svc.otp = this; + svc.zone = _zone_svc; + svc.meta = _meta_svc; + svc.meta_be = _meta_be_svc; +} + +int RGWSI_OTP::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + /* create first backend handler for bucket entrypoints */ + + RGWSI_MetaBackend_Handler *_otp_be_handler; + + int r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_OTP, &_otp_be_handler); + if (r < 0) { + ldout(ctx(), 0) << "ERROR: failed to create be handler: r=" << r << dendl; + return r; + } + + be_handler = _otp_be_handler; + + RGWSI_MetaBackend_Handler_OTP *otp_be_handler = static_cast(_otp_be_handler); + + auto otp_be_module = new RGW_MB_Handler_Module_OTP(svc.zone); + be_module.reset(otp_be_module); + otp_be_handler->set_module(otp_be_module); + + return 0; +} + +int RGWSI_OTP::read_all(RGWSI_OTP_BE_Ctx& ctx, + const string& key, + otp_devices_list_t *devices, + real_time *pmtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) +{ + RGWSI_MBOTP_GetParams params; + params.pdevices = devices; + params.pmtime = pmtime; + + int ret = svc.meta_be->get_entry(ctx.get(), key, params, objv_tracker, y, dpp); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWSI_OTP::read_all(RGWSI_OTP_BE_Ctx& ctx, + const rgw_user& uid, + otp_devices_list_t *devices, + real_time *pmtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + return read_all(ctx, + uid.to_str(), + devices, + pmtime, + objv_tracker, + y, + dpp); +} + +int RGWSI_OTP::store_all(const DoutPrefixProvider *dpp, + RGWSI_OTP_BE_Ctx& ctx, + const string& key, + const otp_devices_list_t& devices, + real_time mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + RGWSI_MBOTP_PutParams params; + params.mtime = mtime; + params.devices = devices; + + int ret = svc.meta_be->put_entry(dpp, ctx.get(), key, params, objv_tracker, y); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWSI_OTP::store_all(const DoutPrefixProvider *dpp, + RGWSI_OTP_BE_Ctx& ctx, + const rgw_user& uid, + const otp_devices_list_t& devices, + real_time mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + return store_all(dpp, ctx, + uid.to_str(), + devices, + mtime, + objv_tracker, + y); +} + +int RGWSI_OTP::remove_all(const DoutPrefixProvider *dpp, + RGWSI_OTP_BE_Ctx& ctx, + const string& key, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + RGWSI_MBOTP_RemoveParams params; + + int ret = svc.meta_be->remove_entry(dpp, ctx.get(), key, params, objv_tracker, y); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWSI_OTP::remove_all(const DoutPrefixProvider *dpp, + RGWSI_OTP_BE_Ctx& ctx, + const rgw_user& uid, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + return remove_all(dpp,ctx, + uid.to_str(), + objv_tracker, + y); +} diff --git a/src/rgw/services/svc_otp.h b/src/rgw/services/svc_otp.h new file mode 100644 index 000000000..e639c2c92 --- /dev/null +++ b/src/rgw/services/svc_otp.h @@ -0,0 +1,95 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "cls/otp/cls_otp_types.h" + +#include "rgw_service.h" + +#include "svc_otp_types.h" +#include "svc_meta_be_otp.h" + +class RGWSI_Zone; + +class RGWSI_OTP : public RGWServiceInstance +{ + RGWSI_OTP_BE_Handler be_handler; + std::unique_ptr be_module; + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + +public: + struct Svc { + RGWSI_OTP *otp{nullptr}; + RGWSI_Zone *zone{nullptr}; + RGWSI_Meta *meta{nullptr}; + RGWSI_MetaBackend *meta_be{nullptr}; + } svc; + + RGWSI_OTP(CephContext *cct); + ~RGWSI_OTP(); + + RGWSI_OTP_BE_Handler& get_be_handler() { + return be_handler; + } + + void init(RGWSI_Zone *_zone_svc, + RGWSI_Meta *_meta_svc, + RGWSI_MetaBackend *_meta_be_svc); + + int read_all(RGWSI_OTP_BE_Ctx& ctx, + const std::string& key, + otp_devices_list_t *devices, + real_time *pmtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp); + int read_all(RGWSI_OTP_BE_Ctx& ctx, + const rgw_user& uid, + otp_devices_list_t *devices, + real_time *pmtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp); + int store_all(const DoutPrefixProvider *dpp, + RGWSI_OTP_BE_Ctx& ctx, + const std::string& key, + const otp_devices_list_t& devices, + real_time mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y); + int store_all(const DoutPrefixProvider *dpp, + RGWSI_OTP_BE_Ctx& ctx, + const rgw_user& uid, + const otp_devices_list_t& devices, + real_time mtime, + RGWObjVersionTracker *objv_tracker, + optional_yield y); + int remove_all(const DoutPrefixProvider *dpp, + RGWSI_OTP_BE_Ctx& ctx, + const std::string& key, + RGWObjVersionTracker *objv_tracker, + optional_yield y); + int remove_all(const DoutPrefixProvider *dpp, + RGWSI_OTP_BE_Ctx& ctx, + const rgw_user& uid, + RGWObjVersionTracker *objv_tracker, + optional_yield y); +}; + + diff --git a/src/rgw/services/svc_otp_types.h b/src/rgw/services/svc_otp_types.h new file mode 100644 index 000000000..60e2a79d6 --- /dev/null +++ b/src/rgw/services/svc_otp_types.h @@ -0,0 +1,29 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "common/ptr_wrapper.h" + +#include "svc_meta_be.h" +#include "svc_meta_be_types.h" + +class RGWSI_MetaBackend_Handler; + +using RGWSI_OTP_BE_Handler = ptr_wrapper; +using RGWSI_OTP_BE_Ctx = ptr_wrapper; + diff --git a/src/rgw/services/svc_quota.cc b/src/rgw/services/svc_quota.cc new file mode 100644 index 000000000..3108a1173 --- /dev/null +++ b/src/rgw/services/svc_quota.cc @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_quota.h" +#include "svc_zone.h" + +#include "rgw_zone.h" + +const RGWQuotaInfo& RGWSI_Quota::get_bucket_quota() const +{ + return zone_svc->get_current_period().get_config().quota.bucket_quota; +} + +const RGWQuotaInfo& RGWSI_Quota::get_user_quota() const +{ + return zone_svc->get_current_period().get_config().quota.user_quota; +} + diff --git a/src/rgw/services/svc_quota.h b/src/rgw/services/svc_quota.h new file mode 100644 index 000000000..81aa0e1bd --- /dev/null +++ b/src/rgw/services/svc_quota.h @@ -0,0 +1,22 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_service.h" + + +class RGWSI_Quota : public RGWServiceInstance +{ + RGWSI_Zone *zone_svc{nullptr}; + +public: + RGWSI_Quota(CephContext *cct): RGWServiceInstance(cct) {} + + void init(RGWSI_Zone *_zone_svc) { + zone_svc = _zone_svc; + } + + const RGWQuotaInfo& get_bucket_quota() const; + const RGWQuotaInfo& get_user_quota() const; +}; diff --git a/src/rgw/services/svc_rados.cc b/src/rgw/services/svc_rados.cc new file mode 100644 index 000000000..99f400f42 --- /dev/null +++ b/src/rgw/services/svc_rados.cc @@ -0,0 +1,445 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_rados.h" + +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "osd/osd_types.h" +#include "rgw_tools.h" +#include "rgw_cr_rados.h" + +#include "auth/AuthRegistry.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWSI_RADOS::RGWSI_RADOS(CephContext *cct) : RGWServiceInstance(cct) +{ +} + +RGWSI_RADOS::~RGWSI_RADOS() +{ +} + +int RGWSI_RADOS::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + int ret = rados.init_with_context(cct); + if (ret < 0) { + return ret; + } + ret = rados.connect(); + if (ret < 0) { + return ret; + } + + async_processor.reset(new RGWAsyncRadosProcessor(cct, cct->_conf->rgw_num_async_rados_threads)); + async_processor->start(); + + return 0; +} + +void RGWSI_RADOS::shutdown() +{ + if (async_processor) { + async_processor->stop(); + } + rados.shutdown(); +} + +void RGWSI_RADOS::stop_processor() +{ + if (async_processor) { + async_processor->stop(); + } +} + +librados::Rados* RGWSI_RADOS::get_rados_handle() +{ + return &rados; +} + +std::string RGWSI_RADOS::cluster_fsid() +{ + std::string fsid; + (void) get_rados_handle()->cluster_fsid(&fsid); + return fsid; +} + +uint64_t RGWSI_RADOS::instance_id() +{ + return get_rados_handle()->get_instance_id(); +} + +int RGWSI_RADOS::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx, + const OpenParams& params) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, + params.create, + params.mostly_omap); +} + +int RGWSI_RADOS::pool_iterate(const DoutPrefixProvider *dpp, + librados::IoCtx& io_ctx, + librados::NObjectIterator& iter, + uint32_t num, vector& objs, + RGWAccessListFilter *filter, + bool *is_truncated) +{ + if (iter == io_ctx.nobjects_end()) + return -ENOENT; + + uint32_t i; + + for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) { + rgw_bucket_dir_entry e; + + string oid = iter->get_oid(); + ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl; + + // fill it in with initial values; we may correct later + if (filter && !filter->filter(oid, oid)) + continue; + + e.key = oid; + objs.push_back(e); + } + + if (is_truncated) + *is_truncated = (iter != io_ctx.nobjects_end()); + + return objs.size(); +} + +RGWSI_RADOS::Obj::Obj(Pool& pool, const string& oid) : rados_svc(pool.rados_svc) +{ + ref.pool = pool; + ref.obj = rgw_raw_obj(pool.get_pool(), oid); +} + +void RGWSI_RADOS::Obj::init(const rgw_raw_obj& obj) +{ + ref.pool = RGWSI_RADOS::Pool(rados_svc, obj.pool); + ref.obj = obj; +} + +int RGWSI_RADOS::Obj::open(const DoutPrefixProvider *dpp) +{ + int r = ref.pool.open(dpp); + if (r < 0) { + return r; + } + + ref.pool.ioctx().locator_set_key(ref.obj.loc); + + return 0; +} + +int RGWSI_RADOS::Obj::operate(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation *op, + optional_yield y, int flags) +{ + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, y, flags); +} + +int RGWSI_RADOS::Obj::operate(const DoutPrefixProvider *dpp, librados::ObjectReadOperation *op, + bufferlist *pbl, optional_yield y, int flags) +{ + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, pbl, y, flags); +} + +int RGWSI_RADOS::Obj::aio_operate(librados::AioCompletion *c, librados::ObjectWriteOperation *op) +{ + return ref.pool.ioctx().aio_operate(ref.obj.oid, c, op); +} + +int RGWSI_RADOS::Obj::aio_operate(librados::AioCompletion *c, librados::ObjectReadOperation *op, + bufferlist *pbl) +{ + return ref.pool.ioctx().aio_operate(ref.obj.oid, c, op, pbl); +} + +int RGWSI_RADOS::Obj::watch(uint64_t *handle, librados::WatchCtx2 *ctx) +{ + return ref.pool.ioctx().watch2(ref.obj.oid, handle, ctx); +} + +int RGWSI_RADOS::Obj::aio_watch(librados::AioCompletion *c, uint64_t *handle, librados::WatchCtx2 *ctx) +{ + return ref.pool.ioctx().aio_watch(ref.obj.oid, c, handle, ctx); +} + +int RGWSI_RADOS::Obj::unwatch(uint64_t handle) +{ + return ref.pool.ioctx().unwatch2(handle); +} + +int RGWSI_RADOS::Obj::notify(const DoutPrefixProvider *dpp, bufferlist& bl, uint64_t timeout_ms, + bufferlist *pbl, optional_yield y) +{ + return rgw_rados_notify(dpp, ref.pool.ioctx(), ref.obj.oid, bl, timeout_ms, pbl, y); +} + +void RGWSI_RADOS::Obj::notify_ack(uint64_t notify_id, + uint64_t cookie, + bufferlist& bl) +{ + ref.pool.ioctx().notify_ack(ref.obj.oid, notify_id, cookie, bl); +} + +uint64_t RGWSI_RADOS::Obj::get_last_version() +{ + return ref.pool.ioctx().get_last_version(); +} + +int RGWSI_RADOS::Pool::create(const DoutPrefixProvider *dpp) +{ + librados::Rados *rad = rados_svc->get_rados_handle(); + int r = rad->pool_create(pool.name.c_str()); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: pool_create returned " << r << dendl; + return r; + } + librados::IoCtx io_ctx; + r = rad->ioctx_create(pool.name.c_str(), io_ctx); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: ioctx_create returned " << r << dendl; + return r; + } + r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: application_enable returned " << r << dendl; + return r; + } + return 0; +} + +int RGWSI_RADOS::Pool::create(const DoutPrefixProvider *dpp, const vector& pools, vector *retcodes) +{ + vector completions; + vector rets; + + librados::Rados *rad = rados_svc->get_rados_handle(); + for (auto iter = pools.begin(); iter != pools.end(); ++iter) { + librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion(); + completions.push_back(c); + auto& pool = *iter; + int ret = rad->pool_create_async(pool.name.c_str(), c); + rets.push_back(ret); + } + + vector::iterator riter; + vector::iterator citer; + + bool error = false; + ceph_assert(rets.size() == completions.size()); + for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) { + int r = *riter; + librados::PoolAsyncCompletion *c = *citer; + if (r == 0) { + c->wait(); + r = c->get_return_value(); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: async pool_create returned " << r << dendl; + error = true; + } + } + c->release(); + retcodes->push_back(r); + } + if (error) { + return 0; + } + + std::vector io_ctxs; + retcodes->clear(); + for (auto pool : pools) { + io_ctxs.emplace_back(); + int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back()); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: ioctx_create returned " << ret << dendl; + error = true; + } + retcodes->push_back(ret); + } + if (error) { + return 0; + } + + completions.clear(); + for (auto &io_ctx : io_ctxs) { + librados::PoolAsyncCompletion *c = + librados::Rados::pool_async_create_completion(); + completions.push_back(c); + int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW, + false, c); + ceph_assert(ret == 0); + } + + retcodes->clear(); + for (auto c : completions) { + c->wait(); + int ret = c->get_return_value(); + if (ret == -EOPNOTSUPP) { + ret = 0; + } else if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: async application_enable returned " << ret + << dendl; + error = true; + } + c->release(); + retcodes->push_back(ret); + } + return 0; +} + +int RGWSI_RADOS::Pool::lookup() +{ + librados::Rados *rad = rados_svc->get_rados_handle(); + int ret = rad->pool_lookup(pool.name.c_str()); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWSI_RADOS::Pool::open(const DoutPrefixProvider *dpp, const OpenParams& params) +{ + return rados_svc->open_pool_ctx(dpp, pool, state.ioctx, params); +} + +int RGWSI_RADOS::Pool::List::init(const DoutPrefixProvider *dpp, const string& marker, RGWAccessListFilter *filter) +{ + if (ctx.initialized) { + return -EINVAL; + } + + if (!pool) { + return -EINVAL; + } + + int r = pool->rados_svc->open_pool_ctx(dpp, pool->pool, ctx.ioctx); + if (r < 0) { + return r; + } + + librados::ObjectCursor oc; + if (!oc.from_str(marker)) { + ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl; + return -EINVAL; + } + + try { + ctx.iter = ctx.ioctx.nobjects_begin(oc); + ctx.filter = filter; + ctx.initialized = true; + return 0; + } catch (const std::system_error& e) { + r = -e.code().value(); + ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() + << ", returning " << r << dendl; + return r; + } catch (const std::exception& e) { + ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() + << ", returning -5" << dendl; + return -EIO; + } +} + +int RGWSI_RADOS::Pool::List::get_next(const DoutPrefixProvider *dpp, + int max, + std::vector *oids, + bool *is_truncated) +{ + if (!ctx.initialized) { + return -EINVAL; + } + vector objs; + int r = pool->rados_svc->pool_iterate(dpp, ctx.ioctx, ctx.iter, max, objs, ctx.filter, is_truncated); + if (r < 0) { + if(r != -ENOENT) { + ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl; + } + return r; + } + + for (auto& o : objs) { + oids->push_back(o.key.name); + } + + return oids->size(); +} + +RGWSI_RADOS::Obj RGWSI_RADOS::Handle::obj(const rgw_raw_obj& o) +{ + return RGWSI_RADOS::Obj(rados_svc, o); +} +int RGWSI_RADOS::Handle::watch_flush() +{ + librados::Rados *rad = rados_svc->get_rados_handle(); + return rad->watch_flush(); +} + +int RGWSI_RADOS::Handle::mon_command(std::string cmd, + const bufferlist& inbl, + bufferlist *outbl, + std::string *outs) +{ + librados::Rados *rad = rados_svc->get_rados_handle(); + return rad->mon_command(cmd, inbl, outbl, outs); +} + +int RGWSI_RADOS::Pool::List::get_marker(string *marker) +{ + if (!ctx.initialized) { + return -EINVAL; + } + + *marker = ctx.iter.get_cursor().to_str(); + return 0; +} + +int RGWSI_RADOS::clog_warn(const string& msg) +{ + string cmd = + "{" + "\"prefix\": \"log\", " + "\"level\": \"warn\", " + "\"logtext\": [\"" + msg + "\"]" + "}"; + + bufferlist inbl; + auto h = handle(); + return h.mon_command(cmd, inbl, nullptr, nullptr); +} + +bool RGWSI_RADOS::check_secure_mon_conn(const DoutPrefixProvider *dpp) const +{ + AuthRegistry reg(cct); + + reg.refresh_config(); + + std::vector methods; + std::vector modes; + + reg.get_supported_methods(CEPH_ENTITY_TYPE_MON, &methods, &modes); + ldpp_dout(dpp, 20) << __func__ << "(): auth registy supported: methods=" << methods << " modes=" << modes << dendl; + + for (auto method : methods) { + if (!reg.is_secure_method(method)) { + ldpp_dout(dpp, 20) << __func__ << "(): method " << method << " is insecure" << dendl; + return false; + } + } + + for (auto mode : modes) { + if (!reg.is_secure_mode(mode)) { + ldpp_dout(dpp, 20) << __func__ << "(): mode " << mode << " is insecure" << dendl; + return false; + } + } + + return true; +} + diff --git a/src/rgw/services/svc_rados.h b/src/rgw/services/svc_rados.h new file mode 100644 index 000000000..ede029aa8 --- /dev/null +++ b/src/rgw/services/svc_rados.h @@ -0,0 +1,252 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_service.h" + +#include "include/rados/librados.hpp" +#include "common/async/yield_context.h" + +class RGWAsyncRadosProcessor; + +class RGWAccessListFilter { +public: + virtual ~RGWAccessListFilter() {} + virtual bool filter(const std::string& name, std::string& key) = 0; +}; + +struct RGWAccessListFilterPrefix : public RGWAccessListFilter { + std::string prefix; + + explicit RGWAccessListFilterPrefix(const std::string& _prefix) : prefix(_prefix) {} + bool filter(const std::string& name, std::string& key) override { + return (prefix.compare(key.substr(0, prefix.size())) == 0); + } +}; + +class RGWSI_RADOS : public RGWServiceInstance +{ + librados::Rados rados; + std::unique_ptr async_processor; + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + +public: + struct OpenParams { + bool create{true}; + bool mostly_omap{false}; + + OpenParams() {} + + OpenParams& set_create(bool _create) { + create = _create; + return *this; + } + OpenParams& set_mostly_omap(bool _mostly_omap) { + mostly_omap = _mostly_omap; + return *this; + } + }; + +private: + int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx, + const OpenParams& params = {}); + int pool_iterate(const DoutPrefixProvider *dpp, + librados::IoCtx& ioctx, + librados::NObjectIterator& iter, + uint32_t num, std::vector& objs, + RGWAccessListFilter *filter, + bool *is_truncated); + +public: + RGWSI_RADOS(CephContext *cct); + ~RGWSI_RADOS(); + librados::Rados* get_rados_handle(); + + void init() {} + void shutdown() override; + void stop_processor(); + + std::string cluster_fsid(); + uint64_t instance_id(); + bool check_secure_mon_conn(const DoutPrefixProvider *dpp) const; + + RGWAsyncRadosProcessor *get_async_processor() { + return async_processor.get(); + } + + int clog_warn(const std::string& msg); + + class Handle; + + class Pool { + friend class RGWSI_RADOS; + friend Handle; + friend class Obj; + + RGWSI_RADOS *rados_svc{nullptr}; + rgw_pool pool; + + struct State { + librados::IoCtx ioctx; + } state; + + Pool(RGWSI_RADOS *_rados_svc, + const rgw_pool& _pool) : rados_svc(_rados_svc), + pool(_pool) {} + + Pool(RGWSI_RADOS *_rados_svc) : rados_svc(_rados_svc) {} + public: + Pool() {} + + int create(const DoutPrefixProvider *dpp); + int create(const DoutPrefixProvider *dpp, const std::vector& pools, std::vector *retcodes); + int lookup(); + int open(const DoutPrefixProvider *dpp, const OpenParams& params = {}); + + const rgw_pool& get_pool() { + return pool; + } + + librados::IoCtx& ioctx() & { + return state.ioctx; + } + + librados::IoCtx&& ioctx() && { + return std::move(state.ioctx); + } + + struct List { + Pool *pool{nullptr}; + + struct Ctx { + bool initialized{false}; + librados::IoCtx ioctx; + librados::NObjectIterator iter; + RGWAccessListFilter *filter{nullptr}; + } ctx; + + List() {} + List(Pool *_pool) : pool(_pool) {} + + int init(const DoutPrefixProvider *dpp, const std::string& marker, RGWAccessListFilter *filter = nullptr); + int get_next(const DoutPrefixProvider *dpp, int max, + std::vector *oids, + bool *is_truncated); + + int get_marker(std::string *marker); + }; + + List op() { + return List(this); + } + + friend List; + }; + + + struct rados_ref { + RGWSI_RADOS::Pool pool; + rgw_raw_obj obj; + }; + + class Obj { + friend class RGWSI_RADOS; + friend class Handle; + + RGWSI_RADOS *rados_svc{nullptr}; + rados_ref ref; + + void init(const rgw_raw_obj& obj); + + Obj(RGWSI_RADOS *_rados_svc, const rgw_raw_obj& _obj) + : rados_svc(_rados_svc) { + init(_obj); + } + + Obj(Pool& pool, const std::string& oid); + + public: + Obj() {} + + int open(const DoutPrefixProvider *dpp); + + int operate(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation *op, optional_yield y, + int flags = 0); + int operate(const DoutPrefixProvider *dpp, librados::ObjectReadOperation *op, bufferlist *pbl, + optional_yield y, int flags = 0); + int aio_operate(librados::AioCompletion *c, librados::ObjectWriteOperation *op); + int aio_operate(librados::AioCompletion *c, librados::ObjectReadOperation *op, + bufferlist *pbl); + + int watch(uint64_t *handle, librados::WatchCtx2 *ctx); + int aio_watch(librados::AioCompletion *c, uint64_t *handle, librados::WatchCtx2 *ctx); + int unwatch(uint64_t handle); + int notify(const DoutPrefixProvider *dpp, bufferlist& bl, uint64_t timeout_ms, + bufferlist *pbl, optional_yield y); + void notify_ack(uint64_t notify_id, + uint64_t cookie, + bufferlist& bl); + + uint64_t get_last_version(); + + rados_ref& get_ref() { return ref; } + const rados_ref& get_ref() const { return ref; } + + const rgw_raw_obj& get_raw_obj() const { + return ref.obj; + } + }; + + class Handle { + friend class RGWSI_RADOS; + + RGWSI_RADOS *rados_svc{nullptr}; + + Handle(RGWSI_RADOS *_rados_svc) : rados_svc(_rados_svc) {} + public: + Obj obj(const rgw_raw_obj& o); + + Pool pool(const rgw_pool& p) { + return Pool(rados_svc, p); + } + + int watch_flush(); + + int mon_command(std::string cmd, + const bufferlist& inbl, + bufferlist *outbl, + std::string *outs); + }; + + Handle handle() { + return Handle(this); + } + + Obj obj(const rgw_raw_obj& o) { + return Obj(this, o); + } + + Obj obj(Pool& pool, const std::string& oid) { + return Obj(pool, oid); + } + + Pool pool() { + return Pool(this); + } + + Pool pool(const rgw_pool& p) { + return Pool(this, p); + } + + friend Obj; + friend Pool; + friend Pool::List; +}; + +using rgw_rados_ref = RGWSI_RADOS::rados_ref; + +inline std::ostream& operator<<(std::ostream& out, const RGWSI_RADOS::Obj& obj) { + return out << obj.get_raw_obj(); +} diff --git a/src/rgw/services/svc_role_rados.cc b/src/rgw/services/svc_role_rados.cc new file mode 100644 index 000000000..a84022497 --- /dev/null +++ b/src/rgw/services/svc_role_rados.cc @@ -0,0 +1,82 @@ +#include "svc_role_rados.h" +#include "svc_meta_be_sobj.h" +#include "svc_meta.h" +#include "rgw_role.h" +#include "rgw_zone.h" +#include "svc_zone.h" +#include "rgw_tools.h" + +#define dout_subsys ceph_subsys_rgw + +class RGWSI_Role_Module : public RGWSI_MBSObj_Handler_Module { + RGWSI_Role_RADOS::Svc& svc; + const std::string prefix; +public: + RGWSI_Role_Module(RGWSI_Role_RADOS::Svc& _svc): RGWSI_MBSObj_Handler_Module("roles"), + svc(_svc), + prefix(role_oid_prefix) {} + + void get_pool_and_oid(const std::string& key, + rgw_pool *pool, + std::string *oid) override + { + if (pool) { + *pool = svc.zone->get_zone_params().roles_pool; + } + + if (oid) { + *oid = key_to_oid(key); + } + } + + bool is_valid_oid(const std::string& oid) override { + return boost::algorithm::starts_with(oid, prefix); + } + + std::string key_to_oid(const std::string& key) override { + return prefix + key; + } + + // This is called after `is_valid_oid` and is assumed to be a valid oid + std::string oid_to_key(const std::string& oid) override { + return oid.substr(prefix.size()); + } + + const std::string& get_oid_prefix() { + return prefix; + } +}; + +RGWSI_MetaBackend_Handler* RGWSI_Role_RADOS::get_be_handler() +{ + return be_handler; +} + +void RGWSI_Role_RADOS::init(RGWSI_Zone *_zone_svc, + RGWSI_Meta *_meta_svc, + RGWSI_MetaBackend *_meta_be_svc, + RGWSI_SysObj *_sysobj_svc) +{ + svc.zone = _zone_svc; + svc.meta = _meta_svc; + svc.meta_be = _meta_be_svc; + svc.sysobj = _sysobj_svc; +} + +int RGWSI_Role_RADOS::do_start(optional_yield y, const DoutPrefixProvider *dpp) +{ + + int r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_SOBJ, + &be_handler); + if (r < 0) { + ldout(ctx(), 0) << "ERROR: failed to create be_handler for Roles: r=" + << r <(be_handler); + be_module.reset(module); + bh->set_module(module); + return 0; +} diff --git a/src/rgw/services/svc_role_rados.h b/src/rgw/services/svc_role_rados.h new file mode 100644 index 000000000..d4d3530c2 --- /dev/null +++ b/src/rgw/services/svc_role_rados.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_service.h" +#include "rgw_role.h" +#include "svc_meta_be.h" + +class RGWSI_Role_RADOS: public RGWServiceInstance +{ + public: + struct Svc { + RGWSI_Zone *zone{nullptr}; + RGWSI_Meta *meta{nullptr}; + RGWSI_MetaBackend *meta_be{nullptr}; + RGWSI_SysObj *sysobj{nullptr}; + } svc; + + RGWSI_Role_RADOS(CephContext *cct) : RGWServiceInstance(cct) {} + ~RGWSI_Role_RADOS() {} + + void init(RGWSI_Zone *_zone_svc, + RGWSI_Meta *_meta_svc, + RGWSI_MetaBackend *_meta_be_svc, + RGWSI_SysObj *_sysobj_svc); + + RGWSI_MetaBackend_Handler * get_be_handler(); + int do_start(optional_yield y, const DoutPrefixProvider *dpp) override; + +private: + RGWSI_MetaBackend_Handler *be_handler; + std::unique_ptr be_module; +}; + +static const std::string role_name_oid_prefix = "role_names."; +static const std::string role_oid_prefix = "roles."; +static const std::string role_path_oid_prefix = "role_paths."; diff --git a/src/rgw/services/svc_sync_modules.cc b/src/rgw/services/svc_sync_modules.cc new file mode 100644 index 000000000..ba9e7d172 --- /dev/null +++ b/src/rgw/services/svc_sync_modules.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_sync_modules.h" +#include "svc_zone.h" + +#include "rgw_sync_module.h" +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +void RGWSI_SyncModules::init(RGWSI_Zone *zone_svc) +{ + svc.zone = zone_svc; + sync_modules_manager = new RGWSyncModulesManager(); + rgw_register_sync_modules(sync_modules_manager); +} + +int RGWSI_SyncModules::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + auto& zone_public_config = svc.zone->get_zone(); + + int ret = sync_modules_manager->create_instance(dpp, cct, zone_public_config.tier_type, svc.zone->get_zone_params().tier_config, &sync_module); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to start sync module instance, ret=" << ret << dendl; + if (ret == -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: " << zone_public_config.tier_type + << " sync module does not exist. valid sync modules: " + << sync_modules_manager->get_registered_module_names() + << dendl; + } + return ret; + } + + ldpp_dout(dpp, 20) << "started sync module instance, tier type = " << zone_public_config.tier_type << dendl; + + return 0; +} + +RGWSI_SyncModules::~RGWSI_SyncModules() +{ + delete sync_modules_manager; +} + diff --git a/src/rgw/services/svc_sync_modules.h b/src/rgw/services/svc_sync_modules.h new file mode 100644 index 000000000..ea78f5817 --- /dev/null +++ b/src/rgw/services/svc_sync_modules.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_service.h" +#include "rgw_sync_module.h" + +class RGWSI_Zone; + +class RGWSyncModulesManager; + +class RGWSI_SyncModules : public RGWServiceInstance +{ + RGWSyncModulesManager *sync_modules_manager{nullptr}; + RGWSyncModuleInstanceRef sync_module; + + struct Svc { + RGWSI_Zone *zone{nullptr}; + } svc; + +public: + RGWSI_SyncModules(CephContext *cct): RGWServiceInstance(cct) {} + ~RGWSI_SyncModules(); + + RGWSyncModulesManager *get_manager() { + return sync_modules_manager; + } + + void init(RGWSI_Zone *zone_svc); + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + + RGWSyncModuleInstanceRef& get_sync_module() { return sync_module; } +}; diff --git a/src/rgw/services/svc_sys_obj.cc b/src/rgw/services/svc_sys_obj.cc new file mode 100644 index 000000000..310e60514 --- /dev/null +++ b/src/rgw/services/svc_sys_obj.cc @@ -0,0 +1,183 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_sys_obj.h" +#include "svc_sys_obj_core.h" +#include "svc_rados.h" +#include "svc_zone.h" + +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +RGWSI_SysObj::Obj RGWSI_SysObj::get_obj(const rgw_raw_obj& obj) +{ + return Obj(core_svc, obj); +} + +RGWSI_SysObj::Obj::ROp::ROp(Obj& _source) : source(_source) { + state.emplace(); +} + +int RGWSI_SysObj::Obj::ROp::stat(optional_yield y, const DoutPrefixProvider *dpp) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->stat(*state, obj, attrs, raw_attrs, + lastmod, obj_size, objv_tracker, y, dpp); +} + +int RGWSI_SysObj::Obj::ROp::read(const DoutPrefixProvider *dpp, + int64_t ofs, int64_t end, bufferlist *bl, + optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->read(dpp, *state, + objv_tracker, + obj, bl, ofs, end, + lastmod, obj_size, + attrs, + raw_attrs, + cache_info, + refresh_version, y); +} + +int RGWSI_SysObj::Obj::ROp::get_attr(const DoutPrefixProvider *dpp, + const char *name, bufferlist *dest, + optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->get_attr(dpp, obj, name, dest, y); +} + +int RGWSI_SysObj::Obj::WOp::remove(const DoutPrefixProvider *dpp, optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->remove(dpp, objv_tracker, obj, y); +} + +int RGWSI_SysObj::Obj::WOp::write(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->write(dpp, obj, pmtime, attrs, exclusive, + bl, objv_tracker, mtime, y); +} + +int RGWSI_SysObj::Obj::WOp::write_data(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->write_data(dpp, obj, bl, exclusive, objv_tracker, y); +} + +int RGWSI_SysObj::Obj::WOp::write_attrs(const DoutPrefixProvider *dpp, optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->set_attrs(dpp, obj, attrs, nullptr, objv_tracker, exclusive, y); +} + +int RGWSI_SysObj::Obj::WOp::write_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& bl, + optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + map m; + m[name] = bl; + + return svc->set_attrs(dpp, obj, m, nullptr, objv_tracker, exclusive, y); +} + +int RGWSI_SysObj::Pool::list_prefixed_objs(const DoutPrefixProvider *dpp, const string& prefix, std::function cb) +{ + return core_svc->pool_list_prefixed_objs(dpp, pool, prefix, cb); +} + +int RGWSI_SysObj::Pool::Op::init(const DoutPrefixProvider *dpp, const string& marker, const string& prefix) +{ + return source.core_svc->pool_list_objects_init(dpp, source.pool, marker, prefix, &ctx); +} + +int RGWSI_SysObj::Pool::Op::get_next(const DoutPrefixProvider *dpp, int max, vector *oids, bool *is_truncated) +{ + return source.core_svc->pool_list_objects_next(dpp, ctx, max, oids, is_truncated); +} + +int RGWSI_SysObj::Pool::Op::get_marker(string *marker) +{ + return source.core_svc->pool_list_objects_get_marker(ctx, marker); +} + +int RGWSI_SysObj::Obj::OmapOp::get_all(const DoutPrefixProvider *dpp, std::map *m, + optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_get_all(dpp, obj, m, y); +} + +int RGWSI_SysObj::Obj::OmapOp::get_vals(const DoutPrefixProvider *dpp, + const string& marker, uint64_t count, + std::map *m, + bool *pmore, optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_get_vals(dpp, obj, marker, count, m, pmore, y); +} + +int RGWSI_SysObj::Obj::OmapOp::set(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& bl, + optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_set(dpp, obj, key, bl, must_exist, y); +} + +int RGWSI_SysObj::Obj::OmapOp::set(const DoutPrefixProvider *dpp, const map& m, + optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_set(dpp, obj, m, must_exist, y); +} + +int RGWSI_SysObj::Obj::OmapOp::del(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_del(dpp, obj, key, y); +} + +int RGWSI_SysObj::Obj::WNOp::notify(const DoutPrefixProvider *dpp, bufferlist& bl, uint64_t timeout_ms, + bufferlist *pbl, optional_yield y) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->notify(dpp, obj, bl, timeout_ms, pbl, y); +} + +RGWSI_Zone *RGWSI_SysObj::get_zone_svc() +{ + return core_svc->get_zone_svc(); +} diff --git a/src/rgw/services/svc_sys_obj.h b/src/rgw/services/svc_sys_obj.h new file mode 100644 index 000000000..f3e217dbd --- /dev/null +++ b/src/rgw/services/svc_sys_obj.h @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "common/static_ptr.h" + +#include "rgw_service.h" + +#include "svc_rados.h" +#include "svc_sys_obj_types.h" +#include "svc_sys_obj_core_types.h" + + +class RGWSI_Zone; +class RGWSI_SysObj; + +struct rgw_cache_entry_info; + +class RGWSI_SysObj : public RGWServiceInstance +{ + friend struct RGWServices_Def; + +public: + class Obj { + friend class ROp; + + RGWSI_SysObj_Core *core_svc; + rgw_raw_obj obj; + + public: + Obj(RGWSI_SysObj_Core *_core_svc, const rgw_raw_obj& _obj) + : core_svc(_core_svc), obj(_obj) {} + + rgw_raw_obj& get_obj() { + return obj; + } + + struct ROp { + Obj& source; + + ceph::static_ptr state; + + RGWObjVersionTracker *objv_tracker{nullptr}; + std::map *attrs{nullptr}; + bool raw_attrs{false}; + boost::optional refresh_version{boost::none}; + ceph::real_time *lastmod{nullptr}; + uint64_t *obj_size{nullptr}; + rgw_cache_entry_info *cache_info{nullptr}; + + ROp& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + ROp& set_last_mod(ceph::real_time *_lastmod) { + lastmod = _lastmod; + return *this; + } + + ROp& set_obj_size(uint64_t *_obj_size) { + obj_size = _obj_size; + return *this; + } + + ROp& set_attrs(std::map *_attrs) { + attrs = _attrs; + return *this; + } + + ROp& set_raw_attrs(bool ra) { + raw_attrs = ra; + return *this; + } + + ROp& set_refresh_version(boost::optional& rf) { + refresh_version = rf; + return *this; + } + + ROp& set_cache_info(rgw_cache_entry_info *ci) { + cache_info = ci; + return *this; + } + + ROp(Obj& _source); + + int stat(optional_yield y, const DoutPrefixProvider *dpp); + int read(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, bufferlist *pbl, optional_yield y); + int read(const DoutPrefixProvider *dpp, bufferlist *pbl, optional_yield y) { + return read(dpp, 0, -1, pbl, y); + } + int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist *dest, optional_yield y); + }; + + struct WOp { + Obj& source; + + RGWObjVersionTracker *objv_tracker{nullptr}; + std::map attrs; + ceph::real_time mtime; + ceph::real_time *pmtime{nullptr}; + bool exclusive{false}; + + WOp& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + WOp& set_attrs(std::map& _attrs) { + attrs = _attrs; + return *this; + } + + WOp& set_attrs(std::map&& _attrs) { + attrs = _attrs; + return *this; + } + + WOp& set_mtime(const ceph::real_time& _mtime) { + mtime = _mtime; + return *this; + } + + WOp& set_pmtime(ceph::real_time *_pmtime) { + pmtime = _pmtime; + return *this; + } + + WOp& set_exclusive(bool _exclusive = true) { + exclusive = _exclusive; + return *this; + } + + WOp(Obj& _source) : source(_source) {} + + int remove(const DoutPrefixProvider *dpp, optional_yield y); + int write(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y); + + int write_data(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y); /* write data only */ + int write_attrs(const DoutPrefixProvider *dpp, optional_yield y); /* write attrs only */ + int write_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& bl, + optional_yield y); /* write attrs only */ + }; + + struct OmapOp { + Obj& source; + + bool must_exist{false}; + + OmapOp& set_must_exist(bool _must_exist = true) { + must_exist = _must_exist; + return *this; + } + + OmapOp(Obj& _source) : source(_source) {} + + int get_all(const DoutPrefixProvider *dpp, std::map *m, optional_yield y); + int get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count, + std::map *m, + bool *pmore, optional_yield y); + int set(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& bl, optional_yield y); + int set(const DoutPrefixProvider *dpp, const std::map& m, optional_yield y); + int del(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y); + }; + + struct WNOp { + Obj& source; + + WNOp(Obj& _source) : source(_source) {} + + int notify(const DoutPrefixProvider *dpp, bufferlist& bl, uint64_t timeout_ms, bufferlist *pbl, + optional_yield y); + }; + ROp rop() { + return ROp(*this); + } + + WOp wop() { + return WOp(*this); + } + + OmapOp omap() { + return OmapOp(*this); + } + + WNOp wn() { + return WNOp(*this); + } + }; + + class Pool { + friend class Op; + friend class RGWSI_SysObj_Core; + + RGWSI_SysObj_Core *core_svc; + rgw_pool pool; + + protected: + using ListImplInfo = RGWSI_SysObj_Pool_ListInfo; + + struct ListCtx { + ceph::static_ptr impl; /* update this if creating new backend types */ + }; + + public: + Pool(RGWSI_SysObj_Core *_core_svc, + const rgw_pool& _pool) : core_svc(_core_svc), + pool(_pool) {} + + rgw_pool& get_pool() { + return pool; + } + + struct Op { + Pool& source; + ListCtx ctx; + + Op(Pool& _source) : source(_source) {} + + int init(const DoutPrefixProvider *dpp, const std::string& marker, const std::string& prefix); + int get_next(const DoutPrefixProvider *dpp, int max, std::vector *oids, bool *is_truncated); + int get_marker(std::string *marker); + }; + + int list_prefixed_objs(const DoutPrefixProvider *dpp, const std::string& prefix, std::function cb); + + template + int list_prefixed_objs(const DoutPrefixProvider *dpp, const std::string& prefix, + Container *result) { + return list_prefixed_objs(dpp, prefix, [&](const std::string& val) { + result->push_back(val); + }); + } + + Op op() { + return Op(*this); + } + }; + + friend class Obj; + friend class Obj::ROp; + friend class Obj::WOp; + friend class Pool; + friend class Pool::Op; + +protected: + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_SysObj_Core *core_svc{nullptr}; + + void init(RGWSI_RADOS *_rados_svc, + RGWSI_SysObj_Core *_core_svc) { + rados_svc = _rados_svc; + core_svc = _core_svc; + } + +public: + RGWSI_SysObj(CephContext *cct): RGWServiceInstance(cct) {} + + Obj get_obj(const rgw_raw_obj& obj); + + Pool get_pool(const rgw_pool& pool) { + return Pool(core_svc, pool); + } + + RGWSI_Zone *get_zone_svc(); +}; + +using RGWSysObj = RGWSI_SysObj::Obj; diff --git a/src/rgw/services/svc_sys_obj_cache.cc b/src/rgw/services/svc_sys_obj_cache.cc new file mode 100644 index 000000000..d1b7a3dbb --- /dev/null +++ b/src/rgw/services/svc_sys_obj_cache.cc @@ -0,0 +1,670 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "common/admin_socket.h" + +#include "svc_sys_obj_cache.h" +#include "svc_zone.h" +#include "svc_notify.h" + +#include "rgw_zone.h" +#include "rgw_tools.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +class RGWSI_SysObj_Cache_CB : public RGWSI_Notify::CB +{ + RGWSI_SysObj_Cache *svc; +public: + RGWSI_SysObj_Cache_CB(RGWSI_SysObj_Cache *_svc) : svc(_svc) {} + int watch_cb(const DoutPrefixProvider *dpp, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) { + return svc->watch_cb(dpp, notify_id, cookie, notifier_id, bl); + } + + void set_enabled(bool status) { + svc->set_enabled(status); + } +}; + +int RGWSI_SysObj_Cache::do_start(optional_yield y, const DoutPrefixProvider *dpp) +{ + int r = asocket.start(); + if (r < 0) { + return r; + } + + r = RGWSI_SysObj_Core::do_start(y, dpp); + if (r < 0) { + return r; + } + + r = notify_svc->start(y, dpp); + if (r < 0) { + return r; + } + + assert(notify_svc->is_started()); + + cb.reset(new RGWSI_SysObj_Cache_CB(this)); + + notify_svc->register_watch_cb(cb.get()); + + return 0; +} + +void RGWSI_SysObj_Cache::shutdown() +{ + asocket.shutdown(); + RGWSI_SysObj_Core::shutdown(); +} + +static string normal_name(rgw_pool& pool, const std::string& oid) { + std::string buf; + buf.reserve(pool.name.size() + pool.ns.size() + oid.size() + 2); + buf.append(pool.name).append("+").append(pool.ns).append("+").append(oid); + return buf; +} + +void RGWSI_SysObj_Cache::normalize_pool_and_obj(const rgw_pool& src_pool, const string& src_obj, rgw_pool& dst_pool, string& dst_obj) +{ + if (src_obj.size()) { + dst_pool = src_pool; + dst_obj = src_obj; + } else { + dst_pool = zone_svc->get_zone_params().domain_root; + dst_obj = src_pool.name; + } +} + + +int RGWSI_SysObj_Cache::remove(const DoutPrefixProvider *dpp, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + optional_yield y) + +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + + string name = normal_name(pool, oid); + cache.invalidate_remove(dpp, name); + + ObjectCacheInfo info; + int r = distribute_cache(dpp, name, obj, info, INVALIDATE_OBJ, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to distribute cache: r=" << r << dendl; + } + + return RGWSI_SysObj_Core::remove(dpp, objv_tracker, obj, y); +} + +int RGWSI_SysObj_Cache::read(const DoutPrefixProvider *dpp, + RGWSI_SysObj_Obj_GetObjState& read_state, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + bufferlist *obl, off_t ofs, off_t end, + ceph::real_time* pmtime, uint64_t* psize, + map *attrs, + bool raw_attrs, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version, + optional_yield y) +{ + rgw_pool pool; + string oid; + if (ofs != 0) { + return RGWSI_SysObj_Core::read(dpp, read_state, objv_tracker, obj, obl, + ofs, end, pmtime, psize, attrs, raw_attrs, + cache_info, refresh_version, y); + } + + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + string name = normal_name(pool, oid); + + ObjectCacheInfo info; + + uint32_t flags = (end != 0 ? CACHE_FLAG_DATA : 0); + if (objv_tracker) + flags |= CACHE_FLAG_OBJV; + if (pmtime || psize) + flags |= CACHE_FLAG_META; + if (attrs) + flags |= CACHE_FLAG_XATTRS; + + int r = cache.get(dpp, name, info, flags, cache_info); + if (r == 0 && + (!refresh_version || !info.version.compare(&(*refresh_version)))) { + if (info.status < 0) + return info.status; + + bufferlist& bl = info.data; + + bufferlist::iterator i = bl.begin(); + + obl->clear(); + + i.copy_all(*obl); + if (objv_tracker) + objv_tracker->read_version = info.version; + if (pmtime) { + *pmtime = info.meta.mtime; + } + if (psize) { + *psize = info.meta.size; + } + if (attrs) { + if (raw_attrs) { + *attrs = info.xattrs; + } else { + rgw_filter_attrset(info.xattrs, RGW_ATTR_PREFIX, attrs); + } + } + return obl->length(); + } + if(r == -ENODATA) + return -ENOENT; + + // if we only ask for one of mtime or size, ask for the other too so we can + // satisfy CACHE_FLAG_META + uint64_t size = 0; + real_time mtime; + if (pmtime) { + if (!psize) { + psize = &size; + } + } else if (psize) { + if (!pmtime) { + pmtime = &mtime; + } + } + + map unfiltered_attrset; + r = RGWSI_SysObj_Core::read(dpp, read_state, objv_tracker, + obj, obl, ofs, end, pmtime, psize, + (attrs ? &unfiltered_attrset : nullptr), + true, /* cache unfiltered attrs */ + cache_info, + refresh_version, y); + if (r < 0) { + if (r == -ENOENT) { // only update ENOENT, we'd rather retry other errors + info.status = r; + cache.put(dpp, name, info, cache_info); + } + return r; + } + + if (obl->length() == end + 1) { + /* in this case, most likely object contains more data, we can't cache it */ + flags &= ~CACHE_FLAG_DATA; + } else { + bufferptr p(r); + bufferlist& bl = info.data; + bl.clear(); + bufferlist::iterator o = obl->begin(); + o.copy_all(bl); + } + + info.status = 0; + info.flags = flags; + if (objv_tracker) { + info.version = objv_tracker->read_version; + } + if (pmtime) { + info.meta.mtime = *pmtime; + } + if (psize) { + info.meta.size = *psize; + } + if (attrs) { + info.xattrs = std::move(unfiltered_attrset); + if (raw_attrs) { + *attrs = info.xattrs; + } else { + rgw_filter_attrset(info.xattrs, RGW_ATTR_PREFIX, attrs); + } + } + cache.put(dpp, name, info, cache_info); + return r; +} + +int RGWSI_SysObj_Cache::get_attr(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + const char *attr_name, + bufferlist *dest, + optional_yield y) +{ + rgw_pool pool; + string oid; + + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + string name = normal_name(pool, oid); + + ObjectCacheInfo info; + + uint32_t flags = CACHE_FLAG_XATTRS; + + int r = cache.get(dpp, name, info, flags, nullptr); + if (r == 0) { + if (info.status < 0) + return info.status; + + auto iter = info.xattrs.find(attr_name); + if (iter == info.xattrs.end()) { + return -ENODATA; + } + + *dest = iter->second; + return dest->length(); + } else if (r == -ENODATA) { + return -ENOENT; + } + /* don't try to cache this one */ + return RGWSI_SysObj_Core::get_attr(dpp, obj, attr_name, dest, y); +} + +int RGWSI_SysObj_Cache::set_attrs(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + map& attrs, + map *rmattrs, + RGWObjVersionTracker *objv_tracker, + bool exclusive, optional_yield y) +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + ObjectCacheInfo info; + info.xattrs = attrs; + if (rmattrs) { + info.rm_xattrs = *rmattrs; + } + info.status = 0; + info.flags = CACHE_FLAG_MODIFY_XATTRS; + int ret = RGWSI_SysObj_Core::set_attrs(dpp, obj, attrs, rmattrs, objv_tracker, exclusive, y); + string name = normal_name(pool, oid); + if (ret >= 0) { + if (objv_tracker && objv_tracker->read_version.ver) { + info.version = objv_tracker->read_version; + info.flags |= CACHE_FLAG_OBJV; + } + cache.put(dpp, name, info, NULL); + int r = distribute_cache(dpp, name, obj, info, UPDATE_OBJ, y); + if (r < 0) + ldpp_dout(dpp, 0) << "ERROR: failed to distribute cache for " << obj << dendl; + } else { + cache.invalidate_remove(dpp, name); + } + + return ret; +} + +int RGWSI_SysObj_Cache::write(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + real_time *pmtime, + map& attrs, + bool exclusive, + const bufferlist& data, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime, + optional_yield y) +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + ObjectCacheInfo info; + info.xattrs = attrs; + info.status = 0; + info.data = data; + info.flags = CACHE_FLAG_XATTRS | CACHE_FLAG_DATA | CACHE_FLAG_META; + ceph::real_time result_mtime; + int ret = RGWSI_SysObj_Core::write(dpp, obj, &result_mtime, attrs, + exclusive, data, + objv_tracker, set_mtime, y); + if (pmtime) { + *pmtime = result_mtime; + } + if (objv_tracker && objv_tracker->read_version.ver) { + info.version = objv_tracker->read_version; + info.flags |= CACHE_FLAG_OBJV; + } + info.meta.mtime = result_mtime; + info.meta.size = data.length(); + string name = normal_name(pool, oid); + if (ret >= 0) { + cache.put(dpp, name, info, NULL); + int r = distribute_cache(dpp, name, obj, info, UPDATE_OBJ, y); + if (r < 0) + ldpp_dout(dpp, 0) << "ERROR: failed to distribute cache for " << obj << dendl; + } else { + cache.invalidate_remove(dpp, name); + } + + return ret; +} + +int RGWSI_SysObj_Cache::write_data(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + const bufferlist& data, + bool exclusive, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + + ObjectCacheInfo info; + info.data = data; + info.meta.size = data.length(); + info.status = 0; + info.flags = CACHE_FLAG_DATA; + + int ret = RGWSI_SysObj_Core::write_data(dpp, obj, data, exclusive, objv_tracker, y); + string name = normal_name(pool, oid); + if (ret >= 0) { + if (objv_tracker && objv_tracker->read_version.ver) { + info.version = objv_tracker->read_version; + info.flags |= CACHE_FLAG_OBJV; + } + cache.put(dpp, name, info, NULL); + int r = distribute_cache(dpp, name, obj, info, UPDATE_OBJ, y); + if (r < 0) + ldpp_dout(dpp, 0) << "ERROR: failed to distribute cache for " << obj << dendl; + } else { + cache.invalidate_remove(dpp, name); + } + + return ret; +} + +int RGWSI_SysObj_Cache::raw_stat(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, + uint64_t *psize, real_time *pmtime, + map *attrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + + string name = normal_name(pool, oid); + + uint64_t size; + real_time mtime; + + ObjectCacheInfo info; + uint32_t flags = CACHE_FLAG_META | CACHE_FLAG_XATTRS; + if (objv_tracker) + flags |= CACHE_FLAG_OBJV; + int r = cache.get(dpp, name, info, flags, NULL); + if (r == 0) { + if (info.status < 0) + return info.status; + + size = info.meta.size; + mtime = info.meta.mtime; + if (objv_tracker) + objv_tracker->read_version = info.version; + goto done; + } + if (r == -ENODATA) { + return -ENOENT; + } + r = RGWSI_SysObj_Core::raw_stat(dpp, obj, &size, &mtime, &info.xattrs, + objv_tracker, y); + if (r < 0) { + if (r == -ENOENT) { + info.status = r; + cache.put(dpp, name, info, NULL); + } + return r; + } + info.status = 0; + info.meta.mtime = mtime; + info.meta.size = size; + info.flags = CACHE_FLAG_META | CACHE_FLAG_XATTRS; + if (objv_tracker) { + info.flags |= CACHE_FLAG_OBJV; + info.version = objv_tracker->read_version; + } + cache.put(dpp, name, info, NULL); +done: + if (psize) + *psize = size; + if (pmtime) + *pmtime = mtime; + if (attrs) + *attrs = info.xattrs; + return 0; +} + +int RGWSI_SysObj_Cache::distribute_cache(const DoutPrefixProvider *dpp, + const string& normal_name, + const rgw_raw_obj& obj, + ObjectCacheInfo& obj_info, int op, + optional_yield y) +{ + RGWCacheNotifyInfo info; + info.op = op; + info.obj_info = obj_info; + info.obj = obj; + return notify_svc->distribute(dpp, normal_name, info, y); +} + +int RGWSI_SysObj_Cache::watch_cb(const DoutPrefixProvider *dpp, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) +{ + RGWCacheNotifyInfo info; + + try { + auto iter = bl.cbegin(); + decode(info, iter); + } catch (buffer::end_of_buffer& err) { + ldpp_dout(dpp, 0) << "ERROR: got bad notification" << dendl; + return -EIO; + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: buffer::error" << dendl; + return -EIO; + } + + rgw_pool pool; + string oid; + normalize_pool_and_obj(info.obj.pool, info.obj.oid, pool, oid); + string name = normal_name(pool, oid); + + switch (info.op) { + case UPDATE_OBJ: + cache.put(dpp, name, info.obj_info, NULL); + break; + case INVALIDATE_OBJ: + cache.invalidate_remove(dpp, name); + break; + default: + ldpp_dout(dpp, 0) << "WARNING: got unknown notification op: " << info.op << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWSI_SysObj_Cache::set_enabled(bool status) +{ + cache.set_enabled(status); +} + +bool RGWSI_SysObj_Cache::chain_cache_entry(const DoutPrefixProvider *dpp, + std::initializer_list cache_info_entries, + RGWChainedCache::Entry *chained_entry) +{ + return cache.chain_cache_entry(dpp, cache_info_entries, chained_entry); +} + +void RGWSI_SysObj_Cache::register_chained_cache(RGWChainedCache *cc) +{ + cache.chain_cache(cc); +} + +void RGWSI_SysObj_Cache::unregister_chained_cache(RGWChainedCache *cc) +{ + cache.unchain_cache(cc); +} + +static void cache_list_dump_helper(Formatter* f, + const std::string& name, + const ceph::real_time mtime, + const std::uint64_t size) +{ + f->dump_string("name", name); + f->dump_string("mtime", ceph::to_iso_8601(mtime)); + f->dump_unsigned("size", size); +} + +class RGWSI_SysObj_Cache_ASocketHook : public AdminSocketHook { + RGWSI_SysObj_Cache *svc; + + static constexpr std::string_view admin_commands[][2] = { + { "cache list name=filter,type=CephString,req=false", + "cache list [filter_str]: list object cache, possibly matching substrings" }, + { "cache inspect name=target,type=CephString,req=true", + "cache inspect target: print cache element" }, + { "cache erase name=target,type=CephString,req=true", + "cache erase target: erase element from cache" }, + { "cache zap", + "cache zap: erase all elements from cache" } + }; + +public: + RGWSI_SysObj_Cache_ASocketHook(RGWSI_SysObj_Cache *_svc) : svc(_svc) {} + + int start(); + void shutdown(); + + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& ss, + bufferlist& out) override; +}; + +int RGWSI_SysObj_Cache_ASocketHook::start() +{ + auto admin_socket = svc->ctx()->get_admin_socket(); + for (auto cmd : admin_commands) { + int r = admin_socket->register_command(cmd[0], this, cmd[1]); + if (r < 0) { + ldout(svc->ctx(), 0) << "ERROR: fail to register admin socket command (r=" << r + << ")" << dendl; + return r; + } + } + return 0; +} + +void RGWSI_SysObj_Cache_ASocketHook::shutdown() +{ + auto admin_socket = svc->ctx()->get_admin_socket(); + admin_socket->unregister_commands(this); +} + +int RGWSI_SysObj_Cache_ASocketHook::call( + std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& ss, + bufferlist& out) +{ + if (command == "cache list"sv) { + std::optional filter; + if (auto i = cmdmap.find("filter"); i != cmdmap.cend()) { + filter = boost::get(i->second); + } + f->open_array_section("cache_entries"); + svc->asocket.call_list(filter, f); + f->close_section(); + return 0; + } else if (command == "cache inspect"sv) { + const auto& target = boost::get(cmdmap.at("target")); + if (svc->asocket.call_inspect(target, f)) { + return 0; + } else { + ss << "Unable to find entry "s + target + ".\n"; + return -ENOENT; + } + } else if (command == "cache erase"sv) { + const auto& target = boost::get(cmdmap.at("target")); + if (svc->asocket.call_erase(target)) { + return 0; + } else { + ss << "Unable to find entry "s + target + ".\n"; + return -ENOENT; + } + } else if (command == "cache zap"sv) { + svc->asocket.call_zap(); + return 0; + } + return -ENOSYS; +} + +RGWSI_SysObj_Cache::ASocketHandler::ASocketHandler(const DoutPrefixProvider *_dpp, RGWSI_SysObj_Cache *_svc) : dpp(_dpp), svc(_svc) +{ + hook.reset(new RGWSI_SysObj_Cache_ASocketHook(_svc)); +} + +RGWSI_SysObj_Cache::ASocketHandler::~ASocketHandler() +{ +} + +int RGWSI_SysObj_Cache::ASocketHandler::start() +{ + return hook->start(); +} + +void RGWSI_SysObj_Cache::ASocketHandler::shutdown() +{ + return hook->shutdown(); +} + +void RGWSI_SysObj_Cache::ASocketHandler::call_list(const std::optional& filter, Formatter* f) +{ + svc->cache.for_each( + [&filter, f] (const string& name, const ObjectCacheEntry& entry) { + if (!filter || name.find(*filter) != name.npos) { + cache_list_dump_helper(f, name, entry.info.meta.mtime, + entry.info.meta.size); + } + }); +} + +int RGWSI_SysObj_Cache::ASocketHandler::call_inspect(const std::string& target, Formatter* f) +{ + if (const auto entry = svc->cache.get(dpp, target)) { + f->open_object_section("cache_entry"); + f->dump_string("name", target.c_str()); + entry->dump(f); + f->close_section(); + return true; + } else { + return false; + } +} + +int RGWSI_SysObj_Cache::ASocketHandler::call_erase(const std::string& target) +{ + return svc->cache.invalidate_remove(dpp, target); +} + +int RGWSI_SysObj_Cache::ASocketHandler::call_zap() +{ + svc->cache.invalidate_all(); + return 0; +} diff --git a/src/rgw/services/svc_sys_obj_cache.h b/src/rgw/services/svc_sys_obj_cache.h new file mode 100644 index 000000000..f7950843f --- /dev/null +++ b/src/rgw/services/svc_sys_obj_cache.h @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "common/RWLock.h" +#include "rgw_service.h" +#include "rgw_cache.h" + +#include "svc_sys_obj_core.h" + +class RGWSI_Notify; + +class RGWSI_SysObj_Cache_CB; +class RGWSI_SysObj_Cache_ASocketHook; + +class RGWSI_SysObj_Cache : public RGWSI_SysObj_Core +{ + friend class RGWSI_SysObj_Cache_CB; + friend class RGWServices_Def; + friend class ASocketHandler; + + RGWSI_Notify *notify_svc{nullptr}; + ObjectCache cache; + + std::shared_ptr cb; + + void normalize_pool_and_obj(const rgw_pool& src_pool, const std::string& src_obj, rgw_pool& dst_pool, std::string& dst_obj); +protected: + void init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc, + RGWSI_Notify *_notify_svc) { + core_init(_rados_svc, _zone_svc); + notify_svc = _notify_svc; + } + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + void shutdown() override; + + int raw_stat(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, + uint64_t *psize, real_time *pmtime, + std::map *attrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y) override; + + int read(const DoutPrefixProvider *dpp, + RGWSI_SysObj_Obj_GetObjState& read_state, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + bufferlist *bl, off_t ofs, off_t end, + ceph::real_time* pmtime, uint64_t* psize, + std::map *attrs, + bool raw_attrs, + rgw_cache_entry_info *cache_info, + boost::optional, + optional_yield y) override; + + int get_attr(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const char *name, bufferlist *dest, + optional_yield y) override; + + int set_attrs(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + std::map& attrs, + std::map *rmattrs, + RGWObjVersionTracker *objv_tracker, + bool exclusive, optional_yield y) override; + + int remove(const DoutPrefixProvider *dpp, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + optional_yield y) override; + + int write(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + real_time *pmtime, + std::map& attrs, + bool exclusive, + const bufferlist& data, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime, + optional_yield y) override; + + int write_data(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + const bufferlist& bl, + bool exclusive, + RGWObjVersionTracker *objv_tracker, + optional_yield y); + + int distribute_cache(const DoutPrefixProvider *dpp, const std::string& normal_name, const rgw_raw_obj& obj, + ObjectCacheInfo& obj_info, int op, + optional_yield y); + + int watch_cb(const DoutPrefixProvider *dpp, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl); + + void set_enabled(bool status); + +public: + RGWSI_SysObj_Cache(const DoutPrefixProvider *dpp, CephContext *cct) : RGWSI_SysObj_Core(cct), asocket(dpp, this) { + cache.set_ctx(cct); + } + + bool chain_cache_entry(const DoutPrefixProvider *dpp, + std::initializer_list cache_info_entries, + RGWChainedCache::Entry *chained_entry); + void register_chained_cache(RGWChainedCache *cc); + void unregister_chained_cache(RGWChainedCache *cc); + + class ASocketHandler { + const DoutPrefixProvider *dpp; + RGWSI_SysObj_Cache *svc; + + std::unique_ptr hook; + + public: + ASocketHandler(const DoutPrefixProvider *dpp, RGWSI_SysObj_Cache *_svc); + ~ASocketHandler(); + + int start(); + void shutdown(); + + // `call_list` must iterate over all cache entries and call + // `cache_list_dump_helper` with the supplied Formatter on any that + // include `filter` as a substd::string. + // + void call_list(const std::optional& filter, Formatter* f); + + // `call_inspect` must look up the requested target and, if found, + // dump it to the supplied Formatter and return true. If not found, + // it must return false. + // + int call_inspect(const std::string& target, Formatter* f); + + // `call_erase` must erase the requested target and return true. If + // the requested target does not exist, it should return false. + int call_erase(const std::string& target); + + // `call_zap` must erase the cache. + int call_zap(); + } asocket; +}; + +template +class RGWChainedCacheImpl : public RGWChainedCache { + RGWSI_SysObj_Cache *svc{nullptr}; + ceph::timespan expiry; + RWLock lock; + + std::unordered_map> entries; + +public: + RGWChainedCacheImpl() : lock("RGWChainedCacheImpl::lock") {} + ~RGWChainedCacheImpl() { + if (!svc) { + return; + } + svc->unregister_chained_cache(this); + } + + void unregistered() override { + svc = nullptr; + } + + void init(RGWSI_SysObj_Cache *_svc) { + if (!_svc) { + return; + } + svc = _svc; + svc->register_chained_cache(this); + expiry = std::chrono::seconds(svc->ctx()->_conf.get_val( + "rgw_cache_expiry_interval")); + } + + boost::optional find(const std::string& key) { + std::shared_lock rl{lock}; + auto iter = entries.find(key); + if (iter == entries.end()) { + return boost::none; + } + if (expiry.count() && + (ceph::coarse_mono_clock::now() - iter->second.second) > expiry) { + return boost::none; + } + + return iter->second.first; + } + + bool put(const DoutPrefixProvider *dpp, RGWSI_SysObj_Cache *svc, const std::string& key, T *entry, + std::initializer_list cache_info_entries) { + if (!svc) { + return false; + } + + Entry chain_entry(this, key, entry); + + /* we need the svc cache to call us under its lock to maintain lock ordering */ + return svc->chain_cache_entry(dpp, cache_info_entries, &chain_entry); + } + + void chain_cb(const std::string& key, void *data) override { + T *entry = static_cast(data); + std::unique_lock wl{lock}; + entries[key].first = *entry; + if (expiry.count() > 0) { + entries[key].second = ceph::coarse_mono_clock::now(); + } + } + + void invalidate(const std::string& key) override { + std::unique_lock wl{lock}; + entries.erase(key); + } + + void invalidate_all() override { + std::unique_lock wl{lock}; + entries.clear(); + } +}; /* RGWChainedCacheImpl */ diff --git a/src/rgw/services/svc_sys_obj_core.cc b/src/rgw/services/svc_sys_obj_core.cc new file mode 100644 index 000000000..303089691 --- /dev/null +++ b/src/rgw/services/svc_sys_obj_core.cc @@ -0,0 +1,666 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_sys_obj_core.h" +#include "svc_rados.h" +#include "svc_zone.h" + +#include "rgw_tools.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +int RGWSI_SysObj_Core_GetObjState::get_rados_obj(const DoutPrefixProvider *dpp, + RGWSI_RADOS *rados_svc, + RGWSI_Zone *zone_svc, + const rgw_raw_obj& obj, + RGWSI_RADOS::Obj **pobj) +{ + if (!has_rados_obj) { + if (obj.oid.empty()) { + ldpp_dout(dpp, 0) << "ERROR: obj.oid is empty" << dendl; + return -EINVAL; + } + + rados_obj = rados_svc->obj(obj); + int r = rados_obj.open(dpp); + if (r < 0) { + return r; + } + has_rados_obj = true; + } + *pobj = &rados_obj; + return 0; +} + +int RGWSI_SysObj_Core::get_rados_obj(const DoutPrefixProvider *dpp, + RGWSI_Zone *zone_svc, + const rgw_raw_obj& obj, + RGWSI_RADOS::Obj *pobj) +{ + if (obj.oid.empty()) { + ldpp_dout(dpp, 0) << "ERROR: obj.oid is empty" << dendl; + return -EINVAL; + } + + *pobj = rados_svc->obj(obj); + int r = pobj->open(dpp); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_SysObj_Core::raw_stat(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, + uint64_t *psize, real_time *pmtime, + map *attrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + return r; + } + + uint64_t size = 0; + struct timespec mtime_ts; + + librados::ObjectReadOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + op.getxattrs(attrs, nullptr); + if (psize || pmtime) { + op.stat2(&size, &mtime_ts, nullptr); + } + bufferlist outbl; + r = rados_obj.operate(dpp, &op, &outbl, y); + if (r < 0) + return r; + + if (psize) + *psize = size; + if (pmtime) + *pmtime = ceph::real_clock::from_timespec(mtime_ts); + + return 0; +} + +int RGWSI_SysObj_Core::stat(RGWSI_SysObj_Obj_GetObjState& _state, + const rgw_raw_obj& obj, + map *attrs, + bool raw_attrs, + real_time *lastmod, + uint64_t *obj_size, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + uint64_t size = 0; + ceph::real_time mtime; + std::map attrset; + + int r = raw_stat(dpp, obj, &size, &mtime, &attrset, objv_tracker, y); + if (r < 0) + return r; + + if (attrs) { + if (raw_attrs) { + *attrs = std::move(attrset); + } else { + rgw_filter_attrset(attrset, RGW_ATTR_PREFIX, attrs); + } + if (cct->_conf->subsys.should_gather()) { + map::iterator iter; + for (iter = attrs->begin(); iter != attrs->end(); ++iter) { + ldpp_dout(dpp, 20) << "Read xattr: " << iter->first << dendl; + } + } + } + + if (obj_size) + *obj_size = size; + if (lastmod) + *lastmod = mtime; + + return 0; +} + +int RGWSI_SysObj_Core::read(const DoutPrefixProvider *dpp, + RGWSI_SysObj_Obj_GetObjState& _read_state, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + bufferlist *bl, off_t ofs, off_t end, + ceph::real_time* pmtime, uint64_t* psize, + map *attrs, + bool raw_attrs, + rgw_cache_entry_info *cache_info, + boost::optional, + optional_yield y) +{ + auto& read_state = static_cast(_read_state); + + uint64_t len; + struct timespec mtime_ts; + librados::ObjectReadOperation op; + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + if (psize || pmtime) { + op.stat2(psize, &mtime_ts, nullptr); + } + + ldpp_dout(dpp, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl; + op.read(ofs, len, bl, nullptr); + + map unfiltered_attrset; + + if (attrs) { + if (raw_attrs) { + op.getxattrs(attrs, nullptr); + } else { + op.getxattrs(&unfiltered_attrset, nullptr); + } + } + + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + r = rados_obj.operate(dpp, &op, nullptr, y); + if (r < 0) { + ldpp_dout(dpp, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl; + return r; + } + ldpp_dout(dpp, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl; + + uint64_t op_ver = rados_obj.get_last_version(); + + if (read_state.last_ver > 0 && + read_state.last_ver != op_ver) { + ldpp_dout(dpp, 5) << "raced with an object write, abort" << dendl; + return -ECANCELED; + } + + if (pmtime) { + *pmtime = ceph::real_clock::from_timespec(mtime_ts); + } + if (attrs && !raw_attrs) { + rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs); + } + + read_state.last_ver = op_ver; + + return bl->length(); +} + +/** + * Get an attribute for a system object. + * obj: the object to get attr + * name: name of the attr to retrieve + * dest: bufferlist to store the result in + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWSI_SysObj_Core::get_attr(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + const char *name, + bufferlist *dest, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectReadOperation op; + + int rval; + op.getxattr(name, dest, &rval); + + r = rados_obj.operate(dpp, &op, nullptr, y); + if (r < 0) + return r; + + return 0; +} + +int RGWSI_SysObj_Core::set_attrs(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + map& attrs, + map *rmattrs, + RGWObjVersionTracker *objv_tracker, + bool exclusive, optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + + if (exclusive) { + op.create(true); // exclusive create + } + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + + map::iterator iter; + if (rmattrs) { + for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { + const string& name = iter->first; + op.rmxattr(name.c_str()); + } + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + } + + if (!op.size()) + return 0; + + bufferlist bl; + + r = rados_obj.operate(dpp, &op, y); + if (r < 0) + return r; + + if (objv_tracker) { + objv_tracker->apply_write(); + } + return 0; +} + +int RGWSI_SysObj_Core::omap_get_vals(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + const string& marker, + uint64_t count, + std::map *m, + bool *pmore, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + string start_after = marker; + bool more; + + do { + librados::ObjectReadOperation op; + + std::map t; + int rval; + op.omap_get_vals2(start_after, count, &t, &more, &rval); + + r = rados_obj.operate(dpp, &op, nullptr, y); + if (r < 0) { + return r; + } + if (t.empty()) { + break; + } + count -= t.size(); + start_after = t.rbegin()->first; + m->insert(t.begin(), t.end()); + } while (more && count > 0); + + if (pmore) { + *pmore = more; + } + return 0; +} + +int RGWSI_SysObj_Core::omap_get_all(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + std::map *m, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + +#define MAX_OMAP_GET_ENTRIES 1024 + const int count = MAX_OMAP_GET_ENTRIES; + string start_after; + bool more; + + do { + librados::ObjectReadOperation op; + + std::map t; + int rval; + op.omap_get_vals2(start_after, count, &t, &more, &rval); + + r = rados_obj.operate(dpp, &op, nullptr, y); + if (r < 0) { + return r; + } + if (t.empty()) { + break; + } + start_after = t.rbegin()->first; + m->insert(t.begin(), t.end()); + } while (more); + return 0; +} + +int RGWSI_SysObj_Core::omap_set(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const std::string& key, + bufferlist& bl, bool must_exist, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + ldpp_dout(dpp, 15) << "omap_set obj=" << obj << " key=" << key << dendl; + + map m; + m[key] = bl; + librados::ObjectWriteOperation op; + if (must_exist) + op.assert_exists(); + op.omap_set(m); + r = rados_obj.operate(dpp, &op, y); + return r; +} + +int RGWSI_SysObj_Core::omap_set(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, + const std::map& m, + bool must_exist, optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + if (must_exist) + op.assert_exists(); + op.omap_set(m); + r = rados_obj.operate(dpp, &op, y); + return r; +} + +int RGWSI_SysObj_Core::omap_del(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const std::string& key, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + set k; + k.insert(key); + + librados::ObjectWriteOperation op; + + op.omap_rm_keys(k); + + r = rados_obj.operate(dpp, &op, y); + return r; +} + +int RGWSI_SysObj_Core::notify(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, bufferlist& bl, + uint64_t timeout_ms, bufferlist *pbl, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + r = rados_obj.notify(dpp, bl, timeout_ms, pbl, y); + return r; +} + +int RGWSI_SysObj_Core::remove(const DoutPrefixProvider *dpp, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + + op.remove(); + r = rados_obj.operate(dpp, &op, y); + if (r < 0) + return r; + + return 0; +} + +int RGWSI_SysObj_Core::write(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + real_time *pmtime, + map& attrs, + bool exclusive, + const bufferlist& data, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + + if (exclusive) { + op.create(true); // exclusive create + } else { + op.remove(); + op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK); + op.create(false); + } + + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + + if (real_clock::is_zero(set_mtime)) { + set_mtime = real_clock::now(); + } + + struct timespec mtime_ts = real_clock::to_timespec(set_mtime); + op.mtime2(&mtime_ts); + op.write_full(data); + + bufferlist acl_bl; + + for (map::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + } + + r = rados_obj.operate(dpp, &op, y); + if (r < 0) { + return r; + } + + if (objv_tracker) { + objv_tracker->apply_write(); + } + + if (pmtime) { + *pmtime = set_mtime; + } + + return 0; +} + + +int RGWSI_SysObj_Core::write_data(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + const bufferlist& bl, + bool exclusive, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj); + if (r < 0) { + ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + + if (exclusive) { + op.create(true); + } + + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + op.write_full(bl); + r = rados_obj.operate(dpp, &op, y); + if (r < 0) + return r; + + if (objv_tracker) { + objv_tracker->apply_write(); + } + return 0; +} + +int RGWSI_SysObj_Core::pool_list_prefixed_objs(const DoutPrefixProvider *dpp, + const rgw_pool& pool, const string& prefix, + std::function cb) +{ + bool is_truncated; + + auto rados_pool = rados_svc->pool(pool); + + auto op = rados_pool.op(); + + RGWAccessListFilterPrefix filter(prefix); + + int r = op.init(dpp, string(), &filter); + if (r < 0) { + return r; + } + + do { + vector oids; +#define MAX_OBJS_DEFAULT 1000 + int r = op.get_next(dpp, MAX_OBJS_DEFAULT, &oids, &is_truncated); + if (r < 0) { + return r; + } + for (auto& val : oids) { + if (val.size() > prefix.size()) { + cb(val.substr(prefix.size())); + } + } + } while (is_truncated); + + return 0; +} + +int RGWSI_SysObj_Core::pool_list_objects_init(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + const string& marker, + const string& prefix, + RGWSI_SysObj::Pool::ListCtx *_ctx) +{ + _ctx->impl.emplace(prefix); + + auto& ctx = static_cast(*_ctx->impl); + + ctx.pool = rados_svc->pool(pool); + ctx.op = ctx.pool.op(); + + int r = ctx.op.init(dpp, marker, &ctx.filter); + if (r < 0) { + ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl; + return r; + } + return 0; +} + +int RGWSI_SysObj_Core::pool_list_objects_next(const DoutPrefixProvider *dpp, + RGWSI_SysObj::Pool::ListCtx& _ctx, + int max, + vector *oids, + bool *is_truncated) +{ + if (!_ctx.impl) { + return -EINVAL; + } + auto& ctx = static_cast(*_ctx.impl); + int r = ctx.op.get_next(dpp, max, oids, is_truncated); + if (r < 0) { + if(r != -ENOENT) + ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl; + return r; + } + + return oids->size(); +} + +int RGWSI_SysObj_Core::pool_list_objects_get_marker(RGWSI_SysObj::Pool::ListCtx& _ctx, + string *marker) +{ + if (!_ctx.impl) { + return -EINVAL; + } + + auto& ctx = static_cast(*_ctx.impl); + return ctx.op.get_marker(marker); +} diff --git a/src/rgw/services/svc_sys_obj_core.h b/src/rgw/services/svc_sys_obj_core.h new file mode 100644 index 000000000..d02a37eee --- /dev/null +++ b/src/rgw/services/svc_sys_obj_core.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_service.h" + +#include "svc_rados.h" +#include "svc_sys_obj.h" +#include "svc_sys_obj_core_types.h" + + +class RGWSI_Zone; + +struct rgw_cache_entry_info; + +class RGWSI_SysObj_Core : public RGWServiceInstance +{ + friend class RGWServices_Def; + friend class RGWSI_SysObj; + +protected: + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_Zone *zone_svc{nullptr}; + + using GetObjState = RGWSI_SysObj_Core_GetObjState; + using PoolListImplInfo = RGWSI_SysObj_Core_PoolListImplInfo; + + void core_init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc) { + rados_svc = _rados_svc; + zone_svc = _zone_svc; + } + int get_rados_obj(const DoutPrefixProvider *dpp, RGWSI_Zone *zone_svc, const rgw_raw_obj& obj, RGWSI_RADOS::Obj *pobj); + + virtual int raw_stat(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, + uint64_t *psize, real_time *pmtime, + std::map *attrs, + RGWObjVersionTracker *objv_tracker, + optional_yield y); + + virtual int read(const DoutPrefixProvider *dpp, + RGWSI_SysObj_Obj_GetObjState& read_state, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + bufferlist *bl, off_t ofs, off_t end, + ceph::real_time* pmtime, uint64_t* psize, + std::map *attrs, + bool raw_attrs, + rgw_cache_entry_info *cache_info, + boost::optional, + optional_yield y); + + virtual int remove(const DoutPrefixProvider *dpp, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + optional_yield y); + + virtual int write(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + real_time *pmtime, + std::map& attrs, + bool exclusive, + const bufferlist& data, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime, + optional_yield y); + + virtual int write_data(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + const bufferlist& bl, + bool exclusive, + RGWObjVersionTracker *objv_tracker, + optional_yield y); + + virtual int get_attr(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, + const char *name, bufferlist *dest, + optional_yield y); + + virtual int set_attrs(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, + std::map& attrs, + std::map *rmattrs, + RGWObjVersionTracker *objv_tracker, + bool exclusive, optional_yield y); + + virtual int omap_get_all(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::map *m, + optional_yield y); + virtual int omap_get_vals(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + const std::string& marker, + uint64_t count, + std::map *m, + bool *pmore, + optional_yield y); + virtual int omap_set(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, const std::string& key, + bufferlist& bl, bool must_exist, + optional_yield y); + virtual int omap_set(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, + const std::map& m, bool must_exist, + optional_yield y); + virtual int omap_del(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const std::string& key, + optional_yield y); + + virtual int notify(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, bufferlist& bl, + uint64_t timeout_ms, bufferlist *pbl, + optional_yield y); + + virtual int pool_list_prefixed_objs(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + const std::string& prefix, + std::function cb); + + virtual int pool_list_objects_init(const DoutPrefixProvider *dpp, + const rgw_pool& pool, + const std::string& marker, + const std::string& prefix, + RGWSI_SysObj::Pool::ListCtx *ctx); + virtual int pool_list_objects_next(const DoutPrefixProvider *dpp, + RGWSI_SysObj::Pool::ListCtx& ctx, + int max, + std::vector *oids, + bool *is_truncated); + + virtual int pool_list_objects_get_marker(RGWSI_SysObj::Pool::ListCtx& _ctx, + std::string *marker); + + int stat(RGWSI_SysObj_Obj_GetObjState& state, + const rgw_raw_obj& obj, + std::map *attrs, + bool raw_attrs, + real_time *lastmod, + uint64_t *obj_size, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp); + +public: + RGWSI_SysObj_Core(CephContext *cct): RGWServiceInstance(cct) {} + + RGWSI_Zone *get_zone_svc() { + return zone_svc; + } +}; diff --git a/src/rgw/services/svc_sys_obj_core_types.h b/src/rgw/services/svc_sys_obj_core_types.h new file mode 100644 index 000000000..74f489d91 --- /dev/null +++ b/src/rgw/services/svc_sys_obj_core_types.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + + +#include "rgw_service.h" + +#include "svc_rados.h" +#include "svc_sys_obj_types.h" + + + +struct RGWSI_SysObj_Core_GetObjState : public RGWSI_SysObj_Obj_GetObjState { + RGWSI_RADOS::Obj rados_obj; + bool has_rados_obj{false}; + uint64_t last_ver{0}; + + RGWSI_SysObj_Core_GetObjState() {} + + int get_rados_obj(const DoutPrefixProvider *dpp, + RGWSI_RADOS *rados_svc, + RGWSI_Zone *zone_svc, + const rgw_raw_obj& obj, + RGWSI_RADOS::Obj **pobj); +}; + +struct RGWSI_SysObj_Core_PoolListImplInfo : public RGWSI_SysObj_Pool_ListInfo { + RGWSI_RADOS::Pool pool; + RGWSI_RADOS::Pool::List op; + RGWAccessListFilterPrefix filter; + + RGWSI_SysObj_Core_PoolListImplInfo(const std::string& prefix) : op(pool.op()), filter(prefix) {} +}; diff --git a/src/rgw/services/svc_sys_obj_types.h b/src/rgw/services/svc_sys_obj_types.h new file mode 100644 index 000000000..b5bc2d40d --- /dev/null +++ b/src/rgw/services/svc_sys_obj_types.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#pragma once + + +#include "rgw_service.h" + + +struct RGWSI_SysObj_Obj_GetObjState { +}; + +struct RGWSI_SysObj_Pool_ListInfo { +}; diff --git a/src/rgw/services/svc_tier_rados.cc b/src/rgw/services/svc_tier_rados.cc new file mode 100644 index 000000000..ca87e8ace --- /dev/null +++ b/src/rgw/services/svc_tier_rados.cc @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_tier_rados.h" + +using namespace std; + +const std::string MP_META_SUFFIX = ".meta"; + +MultipartMetaFilter::~MultipartMetaFilter() {} + +bool MultipartMetaFilter::filter(const string& name, string& key) { + // the length of the suffix so we can skip past it + static const size_t MP_META_SUFFIX_LEN = MP_META_SUFFIX.length(); + + size_t len = name.size(); + + // make sure there's room for suffix plus at least one more + // character + if (len <= MP_META_SUFFIX_LEN) + return false; + + size_t pos = name.find(MP_META_SUFFIX, len - MP_META_SUFFIX_LEN); + if (pos == string::npos) + return false; + + pos = name.rfind('.', pos - 1); + if (pos == string::npos) + return false; + + key = name.substr(0, pos); + + return true; +} + + diff --git a/src/rgw/services/svc_tier_rados.h b/src/rgw/services/svc_tier_rados.h new file mode 100644 index 000000000..a2036b933 --- /dev/null +++ b/src/rgw/services/svc_tier_rados.h @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include + +#include "rgw_service.h" + +#include "svc_rados.h" + +extern const std::string MP_META_SUFFIX; + +class RGWMPObj { + std::string oid; + std::string prefix; + std::string meta; + std::string upload_id; +public: + RGWMPObj() {} + RGWMPObj(const std::string& _oid, const std::string& _upload_id) { + init(_oid, _upload_id, _upload_id); + } + RGWMPObj(const std::string& _oid, std::optional _upload_id) { + if (_upload_id) { + init(_oid, *_upload_id, *_upload_id); + } else { + from_meta(_oid); + } + } + void init(const std::string& _oid, const std::string& _upload_id) { + init(_oid, _upload_id, _upload_id); + } + void init(const std::string& _oid, const std::string& _upload_id, const std::string& part_unique_str) { + if (_oid.empty()) { + clear(); + return; + } + oid = _oid; + upload_id = _upload_id; + prefix = oid + "."; + meta = prefix + upload_id + MP_META_SUFFIX; + prefix.append(part_unique_str); + } + const std::string& get_meta() const { return meta; } + std::string get_part(int num) const { + char buf[16]; + snprintf(buf, 16, ".%d", num); + std::string s = prefix; + s.append(buf); + return s; + } + std::string get_part(const std::string& part) const { + std::string s = prefix; + s.append("."); + s.append(part); + return s; + } + const std::string& get_upload_id() const { + return upload_id; + } + const std::string& get_key() const { + return oid; + } + bool from_meta(const std::string& meta) { + int end_pos = meta.rfind('.'); // search for ".meta" + if (end_pos < 0) + return false; + int mid_pos = meta.rfind('.', end_pos - 1); // . + if (mid_pos < 0) + return false; + oid = meta.substr(0, mid_pos); + upload_id = meta.substr(mid_pos + 1, end_pos - mid_pos - 1); + init(oid, upload_id, upload_id); + return true; + } + void clear() { + oid = ""; + prefix = ""; + meta = ""; + upload_id = ""; + } + friend std::ostream& operator<<(std::ostream& out, const RGWMPObj& obj) { + return out << "RGWMPObj:{ prefix=" << std::quoted(obj.prefix) << + ", meta=" << std::quoted(obj.meta) << " }"; + } +}; // class RGWMPObj + +/** + * A filter to a) test whether an object name is a multipart meta + * object, and b) filter out just the key used to determine the bucket + * index shard. + * + * Objects for multipart meta have names adorned with an upload id and + * other elements -- specifically a ".", MULTIPART_UPLOAD_ID_PREFIX, + * unique id, and MP_META_SUFFIX. This filter will return true when + * the name provided is such. It will also extract the key used for + * bucket index shard calculation from the adorned name. + */ +class MultipartMetaFilter : public RGWAccessListFilter { +public: + MultipartMetaFilter() {} + + virtual ~MultipartMetaFilter() override; + + /** + * @param name [in] The object name as it appears in the bucket index. + * @param key [out] An output parameter that will contain the bucket + * index key if this entry is in the form of a multipart meta object. + * @return true if the name provided is in the form of a multipart meta + * object, false otherwise + */ + bool filter(const std::string& name, std::string& key) override; +}; + +class RGWSI_Tier_RADOS : public RGWServiceInstance +{ + RGWSI_Zone *zone_svc{nullptr}; + +public: + RGWSI_Tier_RADOS(CephContext *cct): RGWServiceInstance(cct) {} + + void init(RGWSI_Zone *_zone_svc) { + zone_svc = _zone_svc; + } + + static inline bool raw_obj_to_obj(const rgw_bucket& bucket, const rgw_raw_obj& raw_obj, rgw_obj *obj) { + ssize_t pos = raw_obj.oid.find('_', bucket.marker.length()); + if (pos < 0) { + return false; + } + + if (!rgw_obj_key::parse_raw_oid(raw_obj.oid.substr(pos + 1), &obj->key)) { + return false; + } + obj->bucket = bucket; + + return true; + } +}; + diff --git a/src/rgw/services/svc_user.cc b/src/rgw/services/svc_user.cc new file mode 100644 index 000000000..9a07c207b --- /dev/null +++ b/src/rgw/services/svc_user.cc @@ -0,0 +1,11 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + + +#include "svc_user.h" + +RGWSI_User::RGWSI_User(CephContext *cct): RGWServiceInstance(cct) { +} + +RGWSI_User::~RGWSI_User() { +} diff --git a/src/rgw/services/svc_user.h b/src/rgw/services/svc_user.h new file mode 100644 index 000000000..1cb459d31 --- /dev/null +++ b/src/rgw/services/svc_user.h @@ -0,0 +1,127 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "svc_meta_be.h" + +#include "rgw_service.h" + +class RGWUserBuckets; +class RGWGetUserStats_CB; + +class RGWSI_User : public RGWServiceInstance +{ +public: + RGWSI_User(CephContext *cct); + virtual ~RGWSI_User(); + + static std::string get_meta_key(const rgw_user& user) { + return user.to_str(); + } + + static rgw_user user_from_meta_key(const std::string& key) { + return rgw_user(key); + } + + virtual RGWSI_MetaBackend_Handler *get_be_handler() = 0; + + /* base svc_user interfaces */ + + virtual int read_user_info(RGWSI_MetaBackend::Context *ctx, + const rgw_user& user, + RGWUserInfo *info, + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime, + rgw_cache_entry_info * const cache_info, + std::map * const pattrs, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int store_user_info(RGWSI_MetaBackend::Context *ctx, + const RGWUserInfo& info, + RGWUserInfo *old_info, + RGWObjVersionTracker *objv_tracker, + const real_time& mtime, + bool exclusive, + std::map *attrs, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int remove_user_info(RGWSI_MetaBackend::Context *ctx, + const RGWUserInfo& info, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int get_user_info_by_email(RGWSI_MetaBackend::Context *ctx, + const std::string& email, RGWUserInfo *info, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + virtual int get_user_info_by_swift(RGWSI_MetaBackend::Context *ctx, + const std::string& swift_name, + RGWUserInfo *info, /* out */ + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + virtual int get_user_info_by_access_key(RGWSI_MetaBackend::Context *ctx, + const std::string& access_key, + RGWUserInfo *info, + RGWObjVersionTracker* objv_tracker, + real_time *pmtime, + optional_yield y, + const DoutPrefixProvider *dpp) = 0; + + virtual int add_bucket(const DoutPrefixProvider *dpp, + const rgw_user& user, + const rgw_bucket& bucket, + ceph::real_time creation_time, + optional_yield y) = 0; + virtual int remove_bucket(const DoutPrefixProvider *dpp, + const rgw_user& user, + const rgw_bucket& _bucket, optional_yield) = 0; + virtual int list_buckets(const DoutPrefixProvider *dpp, + const rgw_user& user, + const std::string& marker, + const std::string& end_marker, + uint64_t max, + RGWUserBuckets *buckets, + bool *is_truncated, + optional_yield y) = 0; + + virtual int flush_bucket_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, + const RGWBucketEnt& ent, optional_yield y) = 0; + virtual int complete_flush_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, optional_yield y) = 0; + virtual int reset_bucket_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, + optional_yield y) = 0; + virtual int read_stats(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const rgw_user& user, RGWStorageStats *stats, + ceph::real_time *last_stats_sync, /* last time a full stats sync completed */ + ceph::real_time *last_stats_update, + optional_yield y) = 0; /* last time a stats update was done */ + + virtual int read_stats_async(const DoutPrefixProvider *dpp, + const rgw_user& user, RGWGetUserStats_CB *cb) = 0; +}; + diff --git a/src/rgw/services/svc_user_rados.cc b/src/rgw/services/svc_user_rados.cc new file mode 100644 index 000000000..c99af9354 --- /dev/null +++ b/src/rgw/services/svc_user_rados.cc @@ -0,0 +1,968 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include + +#include "svc_user.h" +#include "svc_user_rados.h" +#include "svc_zone.h" +#include "svc_sys_obj.h" +#include "svc_sys_obj_cache.h" +#include "svc_meta.h" +#include "svc_meta_be_sobj.h" +#include "svc_sync_modules.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_tools.h" +#include "rgw_zone.h" +#include "rgw_rados.h" + +#include "cls/user/cls_user_client.h" + +#define dout_subsys ceph_subsys_rgw + +#define RGW_BUCKETS_OBJ_SUFFIX ".buckets" + +using namespace std; + +class RGWSI_User_Module : public RGWSI_MBSObj_Handler_Module { + RGWSI_User_RADOS::Svc& svc; + + const string prefix; +public: + RGWSI_User_Module(RGWSI_User_RADOS::Svc& _svc) : RGWSI_MBSObj_Handler_Module("user"), + svc(_svc) {} + + void get_pool_and_oid(const string& key, rgw_pool *pool, string *oid) override { + if (pool) { + *pool = svc.zone->get_zone_params().user_uid_pool; + } + if (oid) { + *oid = key; + } + } + + const string& get_oid_prefix() override { + return prefix; + } + + bool is_valid_oid(const string& oid) override { + // filter out the user.buckets objects + return !boost::algorithm::ends_with(oid, RGW_BUCKETS_OBJ_SUFFIX); + } + + string key_to_oid(const string& key) override { + return key; + } + + string oid_to_key(const string& oid) override { + return oid; + } +}; + +RGWSI_User_RADOS::RGWSI_User_RADOS(CephContext *cct): RGWSI_User(cct) { +} + +RGWSI_User_RADOS::~RGWSI_User_RADOS() { +} + +void RGWSI_User_RADOS::init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc, RGWSI_SysObj *_sysobj_svc, + RGWSI_SysObj_Cache *_cache_svc, RGWSI_Meta *_meta_svc, + RGWSI_MetaBackend *_meta_be_svc, + RGWSI_SyncModules *_sync_modules_svc) +{ + svc.user = this; + svc.rados = _rados_svc; + svc.zone = _zone_svc; + svc.sysobj = _sysobj_svc; + svc.cache = _cache_svc; + svc.meta = _meta_svc; + svc.meta_be = _meta_be_svc; + svc.sync_modules = _sync_modules_svc; +} + +int RGWSI_User_RADOS::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + uinfo_cache.reset(new RGWChainedCacheImpl); + uinfo_cache->init(svc.cache); + + int r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_SOBJ, &be_handler); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to create be handler: r=" << r << dendl; + return r; + } + + RGWSI_MetaBackend_Handler_SObj *bh = static_cast(be_handler); + + auto module = new RGWSI_User_Module(svc); + be_module.reset(module); + bh->set_module(module); + return 0; +} + +rgw_raw_obj RGWSI_User_RADOS::get_buckets_obj(const rgw_user& user) const +{ + string oid = user.to_str() + RGW_BUCKETS_OBJ_SUFFIX; + return rgw_raw_obj(svc.zone->get_zone_params().user_uid_pool, oid); +} + +int RGWSI_User_RADOS::read_user_info(RGWSI_MetaBackend::Context *ctx, + const rgw_user& user, + RGWUserInfo *info, + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime, + rgw_cache_entry_info * const cache_info, + map * const pattrs, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + if(user.id == RGW_USER_ANON_ID) { + ldpp_dout(dpp, 20) << "RGWSI_User_RADOS::read_user_info(): anonymous user" << dendl; + return -ENOENT; + } + bufferlist bl; + RGWUID user_id; + + RGWSI_MBSObj_GetParams params(&bl, pattrs, pmtime); + params.set_cache_info(cache_info); + + int ret = svc.meta_be->get_entry(ctx, get_meta_key(user), params, objv_tracker, y, dpp); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(user_id, iter); + if (user_id.user_id != user) { + ldpp_dout(dpp, -1) << "ERROR: rgw_get_user_info_by_uid(): user id mismatch: " << user_id.user_id << " != " << user << dendl; + return -EIO; + } + if (!iter.end()) { + decode(*info, iter); + } + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode user info, caught buffer::error" << dendl; + return -EIO; + } + + return 0; +} + +class PutOperation +{ + RGWSI_User_RADOS::Svc& svc; + RGWSI_MetaBackend_SObj::Context_SObj *ctx; + RGWUID ui; + const RGWUserInfo& info; + RGWUserInfo *old_info; + RGWObjVersionTracker *objv_tracker; + const real_time& mtime; + bool exclusive; + map *pattrs; + RGWObjVersionTracker ot; + string err_msg; + optional_yield y; + + void set_err_msg(string msg) { + if (!err_msg.empty()) { + err_msg = std::move(msg); + } + } + +public: + PutOperation(RGWSI_User_RADOS::Svc& svc, + RGWSI_MetaBackend::Context *_ctx, + const RGWUserInfo& info, + RGWUserInfo *old_info, + RGWObjVersionTracker *objv_tracker, + const real_time& mtime, + bool exclusive, + map *pattrs, + optional_yield y) : + svc(svc), info(info), old_info(old_info), + objv_tracker(objv_tracker), mtime(mtime), + exclusive(exclusive), pattrs(pattrs), y(y) { + ctx = static_cast(_ctx); + ui.user_id = info.user_id; + } + + int prepare(const DoutPrefixProvider *dpp) { + if (objv_tracker) { + ot = *objv_tracker; + } + + if (ot.write_version.tag.empty()) { + if (ot.read_version.tag.empty()) { + ot.generate_new_write_ver(svc.meta_be->ctx()); + } else { + ot.write_version = ot.read_version; + ot.write_version.ver++; + } + } + + for (auto iter = info.swift_keys.begin(); iter != info.swift_keys.end(); ++iter) { + if (old_info && old_info->swift_keys.count(iter->first) != 0) + continue; + auto& k = iter->second; + /* check if swift mapping exists */ + RGWUserInfo inf; + int r = svc.user->get_user_info_by_swift(ctx, k.id, &inf, nullptr, nullptr, y, dpp); + if (r >= 0 && inf.user_id != info.user_id && + (!old_info || inf.user_id != old_info->user_id)) { + ldpp_dout(dpp, 0) << "WARNING: can't store user info, swift id (" << k.id + << ") already mapped to another user (" << info.user_id << ")" << dendl; + return -EEXIST; + } + } + + /* check if access keys already exist */ + for (auto iter = info.access_keys.begin(); iter != info.access_keys.end(); ++iter) { + if (old_info && old_info->access_keys.count(iter->first) != 0) + continue; + auto& k = iter->second; + RGWUserInfo inf; + int r = svc.user->get_user_info_by_access_key(ctx, k.id, &inf, nullptr, nullptr, y, dpp); + if (r >= 0 && inf.user_id != info.user_id && + (!old_info || inf.user_id != old_info->user_id)) { + ldpp_dout(dpp, 0) << "WARNING: can't store user info, access key already mapped to another user" << dendl; + return -EEXIST; + } + } + + return 0; + } + + int put(const DoutPrefixProvider *dpp) { + bufferlist data_bl; + encode(ui, data_bl); + encode(info, data_bl); + + RGWSI_MBSObj_PutParams params(data_bl, pattrs, mtime, exclusive); + + int ret = svc.meta_be->put(ctx, RGWSI_User::get_meta_key(info.user_id), params, &ot, y, dpp); + if (ret < 0) + return ret; + + return 0; + } + + int complete(const DoutPrefixProvider *dpp) { + int ret; + + bufferlist link_bl; + encode(ui, link_bl); + + if (!info.user_email.empty()) { + if (!old_info || + old_info->user_email.compare(info.user_email) != 0) { /* only if new index changed */ + ret = rgw_put_system_obj(dpp, svc.sysobj, svc.zone->get_zone_params().user_email_pool, info.user_email, + link_bl, exclusive, NULL, real_time(), y); + if (ret < 0) + return ret; + } + } + + const bool renamed = old_info && old_info->user_id != info.user_id; + for (auto iter = info.access_keys.begin(); iter != info.access_keys.end(); ++iter) { + auto& k = iter->second; + if (old_info && old_info->access_keys.count(iter->first) != 0 && !renamed) + continue; + + ret = rgw_put_system_obj(dpp, svc.sysobj, svc.zone->get_zone_params().user_keys_pool, k.id, + link_bl, exclusive, NULL, real_time(), y); + if (ret < 0) + return ret; + } + + for (auto siter = info.swift_keys.begin(); siter != info.swift_keys.end(); ++siter) { + auto& k = siter->second; + if (old_info && old_info->swift_keys.count(siter->first) != 0 && !renamed) + continue; + + ret = rgw_put_system_obj(dpp, svc.sysobj, svc.zone->get_zone_params().user_swift_pool, k.id, + link_bl, exclusive, NULL, real_time(), y); + if (ret < 0) + return ret; + } + + if (old_info) { + ret = remove_old_indexes(*old_info, info, y, dpp); + if (ret < 0) { + return ret; + } + } + + return 0; + } + + int remove_old_indexes(const RGWUserInfo& old_info, const RGWUserInfo& new_info, optional_yield y, const DoutPrefixProvider *dpp) { + int ret; + + if (!old_info.user_id.empty() && + old_info.user_id != new_info.user_id) { + if (old_info.user_id.tenant != new_info.user_id.tenant) { + ldpp_dout(dpp, 0) << "ERROR: tenant mismatch: " << old_info.user_id.tenant << " != " << new_info.user_id.tenant << dendl; + return -EINVAL; + } + ret = svc.user->remove_uid_index(ctx, old_info, nullptr, y, dpp); + if (ret < 0 && ret != -ENOENT) { + set_err_msg("ERROR: could not remove index for uid " + old_info.user_id.to_str()); + return ret; + } + } + + if (!old_info.user_email.empty() && + old_info.user_email != new_info.user_email) { + ret = svc.user->remove_email_index(dpp, old_info.user_email, y); + if (ret < 0 && ret != -ENOENT) { + set_err_msg("ERROR: could not remove index for email " + old_info.user_email); + return ret; + } + } + + for ([[maybe_unused]] const auto& [name, access_key] : old_info.access_keys) { + if (!new_info.access_keys.count(access_key.id)) { + ret = svc.user->remove_key_index(dpp, access_key, y); + if (ret < 0 && ret != -ENOENT) { + set_err_msg("ERROR: could not remove index for key " + access_key.id); + return ret; + } + } + } + + for (auto old_iter = old_info.swift_keys.begin(); old_iter != old_info.swift_keys.end(); ++old_iter) { + const auto& swift_key = old_iter->second; + auto new_iter = new_info.swift_keys.find(swift_key.id); + if (new_iter == new_info.swift_keys.end()) { + ret = svc.user->remove_swift_name_index(dpp, swift_key.id, y); + if (ret < 0 && ret != -ENOENT) { + set_err_msg("ERROR: could not remove index for swift_name " + swift_key.id); + return ret; + } + } + } + + return 0; + } + + const string& get_err_msg() { + return err_msg; + } +}; + +int RGWSI_User_RADOS::store_user_info(RGWSI_MetaBackend::Context *ctx, + const RGWUserInfo& info, + RGWUserInfo *old_info, + RGWObjVersionTracker *objv_tracker, + const real_time& mtime, + bool exclusive, + map *attrs, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + PutOperation op(svc, ctx, + info, old_info, + objv_tracker, + mtime, exclusive, + attrs, + y); + + int r = op.prepare(dpp); + if (r < 0) { + return r; + } + + r = op.put(dpp); + if (r < 0) { + return r; + } + + r = op.complete(dpp); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_User_RADOS::remove_key_index(const DoutPrefixProvider *dpp, + const RGWAccessKey& access_key, + optional_yield y) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().user_keys_pool, access_key.id); + auto sysobj = svc.sysobj->get_obj(obj); + return sysobj.wop().remove(dpp, y); +} + +int RGWSI_User_RADOS::remove_email_index(const DoutPrefixProvider *dpp, + const string& email, + optional_yield y) +{ + if (email.empty()) { + return 0; + } + rgw_raw_obj obj(svc.zone->get_zone_params().user_email_pool, email); + auto sysobj = svc.sysobj->get_obj(obj); + return sysobj.wop().remove(dpp, y); +} + +int RGWSI_User_RADOS::remove_swift_name_index(const DoutPrefixProvider *dpp, + const string& swift_name, + optional_yield y) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().user_swift_pool, swift_name); + auto sysobj = svc.sysobj->get_obj(obj); + return sysobj.wop().remove(dpp, y); +} + +/** + * delete a user's presence from the RGW system. + * First remove their bucket ACLs, then delete them + * from the user and user email pools. This leaves the pools + * themselves alone, as well as any ACLs embedded in object xattrs. + */ +int RGWSI_User_RADOS::remove_user_info(RGWSI_MetaBackend::Context *ctx, + const RGWUserInfo& info, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + int ret; + + auto kiter = info.access_keys.begin(); + for (; kiter != info.access_keys.end(); ++kiter) { + ldpp_dout(dpp, 10) << "removing key index: " << kiter->first << dendl; + ret = remove_key_index(dpp, kiter->second, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: could not remove " << kiter->first << " (access key object), should be fixed (err=" << ret << ")" << dendl; + return ret; + } + } + + auto siter = info.swift_keys.begin(); + for (; siter != info.swift_keys.end(); ++siter) { + auto& k = siter->second; + ldpp_dout(dpp, 10) << "removing swift subuser index: " << k.id << dendl; + /* check if swift mapping exists */ + ret = remove_swift_name_index(dpp, k.id, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: could not remove " << k.id << " (swift name object), should be fixed (err=" << ret << ")" << dendl; + return ret; + } + } + + ldpp_dout(dpp, 10) << "removing email index: " << info.user_email << dendl; + ret = remove_email_index(dpp, info.user_email, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: could not remove email index object for " + << info.user_email << ", should be fixed (err=" << ret << ")" << dendl; + return ret; + } + + rgw_raw_obj uid_bucks = get_buckets_obj(info.user_id); + ldpp_dout(dpp, 10) << "removing user buckets index" << dendl; + auto sysobj = svc.sysobj->get_obj(uid_bucks); + ret = sysobj.wop().remove(dpp, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: could not remove " << info.user_id << ":" << uid_bucks << ", should be fixed (err=" << ret << ")" << dendl; + return ret; + } + + ret = remove_uid_index(ctx, info, objv_tracker, y, dpp); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + + return 0; +} + +int RGWSI_User_RADOS::remove_uid_index(RGWSI_MetaBackend::Context *ctx, const RGWUserInfo& user_info, RGWObjVersionTracker *objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp) +{ + ldpp_dout(dpp, 10) << "removing user index: " << user_info.user_id << dendl; + + RGWSI_MBSObj_RemoveParams params; + int ret = svc.meta_be->remove(ctx, get_meta_key(user_info.user_id), params, objv_tracker, y, dpp); + if (ret < 0 && ret != -ENOENT && ret != -ECANCELED) { + string key; + user_info.user_id.to_str(key); + rgw_raw_obj uid_obj(svc.zone->get_zone_params().user_uid_pool, key); + ldpp_dout(dpp, 0) << "ERROR: could not remove " << user_info.user_id << ":" << uid_obj << ", should be fixed (err=" << ret << ")" << dendl; + return ret; + } + + return 0; +} + +int RGWSI_User_RADOS::get_user_info_from_index(RGWSI_MetaBackend::Context* ctx, + const string& key, + const rgw_pool& pool, + RGWUserInfo *info, + RGWObjVersionTracker* objv_tracker, + real_time* pmtime, optional_yield y, + const DoutPrefixProvider* dpp) +{ + string cache_key = pool.to_str() + "/" + key; + + if (auto e = uinfo_cache->find(cache_key)) { + *info = e->info; + if (objv_tracker) + *objv_tracker = e->objv_tracker; + if (pmtime) + *pmtime = e->mtime; + return 0; + } + + user_info_cache_entry e; + bufferlist bl; + RGWUID uid; + + int ret = rgw_get_system_obj(svc.sysobj, pool, key, bl, nullptr, &e.mtime, y, dpp); + if (ret < 0) + return ret; + + rgw_cache_entry_info cache_info; + + auto iter = bl.cbegin(); + try { + decode(uid, iter); + + int ret = read_user_info(ctx, uid.user_id, + &e.info, &e.objv_tracker, nullptr, &cache_info, nullptr, + y, dpp); + if (ret < 0) { + return ret; + } + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode user info, caught buffer::error" << dendl; + return -EIO; + } + + uinfo_cache->put(dpp, svc.cache, cache_key, &e, { &cache_info }); + + *info = e.info; + if (objv_tracker) + *objv_tracker = e.objv_tracker; + if (pmtime) + *pmtime = e.mtime; + + return 0; +} + +/** + * Given an email, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +int RGWSI_User_RADOS::get_user_info_by_email(RGWSI_MetaBackend::Context *ctx, + const string& email, RGWUserInfo *info, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, optional_yield y, + const DoutPrefixProvider *dpp) +{ + return get_user_info_from_index(ctx, email, svc.zone->get_zone_params().user_email_pool, + info, objv_tracker, pmtime, y, dpp); +} + +/** + * Given an swift username, finds the user_info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +int RGWSI_User_RADOS::get_user_info_by_swift(RGWSI_MetaBackend::Context *ctx, + const string& swift_name, + RGWUserInfo *info, /* out */ + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime, optional_yield y, + const DoutPrefixProvider *dpp) +{ + return get_user_info_from_index(ctx, + swift_name, + svc.zone->get_zone_params().user_swift_pool, + info, objv_tracker, pmtime, y, dpp); +} + +/** + * Given an access key, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +int RGWSI_User_RADOS::get_user_info_by_access_key(RGWSI_MetaBackend::Context *ctx, + const std::string& access_key, + RGWUserInfo *info, + RGWObjVersionTracker* objv_tracker, + real_time *pmtime, optional_yield y, + const DoutPrefixProvider *dpp) +{ + return get_user_info_from_index(ctx, + access_key, + svc.zone->get_zone_params().user_keys_pool, + info, objv_tracker, pmtime, y, dpp); +} + +int RGWSI_User_RADOS::cls_user_update_buckets(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, list& entries, bool add, optional_yield y) +{ + auto rados_obj = svc.rados->obj(obj); + int r = rados_obj.open(dpp); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + cls_user_set_buckets(op, entries, add); + r = rados_obj.operate(dpp, &op, y); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_User_RADOS::cls_user_add_bucket(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, const cls_user_bucket_entry& entry, optional_yield y) +{ + list l; + l.push_back(entry); + + return cls_user_update_buckets(dpp, obj, l, true, y); +} + +int RGWSI_User_RADOS::cls_user_remove_bucket(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, const cls_user_bucket& bucket, optional_yield y) +{ + auto rados_obj = svc.rados->obj(obj); + int r = rados_obj.open(dpp); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + ::cls_user_remove_bucket(op, bucket); + r = rados_obj.operate(dpp, &op, y); + if (r < 0) + return r; + + return 0; +} + +int RGWSI_User_RADOS::add_bucket(const DoutPrefixProvider *dpp, + const rgw_user& user, + const rgw_bucket& bucket, + ceph::real_time creation_time, + optional_yield y) +{ + int ret; + + cls_user_bucket_entry new_bucket; + + bucket.convert(&new_bucket.bucket); + new_bucket.size = 0; + if (real_clock::is_zero(creation_time)) + new_bucket.creation_time = real_clock::now(); + else + new_bucket.creation_time = creation_time; + + rgw_raw_obj obj = get_buckets_obj(user); + ret = cls_user_add_bucket(dpp, obj, new_bucket, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: error adding bucket to user: ret=" << ret << dendl; + return ret; + } + + return 0; +} + + +int RGWSI_User_RADOS::remove_bucket(const DoutPrefixProvider *dpp, + const rgw_user& user, + const rgw_bucket& _bucket, + optional_yield y) +{ + cls_user_bucket bucket; + bucket.name = _bucket.name; + rgw_raw_obj obj = get_buckets_obj(user); + int ret = cls_user_remove_bucket(dpp, obj, bucket, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: error removing bucket from user: ret=" << ret << dendl; + } + + return 0; +} + +int RGWSI_User_RADOS::cls_user_flush_bucket_stats(const DoutPrefixProvider *dpp, + rgw_raw_obj& user_obj, + const RGWBucketEnt& ent, optional_yield y) +{ + cls_user_bucket_entry entry; + ent.convert(&entry); + + list entries; + entries.push_back(entry); + + int r = cls_user_update_buckets(dpp, user_obj, entries, false, y); + if (r < 0) { + ldpp_dout(dpp, 20) << "cls_user_update_buckets() returned " << r << dendl; + return r; + } + + return 0; +} + +int RGWSI_User_RADOS::cls_user_list_buckets(const DoutPrefixProvider *dpp, + rgw_raw_obj& obj, + const string& in_marker, + const string& end_marker, + const int max_entries, + list& entries, + string * const out_marker, + bool * const truncated, + optional_yield y) +{ + auto rados_obj = svc.rados->obj(obj); + int r = rados_obj.open(dpp); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + int rc; + + cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc); + bufferlist ibl; + r = rados_obj.operate(dpp, &op, &ibl, y); + if (r < 0) + return r; + if (rc < 0) + return rc; + + return 0; +} + +int RGWSI_User_RADOS::list_buckets(const DoutPrefixProvider *dpp, + const rgw_user& user, + const string& marker, + const string& end_marker, + uint64_t max, + RGWUserBuckets *buckets, + bool *is_truncated, optional_yield y) +{ + int ret; + + buckets->clear(); + if (user.id == RGW_USER_ANON_ID) { + ldpp_dout(dpp, 20) << "RGWSI_User_RADOS::list_buckets(): anonymous user" << dendl; + *is_truncated = false; + return 0; + } + rgw_raw_obj obj = get_buckets_obj(user); + + bool truncated = false; + string m = marker; + + uint64_t total = 0; + + do { + std::list entries; + ret = cls_user_list_buckets(dpp, obj, m, end_marker, max - total, entries, &m, &truncated, y); + if (ret == -ENOENT) { + ret = 0; + } + + if (ret < 0) { + return ret; + } + + for (auto& entry : entries) { + buckets->add(RGWBucketEnt(user, std::move(entry))); + total++; + } + + } while (truncated && total < max); + + if (is_truncated) { + *is_truncated = truncated; + } + + return 0; +} + +int RGWSI_User_RADOS::flush_bucket_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, + const RGWBucketEnt& ent, + optional_yield y) +{ + rgw_raw_obj obj = get_buckets_obj(user); + + return cls_user_flush_bucket_stats(dpp, obj, ent, y); +} + +int RGWSI_User_RADOS::reset_bucket_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, + optional_yield y) +{ + return cls_user_reset_stats(dpp, user, y); +} + +int RGWSI_User_RADOS::cls_user_reset_stats(const DoutPrefixProvider *dpp, const rgw_user& user, optional_yield y) +{ + rgw_raw_obj obj = get_buckets_obj(user); + auto rados_obj = svc.rados->obj(obj); + int rval, r = rados_obj.open(dpp); + if (r < 0) { + return r; + } + + cls_user_reset_stats2_op call; + cls_user_reset_stats2_ret ret; + + do { + buffer::list in, out; + librados::ObjectWriteOperation op; + + call.time = real_clock::now(); + ret.update_call(call); + + encode(call, in); + op.exec("user", "reset_user_stats2", in, &out, &rval); + r = rados_obj.operate(dpp, &op, y, librados::OPERATION_RETURNVEC); + if (r < 0) { + return r; + } + try { + auto bliter = out.cbegin(); + decode(ret, bliter); + } catch (ceph::buffer::error& err) { + return -EINVAL; + } + } while (ret.truncated); + + return rval; +} + +int RGWSI_User_RADOS::complete_flush_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, optional_yield y) +{ + rgw_raw_obj obj = get_buckets_obj(user); + auto rados_obj = svc.rados->obj(obj); + int r = rados_obj.open(dpp); + if (r < 0) { + return r; + } + librados::ObjectWriteOperation op; + ::cls_user_complete_stats_sync(op); + return rados_obj.operate(dpp, &op, y); +} + +int RGWSI_User_RADOS::cls_user_get_header(const DoutPrefixProvider *dpp, + const rgw_user& user, cls_user_header *header, + optional_yield y) +{ + rgw_raw_obj obj = get_buckets_obj(user); + auto rados_obj = svc.rados->obj(obj); + int r = rados_obj.open(dpp); + if (r < 0) { + return r; + } + int rc; + bufferlist ibl; + librados::ObjectReadOperation op; + ::cls_user_get_header(op, header, &rc); + return rados_obj.operate(dpp, &op, &ibl, y); +} + +int RGWSI_User_RADOS::cls_user_get_header_async(const DoutPrefixProvider *dpp, const string& user_str, RGWGetUserHeader_CB *cb) +{ + rgw_raw_obj obj = get_buckets_obj(rgw_user(user_str)); + auto rados_obj = svc.rados->obj(obj); + int r = rados_obj.open(dpp); + if (r < 0) { + return r; + } + + auto& ref = rados_obj.get_ref(); + + r = ::cls_user_get_header_async(ref.pool.ioctx(), ref.obj.oid, cb); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_User_RADOS::read_stats(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const rgw_user& user, RGWStorageStats *stats, + ceph::real_time *last_stats_sync, + ceph::real_time *last_stats_update, + optional_yield y) +{ + string user_str = user.to_str(); + + RGWUserInfo info; + real_time mtime; + int ret = read_user_info(ctx, user, &info, nullptr, &mtime, nullptr, nullptr, y, dpp); + if (ret < 0) + { + return ret; + } + + cls_user_header header; + int r = cls_user_get_header(dpp, rgw_user(user_str), &header, y); + if (r < 0 && r != -ENOENT) + return r; + + const cls_user_stats& hs = header.stats; + + stats->size = hs.total_bytes; + stats->size_rounded = hs.total_bytes_rounded; + stats->num_objects = hs.total_entries; + + if (last_stats_sync) { + *last_stats_sync = header.last_stats_sync; + } + + if (last_stats_update) { + *last_stats_update = header.last_stats_update; + } + + return 0; +} + +class RGWGetUserStatsContext : public RGWGetUserHeader_CB { + RGWGetUserStats_CB *cb; + +public: + explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb) + : cb(cb) {} + + void handle_response(int r, cls_user_header& header) override { + const cls_user_stats& hs = header.stats; + if (r >= 0) { + RGWStorageStats stats; + + stats.size = hs.total_bytes; + stats.size_rounded = hs.total_bytes_rounded; + stats.num_objects = hs.total_entries; + + cb->set_response(stats); + } + + cb->handle_response(r); + + cb->put(); + } +}; + +int RGWSI_User_RADOS::read_stats_async(const DoutPrefixProvider *dpp, + const rgw_user& user, RGWGetUserStats_CB *_cb) +{ + string user_str = user.to_str(); + + RGWGetUserStatsContext *cb = new RGWGetUserStatsContext(_cb); + int r = cls_user_get_header_async(dpp, user_str, cb); + if (r < 0) { + delete cb; + return r; + } + + return 0; +} + diff --git a/src/rgw/services/svc_user_rados.h b/src/rgw/services/svc_user_rados.h new file mode 100644 index 000000000..177f720d6 --- /dev/null +++ b/src/rgw/services/svc_user_rados.h @@ -0,0 +1,211 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include "rgw_service.h" + +#include "svc_meta_be.h" +#include "svc_user.h" +#include "rgw_bucket.h" + +class RGWSI_RADOS; +class RGWSI_Zone; +class RGWSI_SysObj; +class RGWSI_SysObj_Cache; +class RGWSI_Meta; +class RGWSI_SyncModules; +class RGWSI_MetaBackend_Handler; + +struct rgw_cache_entry_info; + +class RGWGetUserHeader_CB; +class RGWGetUserStats_CB; + +template +class RGWChainedCacheImpl; + +class RGWSI_User_RADOS : public RGWSI_User +{ + friend class PutOperation; + + std::unique_ptr be_module; + RGWSI_MetaBackend_Handler *be_handler; + + struct user_info_cache_entry { + RGWUserInfo info; + RGWObjVersionTracker objv_tracker; + real_time mtime; + }; + + using RGWChainedCacheImpl_user_info_cache_entry = RGWChainedCacheImpl; + std::unique_ptr uinfo_cache; + + rgw_raw_obj get_buckets_obj(const rgw_user& user_id) const; + + int get_user_info_from_index(RGWSI_MetaBackend::Context *ctx, + const std::string& key, + const rgw_pool& pool, + RGWUserInfo *info, + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime, + optional_yield y, + const DoutPrefixProvider *dpp); + + int remove_uid_index(RGWSI_MetaBackend::Context *ctx, const RGWUserInfo& user_info, RGWObjVersionTracker *objv_tracker, + optional_yield y, const DoutPrefixProvider *dpp); + + int remove_key_index(const DoutPrefixProvider *dpp, const RGWAccessKey& access_key, optional_yield y); + int remove_email_index(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y); + int remove_swift_name_index(const DoutPrefixProvider *dpp, const std::string& swift_name, optional_yield y); + + /* admin management */ + int cls_user_update_buckets(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, std::list& entries, bool add, optional_yield y); + int cls_user_add_bucket(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, const cls_user_bucket_entry& entry, optional_yield y); + int cls_user_remove_bucket(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, const cls_user_bucket& bucket, optional_yield y); + + /* quota stats */ + int cls_user_flush_bucket_stats(const DoutPrefixProvider *dpp, rgw_raw_obj& user_obj, + const RGWBucketEnt& ent, optional_yield y); + int cls_user_list_buckets(const DoutPrefixProvider *dpp, + rgw_raw_obj& obj, + const std::string& in_marker, + const std::string& end_marker, + const int max_entries, + std::list& entries, + std::string * const out_marker, + bool * const truncated, + optional_yield y); + + int cls_user_reset_stats(const DoutPrefixProvider *dpp, const rgw_user& user, optional_yield y); + int cls_user_get_header(const DoutPrefixProvider *dpp, const rgw_user& user, cls_user_header *header, optional_yield y); + int cls_user_get_header_async(const DoutPrefixProvider *dpp, const std::string& user, RGWGetUserHeader_CB *cb); + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; +public: + struct Svc { + RGWSI_User_RADOS *user{nullptr}; + RGWSI_RADOS *rados{nullptr}; + RGWSI_Zone *zone{nullptr}; + RGWSI_SysObj *sysobj{nullptr}; + RGWSI_SysObj_Cache *cache{nullptr}; + RGWSI_Meta *meta{nullptr}; + RGWSI_MetaBackend *meta_be{nullptr}; + RGWSI_SyncModules *sync_modules{nullptr}; + } svc; + + RGWSI_User_RADOS(CephContext *cct); + ~RGWSI_User_RADOS(); + + void init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc, RGWSI_SysObj *_sysobj_svc, + RGWSI_SysObj_Cache *_cache_svc, RGWSI_Meta *_meta_svc, + RGWSI_MetaBackend *_meta_be_svc, + RGWSI_SyncModules *_sync_modules); + + RGWSI_MetaBackend_Handler *get_be_handler() override { + return be_handler; + } + + int read_user_info(RGWSI_MetaBackend::Context *ctx, + const rgw_user& user, + RGWUserInfo *info, + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime, + rgw_cache_entry_info * const cache_info, + std::map * const pattrs, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int store_user_info(RGWSI_MetaBackend::Context *ctx, + const RGWUserInfo& info, + RGWUserInfo *old_info, + RGWObjVersionTracker *objv_tracker, + const real_time& mtime, + bool exclusive, + std::map *attrs, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int remove_user_info(RGWSI_MetaBackend::Context *ctx, + const RGWUserInfo& info, + RGWObjVersionTracker *objv_tracker, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + int get_user_info_by_email(RGWSI_MetaBackend::Context *ctx, + const std::string& email, RGWUserInfo *info, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, + optional_yield y, + const DoutPrefixProvider *dpp) override; + int get_user_info_by_swift(RGWSI_MetaBackend::Context *ctx, + const std::string& swift_name, + RGWUserInfo *info, /* out */ + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime, + optional_yield y, + const DoutPrefixProvider *dpp) override; + int get_user_info_by_access_key(RGWSI_MetaBackend::Context *ctx, + const std::string& access_key, + RGWUserInfo *info, + RGWObjVersionTracker* objv_tracker, + real_time *pmtime, + optional_yield y, + const DoutPrefixProvider *dpp) override; + + /* user buckets directory */ + + int add_bucket(const DoutPrefixProvider *dpp, + const rgw_user& user, + const rgw_bucket& bucket, + ceph::real_time creation_time, + optional_yield y) override; + int remove_bucket(const DoutPrefixProvider *dpp, + const rgw_user& user, + const rgw_bucket& _bucket, + optional_yield y) override; + int list_buckets(const DoutPrefixProvider *dpp, + const rgw_user& user, + const std::string& marker, + const std::string& end_marker, + uint64_t max, + RGWUserBuckets *buckets, + bool *is_truncated, + optional_yield y) override; + + /* quota related */ + int flush_bucket_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, + const RGWBucketEnt& ent, optional_yield y) override; + + int complete_flush_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, optional_yield y) override; + + int reset_bucket_stats(const DoutPrefixProvider *dpp, + const rgw_user& user, + optional_yield y) override; + int read_stats(const DoutPrefixProvider *dpp, + RGWSI_MetaBackend::Context *ctx, + const rgw_user& user, RGWStorageStats *stats, + ceph::real_time *last_stats_sync, /* last time a full stats sync completed */ + ceph::real_time *last_stats_update, + optional_yield y) override; /* last time a stats update was done */ + + int read_stats_async(const DoutPrefixProvider *dpp, const rgw_user& user, + RGWGetUserStats_CB *cb) override; +}; + diff --git a/src/rgw/services/svc_zone.cc b/src/rgw/services/svc_zone.cc new file mode 100644 index 000000000..180d93712 --- /dev/null +++ b/src/rgw/services/svc_zone.cc @@ -0,0 +1,1100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_zone.h" +#include "svc_rados.h" +#include "svc_sys_obj.h" +#include "svc_sync_modules.h" + +#include "rgw_zone.h" +#include "rgw_rest_conn.h" +#include "rgw_bucket_sync.h" + +#include "common/errno.h" +#include "include/random.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace rgw_zone_defaults; + +RGWSI_Zone::RGWSI_Zone(CephContext *cct) : RGWServiceInstance(cct) +{ +} + +void RGWSI_Zone::init(RGWSI_SysObj *_sysobj_svc, + RGWSI_RADOS * _rados_svc, + RGWSI_SyncModules * _sync_modules_svc, + RGWSI_Bucket_Sync *_bucket_sync_svc) +{ + sysobj_svc = _sysobj_svc; + rados_svc = _rados_svc; + sync_modules_svc = _sync_modules_svc; + bucket_sync_svc = _bucket_sync_svc; + + realm = new RGWRealm(); + zonegroup = new RGWZoneGroup(); + zone_public_config = new RGWZone(); + zone_params = new RGWZoneParams(); + current_period = new RGWPeriod(); +} + +RGWSI_Zone::~RGWSI_Zone() +{ + delete realm; + delete zonegroup; + delete zone_public_config; + delete zone_params; + delete current_period; +} + +std::shared_ptr RGWSI_Zone::get_sync_policy_handler(std::optional zone) const { + if (!zone || *zone == zone_id()) { + return sync_policy_handler; + } + auto iter = sync_policy_handlers.find(*zone); + if (iter == sync_policy_handlers.end()) { + return std::shared_ptr(); + } + return iter->second; +} + +bool RGWSI_Zone::zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const +{ + return target_zone.syncs_from(source_zone.name) && + sync_modules_svc->get_manager()->supports_data_export(source_zone.tier_type); +} + +bool RGWSI_Zone::zone_syncs_from(const RGWZone& source_zone) const +{ + auto target_zone = get_zone(); + bool found = false; + + for (auto s : data_sync_source_zones) { + if (s->id == source_zone.id) { + found = true; + break; + } + } + return found && target_zone.syncs_from(source_zone.name) && + sync_modules_svc->get_manager()->supports_data_export(source_zone.tier_type); +} + +int RGWSI_Zone::search_realm_with_zone(const DoutPrefixProvider *dpp, + const rgw_zone_id& zid, + RGWRealm *prealm, + RGWPeriod *pperiod, + RGWZoneGroup *pzonegroup, + bool *pfound, + optional_yield y) +{ + auto& found = *pfound; + + found = false; + + list realms; + int r = list_realms(dpp, realms); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to list realms: r=" << r << dendl; + return r; + } + + for (auto& realm_name : realms) { + string realm_id; + RGWRealm realm(realm_id, realm_name); + r = realm.init(dpp, cct, sysobj_svc, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: can't open realm " << realm_name << ": " << cpp_strerror(-r) << " ... skipping" << dendl; + continue; + } + + r = realm.find_zone(dpp, zid, pperiod, + pzonegroup, &found, y); + if (r < 0) { + ldpp_dout(dpp, 20) << __func__ << "(): ERROR: realm.find_zone() returned r=" << r<< dendl; + return r; + } + + if (found) { + *prealm = realm; + ldpp_dout(dpp, 20) << __func__ << "(): found realm_id=" << realm_id << " realm_name=" << realm_name << dendl; + return 0; + } + } + + return 0; +} + +int RGWSI_Zone::do_start(optional_yield y, const DoutPrefixProvider *dpp) +{ + int ret = sysobj_svc->start(y, dpp); + if (ret < 0) { + return ret; + } + + assert(sysobj_svc->is_started()); /* if not then there's ordering issue */ + + ret = rados_svc->start(y, dpp); + if (ret < 0) { + return ret; + } + + ret = realm->init(dpp, cct, sysobj_svc, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } + + ldpp_dout(dpp, 20) << "realm " << realm->get_name() << " " << realm->get_id() << dendl; + ret = current_period->init(dpp, cct, sysobj_svc, realm->get_id(), y, + realm->get_name()); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = zone_params->init(dpp, cct, sysobj_svc, y); + bool found_zone = (ret == 0); + if (ret < 0 && ret != -ENOENT) { + lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } + + cur_zone_id = rgw_zone_id(zone_params->get_id()); + + bool found_period_conf = false; + + /* try to find zone in period config (if we have one) */ + if (found_zone && + !current_period->get_id().empty()) { + found_period_conf = current_period->find_zone(dpp, + cur_zone_id, + zonegroup, + y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: current_period->find_zone() returned ret=" << ret << dendl; + return ret; + } + if (!found_period_conf) { + ldpp_dout(dpp, 0) << "period (" << current_period->get_id() << " does not have zone " << cur_zone_id << " configured" << dendl; + } + } + + RGWRealm search_realm; + + if (found_zone && + !found_period_conf) { + ldpp_dout(dpp, 20) << "searching for the correct realm" << dendl; + ret = search_realm_with_zone(dpp, + cur_zone_id, + realm, + current_period, + zonegroup, + &found_period_conf, + y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: search_realm_conf() failed: ret="<< ret << dendl; + return ret; + } + } + bool zg_initialized = found_period_conf; + + if (!zg_initialized) { + /* couldn't find a proper period config, use local zonegroup */ + ret = zonegroup->init(dpp, cct, sysobj_svc, y); + zg_initialized = (ret == 0); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + auto& zonegroup_param = cct->_conf->rgw_zonegroup; + bool init_from_period = found_period_conf; + bool explicit_zg = !zonegroup_param.empty(); + + if (!zg_initialized && + (!explicit_zg || zonegroup_param == default_zonegroup_name)) { + /* we couldn't initialize any zonegroup, + falling back to a non-multisite config with default zonegroup */ + ret = create_default_zg(dpp, y); + if (ret < 0) { + return ret; + } + zg_initialized = true; + } + + if (!zg_initialized) { + ldpp_dout(dpp, 0) << "ERROR: could not find zonegroup (" << zonegroup_param << ")" << dendl; + return -ENOENT; + } + + /* we have zonegroup now */ + + if (explicit_zg && + zonegroup->get_name() != zonegroup_param) { + ldpp_dout(dpp, 0) << "ERROR: incorrect zonegroup: " << zonegroup_param << " (got: " << zonegroup_param << ", expected: " << zonegroup->get_name() << ")" << dendl; + return -EINVAL; + } + + auto& zone_param = cct->_conf->rgw_zone; + bool explicit_zone = !zone_param.empty(); + + if (!found_zone) { + if ((!explicit_zone || zone_param == default_zone_name) && + zonegroup->get_name() == default_zonegroup_name) { + ret = init_default_zone(dpp, y); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + cur_zone_id = zone_params->get_id(); + } else { + ldpp_dout(dpp, 0) << "ERROR: could not find zone (" << zone_param << ")" << dendl; + return -ENOENT; + } + } + + /* we have zone now */ + + auto zone_iter = zonegroup->zones.find(zone_params->get_id()); + if (zone_iter == zonegroup->zones.end()) { + /* shouldn't happen if relying on period config */ + if (!init_from_period) { + ldpp_dout(dpp, -1) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << ")" << dendl; + return -EINVAL; + } + ldpp_dout(dpp, 1) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << "), switching to local zonegroup configuration" << dendl; + init_from_period = false; + zone_iter = zonegroup->zones.find(zone_params->get_id()); + } + if (zone_iter == zonegroup->zones.end()) { + ldpp_dout(dpp, -1) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << ")" << dendl; + return -EINVAL; + } + *zone_public_config = zone_iter->second; + ldout(cct, 20) << "zone " << zone_params->get_name() << " found" << dendl; + + ldpp_dout(dpp, 4) << "Realm: " << std::left << setw(20) << realm->get_name() << " (" << realm->get_id() << ")" << dendl; + ldpp_dout(dpp, 4) << "ZoneGroup: " << std::left << setw(20) << zonegroup->get_name() << " (" << zonegroup->get_id() << ")" << dendl; + ldpp_dout(dpp, 4) << "Zone: " << std::left << setw(20) << zone_params->get_name() << " (" << zone_params->get_id() << ")" << dendl; + + if (init_from_period) { + ldpp_dout(dpp, 4) << "using period configuration: " << current_period->get_id() << ":" << current_period->get_epoch() << dendl; + ret = init_zg_from_period(dpp, y); + if (ret < 0) { + return ret; + } + } else { + ldout(cct, 10) << "cannot find current period zonegroup using local zonegroup configuration" << dendl; + ret = init_zg_from_local(dpp, y); + if (ret < 0) { + return ret; + } + // read period_config into current_period + auto& period_config = current_period->get_config(); + ret = period_config.read(dpp, sysobj_svc, zonegroup->realm_id, y); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "ERROR: failed to read period config: " + << cpp_strerror(ret) << dendl; + return ret; + } + } + + zone_short_id = current_period->get_map().get_zone_short_id(zone_params->get_id()); + + for (auto ziter : zonegroup->zones) { + auto zone_handler = std::make_shared(this, sync_modules_svc, bucket_sync_svc, ziter.second.id); + ret = zone_handler->init(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: could not initialize zone policy handler for zone=" << ziter.second.name << dendl; + return ret; + } + sync_policy_handlers[ziter.second.id] = zone_handler; + } + + sync_policy_handler = sync_policy_handlers[zone_id()]; /* we made sure earlier that zonegroup->zones has our zone */ + + set source_zones; + set target_zones; + + sync_policy_handler->reflect(dpp, nullptr, nullptr, + nullptr, nullptr, + &source_zones, + &target_zones, + false); /* relaxed: also get all zones that we allow to sync to/from */ + + ret = sync_modules_svc->start(y, dpp); + if (ret < 0) { + return ret; + } + + auto sync_modules = sync_modules_svc->get_manager(); + RGWSyncModuleRef sm; + if (!sync_modules->get_module(zone_public_config->tier_type, &sm)) { + ldpp_dout(dpp, -1) << "ERROR: tier type not found: " << zone_public_config->tier_type << dendl; + return -EINVAL; + } + + writeable_zone = sm->supports_writes(); + exports_data = sm->supports_data_export(); + + /* first build all zones index */ + for (auto ziter : zonegroup->zones) { + const rgw_zone_id& id = ziter.first; + RGWZone& z = ziter.second; + zone_id_by_name[z.name] = id; + zone_by_id[id] = z; + } + + if (zone_by_id.find(zone_id()) == zone_by_id.end()) { + ldpp_dout(dpp, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl; + } + + for (const auto& ziter : zonegroup->zones) { + const rgw_zone_id& id = ziter.first; + const RGWZone& z = ziter.second; + if (id == zone_id()) { + continue; + } + if (z.endpoints.empty()) { + ldpp_dout(dpp, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl; + continue; + } + ldpp_dout(dpp, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl; + RGWRESTConn *conn = new RGWRESTConn(cct, z.id, z.endpoints, zone_params->system_key, zonegroup->get_id(), zonegroup->api_name); + zone_conn_map[id] = conn; + + bool zone_is_source = source_zones.find(z.id) != source_zones.end(); + bool zone_is_target = target_zones.find(z.id) != target_zones.end(); + + if (zone_is_source || zone_is_target) { + if (zone_is_source && sync_modules->supports_data_export(z.tier_type)) { + data_sync_source_zones.push_back(&z); + } + if (zone_is_target) { + zone_data_notify_to_map[id] = conn; + } + } else { + ldpp_dout(dpp, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl; + } + } + + ldpp_dout(dpp, 20) << "started zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << + ") with tier type = " << zone_public_config->tier_type << dendl; + + return 0; +} + +void RGWSI_Zone::shutdown() +{ + delete rest_master_conn; + + for (auto& item : zone_conn_map) { + auto conn = item.second; + delete conn; + } + + for (auto& item : zonegroup_conn_map) { + auto conn = item.second; + delete conn; + } +} + +int RGWSI_Zone::list_regions(const DoutPrefixProvider *dpp, list& regions) +{ + RGWZoneGroup zonegroup; + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zonegroup.get_pool(cct)); + + return syspool.list_prefixed_objs(dpp, region_info_oid_prefix, ®ions); +} + +int RGWSI_Zone::list_zonegroups(const DoutPrefixProvider *dpp, list& zonegroups) +{ + RGWZoneGroup zonegroup; + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zonegroup.get_pool(cct)); + + return syspool.list_prefixed_objs(dpp, zonegroup_names_oid_prefix, &zonegroups); +} + +int RGWSI_Zone::list_zones(const DoutPrefixProvider *dpp, list& zones) +{ + RGWZoneParams zoneparams; + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zoneparams.get_pool(cct)); + + return syspool.list_prefixed_objs(dpp, zone_names_oid_prefix, &zones); +} + +int RGWSI_Zone::list_realms(const DoutPrefixProvider *dpp, list& realms) +{ + RGWRealm realm(cct, sysobj_svc); + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(realm.get_pool(cct)); + + return syspool.list_prefixed_objs(dpp, realm_names_oid_prefix, &realms); +} + +int RGWSI_Zone::list_periods(const DoutPrefixProvider *dpp, list& periods) +{ + RGWPeriod period; + list raw_periods; + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(period.get_pool(cct)); + int ret = syspool.list_prefixed_objs(dpp, period.get_info_oid_prefix(), &raw_periods); + if (ret < 0) { + return ret; + } + for (const auto& oid : raw_periods) { + size_t pos = oid.find("."); + if (pos != std::string::npos) { + periods.push_back(oid.substr(0, pos)); + } else { + periods.push_back(oid); + } + } + periods.sort(); // unique() only detects duplicates if they're adjacent + periods.unique(); + return 0; +} + + +int RGWSI_Zone::list_periods(const DoutPrefixProvider *dpp, const string& current_period, list& periods, optional_yield y) +{ + int ret = 0; + string period_id = current_period; + while(!period_id.empty()) { + RGWPeriod period(period_id); + ret = period.init(dpp, cct, sysobj_svc, y); + if (ret < 0) { + return ret; + } + periods.push_back(period.get_id()); + period_id = period.get_predecessor(); + } + + return ret; +} + +/** + * Add new connection to connections map + * @param zonegroup_conn_map map which new connection will be added to + * @param zonegroup zonegroup which new connection will connect to + * @param new_connection pointer to new connection instance + */ +static void add_new_connection_to_map(map &zonegroup_conn_map, + const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection) +{ + // Delete if connection is already exists + map::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id()); + if (iterZoneGroup != zonegroup_conn_map.end()) { + delete iterZoneGroup->second; + } + + // Add new connection to connections map + zonegroup_conn_map[zonegroup.get_id()] = new_connection; +} + +int RGWSI_Zone::init_zg_from_period(const DoutPrefixProvider *dpp, optional_yield y) +{ + ldout(cct, 20) << "period zonegroup name " << zonegroup->get_name() << dendl; + + map::const_iterator iter = + current_period->get_map().zonegroups.find(zonegroup->get_id()); + + if (iter != current_period->get_map().zonegroups.end()) { + ldpp_dout(dpp, 20) << "using current period zonegroup " << zonegroup->get_name() << dendl; + *zonegroup = iter->second; + int ret = zonegroup->init(dpp, cct, sysobj_svc, y, false); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl; + return ret; + } + } + for (iter = current_period->get_map().zonegroups.begin(); + iter != current_period->get_map().zonegroups.end(); ++iter){ + const RGWZoneGroup& zg = iter->second; + // use endpoints from the zonegroup's master zone + auto master = zg.zones.find(zg.master_zone); + if (master == zg.zones.end()) { + // Check for empty zonegroup which can happen if zone was deleted before removal + if (zg.zones.size() == 0) + continue; + // fix missing master zone for a single zone zonegroup + if (zg.master_zone.empty() && zg.zones.size() == 1) { + master = zg.zones.begin(); + ldpp_dout(dpp, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " << + master->second.name << " id:" << master->second.id << " as master" << dendl; + if (zonegroup->get_id() == zg.get_id()) { + zonegroup->master_zone = master->second.id; + int ret = zonegroup->update(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl; + return ret; + } + } else { + RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name()); + int ret = fixed_zg.init(dpp, cct, sysobj_svc, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl; + return ret; + } + fixed_zg.master_zone = master->second.id; + ret = fixed_zg.update(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl; + return ret; + } + } + } else { + ldpp_dout(dpp, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" << + zg.master_zone << dendl; + return -EINVAL; + } + } + const auto& endpoints = master->second.endpoints; + add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, zg.get_id(), endpoints, zone_params->system_key, zonegroup->get_id(), zg.api_name)); + if (!current_period->get_master_zonegroup().empty() && + zg.get_id() == current_period->get_master_zonegroup()) { + rest_master_conn = new RGWRESTConn(cct, zg.get_id(), endpoints, zone_params->system_key, zonegroup->get_id(), zg.api_name); + } + } + + return 0; +} + +int RGWSI_Zone::create_default_zg(const DoutPrefixProvider *dpp, optional_yield y) +{ + ldout(cct, 10) << "Creating default zonegroup " << dendl; + int ret = zonegroup->create_default(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + ret = zonegroup->init(dpp, cct, sysobj_svc, y); + if (ret < 0) { + ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + + return 0; +} + +int RGWSI_Zone::init_default_zone(const DoutPrefixProvider *dpp, optional_yield y) +{ + ldpp_dout(dpp, 10) << " Using default name "<< default_zone_name << dendl; + zone_params->set_name(default_zone_name); + int ret = zone_params->init(dpp, cct, sysobj_svc, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +int RGWSI_Zone::init_zg_from_local(const DoutPrefixProvider *dpp, optional_yield y) +{ + ldpp_dout(dpp, 20) << "zonegroup " << zonegroup->get_name() << dendl; + if (zonegroup->is_master_zonegroup()) { + // use endpoints from the zonegroup's master zone + auto master = zonegroup->zones.find(zonegroup->master_zone); + if (master == zonegroup->zones.end()) { + // fix missing master zone for a single zone zonegroup + if (zonegroup->master_zone.empty() && zonegroup->zones.size() == 1) { + master = zonegroup->zones.begin(); + ldpp_dout(dpp, 0) << "zonegroup " << zonegroup->get_name() << " missing master_zone, setting zone " << + master->second.name << " id:" << master->second.id << " as master" << dendl; + zonegroup->master_zone = master->second.id; + int ret = zonegroup->update(dpp, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl; + return ret; + } + } else { + ldpp_dout(dpp, 0) << "zonegroup " << zonegroup->get_name() << " missing zone for " + "master_zone=" << zonegroup->master_zone << dendl; + return -EINVAL; + } + } + const auto& endpoints = master->second.endpoints; + rest_master_conn = new RGWRESTConn(cct, zonegroup->get_id(), endpoints, zone_params->system_key, zonegroup->get_id(), zonegroup->api_name); + } + + return 0; +} + +const RGWZoneParams& RGWSI_Zone::get_zone_params() const +{ + return *zone_params; +} + +const RGWZone& RGWSI_Zone::get_zone() const +{ + return *zone_public_config; +} + +const RGWZoneGroup& RGWSI_Zone::get_zonegroup() const +{ + return *zonegroup; +} + +int RGWSI_Zone::get_zonegroup(const string& id, RGWZoneGroup& zg) const +{ + int ret = 0; + if (id == zonegroup->get_id()) { + zg = *zonegroup; + } else if (!current_period->get_id().empty()) { + ret = current_period->get_zonegroup(zg, id); + } + return ret; +} + +const RGWRealm& RGWSI_Zone::get_realm() const +{ + return *realm; +} + +const RGWPeriod& RGWSI_Zone::get_current_period() const +{ + return *current_period; +} + +const string& RGWSI_Zone::get_current_period_id() const +{ + return current_period->get_id(); +} + +bool RGWSI_Zone::has_zonegroup_api(const std::string& api) const +{ + if (!current_period->get_id().empty()) { + const auto& zonegroups_by_api = current_period->get_map().zonegroups_by_api; + if (zonegroups_by_api.find(api) != zonegroups_by_api.end()) + return true; + } else if (zonegroup->api_name == api) { + return true; + } + return false; +} + +bool RGWSI_Zone::zone_is_writeable() +{ + return writeable_zone && !get_zone().is_read_only(); +} + +uint32_t RGWSI_Zone::get_zone_short_id() const +{ + return zone_short_id; +} + +const string& RGWSI_Zone::zone_name() const +{ + return get_zone_params().get_name(); +} + +RGWZone* RGWSI_Zone::find_zone(const rgw_zone_id& id) +{ + auto iter = zone_by_id.find(id); + if (iter == zone_by_id.end()) { + return nullptr; + } + return &(iter->second); +} + +RGWRESTConn *RGWSI_Zone::get_zone_conn(const rgw_zone_id& zone_id) { + auto citer = zone_conn_map.find(zone_id.id); + if (citer == zone_conn_map.end()) { + return NULL; + } + + return citer->second; +} + +RGWRESTConn *RGWSI_Zone::get_zone_conn_by_name(const string& name) { + auto i = zone_id_by_name.find(name); + if (i == zone_id_by_name.end()) { + return NULL; + } + + return get_zone_conn(i->second); +} + +bool RGWSI_Zone::find_zone_id_by_name(const string& name, rgw_zone_id *id) { + auto i = zone_id_by_name.find(name); + if (i == zone_id_by_name.end()) { + return false; + } + *id = i->second; + return true; +} + +bool RGWSI_Zone::need_to_sync() const +{ + return !(zonegroup->master_zone.empty() || + !rest_master_conn || + current_period->get_id().empty()); +} + +bool RGWSI_Zone::need_to_log_data() const +{ + return (zone_public_config->log_data && sync_module_exports_data()); +} + +bool RGWSI_Zone::is_meta_master() const +{ + if (!zonegroup->is_master_zonegroup()) { + return false; + } + + return (zonegroup->master_zone == zone_public_config->id); +} + +bool RGWSI_Zone::need_to_log_metadata() const +{ + return is_meta_master() && + (zonegroup->zones.size() > 1 || current_period->is_multi_zonegroups_with_zones()); +} + +bool RGWSI_Zone::can_reshard() const +{ + if (current_period->get_id().empty()) { + return true; // no realm + } + if (zonegroup->zones.size() == 1 && current_period->is_single_zonegroup()) { + return true; // single zone/zonegroup + } + // 'resharding' feature enabled in zonegroup + return zonegroup->supports(rgw::zone_features::resharding); +} + +/** + * Check to see if the bucket metadata could be synced + * bucket: the bucket to check + * Returns false is the bucket is not synced + */ +bool RGWSI_Zone::is_syncing_bucket_meta(const rgw_bucket& bucket) +{ + + /* no current period */ + if (current_period->get_id().empty()) { + return false; + } + + /* zonegroup is not master zonegroup */ + if (!zonegroup->is_master_zonegroup()) { + return false; + } + + /* single zonegroup and a single zone */ + if (current_period->is_single_zonegroup() && zonegroup->zones.size() == 1) { + return false; + } + + /* zone is not master */ + if (zonegroup->master_zone != zone_public_config->id) { + return false; + } + + return true; +} + + +int RGWSI_Zone::select_new_bucket_location(const DoutPrefixProvider *dpp, const RGWUserInfo& user_info, const string& zonegroup_id, + const rgw_placement_rule& request_rule, + rgw_placement_rule *pselected_rule_name, RGWZonePlacementInfo *rule_info, + optional_yield y) +{ + /* first check that zonegroup exists within current period. */ + RGWZoneGroup zonegroup; + int ret = get_zonegroup(zonegroup_id, zonegroup); + if (ret < 0) { + ldpp_dout(dpp, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl; + return ret; + } + + const rgw_placement_rule *used_rule; + + /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */ + std::map::const_iterator titer; + + if (!request_rule.name.empty()) { + used_rule = &request_rule; + titer = zonegroup.placement_targets.find(request_rule.name); + if (titer == zonegroup.placement_targets.end()) { + ldpp_dout(dpp, 0) << "could not find requested placement id " << request_rule + << " within zonegroup " << dendl; + return -ERR_INVALID_LOCATION_CONSTRAINT; + } + } else if (!user_info.default_placement.name.empty()) { + used_rule = &user_info.default_placement; + titer = zonegroup.placement_targets.find(user_info.default_placement.name); + if (titer == zonegroup.placement_targets.end()) { + ldpp_dout(dpp, 0) << "could not find user default placement id " << user_info.default_placement + << " within zonegroup " << dendl; + return -ERR_INVALID_LOCATION_CONSTRAINT; + } + } else { + if (zonegroup.default_placement.name.empty()) { // zonegroup default rule as fallback, it should not be empty. + ldpp_dout(dpp, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl; + return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION; + } else { + used_rule = &zonegroup.default_placement; + titer = zonegroup.placement_targets.find(zonegroup.default_placement.name); + if (titer == zonegroup.placement_targets.end()) { + ldpp_dout(dpp, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement + << " within zonegroup " << dendl; + return -ERR_INVALID_LOCATION_CONSTRAINT; + } + } + } + + /* now check tag for the rule, whether user is permitted to use rule */ + const auto& target_rule = titer->second; + if (!target_rule.user_permitted(user_info.placement_tags)) { + ldpp_dout(dpp, 0) << "user not permitted to use placement rule " << titer->first << dendl; + return -EPERM; + } + + const string *storage_class = &request_rule.storage_class; + + if (storage_class->empty()) { + storage_class = &used_rule->storage_class; + } + + rgw_placement_rule rule(titer->first, *storage_class); + + if (pselected_rule_name) { + *pselected_rule_name = rule; + } + + return select_bucket_location_by_rule(dpp, rule, rule_info, y); +} + +int RGWSI_Zone::select_bucket_location_by_rule(const DoutPrefixProvider *dpp, const rgw_placement_rule& location_rule, RGWZonePlacementInfo *rule_info, optional_yield y) +{ + if (location_rule.name.empty()) { + /* we can only reach here if we're trying to set a bucket location from a bucket + * created on a different zone, using a legacy / default pool configuration + */ + if (rule_info) { + return select_legacy_bucket_placement(dpp, rule_info, y); + } + + return 0; + } + + /* + * make sure that zone has this rule configured. We're + * checking it for the local zone, because that's where this bucket object is going to + * reside. + */ + auto piter = zone_params->placement_pools.find(location_rule.name); + if (piter == zone_params->placement_pools.end()) { + /* couldn't find, means we cannot really place data for this bucket in this zone */ + ldpp_dout(dpp, 0) << "ERROR: This zone does not contain placement rule " + << location_rule << " present in the zonegroup!" << dendl; + return -EINVAL; + } + + auto storage_class = location_rule.get_storage_class(); + if (!piter->second.storage_class_exists(storage_class)) { + ldpp_dout(dpp, 5) << "requested storage class does not exist: " << storage_class << dendl; + return -EINVAL; + } + + + RGWZonePlacementInfo& placement_info = piter->second; + + if (rule_info) { + *rule_info = placement_info; + } + + return 0; +} + +int RGWSI_Zone::select_bucket_placement(const DoutPrefixProvider *dpp, const RGWUserInfo& user_info, const string& zonegroup_id, + const rgw_placement_rule& placement_rule, + rgw_placement_rule *pselected_rule, RGWZonePlacementInfo *rule_info, + optional_yield y) +{ + if (!zone_params->placement_pools.empty()) { + return select_new_bucket_location(dpp, user_info, zonegroup_id, placement_rule, + pselected_rule, rule_info, y); + } + + if (pselected_rule) { + pselected_rule->clear(); + } + + if (rule_info) { + return select_legacy_bucket_placement(dpp, rule_info, y); + } + + return 0; +} + +int RGWSI_Zone::select_legacy_bucket_placement(const DoutPrefixProvider *dpp, RGWZonePlacementInfo *rule_info, + optional_yield y) +{ + bufferlist map_bl; + map m; + string pool_name; + bool write_map = false; + + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + + auto sysobj = sysobj_svc->get_obj(obj); + int ret = sysobj.rop().read(dpp, &map_bl, y); + if (ret < 0) { + goto read_omap; + } + + try { + auto iter = map_bl.cbegin(); + decode(m, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode avail_pools" << dendl; + } + +read_omap: + if (m.empty()) { + ret = sysobj.omap().get_all(dpp, &m, y); + + write_map = true; + } + + if (ret < 0 || m.empty()) { + vector pools; + string s = string("default.") + default_storage_pool_suffix; + pools.push_back(rgw_pool(s)); + vector retcodes; + bufferlist bl; + ret = rados_svc->pool().create(dpp, pools, &retcodes); + if (ret < 0) + return ret; + ret = sysobj.omap().set(dpp, s, bl, y); + if (ret < 0) + return ret; + m[s] = bl; + } + + if (write_map) { + bufferlist new_bl; + encode(m, new_bl); + ret = sysobj.wop().write(dpp, new_bl, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl; + } + } + + auto miter = m.begin(); + if (m.size() > 1) { + // choose a pool at random + auto r = ceph::util::generate_random_number(0, m.size() - 1); + std::advance(miter, r); + } + pool_name = miter->first; + + rgw_pool pool = pool_name; + + rule_info->storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &pool, nullptr); + rule_info->data_extra_pool = pool_name; + rule_info->index_pool = pool_name; + rule_info->index_type = rgw::BucketIndexType::Normal; + + return 0; +} + +int RGWSI_Zone::update_placement_map(const DoutPrefixProvider *dpp, optional_yield y) +{ + bufferlist header; + map m; + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + + auto sysobj = sysobj_svc->get_obj(obj); + int ret = sysobj.omap().get_all(dpp, &m, y); + if (ret < 0) + return ret; + + bufferlist new_bl; + encode(m, new_bl); + ret = sysobj.wop().write(dpp, new_bl, y); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl; + } + + return ret; +} + +int RGWSI_Zone::add_bucket_placement(const DoutPrefixProvider *dpp, const rgw_pool& new_pool, optional_yield y) +{ + int ret = rados_svc->pool(new_pool).lookup(); + if (ret < 0) { // DNE, or something + return ret; + } + + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + auto sysobj = sysobj_svc->get_obj(obj); + + bufferlist empty_bl; + ret = sysobj.omap().set(dpp, new_pool.to_str(), empty_bl, y); + + // don't care about return value + update_placement_map(dpp, y); + + return ret; +} + +int RGWSI_Zone::remove_bucket_placement(const DoutPrefixProvider *dpp, const rgw_pool& old_pool, optional_yield y) +{ + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + auto sysobj = sysobj_svc->get_obj(obj); + int ret = sysobj.omap().del(dpp, old_pool.to_str(), y); + + // don't care about return value + update_placement_map(dpp, y); + + return ret; +} + +int RGWSI_Zone::list_placement_set(const DoutPrefixProvider *dpp, set& names, optional_yield y) +{ + bufferlist header; + map m; + + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + auto sysobj = sysobj_svc->get_obj(obj); + int ret = sysobj.omap().get_all(dpp, &m, y); + if (ret < 0) + return ret; + + names.clear(); + map::iterator miter; + for (miter = m.begin(); miter != m.end(); ++miter) { + names.insert(rgw_pool(miter->first)); + } + + return names.size(); +} + +bool RGWSI_Zone::get_redirect_zone_endpoint(string *endpoint) +{ + if (zone_public_config->redirect_zone.empty()) { + return false; + } + + auto iter = zone_conn_map.find(zone_public_config->redirect_zone); + if (iter == zone_conn_map.end()) { + ldout(cct, 0) << "ERROR: cannot find entry for redirect zone: " << zone_public_config->redirect_zone << dendl; + return false; + } + + RGWRESTConn *conn = iter->second; + + int ret = conn->get_url(*endpoint); + if (ret < 0) { + ldout(cct, 0) << "ERROR: redirect zone, conn->get_endpoint() returned ret=" << ret << dendl; + return false; + } + + return true; +} + diff --git a/src/rgw/services/svc_zone.h b/src/rgw/services/svc_zone.h new file mode 100644 index 000000000..7b0a277c4 --- /dev/null +++ b/src/rgw/services/svc_zone.h @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_service.h" + + +class RGWSI_RADOS; +class RGWSI_SysObj; +class RGWSI_SyncModules; +class RGWSI_Bucket_Sync; + +class RGWRealm; +class RGWZoneGroup; +class RGWZone; +class RGWZoneParams; +class RGWPeriod; +class RGWZonePlacementInfo; + +class RGWBucketSyncPolicyHandler; + +class RGWRESTConn; + +struct rgw_sync_policy_info; + +class RGWSI_Zone : public RGWServiceInstance +{ + friend struct RGWServices_Def; + + RGWSI_SysObj *sysobj_svc{nullptr}; + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_SyncModules *sync_modules_svc{nullptr}; + RGWSI_Bucket_Sync *bucket_sync_svc{nullptr}; + + RGWRealm *realm{nullptr}; + RGWZoneGroup *zonegroup{nullptr}; + RGWZone *zone_public_config{nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */ + RGWZoneParams *zone_params{nullptr}; /* internal zone params, e.g., rados pools */ + RGWPeriod *current_period{nullptr}; + rgw_zone_id cur_zone_id; + uint32_t zone_short_id{0}; + bool writeable_zone{false}; + bool exports_data{false}; + + std::shared_ptr sync_policy_handler; + std::map > sync_policy_handlers; + + RGWRESTConn *rest_master_conn{nullptr}; + std::map zone_conn_map; + std::vector data_sync_source_zones; + std::map zone_data_notify_to_map; + std::map zonegroup_conn_map; + + std::map zone_id_by_name; + std::map zone_by_id; + + std::unique_ptr sync_policy; + + void init(RGWSI_SysObj *_sysobj_svc, + RGWSI_RADOS *_rados_svc, + RGWSI_SyncModules *_sync_modules_svc, + RGWSI_Bucket_Sync *_bucket_sync_svc); + int do_start(optional_yield y, const DoutPrefixProvider *dpp) override; + void shutdown() override; + + int init_zg_from_period(const DoutPrefixProvider *dpp, optional_yield y); + int init_zg_from_local(const DoutPrefixProvider *dpp, optional_yield y); + + int update_placement_map(const DoutPrefixProvider *dpp, optional_yield y); + + int create_default_zg(const DoutPrefixProvider *dpp, optional_yield y); + int init_default_zone(const DoutPrefixProvider *dpp, optional_yield y); + + int search_realm_with_zone(const DoutPrefixProvider *dpp, + const rgw_zone_id& zid, + RGWRealm *prealm, + RGWPeriod *pperiod, + RGWZoneGroup *pzonegroup, + bool *pfound, + optional_yield y); +public: + RGWSI_Zone(CephContext *cct); + ~RGWSI_Zone(); + + const RGWZoneParams& get_zone_params() const; + const RGWPeriod& get_current_period() const; + const RGWRealm& get_realm() const; + const RGWZoneGroup& get_zonegroup() const; + int get_zonegroup(const std::string& id, RGWZoneGroup& zonegroup) const; + const RGWZone& get_zone() const; + + std::shared_ptr get_sync_policy_handler(std::optional zone = std::nullopt) const; + + const std::string& zone_name() const; + const rgw_zone_id& zone_id() const { + return cur_zone_id; + } + uint32_t get_zone_short_id() const; + + const std::string& get_current_period_id() const; + bool has_zonegroup_api(const std::string& api) const; + + bool zone_is_writeable(); + bool zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const; + bool zone_syncs_from(const RGWZone& source_zone) const; + bool get_redirect_zone_endpoint(std::string *endpoint); + bool sync_module_supports_writes() const { return writeable_zone; } + bool sync_module_exports_data() const { return exports_data; } + + RGWRESTConn *get_master_conn() { + return rest_master_conn; + } + + std::map& get_zonegroup_conn_map() { + return zonegroup_conn_map; + } + + std::map& get_zone_conn_map() { + return zone_conn_map; + } + + std::vector& get_data_sync_source_zones() { + return data_sync_source_zones; + } + + std::map& get_zone_data_notify_to_map() { + return zone_data_notify_to_map; + } + + RGWZone* find_zone(const rgw_zone_id& id); + + RGWRESTConn *get_zone_conn(const rgw_zone_id& zone_id); + RGWRESTConn *get_zone_conn_by_name(const std::string& name); + bool find_zone_id_by_name(const std::string& name, rgw_zone_id *id); + + int select_bucket_placement(const DoutPrefixProvider *dpp, const RGWUserInfo& user_info, const std::string& zonegroup_id, + const rgw_placement_rule& rule, + rgw_placement_rule *pselected_rule, RGWZonePlacementInfo *rule_info, optional_yield y); + int select_legacy_bucket_placement(const DoutPrefixProvider *dpp, RGWZonePlacementInfo *rule_info, optional_yield y); + int select_new_bucket_location(const DoutPrefixProvider *dpp, const RGWUserInfo& user_info, const std::string& zonegroup_id, + const rgw_placement_rule& rule, + rgw_placement_rule *pselected_rule_name, RGWZonePlacementInfo *rule_info, + optional_yield y); + int select_bucket_location_by_rule(const DoutPrefixProvider *dpp, const rgw_placement_rule& location_rule, RGWZonePlacementInfo *rule_info, optional_yield y); + + int add_bucket_placement(const DoutPrefixProvider *dpp, const rgw_pool& new_pool, optional_yield y); + int remove_bucket_placement(const DoutPrefixProvider *dpp, const rgw_pool& old_pool, optional_yield y); + int list_placement_set(const DoutPrefixProvider *dpp, std::set& names, optional_yield y); + + bool is_meta_master() const; + + bool need_to_sync() const; + bool need_to_log_data() const; + bool need_to_log_metadata() const; + bool can_reshard() const; + bool is_syncing_bucket_meta(const rgw_bucket& bucket); + + int list_zonegroups(const DoutPrefixProvider *dpp, std::list& zonegroups); + int list_regions(const DoutPrefixProvider *dpp, std::list& regions); + int list_zones(const DoutPrefixProvider *dpp, std::list& zones); + int list_realms(const DoutPrefixProvider *dpp, std::list& realms); + int list_periods(const DoutPrefixProvider *dpp, std::list& periods); + int list_periods(const DoutPrefixProvider *dpp, const std::string& current_period, std::list& periods, optional_yield y); +}; diff --git a/src/rgw/services/svc_zone_utils.cc b/src/rgw/services/svc_zone_utils.cc new file mode 100644 index 000000000..712bb97c9 --- /dev/null +++ b/src/rgw/services/svc_zone_utils.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "svc_zone_utils.h" +#include "svc_rados.h" +#include "svc_zone.h" + +#include "rgw_zone.h" + +using namespace std; + +int RGWSI_ZoneUtils::do_start(optional_yield, const DoutPrefixProvider *dpp) +{ + init_unique_trans_id_deps(); + + return 0; +} + +string RGWSI_ZoneUtils::gen_host_id() { + /* uint64_t needs 16, two '-' separators and a trailing null */ + const string& zone_name = zone_svc->get_zone().name; + const string& zonegroup_name = zone_svc->get_zonegroup().get_name(); + char charbuf[16 + zone_name.size() + zonegroup_name.size() + 2 + 1]; + snprintf(charbuf, sizeof(charbuf), "%llx-%s-%s", (unsigned long long)rados_svc->instance_id(), zone_name.c_str(), zonegroup_name.c_str()); + return string(charbuf); +} + +string RGWSI_ZoneUtils::unique_id(uint64_t unique_num) +{ + char buf[32]; + snprintf(buf, sizeof(buf), ".%llu.%llu", (unsigned long long)rados_svc->instance_id(), (unsigned long long)unique_num); + string s = zone_svc->get_zone_params().get_id() + buf; + return s; +} + +void RGWSI_ZoneUtils::init_unique_trans_id_deps() { + char buf[16 + 2 + 1]; /* uint64_t needs 16, 2 hyphens add further 2 */ + + snprintf(buf, sizeof(buf), "-%llx-", (unsigned long long)rados_svc->instance_id()); + url_encode(string(buf) + zone_svc->get_zone().name, trans_id_suffix); +} + +/* In order to preserve compatibility with Swift API, transaction ID + * should contain at least 32 characters satisfying following spec: + * - first 21 chars must be in range [0-9a-f]. Swift uses this + * space for storing fragment of UUID obtained through a call to + * uuid4() function of Python's uuid module; + * - char no. 22 must be a hyphen; + * - at least 10 next characters constitute hex-formatted timestamp + * padded with zeroes if necessary. All bytes must be in [0-9a-f] + * range; + * - last, optional part of transaction ID is any url-encoded string + * without restriction on length. */ +string RGWSI_ZoneUtils::unique_trans_id(const uint64_t unique_num) { + char buf[41]; /* 2 + 21 + 1 + 16 (timestamp can consume up to 16) + 1 */ + time_t timestamp = time(NULL); + + snprintf(buf, sizeof(buf), "tx%021llx-%010llx", + (unsigned long long)unique_num, + (unsigned long long)timestamp); + + return string(buf) + trans_id_suffix; +} + diff --git a/src/rgw/services/svc_zone_utils.h b/src/rgw/services/svc_zone_utils.h new file mode 100644 index 000000000..43e3fee8d --- /dev/null +++ b/src/rgw/services/svc_zone_utils.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_service.h" + + +class RGWSI_RADOS; +class RGWSI_Zone; + +class RGWSI_ZoneUtils : public RGWServiceInstance +{ + friend struct RGWServices_Def; + + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_Zone *zone_svc{nullptr}; + + std::string trans_id_suffix; + + void init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc) { + rados_svc = _rados_svc; + zone_svc = _zone_svc; + } + + int do_start(optional_yield, const DoutPrefixProvider *dpp) override; + + void init_unique_trans_id_deps(); + +public: + RGWSI_ZoneUtils(CephContext *cct): RGWServiceInstance(cct) {} + + std::string gen_host_id(); + std::string unique_id(uint64_t unique_num); + + std::string unique_trans_id(const uint64_t unique_num); +}; -- cgit v1.2.3